diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,350034 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.4340240102082447,
+  "eval_steps": 0,
+  "global_step": 50000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 8.680480204164894e-06,
+      "grad_norm": 15.5625,
+      "learning_rate": 0.0,
+      "loss": 1.2891,
+      "step": 1
+    },
+    {
+      "epoch": 1.736096040832979e-05,
+      "grad_norm": 22.625,
+      "learning_rate": 2e-06,
+      "loss": 1.6406,
+      "step": 2
+    },
+    {
+      "epoch": 2.6041440612494685e-05,
+      "grad_norm": 10.75,
+      "learning_rate": 4e-06,
+      "loss": 1.1016,
+      "step": 3
+    },
+    {
+      "epoch": 3.472192081665958e-05,
+      "grad_norm": 10.6875,
+      "learning_rate": 6e-06,
+      "loss": 0.9922,
+      "step": 4
+    },
+    {
+      "epoch": 4.3402401020824474e-05,
+      "grad_norm": 18.125,
+      "learning_rate": 8e-06,
+      "loss": 1.4375,
+      "step": 5
+    },
+    {
+      "epoch": 5.208288122498937e-05,
+      "grad_norm": 10.1875,
+      "learning_rate": 1e-05,
+      "loss": 1.0,
+      "step": 6
+    },
+    {
+      "epoch": 6.076336142915426e-05,
+      "grad_norm": 12.9375,
+      "learning_rate": 1.2e-05,
+      "loss": 1.2344,
+      "step": 7
+    },
+    {
+      "epoch": 6.944384163331915e-05,
+      "grad_norm": 15.8125,
+      "learning_rate": 1.4e-05,
+      "loss": 1.1875,
+      "step": 8
+    },
+    {
+      "epoch": 7.812432183748404e-05,
+      "grad_norm": 13.3125,
+      "learning_rate": 1.6e-05,
+      "loss": 1.0469,
+      "step": 9
+    },
+    {
+      "epoch": 8.680480204164895e-05,
+      "grad_norm": 11.375,
+      "learning_rate": 1.8e-05,
+      "loss": 0.9844,
+      "step": 10
+    },
+    {
+      "epoch": 9.548528224581384e-05,
+      "grad_norm": 11.125,
+      "learning_rate": 2e-05,
+      "loss": 1.0391,
+      "step": 11
+    },
+    {
+      "epoch": 0.00010416576244997874,
+      "grad_norm": 12.0625,
+      "learning_rate": 2.2e-05,
+      "loss": 1.0469,
+      "step": 12
+    },
+    {
+      "epoch": 0.00011284624265414363,
+      "grad_norm": 7.25,
+      "learning_rate": 2.4e-05,
+      "loss": 0.8203,
+      "step": 13
+    },
+    {
+      "epoch": 0.00012152672285830852,
+      "grad_norm": 18.5,
+      "learning_rate": 2.6e-05,
+      "loss": 1.0312,
+      "step": 14
+    },
+    {
+      "epoch": 0.0001302072030624734,
+      "grad_norm": 7.4375,
+      "learning_rate": 2.8e-05,
+      "loss": 0.9141,
+      "step": 15
+    },
+    {
+      "epoch": 0.0001388876832666383,
+      "grad_norm": 4.5625,
+      "learning_rate": 3e-05,
+      "loss": 0.7305,
+      "step": 16
+    },
+    {
+      "epoch": 0.0001475681634708032,
+      "grad_norm": 4.15625,
+      "learning_rate": 3.2e-05,
+      "loss": 0.7031,
+      "step": 17
+    },
+    {
+      "epoch": 0.0001562486436749681,
+      "grad_norm": 4.6875,
+      "learning_rate": 3.4000000000000007e-05,
+      "loss": 0.5977,
+      "step": 18
+    },
+    {
+      "epoch": 0.000164929123879133,
+      "grad_norm": 3.75,
+      "learning_rate": 3.6e-05,
+      "loss": 0.625,
+      "step": 19
+    },
+    {
+      "epoch": 0.0001736096040832979,
+      "grad_norm": 2.65625,
+      "learning_rate": 3.8e-05,
+      "loss": 0.5898,
+      "step": 20
+    },
+    {
+      "epoch": 0.00018229008428746277,
+      "grad_norm": 2.25,
+      "learning_rate": 4e-05,
+      "loss": 0.582,
+      "step": 21
+    },
+    {
+      "epoch": 0.00019097056449162767,
+      "grad_norm": 2.65625,
+      "learning_rate": 4.2000000000000004e-05,
+      "loss": 0.5977,
+      "step": 22
+    },
+    {
+      "epoch": 0.00019965104469579258,
+      "grad_norm": 2.09375,
+      "learning_rate": 4.4e-05,
+      "loss": 0.4551,
+      "step": 23
+    },
+    {
+      "epoch": 0.00020833152489995748,
+      "grad_norm": 1.625,
+      "learning_rate": 4.6e-05,
+      "loss": 0.4844,
+      "step": 24
+    },
+    {
+      "epoch": 0.00021701200510412235,
+      "grad_norm": 1.7421875,
+      "learning_rate": 4.8e-05,
+      "loss": 0.5156,
+      "step": 25
+    },
+    {
+      "epoch": 0.00022569248530828726,
+      "grad_norm": 3.703125,
+      "learning_rate": 5e-05,
+      "loss": 0.4688,
+      "step": 26
+    },
+    {
+      "epoch": 0.00023437296551245216,
+      "grad_norm": 1.296875,
+      "learning_rate": 5.2e-05,
+      "loss": 0.457,
+      "step": 27
+    },
+    {
+      "epoch": 0.00024305344571661704,
+      "grad_norm": 2.0625,
+      "learning_rate": 5.4e-05,
+      "loss": 0.5938,
+      "step": 28
+    },
+    {
+      "epoch": 0.0002517339259207819,
+      "grad_norm": 1.2890625,
+      "learning_rate": 5.6e-05,
+      "loss": 0.4648,
+      "step": 29
+    },
+    {
+      "epoch": 0.0002604144061249468,
+      "grad_norm": 1.125,
+      "learning_rate": 5.800000000000001e-05,
+      "loss": 0.4805,
+      "step": 30
+    },
+    {
+      "epoch": 0.0002690948863291117,
+      "grad_norm": 1.265625,
+      "learning_rate": 6e-05,
+      "loss": 0.5234,
+      "step": 31
+    },
+    {
+      "epoch": 0.0002777753665332766,
+      "grad_norm": 1.0078125,
+      "learning_rate": 6.2e-05,
+      "loss": 0.4297,
+      "step": 32
+    },
+    {
+      "epoch": 0.0002864558467374415,
+      "grad_norm": 1.0078125,
+      "learning_rate": 6.4e-05,
+      "loss": 0.3555,
+      "step": 33
+    },
+    {
+      "epoch": 0.0002951363269416064,
+      "grad_norm": 0.984375,
+      "learning_rate": 6.6e-05,
+      "loss": 0.3633,
+      "step": 34
+    },
+    {
+      "epoch": 0.00030381680714577133,
+      "grad_norm": 0.78125,
+      "learning_rate": 6.800000000000001e-05,
+      "loss": 0.3398,
+      "step": 35
+    },
+    {
+      "epoch": 0.0003124972873499362,
+      "grad_norm": 0.87890625,
+      "learning_rate": 7.000000000000001e-05,
+      "loss": 0.3574,
+      "step": 36
+    },
+    {
+      "epoch": 0.0003211777675541011,
+      "grad_norm": 0.6015625,
+      "learning_rate": 7.2e-05,
+      "loss": 0.3145,
+      "step": 37
+    },
+    {
+      "epoch": 0.000329858247758266,
+      "grad_norm": 0.6875,
+      "learning_rate": 7.4e-05,
+      "loss": 0.3906,
+      "step": 38
+    },
+    {
+      "epoch": 0.0003385387279624309,
+      "grad_norm": 0.73046875,
+      "learning_rate": 7.6e-05,
+      "loss": 0.3477,
+      "step": 39
+    },
+    {
+      "epoch": 0.0003472192081665958,
+      "grad_norm": 0.71484375,
+      "learning_rate": 7.8e-05,
+      "loss": 0.3477,
+      "step": 40
+    },
+    {
+      "epoch": 0.0003558996883707607,
+      "grad_norm": 0.59765625,
+      "learning_rate": 8e-05,
+      "loss": 0.3711,
+      "step": 41
+    },
+    {
+      "epoch": 0.00036458016857492554,
+      "grad_norm": 0.5546875,
+      "learning_rate": 8.2e-05,
+      "loss": 0.3398,
+      "step": 42
+    },
+    {
+      "epoch": 0.00037326064877909044,
+      "grad_norm": 0.5703125,
+      "learning_rate": 8.400000000000001e-05,
+      "loss": 0.3145,
+      "step": 43
+    },
+    {
+      "epoch": 0.00038194112898325535,
+      "grad_norm": 0.55078125,
+      "learning_rate": 8.599999999999999e-05,
+      "loss": 0.332,
+      "step": 44
+    },
+    {
+      "epoch": 0.00039062160918742025,
+      "grad_norm": 0.671875,
+      "learning_rate": 8.8e-05,
+      "loss": 0.4297,
+      "step": 45
+    },
+    {
+      "epoch": 0.00039930208939158515,
+      "grad_norm": 0.68359375,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 0.3945,
+      "step": 46
+    },
+    {
+      "epoch": 0.00040798256959575005,
+      "grad_norm": 0.482421875,
+      "learning_rate": 9.2e-05,
+      "loss": 0.3066,
+      "step": 47
+    },
+    {
+      "epoch": 0.00041666304979991496,
+      "grad_norm": 0.5703125,
+      "learning_rate": 9.400000000000001e-05,
+      "loss": 0.2988,
+      "step": 48
+    },
+    {
+      "epoch": 0.0004253435300040798,
+      "grad_norm": 0.4609375,
+      "learning_rate": 9.6e-05,
+      "loss": 0.3203,
+      "step": 49
+    },
+    {
+      "epoch": 0.0004340240102082447,
+      "grad_norm": 0.6015625,
+      "learning_rate": 9.800000000000001e-05,
+      "loss": 0.3828,
+      "step": 50
+    },
+    {
+      "epoch": 0.0004427044904124096,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2773,
+      "step": 51
+    },
+    {
+      "epoch": 0.0004513849706165745,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.000102,
+      "loss": 0.3672,
+      "step": 52
+    },
+    {
+      "epoch": 0.0004600654508207394,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.000104,
+      "loss": 0.3965,
+      "step": 53
+    },
+    {
+      "epoch": 0.0004687459310249043,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.000106,
+      "loss": 0.6133,
+      "step": 54
+    },
+    {
+      "epoch": 0.00047742641122906917,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.000108,
+      "loss": 0.3184,
+      "step": 55
+    },
+    {
+      "epoch": 0.00048610689143323407,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.00011,
+      "loss": 0.4648,
+      "step": 56
+    },
+    {
+      "epoch": 0.000494787371637399,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.000112,
+      "loss": 0.3496,
+      "step": 57
+    },
+    {
+      "epoch": 0.0005034678518415638,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.000114,
+      "loss": 0.3008,
+      "step": 58
+    },
+    {
+      "epoch": 0.0005121483320457287,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00011600000000000001,
+      "loss": 0.3867,
+      "step": 59
+    },
+    {
+      "epoch": 0.0005208288122498936,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.000118,
+      "loss": 0.3359,
+      "step": 60
+    },
+    {
+      "epoch": 0.0005295092924540585,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.00012,
+      "loss": 0.3926,
+      "step": 61
+    },
+    {
+      "epoch": 0.0005381897726582234,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.000122,
+      "loss": 0.375,
+      "step": 62
+    },
+    {
+      "epoch": 0.0005468702528623883,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.000124,
+      "loss": 0.3574,
+      "step": 63
+    },
+    {
+      "epoch": 0.0005555507330665532,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.000126,
+      "loss": 0.2695,
+      "step": 64
+    },
+    {
+      "epoch": 0.0005642312132707181,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.000128,
+      "loss": 0.3594,
+      "step": 65
+    },
+    {
+      "epoch": 0.000572911693474883,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 0.3438,
+      "step": 66
+    },
+    {
+      "epoch": 0.000581592173679048,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.000132,
+      "loss": 0.3047,
+      "step": 67
+    },
+    {
+      "epoch": 0.0005902726538832129,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.000134,
+      "loss": 0.2773,
+      "step": 68
+    },
+    {
+      "epoch": 0.0005989531340873778,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.00013600000000000003,
+      "loss": 0.3242,
+      "step": 69
+    },
+    {
+      "epoch": 0.0006076336142915427,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.00013800000000000002,
+      "loss": 0.3203,
+      "step": 70
+    },
+    {
+      "epoch": 0.0006163140944957075,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00014000000000000001,
+      "loss": 0.3047,
+      "step": 71
+    },
+    {
+      "epoch": 0.0006249945746998724,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.00014199999999999998,
+      "loss": 0.3887,
+      "step": 72
+    },
+    {
+      "epoch": 0.0006336750549040373,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.000144,
+      "loss": 0.332,
+      "step": 73
+    },
+    {
+      "epoch": 0.0006423555351082022,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.000146,
+      "loss": 0.3203,
+      "step": 74
+    },
+    {
+      "epoch": 0.0006510360153123671,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.000148,
+      "loss": 0.3555,
+      "step": 75
+    },
+    {
+      "epoch": 0.000659716495516532,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.00015,
+      "loss": 0.377,
+      "step": 76
+    },
+    {
+      "epoch": 0.0006683969757206969,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.000152,
+      "loss": 0.2656,
+      "step": 77
+    },
+    {
+      "epoch": 0.0006770774559248618,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.000154,
+      "loss": 0.4102,
+      "step": 78
+    },
+    {
+      "epoch": 0.0006857579361290267,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.000156,
+      "loss": 0.2949,
+      "step": 79
+    },
+    {
+      "epoch": 0.0006944384163331916,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.000158,
+      "loss": 0.3906,
+      "step": 80
+    },
+    {
+      "epoch": 0.0007031188965373565,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00016,
+      "loss": 0.332,
+      "step": 81
+    },
+    {
+      "epoch": 0.0007117993767415214,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.000162,
+      "loss": 0.3164,
+      "step": 82
+    },
+    {
+      "epoch": 0.0007204798569456863,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.000164,
+      "loss": 0.3535,
+      "step": 83
+    },
+    {
+      "epoch": 0.0007291603371498511,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00016600000000000002,
+      "loss": 0.3438,
+      "step": 84
+    },
+    {
+      "epoch": 0.000737840817354016,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00016800000000000002,
+      "loss": 0.3066,
+      "step": 85
+    },
+    {
+      "epoch": 0.0007465212975581809,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00017,
+      "loss": 0.3262,
+      "step": 86
+    },
+    {
+      "epoch": 0.0007552017777623458,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00017199999999999998,
+      "loss": 0.334,
+      "step": 87
+    },
+    {
+      "epoch": 0.0007638822579665107,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.000174,
+      "loss": 0.3242,
+      "step": 88
+    },
+    {
+      "epoch": 0.0007725627381706756,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.000176,
+      "loss": 0.2539,
+      "step": 89
+    },
+    {
+      "epoch": 0.0007812432183748405,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.000178,
+      "loss": 0.3184,
+      "step": 90
+    },
+    {
+      "epoch": 0.0007899236985790054,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 0.3398,
+      "step": 91
+    },
+    {
+      "epoch": 0.0007986041787831703,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.000182,
+      "loss": 0.3516,
+      "step": 92
+    },
+    {
+      "epoch": 0.0008072846589873352,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.000184,
+      "loss": 0.2852,
+      "step": 93
+    },
+    {
+      "epoch": 0.0008159651391915001,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.000186,
+      "loss": 0.3184,
+      "step": 94
+    },
+    {
+      "epoch": 0.000824645619395665,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00018800000000000002,
+      "loss": 0.2871,
+      "step": 95
+    },
+    {
+      "epoch": 0.0008333260995998299,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00019,
+      "loss": 0.2637,
+      "step": 96
+    },
+    {
+      "epoch": 0.0008420065798039947,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.000192,
+      "loss": 0.2559,
+      "step": 97
+    },
+    {
+      "epoch": 0.0008506870600081596,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.000194,
+      "loss": 0.4199,
+      "step": 98
+    },
+    {
+      "epoch": 0.0008593675402123245,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.00019600000000000002,
+      "loss": 0.3438,
+      "step": 99
+    },
+    {
+      "epoch": 0.0008680480204164894,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00019800000000000002,
+      "loss": 0.3398,
+      "step": 100
+    },
+    {
+      "epoch": 0.0008767285006206543,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0002,
+      "loss": 0.3555,
+      "step": 101
+    },
+    {
+      "epoch": 0.0008854089808248192,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.000202,
+      "loss": 0.249,
+      "step": 102
+    },
+    {
+      "epoch": 0.0008940894610289841,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.000204,
+      "loss": 0.3535,
+      "step": 103
+    },
+    {
+      "epoch": 0.000902769941233149,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.000206,
+      "loss": 0.3887,
+      "step": 104
+    },
+    {
+      "epoch": 0.0009114504214373139,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.000208,
+      "loss": 0.2383,
+      "step": 105
+    },
+    {
+      "epoch": 0.0009201309016414788,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00021,
+      "loss": 0.3086,
+      "step": 106
+    },
+    {
+      "epoch": 0.0009288113818456437,
+      "grad_norm": 0.375,
+      "learning_rate": 0.000212,
+      "loss": 0.3809,
+      "step": 107
+    },
+    {
+      "epoch": 0.0009374918620498086,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.000214,
+      "loss": 0.2969,
+      "step": 108
+    },
+    {
+      "epoch": 0.0009461723422539735,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.000216,
+      "loss": 0.3691,
+      "step": 109
+    },
+    {
+      "epoch": 0.0009548528224581383,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.000218,
+      "loss": 0.332,
+      "step": 110
+    },
+    {
+      "epoch": 0.0009635333026623032,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00022,
+      "loss": 0.3828,
+      "step": 111
+    },
+    {
+      "epoch": 0.0009722137828664681,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.000222,
+      "loss": 0.4023,
+      "step": 112
+    },
+    {
+      "epoch": 0.0009808942630706332,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.000224,
+      "loss": 0.3145,
+      "step": 113
+    },
+    {
+      "epoch": 0.000989574743274798,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00022600000000000002,
+      "loss": 0.3945,
+      "step": 114
+    },
+    {
+      "epoch": 0.000998255223478963,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.000228,
+      "loss": 0.2969,
+      "step": 115
+    },
+    {
+      "epoch": 0.0010069357036831276,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00023,
+      "loss": 0.3438,
+      "step": 116
+    },
+    {
+      "epoch": 0.0010156161838872925,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.00023200000000000003,
+      "loss": 0.6211,
+      "step": 117
+    },
+    {
+      "epoch": 0.0010242966640914575,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00023400000000000002,
+      "loss": 0.3145,
+      "step": 118
+    },
+    {
+      "epoch": 0.0010329771442956224,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.000236,
+      "loss": 0.3477,
+      "step": 119
+    },
+    {
+      "epoch": 0.0010416576244997873,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00023799999999999998,
+      "loss": 0.3789,
+      "step": 120
+    },
+    {
+      "epoch": 0.0010503381047039522,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.00024,
+      "loss": 0.2422,
+      "step": 121
+    },
+    {
+      "epoch": 0.001059018584908117,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.000242,
+      "loss": 0.2988,
+      "step": 122
+    },
+    {
+      "epoch": 0.001067699065112282,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.000244,
+      "loss": 0.2949,
+      "step": 123
+    },
+    {
+      "epoch": 0.0010763795453164469,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.000246,
+      "loss": 0.2451,
+      "step": 124
+    },
+    {
+      "epoch": 0.0010850600255206118,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.000248,
+      "loss": 0.291,
+      "step": 125
+    },
+    {
+      "epoch": 0.0010937405057247767,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00025,
+      "loss": 0.3516,
+      "step": 126
+    },
+    {
+      "epoch": 0.0011024209859289416,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.000252,
+      "loss": 0.3301,
+      "step": 127
+    },
+    {
+      "epoch": 0.0011111014661331065,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.000254,
+      "loss": 0.252,
+      "step": 128
+    },
+    {
+      "epoch": 0.0011197819463372714,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.000256,
+      "loss": 0.2539,
+      "step": 129
+    },
+    {
+      "epoch": 0.0011284624265414363,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.00025800000000000004,
+      "loss": 0.3535,
+      "step": 130
+    },
+    {
+      "epoch": 0.0011371429067456012,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00026000000000000003,
+      "loss": 0.3242,
+      "step": 131
+    },
+    {
+      "epoch": 0.001145823386949766,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.000262,
+      "loss": 0.3184,
+      "step": 132
+    },
+    {
+      "epoch": 0.001154503867153931,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.000264,
+      "loss": 0.5039,
+      "step": 133
+    },
+    {
+      "epoch": 0.001163184347358096,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.000266,
+      "loss": 0.2656,
+      "step": 134
+    },
+    {
+      "epoch": 0.0011718648275622608,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.000268,
+      "loss": 0.3652,
+      "step": 135
+    },
+    {
+      "epoch": 0.0011805453077664257,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00027,
+      "loss": 0.3086,
+      "step": 136
+    },
+    {
+      "epoch": 0.0011892257879705906,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00027200000000000005,
+      "loss": 0.3145,
+      "step": 137
+    },
+    {
+      "epoch": 0.0011979062681747555,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00027400000000000005,
+      "loss": 0.4375,
+      "step": 138
+    },
+    {
+      "epoch": 0.0012065867483789204,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00027600000000000004,
+      "loss": 0.3203,
+      "step": 139
+    },
+    {
+      "epoch": 0.0012152672285830853,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.00027800000000000004,
+      "loss": 0.2949,
+      "step": 140
+    },
+    {
+      "epoch": 0.0012239477087872502,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00028000000000000003,
+      "loss": 0.3613,
+      "step": 141
+    },
+    {
+      "epoch": 0.001232628188991415,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00028199999999999997,
+      "loss": 0.3828,
+      "step": 142
+    },
+    {
+      "epoch": 0.0012413086691955798,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.00028399999999999996,
+      "loss": 0.3125,
+      "step": 143
+    },
+    {
+      "epoch": 0.0012499891493997447,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00028599999999999996,
+      "loss": 0.2773,
+      "step": 144
+    },
+    {
+      "epoch": 0.0012586696296039096,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.000288,
+      "loss": 0.293,
+      "step": 145
+    },
+    {
+      "epoch": 0.0012673501098080745,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.00029,
+      "loss": 0.2461,
+      "step": 146
+    },
+    {
+      "epoch": 0.0012760305900122394,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.000292,
+      "loss": 0.2461,
+      "step": 147
+    },
+    {
+      "epoch": 0.0012847110702164043,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.000294,
+      "loss": 0.3887,
+      "step": 148
+    },
+    {
+      "epoch": 0.0012933915504205692,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.000296,
+      "loss": 0.3105,
+      "step": 149
+    },
+    {
+      "epoch": 0.0013020720306247341,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.000298,
+      "loss": 0.2402,
+      "step": 150
+    },
+    {
+      "epoch": 0.001310752510828899,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0003,
+      "loss": 0.2734,
+      "step": 151
+    },
+    {
+      "epoch": 0.001319432991033064,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.000302,
+      "loss": 0.3125,
+      "step": 152
+    },
+    {
+      "epoch": 0.0013281134712372288,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.000304,
+      "loss": 0.3223,
+      "step": 153
+    },
+    {
+      "epoch": 0.0013367939514413937,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.000306,
+      "loss": 0.2695,
+      "step": 154
+    },
+    {
+      "epoch": 0.0013454744316455586,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.000308,
+      "loss": 0.2441,
+      "step": 155
+    },
+    {
+      "epoch": 0.0013541549118497235,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00031,
+      "loss": 0.375,
+      "step": 156
+    },
+    {
+      "epoch": 0.0013628353920538884,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.000312,
+      "loss": 0.3242,
+      "step": 157
+    },
+    {
+      "epoch": 0.0013715158722580533,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.000314,
+      "loss": 0.293,
+      "step": 158
+    },
+    {
+      "epoch": 0.0013801963524622182,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.000316,
+      "loss": 0.3027,
+      "step": 159
+    },
+    {
+      "epoch": 0.0013888768326663832,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.00031800000000000003,
+      "loss": 0.2217,
+      "step": 160
+    },
+    {
+      "epoch": 0.001397557312870548,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00032,
+      "loss": 0.3691,
+      "step": 161
+    },
+    {
+      "epoch": 0.001406237793074713,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.000322,
+      "loss": 0.2891,
+      "step": 162
+    },
+    {
+      "epoch": 0.0014149182732788779,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.000324,
+      "loss": 0.3301,
+      "step": 163
+    },
+    {
+      "epoch": 0.0014235987534830428,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.000326,
+      "loss": 0.334,
+      "step": 164
+    },
+    {
+      "epoch": 0.0014322792336872077,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.000328,
+      "loss": 0.2988,
+      "step": 165
+    },
+    {
+      "epoch": 0.0014409597138913726,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.00033,
+      "loss": 0.2578,
+      "step": 166
+    },
+    {
+      "epoch": 0.0014496401940955375,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00033200000000000005,
+      "loss": 0.3008,
+      "step": 167
+    },
+    {
+      "epoch": 0.0014583206742997022,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00033400000000000004,
+      "loss": 0.293,
+      "step": 168
+    },
+    {
+      "epoch": 0.001467001154503867,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00033600000000000004,
+      "loss": 0.2832,
+      "step": 169
+    },
+    {
+      "epoch": 0.001475681634708032,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00033800000000000003,
+      "loss": 0.2676,
+      "step": 170
+    },
+    {
+      "epoch": 0.0014843621149121969,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00034,
+      "loss": 0.2734,
+      "step": 171
+    },
+    {
+      "epoch": 0.0014930425951163618,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.000342,
+      "loss": 0.3125,
+      "step": 172
+    },
+    {
+      "epoch": 0.0015017230753205267,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00034399999999999996,
+      "loss": 0.3008,
+      "step": 173
+    },
+    {
+      "epoch": 0.0015104035555246916,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.000346,
+      "loss": 0.332,
+      "step": 174
+    },
+    {
+      "epoch": 0.0015190840357288565,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.000348,
+      "loss": 0.2539,
+      "step": 175
+    },
+    {
+      "epoch": 0.0015277645159330214,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.00035,
+      "loss": 0.3203,
+      "step": 176
+    },
+    {
+      "epoch": 0.0015364449961371863,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.000352,
+      "loss": 0.3066,
+      "step": 177
+    },
+    {
+      "epoch": 0.0015451254763413512,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.000354,
+      "loss": 0.3047,
+      "step": 178
+    },
+    {
+      "epoch": 0.001553805956545516,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.000356,
+      "loss": 0.3184,
+      "step": 179
+    },
+    {
+      "epoch": 0.001562486436749681,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.000358,
+      "loss": 0.3105,
+      "step": 180
+    },
+    {
+      "epoch": 0.001571166916953846,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 0.3125,
+      "step": 181
+    },
+    {
+      "epoch": 0.0015798473971580108,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.000362,
+      "loss": 0.3223,
+      "step": 182
+    },
+    {
+      "epoch": 0.0015885278773621757,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.000364,
+      "loss": 0.4199,
+      "step": 183
+    },
+    {
+      "epoch": 0.0015972083575663406,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.000366,
+      "loss": 0.3281,
+      "step": 184
+    },
+    {
+      "epoch": 0.0016058888377705055,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.000368,
+      "loss": 0.2891,
+      "step": 185
+    },
+    {
+      "epoch": 0.0016145693179746704,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00037,
+      "loss": 0.3555,
+      "step": 186
+    },
+    {
+      "epoch": 0.0016232497981788353,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.000372,
+      "loss": 0.3457,
+      "step": 187
+    },
+    {
+      "epoch": 0.0016319302783830002,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.000374,
+      "loss": 0.2988,
+      "step": 188
+    },
+    {
+      "epoch": 0.0016406107585871651,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00037600000000000003,
+      "loss": 0.3242,
+      "step": 189
+    },
+    {
+      "epoch": 0.00164929123879133,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.000378,
+      "loss": 0.2949,
+      "step": 190
+    },
+    {
+      "epoch": 0.001657971718995495,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00038,
+      "loss": 0.3672,
+      "step": 191
+    },
+    {
+      "epoch": 0.0016666521991996598,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.000382,
+      "loss": 0.3047,
+      "step": 192
+    },
+    {
+      "epoch": 0.0016753326794038245,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.000384,
+      "loss": 0.2715,
+      "step": 193
+    },
+    {
+      "epoch": 0.0016840131596079894,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.000386,
+      "loss": 0.3086,
+      "step": 194
+    },
+    {
+      "epoch": 0.0016926936398121543,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.000388,
+      "loss": 0.2617,
+      "step": 195
+    },
+    {
+      "epoch": 0.0017013741200163192,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.00039000000000000005,
+      "loss": 0.3262,
+      "step": 196
+    },
+    {
+      "epoch": 0.0017100546002204841,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.00039200000000000004,
+      "loss": 0.252,
+      "step": 197
+    },
+    {
+      "epoch": 0.001718735080424649,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00039400000000000004,
+      "loss": 0.3047,
+      "step": 198
+    },
+    {
+      "epoch": 0.001727415560628814,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.00039600000000000003,
+      "loss": 0.332,
+      "step": 199
+    },
+    {
+      "epoch": 0.0017360960408329788,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.000398,
+      "loss": 0.2695,
+      "step": 200
+    },
+    {
+      "epoch": 0.0017447765210371437,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0004,
+      "loss": 0.3867,
+      "step": 201
+    },
+    {
+      "epoch": 0.0017534570012413086,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.000402,
+      "loss": 0.2598,
+      "step": 202
+    },
+    {
+      "epoch": 0.0017621374814454735,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.000404,
+      "loss": 0.3203,
+      "step": 203
+    },
+    {
+      "epoch": 0.0017708179616496384,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00040600000000000006,
+      "loss": 0.3438,
+      "step": 204
+    },
+    {
+      "epoch": 0.0017794984418538033,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.000408,
+      "loss": 0.248,
+      "step": 205
+    },
+    {
+      "epoch": 0.0017881789220579682,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.00041,
+      "loss": 0.2852,
+      "step": 206
+    },
+    {
+      "epoch": 0.0017968594022621332,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.000412,
+      "loss": 0.3652,
+      "step": 207
+    },
+    {
+      "epoch": 0.001805539882466298,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.000414,
+      "loss": 0.3359,
+      "step": 208
+    },
+    {
+      "epoch": 0.001814220362670463,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.000416,
+      "loss": 0.2773,
+      "step": 209
+    },
+    {
+      "epoch": 0.0018229008428746279,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.00041799999999999997,
+      "loss": 0.3047,
+      "step": 210
+    },
+    {
+      "epoch": 0.0018315813230787928,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.00042,
+      "loss": 0.3613,
+      "step": 211
+    },
+    {
+      "epoch": 0.0018402618032829577,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.000422,
+      "loss": 0.2676,
+      "step": 212
+    },
+    {
+      "epoch": 0.0018489422834871226,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.000424,
+      "loss": 0.3633,
+      "step": 213
+    },
+    {
+      "epoch": 0.0018576227636912875,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.000426,
+      "loss": 0.4062,
+      "step": 214
+    },
+    {
+      "epoch": 0.0018663032438954524,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.000428,
+      "loss": 0.3223,
+      "step": 215
+    },
+    {
+      "epoch": 0.0018749837240996173,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.00043,
+      "loss": 0.293,
+      "step": 216
+    },
+    {
+      "epoch": 0.0018836642043037822,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.000432,
+      "loss": 0.3086,
+      "step": 217
+    },
+    {
+      "epoch": 0.001892344684507947,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.00043400000000000003,
+      "loss": 0.2656,
+      "step": 218
+    },
+    {
+      "epoch": 0.0019010251647121118,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.000436,
+      "loss": 0.248,
+      "step": 219
+    },
+    {
+      "epoch": 0.0019097056449162767,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.000438,
+      "loss": 0.3379,
+      "step": 220
+    },
+    {
+      "epoch": 0.0019183861251204416,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.00044,
+      "loss": 0.3301,
+      "step": 221
+    },
+    {
+      "epoch": 0.0019270666053246065,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.000442,
+      "loss": 0.2812,
+      "step": 222
+    },
+    {
+      "epoch": 0.0019357470855287714,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.000444,
+      "loss": 0.2188,
+      "step": 223
+    },
+    {
+      "epoch": 0.0019444275657329363,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.000446,
+      "loss": 0.3242,
+      "step": 224
+    },
+    {
+      "epoch": 0.0019531080459371012,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.000448,
+      "loss": 0.2773,
+      "step": 225
+    },
+    {
+      "epoch": 0.0019617885261412663,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 0.293,
+      "step": 226
+    },
+    {
+      "epoch": 0.001970469006345431,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00045200000000000004,
+      "loss": 0.3281,
+      "step": 227
+    },
+    {
+      "epoch": 0.001979149486549596,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.00045400000000000003,
+      "loss": 0.3184,
+      "step": 228
+    },
+    {
+      "epoch": 0.001987829966753761,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.000456,
+      "loss": 0.3164,
+      "step": 229
+    },
+    {
+      "epoch": 0.001996510446957926,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.000458,
+      "loss": 0.2734,
+      "step": 230
+    },
+    {
+      "epoch": 0.0020051909271620906,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00046,
+      "loss": 0.4395,
+      "step": 231
+    },
+    {
+      "epoch": 0.0020138714073662553,
+      "grad_norm": 0.25,
+      "learning_rate": 0.000462,
+      "loss": 0.291,
+      "step": 232
+    },
+    {
+      "epoch": 0.0020225518875704204,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.00046400000000000006,
+      "loss": 0.332,
+      "step": 233
+    },
+    {
+      "epoch": 0.002031232367774585,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00046600000000000005,
+      "loss": 0.3457,
+      "step": 234
+    },
+    {
+      "epoch": 0.00203991284797875,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.00046800000000000005,
+      "loss": 0.3086,
+      "step": 235
+    },
+    {
+      "epoch": 0.002048593328182915,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00047,
+      "loss": 0.4199,
+      "step": 236
+    },
+    {
+      "epoch": 0.00205727380838708,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.000472,
+      "loss": 0.3477,
+      "step": 237
+    },
+    {
+      "epoch": 0.0020659542885912447,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.000474,
+      "loss": 0.3145,
+      "step": 238
+    },
+    {
+      "epoch": 0.00207463476879541,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.00047599999999999997,
+      "loss": 0.2617,
+      "step": 239
+    },
+    {
+      "epoch": 0.0020833152489995745,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.00047799999999999996,
+      "loss": 0.3809,
+      "step": 240
+    },
+    {
+      "epoch": 0.0020919957292037396,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.00048,
+      "loss": 0.2715,
+      "step": 241
+    },
+    {
+      "epoch": 0.0021006762094079043,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.000482,
+      "loss": 0.2578,
+      "step": 242
+    },
+    {
+      "epoch": 0.0021093566896120694,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.000484,
+      "loss": 0.2578,
+      "step": 243
+    },
+    {
+      "epoch": 0.002118037169816234,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.000486,
+      "loss": 0.2793,
+      "step": 244
+    },
+    {
+      "epoch": 0.0021267176500203992,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.000488,
+      "loss": 0.3828,
+      "step": 245
+    },
+    {
+      "epoch": 0.002135398130224564,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.00049,
+      "loss": 0.2695,
+      "step": 246
+    },
+    {
+      "epoch": 0.002144078610428729,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.000492,
+      "loss": 0.2656,
+      "step": 247
+    },
+    {
+      "epoch": 0.0021527590906328937,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.000494,
+      "loss": 0.2432,
+      "step": 248
+    },
+    {
+      "epoch": 0.002161439570837059,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.000496,
+      "loss": 0.3125,
+      "step": 249
+    },
+    {
+      "epoch": 0.0021701200510412235,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.000498,
+      "loss": 0.2695,
+      "step": 250
+    },
+    {
+      "epoch": 0.0021788005312453887,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0005,
+      "loss": 0.3184,
+      "step": 251
+    },
+    {
+      "epoch": 0.0021874810114495533,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0005020000000000001,
+      "loss": 0.3027,
+      "step": 252
+    },
+    {
+      "epoch": 0.0021961614916537185,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.000504,
+      "loss": 0.2988,
+      "step": 253
+    },
+    {
+      "epoch": 0.002204841971857883,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.000506,
+      "loss": 0.293,
+      "step": 254
+    },
+    {
+      "epoch": 0.0022135224520620483,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.000508,
+      "loss": 0.2656,
+      "step": 255
+    },
+    {
+      "epoch": 0.002222202932266213,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.00051,
+      "loss": 0.3066,
+      "step": 256
+    },
+    {
+      "epoch": 0.002230883412470378,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.000512,
+      "loss": 0.3027,
+      "step": 257
+    },
+    {
+      "epoch": 0.0022395638926745428,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.000514,
+      "loss": 0.2969,
+      "step": 258
+    },
+    {
+      "epoch": 0.0022482443728787074,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0005160000000000001,
+      "loss": 0.3887,
+      "step": 259
+    },
+    {
+      "epoch": 0.0022569248530828726,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.000518,
+      "loss": 0.2695,
+      "step": 260
+    },
+    {
+      "epoch": 0.0022656053332870373,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0005200000000000001,
+      "loss": 0.2812,
+      "step": 261
+    },
+    {
+      "epoch": 0.0022742858134912024,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.000522,
+      "loss": 0.3105,
+      "step": 262
+    },
+    {
+      "epoch": 0.002282966293695367,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.000524,
+      "loss": 0.3516,
+      "step": 263
+    },
+    {
+      "epoch": 0.002291646773899532,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.000526,
+      "loss": 0.2578,
+      "step": 264
+    },
+    {
+      "epoch": 0.002300327254103697,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.000528,
+      "loss": 0.252,
+      "step": 265
+    },
+    {
+      "epoch": 0.002309007734307862,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0005300000000000001,
+      "loss": 0.3516,
+      "step": 266
+    },
+    {
+      "epoch": 0.0023176882145120267,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.000532,
+      "loss": 0.2969,
+      "step": 267
+    },
+    {
+      "epoch": 0.002326368694716192,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0005340000000000001,
+      "loss": 0.3203,
+      "step": 268
+    },
+    {
+      "epoch": 0.0023350491749203565,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.000536,
+      "loss": 0.3242,
+      "step": 269
+    },
+    {
+      "epoch": 0.0023437296551245216,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0005380000000000001,
+      "loss": 0.2832,
+      "step": 270
+    },
+    {
+      "epoch": 0.0023524101353286863,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.00054,
+      "loss": 0.3145,
+      "step": 271
+    },
+    {
+      "epoch": 0.0023610906155328514,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0005420000000000001,
+      "loss": 0.2637,
+      "step": 272
+    },
+    {
+      "epoch": 0.002369771095737016,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0005440000000000001,
+      "loss": 0.4414,
+      "step": 273
+    },
+    {
+      "epoch": 0.002378451575941181,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.000546,
+      "loss": 0.2695,
+      "step": 274
+    },
+    {
+      "epoch": 0.002387132056145346,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0005480000000000001,
+      "loss": 0.3809,
+      "step": 275
+    },
+    {
+      "epoch": 0.002395812536349511,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.00055,
+      "loss": 0.3242,
+      "step": 276
+    },
+    {
+      "epoch": 0.0024044930165536757,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0005520000000000001,
+      "loss": 0.3281,
+      "step": 277
+    },
+    {
+      "epoch": 0.002413173496757841,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.000554,
+      "loss": 0.2695,
+      "step": 278
+    },
+    {
+      "epoch": 0.0024218539769620055,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0005560000000000001,
+      "loss": 0.3359,
+      "step": 279
+    },
+    {
+      "epoch": 0.0024305344571661706,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.000558,
+      "loss": 0.2969,
+      "step": 280
+    },
+    {
+      "epoch": 0.0024392149373703353,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0005600000000000001,
+      "loss": 0.2363,
+      "step": 281
+    },
+    {
+      "epoch": 0.0024478954175745004,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0005620000000000001,
+      "loss": 0.3203,
+      "step": 282
+    },
+    {
+      "epoch": 0.002456575897778665,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0005639999999999999,
+      "loss": 0.2402,
+      "step": 283
+    },
+    {
+      "epoch": 0.00246525637798283,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.000566,
+      "loss": 0.2949,
+      "step": 284
+    },
+    {
+      "epoch": 0.002473936858186995,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0005679999999999999,
+      "loss": 0.3457,
+      "step": 285
+    },
+    {
+      "epoch": 0.0024826173383911596,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.00057,
+      "loss": 0.3047,
+      "step": 286
+    },
+    {
+      "epoch": 0.0024912978185953247,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0005719999999999999,
+      "loss": 0.3555,
+      "step": 287
+    },
+    {
+      "epoch": 0.0024999782987994894,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.000574,
+      "loss": 0.2539,
+      "step": 288
+    },
+    {
+      "epoch": 0.0025086587790036545,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.000576,
+      "loss": 0.3066,
+      "step": 289
+    },
+    {
+      "epoch": 0.0025173392592078192,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.000578,
+      "loss": 0.3379,
+      "step": 290
+    },
+    {
+      "epoch": 0.0025260197394119843,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.00058,
+      "loss": 0.2832,
+      "step": 291
+    },
+    {
+      "epoch": 0.002534700219616149,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0005819999999999999,
+      "loss": 0.3867,
+      "step": 292
+    },
+    {
+      "epoch": 0.002543380699820314,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.000584,
+      "loss": 0.3398,
+      "step": 293
+    },
+    {
+      "epoch": 0.002552061180024479,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0005859999999999999,
+      "loss": 0.3203,
+      "step": 294
+    },
+    {
+      "epoch": 0.002560741660228644,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.000588,
+      "loss": 0.25,
+      "step": 295
+    },
+    {
+      "epoch": 0.0025694221404328086,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.00059,
+      "loss": 0.3105,
+      "step": 296
+    },
+    {
+      "epoch": 0.0025781026206369738,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.000592,
+      "loss": 0.3027,
+      "step": 297
+    },
+    {
+      "epoch": 0.0025867831008411384,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.000594,
+      "loss": 0.3047,
+      "step": 298
+    },
+    {
+      "epoch": 0.0025954635810453036,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.000596,
+      "loss": 0.457,
+      "step": 299
+    },
+    {
+      "epoch": 0.0026041440612494682,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.000598,
+      "loss": 0.2793,
+      "step": 300
+    },
+    {
+      "epoch": 0.0026128245414536334,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0006,
+      "loss": 0.3379,
+      "step": 301
+    },
+    {
+      "epoch": 0.002621505021657798,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.000602,
+      "loss": 0.2676,
+      "step": 302
+    },
+    {
+      "epoch": 0.002630185501861963,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.000604,
+      "loss": 0.3223,
+      "step": 303
+    },
+    {
+      "epoch": 0.002638865982066128,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.000606,
+      "loss": 0.2617,
+      "step": 304
+    },
+    {
+      "epoch": 0.002647546462270293,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.000608,
+      "loss": 0.3066,
+      "step": 305
+    },
+    {
+      "epoch": 0.0026562269424744577,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.00061,
+      "loss": 0.2637,
+      "step": 306
+    },
+    {
+      "epoch": 0.002664907422678623,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.000612,
+      "loss": 0.3105,
+      "step": 307
+    },
+    {
+      "epoch": 0.0026735879028827875,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.000614,
+      "loss": 0.3164,
+      "step": 308
+    },
+    {
+      "epoch": 0.002682268383086952,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.000616,
+      "loss": 0.2539,
+      "step": 309
+    },
+    {
+      "epoch": 0.0026909488632911173,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0006180000000000001,
+      "loss": 0.2969,
+      "step": 310
+    },
+    {
+      "epoch": 0.002699629343495282,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.00062,
+      "loss": 0.2598,
+      "step": 311
+    },
+    {
+      "epoch": 0.002708309823699447,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.000622,
+      "loss": 0.2734,
+      "step": 312
+    },
+    {
+      "epoch": 0.0027169903039036118,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.000624,
+      "loss": 0.3359,
+      "step": 313
+    },
+    {
+      "epoch": 0.002725670784107777,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.000626,
+      "loss": 0.3359,
+      "step": 314
+    },
+    {
+      "epoch": 0.0027343512643119416,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.000628,
+      "loss": 0.3008,
+      "step": 315
+    },
+    {
+      "epoch": 0.0027430317445161067,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.00063,
+      "loss": 0.3086,
+      "step": 316
+    },
+    {
+      "epoch": 0.0027517122247202714,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.000632,
+      "loss": 0.3105,
+      "step": 317
+    },
+    {
+      "epoch": 0.0027603927049244365,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.000634,
+      "loss": 0.248,
+      "step": 318
+    },
+    {
+      "epoch": 0.002769073185128601,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0006360000000000001,
+      "loss": 0.3008,
+      "step": 319
+    },
+    {
+      "epoch": 0.0027777536653327663,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.000638,
+      "loss": 0.2617,
+      "step": 320
+    },
+    {
+      "epoch": 0.002786434145536931,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.00064,
+      "loss": 0.248,
+      "step": 321
+    },
+    {
+      "epoch": 0.002795114625741096,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.000642,
+      "loss": 0.3457,
+      "step": 322
+    },
+    {
+      "epoch": 0.002803795105945261,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.000644,
+      "loss": 0.3047,
+      "step": 323
+    },
+    {
+      "epoch": 0.002812475586149426,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.000646,
+      "loss": 0.2969,
+      "step": 324
+    },
+    {
+      "epoch": 0.0028211560663535906,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.000648,
+      "loss": 0.248,
+      "step": 325
+    },
+    {
+      "epoch": 0.0028298365465577557,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0006500000000000001,
+      "loss": 0.3164,
+      "step": 326
+    },
+    {
+      "epoch": 0.0028385170267619204,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.000652,
+      "loss": 0.25,
+      "step": 327
+    },
+    {
+      "epoch": 0.0028471975069660855,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0006540000000000001,
+      "loss": 0.3594,
+      "step": 328
+    },
+    {
+      "epoch": 0.00285587798717025,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.000656,
+      "loss": 0.2471,
+      "step": 329
+    },
+    {
+      "epoch": 0.0028645584673744153,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0006580000000000001,
+      "loss": 0.3359,
+      "step": 330
+    },
+    {
+      "epoch": 0.00287323894757858,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00066,
+      "loss": 0.3184,
+      "step": 331
+    },
+    {
+      "epoch": 0.002881919427782745,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.000662,
+      "loss": 0.3438,
+      "step": 332
+    },
+    {
+      "epoch": 0.00289059990798691,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0006640000000000001,
+      "loss": 0.2617,
+      "step": 333
+    },
+    {
+      "epoch": 0.002899280388191075,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.000666,
+      "loss": 0.3105,
+      "step": 334
+    },
+    {
+      "epoch": 0.0029079608683952396,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0006680000000000001,
+      "loss": 0.3359,
+      "step": 335
+    },
+    {
+      "epoch": 0.0029166413485994043,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.00067,
+      "loss": 0.3301,
+      "step": 336
+    },
+    {
+      "epoch": 0.0029253218288035694,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0006720000000000001,
+      "loss": 0.3008,
+      "step": 337
+    },
+    {
+      "epoch": 0.002934002309007734,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.000674,
+      "loss": 0.4102,
+      "step": 338
+    },
+    {
+      "epoch": 0.0029426827892118992,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0006760000000000001,
+      "loss": 0.2773,
+      "step": 339
+    },
+    {
+      "epoch": 0.002951363269416064,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0006780000000000001,
+      "loss": 0.2969,
+      "step": 340
+    },
+    {
+      "epoch": 0.002960043749620229,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.00068,
+      "loss": 0.3203,
+      "step": 341
+    },
+    {
+      "epoch": 0.0029687242298243937,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0006820000000000001,
+      "loss": 0.3066,
+      "step": 342
+    },
+    {
+      "epoch": 0.002977404710028559,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.000684,
+      "loss": 0.3789,
+      "step": 343
+    },
+    {
+      "epoch": 0.0029860851902327235,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0006860000000000001,
+      "loss": 0.3242,
+      "step": 344
+    },
+    {
+      "epoch": 0.0029947656704368887,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0006879999999999999,
+      "loss": 0.2988,
+      "step": 345
+    },
+    {
+      "epoch": 0.0030034461506410533,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.00069,
+      "loss": 0.2695,
+      "step": 346
+    },
+    {
+      "epoch": 0.0030121266308452185,
+      "grad_norm": 0.25,
+      "learning_rate": 0.000692,
+      "loss": 0.2871,
+      "step": 347
+    },
+    {
+      "epoch": 0.003020807111049383,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.000694,
+      "loss": 0.2812,
+      "step": 348
+    },
+    {
+      "epoch": 0.0030294875912535483,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.000696,
+      "loss": 0.4082,
+      "step": 349
+    },
+    {
+      "epoch": 0.003038168071457713,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0006979999999999999,
+      "loss": 0.2793,
+      "step": 350
+    },
+    {
+      "epoch": 0.003046848551661878,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0007,
+      "loss": 0.2891,
+      "step": 351
+    },
+    {
+      "epoch": 0.0030555290318660428,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0007019999999999999,
+      "loss": 0.3535,
+      "step": 352
+    },
+    {
+      "epoch": 0.003064209512070208,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.000704,
+      "loss": 0.2656,
+      "step": 353
+    },
+    {
+      "epoch": 0.0030728899922743726,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0007059999999999999,
+      "loss": 0.2383,
+      "step": 354
+    },
+    {
+      "epoch": 0.0030815704724785377,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.000708,
+      "loss": 0.3086,
+      "step": 355
+    },
+    {
+      "epoch": 0.0030902509526827024,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.00071,
+      "loss": 0.3086,
+      "step": 356
+    },
+    {
+      "epoch": 0.0030989314328868675,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.000712,
+      "loss": 0.291,
+      "step": 357
+    },
+    {
+      "epoch": 0.003107611913091032,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.000714,
+      "loss": 0.291,
+      "step": 358
+    },
+    {
+      "epoch": 0.0031162923932951973,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.000716,
+      "loss": 0.2988,
+      "step": 359
+    },
+    {
+      "epoch": 0.003124972873499362,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.000718,
+      "loss": 0.3398,
+      "step": 360
+    },
+    {
+      "epoch": 0.0031336533537035267,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0007199999999999999,
+      "loss": 0.2793,
+      "step": 361
+    },
+    {
+      "epoch": 0.003142333833907692,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.000722,
+      "loss": 0.3047,
+      "step": 362
+    },
+    {
+      "epoch": 0.0031510143141118565,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.000724,
+      "loss": 0.3281,
+      "step": 363
+    },
+    {
+      "epoch": 0.0031596947943160216,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.000726,
+      "loss": 0.2969,
+      "step": 364
+    },
+    {
+      "epoch": 0.0031683752745201863,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.000728,
+      "loss": 0.3652,
+      "step": 365
+    },
+    {
+      "epoch": 0.0031770557547243514,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.00073,
+      "loss": 0.2344,
+      "step": 366
+    },
+    {
+      "epoch": 0.003185736234928516,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.000732,
+      "loss": 0.2891,
+      "step": 367
+    },
+    {
+      "epoch": 0.003194416715132681,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.000734,
+      "loss": 0.3438,
+      "step": 368
+    },
+    {
+      "epoch": 0.003203097195336846,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.000736,
+      "loss": 0.3164,
+      "step": 369
+    },
+    {
+      "epoch": 0.003211777675541011,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.000738,
+      "loss": 0.2832,
+      "step": 370
+    },
+    {
+      "epoch": 0.0032204581557451757,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.00074,
+      "loss": 0.2695,
+      "step": 371
+    },
+    {
+      "epoch": 0.003229138635949341,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.000742,
+      "loss": 0.2559,
+      "step": 372
+    },
+    {
+      "epoch": 0.0032378191161535055,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.000744,
+      "loss": 0.2949,
+      "step": 373
+    },
+    {
+      "epoch": 0.0032464995963576706,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.000746,
+      "loss": 0.2832,
+      "step": 374
+    },
+    {
+      "epoch": 0.0032551800765618353,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.000748,
+      "loss": 0.2832,
+      "step": 375
+    },
+    {
+      "epoch": 0.0032638605567660004,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00075,
+      "loss": 0.2812,
+      "step": 376
+    },
+    {
+      "epoch": 0.003272541036970165,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0007520000000000001,
+      "loss": 0.4453,
+      "step": 377
+    },
+    {
+      "epoch": 0.0032812215171743302,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.000754,
+      "loss": 0.3223,
+      "step": 378
+    },
+    {
+      "epoch": 0.003289901997378495,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.000756,
+      "loss": 0.2695,
+      "step": 379
+    },
+    {
+      "epoch": 0.00329858247758266,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.000758,
+      "loss": 0.3164,
+      "step": 380
+    },
+    {
+      "epoch": 0.0033072629577868247,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00076,
+      "loss": 0.291,
+      "step": 381
+    },
+    {
+      "epoch": 0.00331594343799099,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.000762,
+      "loss": 0.3047,
+      "step": 382
+    },
+    {
+      "epoch": 0.0033246239181951545,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.000764,
+      "loss": 0.2832,
+      "step": 383
+    },
+    {
+      "epoch": 0.0033333043983993197,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0007660000000000001,
+      "loss": 0.291,
+      "step": 384
+    },
+    {
+      "epoch": 0.0033419848786034843,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.000768,
+      "loss": 0.2305,
+      "step": 385
+    },
+    {
+      "epoch": 0.003350665358807649,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0007700000000000001,
+      "loss": 0.3105,
+      "step": 386
+    },
+    {
+      "epoch": 0.003359345839011814,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.000772,
+      "loss": 0.3145,
+      "step": 387
+    },
+    {
+      "epoch": 0.003368026319215979,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0007740000000000001,
+      "loss": 0.373,
+      "step": 388
+    },
+    {
+      "epoch": 0.003376706799420144,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.000776,
+      "loss": 0.2158,
+      "step": 389
+    },
+    {
+      "epoch": 0.0033853872796243086,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.000778,
+      "loss": 0.2676,
+      "step": 390
+    },
+    {
+      "epoch": 0.0033940677598284738,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0007800000000000001,
+      "loss": 0.3438,
+      "step": 391
+    },
+    {
+      "epoch": 0.0034027482400326384,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.000782,
+      "loss": 0.25,
+      "step": 392
+    },
+    {
+      "epoch": 0.0034114287202368036,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0007840000000000001,
+      "loss": 0.2832,
+      "step": 393
+    },
+    {
+      "epoch": 0.0034201092004409682,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.000786,
+      "loss": 0.293,
+      "step": 394
+    },
+    {
+      "epoch": 0.0034287896806451334,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0007880000000000001,
+      "loss": 0.3242,
+      "step": 395
+    },
+    {
+      "epoch": 0.003437470160849298,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.00079,
+      "loss": 0.293,
+      "step": 396
+    },
+    {
+      "epoch": 0.003446150641053463,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0007920000000000001,
+      "loss": 0.2266,
+      "step": 397
+    },
+    {
+      "epoch": 0.003454831121257628,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0007940000000000001,
+      "loss": 0.3711,
+      "step": 398
+    },
+    {
+      "epoch": 0.003463511601461793,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.000796,
+      "loss": 0.3184,
+      "step": 399
+    },
+    {
+      "epoch": 0.0034721920816659577,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0007980000000000001,
+      "loss": 0.334,
+      "step": 400
+    },
+    {
+      "epoch": 0.003480872561870123,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0008,
+      "loss": 0.3398,
+      "step": 401
+    },
+    {
+      "epoch": 0.0034895530420742875,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0008020000000000001,
+      "loss": 0.2578,
+      "step": 402
+    },
+    {
+      "epoch": 0.0034982335222784526,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.000804,
+      "loss": 0.2734,
+      "step": 403
+    },
+    {
+      "epoch": 0.0035069140024826173,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0008060000000000001,
+      "loss": 0.332,
+      "step": 404
+    },
+    {
+      "epoch": 0.0035155944826867824,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.000808,
+      "loss": 0.2969,
+      "step": 405
+    },
+    {
+      "epoch": 0.003524274962890947,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0008100000000000001,
+      "loss": 0.249,
+      "step": 406
+    },
+    {
+      "epoch": 0.003532955443095112,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0008120000000000001,
+      "loss": 0.3672,
+      "step": 407
+    },
+    {
+      "epoch": 0.003541635923299277,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0008139999999999999,
+      "loss": 0.2617,
+      "step": 408
+    },
+    {
+      "epoch": 0.003550316403503442,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.000816,
+      "loss": 0.3086,
+      "step": 409
+    },
+    {
+      "epoch": 0.0035589968837076067,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0008179999999999999,
+      "loss": 0.2363,
+      "step": 410
+    },
+    {
+      "epoch": 0.003567677363911772,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.00082,
+      "loss": 0.3008,
+      "step": 411
+    },
+    {
+      "epoch": 0.0035763578441159365,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0008219999999999999,
+      "loss": 0.2852,
+      "step": 412
+    },
+    {
+      "epoch": 0.003585038324320101,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.000824,
+      "loss": 0.3027,
+      "step": 413
+    },
+    {
+      "epoch": 0.0035937188045242663,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.000826,
+      "loss": 0.2451,
+      "step": 414
+    },
+    {
+      "epoch": 0.003602399284728431,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.000828,
+      "loss": 0.3164,
+      "step": 415
+    },
+    {
+      "epoch": 0.003611079764932596,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.00083,
+      "loss": 0.2734,
+      "step": 416
+    },
+    {
+      "epoch": 0.003619760245136761,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.000832,
+      "loss": 0.2734,
+      "step": 417
+    },
+    {
+      "epoch": 0.003628440725340926,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.000834,
+      "loss": 0.3262,
+      "step": 418
+    },
+    {
+      "epoch": 0.0036371212055450906,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0008359999999999999,
+      "loss": 0.2969,
+      "step": 419
+    },
+    {
+      "epoch": 0.0036458016857492557,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.000838,
+      "loss": 0.2539,
+      "step": 420
+    },
+    {
+      "epoch": 0.0036544821659534204,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.00084,
+      "loss": 0.2363,
+      "step": 421
+    },
+    {
+      "epoch": 0.0036631626461575855,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.000842,
+      "loss": 0.3066,
+      "step": 422
+    },
+    {
+      "epoch": 0.00367184312636175,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.000844,
+      "loss": 0.2969,
+      "step": 423
+    },
+    {
+      "epoch": 0.0036805236065659153,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.000846,
+      "loss": 0.2988,
+      "step": 424
+    },
+    {
+      "epoch": 0.00368920408677008,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.000848,
+      "loss": 0.2393,
+      "step": 425
+    },
+    {
+      "epoch": 0.003697884566974245,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.00085,
+      "loss": 0.2812,
+      "step": 426
+    },
+    {
+      "epoch": 0.00370656504717841,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.000852,
+      "loss": 0.2422,
+      "step": 427
+    },
+    {
+      "epoch": 0.003715245527382575,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.000854,
+      "loss": 0.334,
+      "step": 428
+    },
+    {
+      "epoch": 0.0037239260075867396,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.000856,
+      "loss": 0.3047,
+      "step": 429
+    },
+    {
+      "epoch": 0.0037326064877909047,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.000858,
+      "loss": 0.3066,
+      "step": 430
+    },
+    {
+      "epoch": 0.0037412869679950694,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.00086,
+      "loss": 0.2852,
+      "step": 431
+    },
+    {
+      "epoch": 0.0037499674481992346,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.000862,
+      "loss": 0.2754,
+      "step": 432
+    },
+    {
+      "epoch": 0.0037586479284033992,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.000864,
+      "loss": 0.2402,
+      "step": 433
+    },
+    {
+      "epoch": 0.0037673284086075644,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.000866,
+      "loss": 0.2734,
+      "step": 434
+    },
+    {
+      "epoch": 0.003776008888811729,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0008680000000000001,
+      "loss": 0.2871,
+      "step": 435
+    },
+    {
+      "epoch": 0.003784689369015894,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.00087,
+      "loss": 0.3008,
+      "step": 436
+    },
+    {
+      "epoch": 0.003793369849220059,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.000872,
+      "loss": 0.2715,
+      "step": 437
+    },
+    {
+      "epoch": 0.0038020503294242235,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.000874,
+      "loss": 0.3086,
+      "step": 438
+    },
+    {
+      "epoch": 0.0038107308096283887,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.000876,
+      "loss": 0.2275,
+      "step": 439
+    },
+    {
+      "epoch": 0.0038194112898325533,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.000878,
+      "loss": 0.375,
+      "step": 440
+    },
+    {
+      "epoch": 0.0038280917700367185,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.00088,
+      "loss": 0.3242,
+      "step": 441
+    },
+    {
+      "epoch": 0.003836772250240883,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.000882,
+      "loss": 0.2852,
+      "step": 442
+    },
+    {
+      "epoch": 0.0038454527304450483,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.000884,
+      "loss": 0.3105,
+      "step": 443
+    },
+    {
+      "epoch": 0.003854133210649213,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0008860000000000001,
+      "loss": 0.2891,
+      "step": 444
+    },
+    {
+      "epoch": 0.003862813690853378,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.000888,
+      "loss": 0.3516,
+      "step": 445
+    },
+    {
+      "epoch": 0.0038714941710575428,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0008900000000000001,
+      "loss": 0.3945,
+      "step": 446
+    },
+    {
+      "epoch": 0.003880174651261708,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.000892,
+      "loss": 0.209,
+      "step": 447
+    },
+    {
+      "epoch": 0.0038888551314658726,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.000894,
+      "loss": 0.2715,
+      "step": 448
+    },
+    {
+      "epoch": 0.0038975356116700377,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.000896,
+      "loss": 0.3066,
+      "step": 449
+    },
+    {
+      "epoch": 0.0039062160918742024,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.000898,
+      "loss": 0.2559,
+      "step": 450
+    },
+    {
+      "epoch": 0.0039148965720783675,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0009000000000000001,
+      "loss": 0.2617,
+      "step": 451
+    },
+    {
+      "epoch": 0.003923577052282533,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.000902,
+      "loss": 0.2988,
+      "step": 452
+    },
+    {
+      "epoch": 0.003932257532486697,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0009040000000000001,
+      "loss": 0.3164,
+      "step": 453
+    },
+    {
+      "epoch": 0.003940938012690862,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.000906,
+      "loss": 0.3789,
+      "step": 454
+    },
+    {
+      "epoch": 0.003949618492895027,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0009080000000000001,
+      "loss": 0.2773,
+      "step": 455
+    },
+    {
+      "epoch": 0.003958298973099192,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.00091,
+      "loss": 0.2051,
+      "step": 456
+    },
+    {
+      "epoch": 0.0039669794533033565,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.000912,
+      "loss": 0.2422,
+      "step": 457
+    },
+    {
+      "epoch": 0.003975659933507522,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0009140000000000001,
+      "loss": 0.2441,
+      "step": 458
+    },
+    {
+      "epoch": 0.003984340413711687,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.000916,
+      "loss": 0.2676,
+      "step": 459
+    },
+    {
+      "epoch": 0.003993020893915852,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0009180000000000001,
+      "loss": 0.3477,
+      "step": 460
+    },
+    {
+      "epoch": 0.004001701374120016,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.00092,
+      "loss": 0.2578,
+      "step": 461
+    },
+    {
+      "epoch": 0.004010381854324181,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0009220000000000001,
+      "loss": 0.3086,
+      "step": 462
+    },
+    {
+      "epoch": 0.004019062334528346,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.000924,
+      "loss": 0.2227,
+      "step": 463
+    },
+    {
+      "epoch": 0.004027742814732511,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0009260000000000001,
+      "loss": 0.4023,
+      "step": 464
+    },
+    {
+      "epoch": 0.004036423294936676,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0009280000000000001,
+      "loss": 0.2969,
+      "step": 465
+    },
+    {
+      "epoch": 0.004045103775140841,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.00093,
+      "loss": 0.2773,
+      "step": 466
+    },
+    {
+      "epoch": 0.004053784255345006,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0009320000000000001,
+      "loss": 0.3027,
+      "step": 467
+    },
+    {
+      "epoch": 0.00406246473554917,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.000934,
+      "loss": 0.2617,
+      "step": 468
+    },
+    {
+      "epoch": 0.004071145215753335,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0009360000000000001,
+      "loss": 0.2539,
+      "step": 469
+    },
+    {
+      "epoch": 0.0040798256959575,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0009379999999999999,
+      "loss": 0.3242,
+      "step": 470
+    },
+    {
+      "epoch": 0.0040885061761616655,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.00094,
+      "loss": 0.2334,
+      "step": 471
+    },
+    {
+      "epoch": 0.00409718665636583,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.000942,
+      "loss": 0.2891,
+      "step": 472
+    },
+    {
+      "epoch": 0.004105867136569995,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.000944,
+      "loss": 0.4492,
+      "step": 473
+    },
+    {
+      "epoch": 0.00411454761677416,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.000946,
+      "loss": 0.3164,
+      "step": 474
+    },
+    {
+      "epoch": 0.004123228096978325,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.000948,
+      "loss": 0.3027,
+      "step": 475
+    },
+    {
+      "epoch": 0.004131908577182489,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.00095,
+      "loss": 0.2754,
+      "step": 476
+    },
+    {
+      "epoch": 0.0041405890573866545,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0009519999999999999,
+      "loss": 0.2275,
+      "step": 477
+    },
+    {
+      "epoch": 0.00414926953759082,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.000954,
+      "loss": 0.2891,
+      "step": 478
+    },
+    {
+      "epoch": 0.004157950017794985,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0009559999999999999,
+      "loss": 0.332,
+      "step": 479
+    },
+    {
+      "epoch": 0.004166630497999149,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.000958,
+      "loss": 0.3027,
+      "step": 480
+    },
+    {
+      "epoch": 0.004175310978203314,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.00096,
+      "loss": 0.2295,
+      "step": 481
+    },
+    {
+      "epoch": 0.004183991458407479,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.000962,
+      "loss": 0.2285,
+      "step": 482
+    },
+    {
+      "epoch": 0.004192671938611644,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.000964,
+      "loss": 0.2969,
+      "step": 483
+    },
+    {
+      "epoch": 0.004201352418815809,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.000966,
+      "loss": 0.3027,
+      "step": 484
+    },
+    {
+      "epoch": 0.004210032899019974,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.000968,
+      "loss": 0.3477,
+      "step": 485
+    },
+    {
+      "epoch": 0.004218713379224139,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0009699999999999999,
+      "loss": 0.2617,
+      "step": 486
+    },
+    {
+      "epoch": 0.004227393859428304,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.000972,
+      "loss": 0.4824,
+      "step": 487
+    },
+    {
+      "epoch": 0.004236074339632468,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.000974,
+      "loss": 0.293,
+      "step": 488
+    },
+    {
+      "epoch": 0.004244754819836633,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.000976,
+      "loss": 0.4102,
+      "step": 489
+    },
+    {
+      "epoch": 0.0042534353000407985,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.000978,
+      "loss": 0.2637,
+      "step": 490
+    },
+    {
+      "epoch": 0.004262115780244963,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.00098,
+      "loss": 0.2578,
+      "step": 491
+    },
+    {
+      "epoch": 0.004270796260449128,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.000982,
+      "loss": 0.1992,
+      "step": 492
+    },
+    {
+      "epoch": 0.004279476740653293,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.000984,
+      "loss": 0.252,
+      "step": 493
+    },
+    {
+      "epoch": 0.004288157220857458,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0009860000000000001,
+      "loss": 0.2598,
+      "step": 494
+    },
+    {
+      "epoch": 0.004296837701061622,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.000988,
+      "loss": 0.2422,
+      "step": 495
+    },
+    {
+      "epoch": 0.0043055181812657875,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.00099,
+      "loss": 0.2578,
+      "step": 496
+    },
+    {
+      "epoch": 0.004314198661469953,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.000992,
+      "loss": 0.2676,
+      "step": 497
+    },
+    {
+      "epoch": 0.004322879141674118,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.000994,
+      "loss": 0.2852,
+      "step": 498
+    },
+    {
+      "epoch": 0.004331559621878282,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.000996,
+      "loss": 0.2754,
+      "step": 499
+    },
+    {
+      "epoch": 0.004340240102082447,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.000998,
+      "loss": 0.2754,
+      "step": 500
+    },
+    {
+      "epoch": 0.004348920582286612,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001,
+      "loss": 0.3984,
+      "step": 501
+    },
+    {
+      "epoch": 0.004357601062490777,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.001002,
+      "loss": 0.2656,
+      "step": 502
+    },
+    {
+      "epoch": 0.004366281542694942,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0010040000000000001,
+      "loss": 0.3008,
+      "step": 503
+    },
+    {
+      "epoch": 0.004374962022899107,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001006,
+      "loss": 0.2676,
+      "step": 504
+    },
+    {
+      "epoch": 0.004383642503103272,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001008,
+      "loss": 0.2461,
+      "step": 505
+    },
+    {
+      "epoch": 0.004392322983307437,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.00101,
+      "loss": 0.2793,
+      "step": 506
+    },
+    {
+      "epoch": 0.004401003463511601,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001012,
+      "loss": 0.3027,
+      "step": 507
+    },
+    {
+      "epoch": 0.004409683943715766,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001014,
+      "loss": 0.2793,
+      "step": 508
+    },
+    {
+      "epoch": 0.004418364423919931,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001016,
+      "loss": 0.2559,
+      "step": 509
+    },
+    {
+      "epoch": 0.0044270449041240965,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001018,
+      "loss": 0.2812,
+      "step": 510
+    },
+    {
+      "epoch": 0.004435725384328261,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.00102,
+      "loss": 0.2793,
+      "step": 511
+    },
+    {
+      "epoch": 0.004444405864532426,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0010220000000000001,
+      "loss": 0.2891,
+      "step": 512
+    },
+    {
+      "epoch": 0.004453086344736591,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001024,
+      "loss": 0.334,
+      "step": 513
+    },
+    {
+      "epoch": 0.004461766824940756,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001026,
+      "loss": 0.252,
+      "step": 514
+    },
+    {
+      "epoch": 0.00447044730514492,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001028,
+      "loss": 0.2432,
+      "step": 515
+    },
+    {
+      "epoch": 0.0044791277853490855,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.00103,
+      "loss": 0.3223,
+      "step": 516
+    },
+    {
+      "epoch": 0.004487808265553251,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0010320000000000001,
+      "loss": 0.2148,
+      "step": 517
+    },
+    {
+      "epoch": 0.004496488745757415,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001034,
+      "loss": 0.3164,
+      "step": 518
+    },
+    {
+      "epoch": 0.00450516922596158,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001036,
+      "loss": 0.3379,
+      "step": 519
+    },
+    {
+      "epoch": 0.004513849706165745,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001038,
+      "loss": 0.2656,
+      "step": 520
+    },
+    {
+      "epoch": 0.00452253018636991,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0010400000000000001,
+      "loss": 0.293,
+      "step": 521
+    },
+    {
+      "epoch": 0.0045312106665740745,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001042,
+      "loss": 0.2656,
+      "step": 522
+    },
+    {
+      "epoch": 0.00453989114677824,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001044,
+      "loss": 0.3262,
+      "step": 523
+    },
+    {
+      "epoch": 0.004548571626982405,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001046,
+      "loss": 0.293,
+      "step": 524
+    },
+    {
+      "epoch": 0.00455725210718657,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001048,
+      "loss": 0.2207,
+      "step": 525
+    },
+    {
+      "epoch": 0.004565932587390734,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0010500000000000002,
+      "loss": 0.3086,
+      "step": 526
+    },
+    {
+      "epoch": 0.004574613067594899,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001052,
+      "loss": 0.2676,
+      "step": 527
+    },
+    {
+      "epoch": 0.004583293547799064,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001054,
+      "loss": 0.332,
+      "step": 528
+    },
+    {
+      "epoch": 0.0045919740280032295,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001056,
+      "loss": 0.2715,
+      "step": 529
+    },
+    {
+      "epoch": 0.004600654508207394,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0010580000000000001,
+      "loss": 0.3008,
+      "step": 530
+    },
+    {
+      "epoch": 0.004609334988411559,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0010600000000000002,
+      "loss": 0.2969,
+      "step": 531
+    },
+    {
+      "epoch": 0.004618015468615724,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001062,
+      "loss": 0.2246,
+      "step": 532
+    },
+    {
+      "epoch": 0.004626695948819889,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001064,
+      "loss": 0.249,
+      "step": 533
+    },
+    {
+      "epoch": 0.004635376429024053,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.001066,
+      "loss": 0.2832,
+      "step": 534
+    },
+    {
+      "epoch": 0.0046440569092282185,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0010680000000000002,
+      "loss": 0.2578,
+      "step": 535
+    },
+    {
+      "epoch": 0.004652737389432384,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.00107,
+      "loss": 0.2383,
+      "step": 536
+    },
+    {
+      "epoch": 0.004661417869636549,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001072,
+      "loss": 0.2734,
+      "step": 537
+    },
+    {
+      "epoch": 0.004670098349840713,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001074,
+      "loss": 0.291,
+      "step": 538
+    },
+    {
+      "epoch": 0.004678778830044878,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0010760000000000001,
+      "loss": 0.2676,
+      "step": 539
+    },
+    {
+      "epoch": 0.004687459310249043,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0010780000000000002,
+      "loss": 0.2285,
+      "step": 540
+    },
+    {
+      "epoch": 0.0046961397904532074,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.00108,
+      "loss": 0.2988,
+      "step": 541
+    },
+    {
+      "epoch": 0.0047048202706573726,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001082,
+      "loss": 0.3203,
+      "step": 542
+    },
+    {
+      "epoch": 0.004713500750861538,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0010840000000000001,
+      "loss": 0.2793,
+      "step": 543
+    },
+    {
+      "epoch": 0.004722181231065703,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0010860000000000002,
+      "loss": 0.293,
+      "step": 544
+    },
+    {
+      "epoch": 0.004730861711269867,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0010880000000000002,
+      "loss": 0.25,
+      "step": 545
+    },
+    {
+      "epoch": 0.004739542191474032,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.00109,
+      "loss": 0.1982,
+      "step": 546
+    },
+    {
+      "epoch": 0.004748222671678197,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001092,
+      "loss": 0.2773,
+      "step": 547
+    },
+    {
+      "epoch": 0.004756903151882362,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0010940000000000001,
+      "loss": 0.2891,
+      "step": 548
+    },
+    {
+      "epoch": 0.004765583632086527,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0010960000000000002,
+      "loss": 0.2285,
+      "step": 549
+    },
+    {
+      "epoch": 0.004774264112290692,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001098,
+      "loss": 0.2949,
+      "step": 550
+    },
+    {
+      "epoch": 0.004782944592494857,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0011,
+      "loss": 0.3047,
+      "step": 551
+    },
+    {
+      "epoch": 0.004791625072699022,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0011020000000000001,
+      "loss": 0.2793,
+      "step": 552
+    },
+    {
+      "epoch": 0.004800305552903186,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0011040000000000002,
+      "loss": 0.3809,
+      "step": 553
+    },
+    {
+      "epoch": 0.004808986033107351,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0011060000000000002,
+      "loss": 0.2539,
+      "step": 554
+    },
+    {
+      "epoch": 0.0048176665133115165,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001108,
+      "loss": 0.2559,
+      "step": 555
+    },
+    {
+      "epoch": 0.004826346993515682,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.00111,
+      "loss": 0.3125,
+      "step": 556
+    },
+    {
+      "epoch": 0.004835027473719846,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0011120000000000001,
+      "loss": 0.2441,
+      "step": 557
+    },
+    {
+      "epoch": 0.004843707953924011,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0011140000000000002,
+      "loss": 0.2598,
+      "step": 558
+    },
+    {
+      "epoch": 0.004852388434128176,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001116,
+      "loss": 0.2852,
+      "step": 559
+    },
+    {
+      "epoch": 0.004861068914332341,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.001118,
+      "loss": 0.2754,
+      "step": 560
+    },
+    {
+      "epoch": 0.0048697493945365055,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0011200000000000001,
+      "loss": 0.248,
+      "step": 561
+    },
+    {
+      "epoch": 0.004878429874740671,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0011220000000000002,
+      "loss": 0.2832,
+      "step": 562
+    },
+    {
+      "epoch": 0.004887110354944836,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0011240000000000002,
+      "loss": 0.2275,
+      "step": 563
+    },
+    {
+      "epoch": 0.004895790835149001,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0011259999999999998,
+      "loss": 0.457,
+      "step": 564
+    },
+    {
+      "epoch": 0.004904471315353165,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0011279999999999999,
+      "loss": 0.3164,
+      "step": 565
+    },
+    {
+      "epoch": 0.00491315179555733,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.00113,
+      "loss": 0.2754,
+      "step": 566
+    },
+    {
+      "epoch": 0.004921832275761495,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001132,
+      "loss": 0.2559,
+      "step": 567
+    },
+    {
+      "epoch": 0.00493051275596566,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001134,
+      "loss": 0.25,
+      "step": 568
+    },
+    {
+      "epoch": 0.004939193236169825,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0011359999999999999,
+      "loss": 0.2363,
+      "step": 569
+    },
+    {
+      "epoch": 0.00494787371637399,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.001138,
+      "loss": 0.8203,
+      "step": 570
+    },
+    {
+      "epoch": 0.004956554196578155,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.00114,
+      "loss": 0.2598,
+      "step": 571
+    },
+    {
+      "epoch": 0.004965234676782319,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001142,
+      "loss": 0.2969,
+      "step": 572
+    },
+    {
+      "epoch": 0.004973915156986484,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0011439999999999998,
+      "loss": 0.293,
+      "step": 573
+    },
+    {
+      "epoch": 0.0049825956371906495,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0011459999999999999,
+      "loss": 0.291,
+      "step": 574
+    },
+    {
+      "epoch": 0.004991276117394815,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001148,
+      "loss": 0.252,
+      "step": 575
+    },
+    {
+      "epoch": 0.004999956597598979,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.00115,
+      "loss": 0.2773,
+      "step": 576
+    },
+    {
+      "epoch": 0.005008637077803144,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001152,
+      "loss": 0.2578,
+      "step": 577
+    },
+    {
+      "epoch": 0.005017317558007309,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0011539999999999999,
+      "loss": 0.25,
+      "step": 578
+    },
+    {
+      "epoch": 0.005025998038211474,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001156,
+      "loss": 0.2461,
+      "step": 579
+    },
+    {
+      "epoch": 0.0050346785184156384,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001158,
+      "loss": 0.2168,
+      "step": 580
+    },
+    {
+      "epoch": 0.0050433589986198036,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.00116,
+      "loss": 0.2559,
+      "step": 581
+    },
+    {
+      "epoch": 0.005052039478823969,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0011619999999999998,
+      "loss": 0.4023,
+      "step": 582
+    },
+    {
+      "epoch": 0.005060719959028134,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0011639999999999999,
+      "loss": 0.3223,
+      "step": 583
+    },
+    {
+      "epoch": 0.005069400439232298,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001166,
+      "loss": 0.2754,
+      "step": 584
+    },
+    {
+      "epoch": 0.005078080919436463,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001168,
+      "loss": 0.2891,
+      "step": 585
+    },
+    {
+      "epoch": 0.005086761399640628,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.00117,
+      "loss": 0.2578,
+      "step": 586
+    },
+    {
+      "epoch": 0.005095441879844793,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0011719999999999999,
+      "loss": 0.2871,
+      "step": 587
+    },
+    {
+      "epoch": 0.005104122360048958,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001174,
+      "loss": 0.2617,
+      "step": 588
+    },
+    {
+      "epoch": 0.005112802840253123,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.001176,
+      "loss": 0.3145,
+      "step": 589
+    },
+    {
+      "epoch": 0.005121483320457288,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.001178,
+      "loss": 0.2734,
+      "step": 590
+    },
+    {
+      "epoch": 0.005130163800661453,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.00118,
+      "loss": 0.2578,
+      "step": 591
+    },
+    {
+      "epoch": 0.005138844280865617,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0011819999999999999,
+      "loss": 0.3203,
+      "step": 592
+    },
+    {
+      "epoch": 0.005147524761069782,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.001184,
+      "loss": 0.2637,
+      "step": 593
+    },
+    {
+      "epoch": 0.0051562052412739475,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001186,
+      "loss": 0.2695,
+      "step": 594
+    },
+    {
+      "epoch": 0.005164885721478112,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001188,
+      "loss": 0.3398,
+      "step": 595
+    },
+    {
+      "epoch": 0.005173566201682277,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0011899999999999999,
+      "loss": 0.2734,
+      "step": 596
+    },
+    {
+      "epoch": 0.005182246681886442,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001192,
+      "loss": 0.3418,
+      "step": 597
+    },
+    {
+      "epoch": 0.005190927162090607,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001194,
+      "loss": 0.2695,
+      "step": 598
+    },
+    {
+      "epoch": 0.005199607642294771,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.001196,
+      "loss": 0.2656,
+      "step": 599
+    },
+    {
+      "epoch": 0.0052082881224989365,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001198,
+      "loss": 0.3262,
+      "step": 600
+    },
+    {
+      "epoch": 0.005216968602703102,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0012,
+      "loss": 0.2891,
+      "step": 601
+    },
+    {
+      "epoch": 0.005225649082907267,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001202,
+      "loss": 0.3535,
+      "step": 602
+    },
+    {
+      "epoch": 0.005234329563111431,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001204,
+      "loss": 0.2656,
+      "step": 603
+    },
+    {
+      "epoch": 0.005243010043315596,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001206,
+      "loss": 0.2539,
+      "step": 604
+    },
+    {
+      "epoch": 0.005251690523519761,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001208,
+      "loss": 0.2812,
+      "step": 605
+    },
+    {
+      "epoch": 0.005260371003723926,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.00121,
+      "loss": 0.2598,
+      "step": 606
+    },
+    {
+      "epoch": 0.005269051483928091,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001212,
+      "loss": 0.293,
+      "step": 607
+    },
+    {
+      "epoch": 0.005277731964132256,
+      "grad_norm": 1.828125,
+      "learning_rate": 0.001214,
+      "loss": 0.7344,
+      "step": 608
+    },
+    {
+      "epoch": 0.005286412444336421,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001216,
+      "loss": 0.2539,
+      "step": 609
+    },
+    {
+      "epoch": 0.005295092924540586,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001218,
+      "loss": 0.3047,
+      "step": 610
+    },
+    {
+      "epoch": 0.00530377340474475,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.00122,
+      "loss": 0.2188,
+      "step": 611
+    },
+    {
+      "epoch": 0.005312453884948915,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001222,
+      "loss": 0.2852,
+      "step": 612
+    },
+    {
+      "epoch": 0.0053211343651530804,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001224,
+      "loss": 0.2969,
+      "step": 613
+    },
+    {
+      "epoch": 0.005329814845357246,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001226,
+      "loss": 0.3613,
+      "step": 614
+    },
+    {
+      "epoch": 0.00533849532556141,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001228,
+      "loss": 0.2617,
+      "step": 615
+    },
+    {
+      "epoch": 0.005347175805765575,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.00123,
+      "loss": 0.2539,
+      "step": 616
+    },
+    {
+      "epoch": 0.00535585628596974,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001232,
+      "loss": 0.2988,
+      "step": 617
+    },
+    {
+      "epoch": 0.005364536766173904,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001234,
+      "loss": 0.3164,
+      "step": 618
+    },
+    {
+      "epoch": 0.005373217246378069,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0012360000000000001,
+      "loss": 0.2598,
+      "step": 619
+    },
+    {
+      "epoch": 0.0053818977265822346,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001238,
+      "loss": 0.3203,
+      "step": 620
+    },
+    {
+      "epoch": 0.0053905782067864,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.00124,
+      "loss": 0.2891,
+      "step": 621
+    },
+    {
+      "epoch": 0.005399258686990564,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001242,
+      "loss": 0.3086,
+      "step": 622
+    },
+    {
+      "epoch": 0.005407939167194729,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001244,
+      "loss": 0.3008,
+      "step": 623
+    },
+    {
+      "epoch": 0.005416619647398894,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001246,
+      "loss": 0.2578,
+      "step": 624
+    },
+    {
+      "epoch": 0.005425300127603059,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001248,
+      "loss": 0.2832,
+      "step": 625
+    },
+    {
+      "epoch": 0.0054339806078072235,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.00125,
+      "loss": 0.3555,
+      "step": 626
+    },
+    {
+      "epoch": 0.005442661088011389,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001252,
+      "loss": 0.2354,
+      "step": 627
+    },
+    {
+      "epoch": 0.005451341568215554,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0012540000000000001,
+      "loss": 0.2852,
+      "step": 628
+    },
+    {
+      "epoch": 0.005460022048419719,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001256,
+      "loss": 0.3789,
+      "step": 629
+    },
+    {
+      "epoch": 0.005468702528623883,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001258,
+      "loss": 0.2852,
+      "step": 630
+    },
+    {
+      "epoch": 0.005477383008828048,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.00126,
+      "loss": 0.3027,
+      "step": 631
+    },
+    {
+      "epoch": 0.005486063489032213,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001262,
+      "loss": 0.2871,
+      "step": 632
+    },
+    {
+      "epoch": 0.0054947439692363785,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001264,
+      "loss": 0.2988,
+      "step": 633
+    },
+    {
+      "epoch": 0.005503424449440543,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001266,
+      "loss": 0.4531,
+      "step": 634
+    },
+    {
+      "epoch": 0.005512104929644708,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001268,
+      "loss": 0.252,
+      "step": 635
+    },
+    {
+      "epoch": 0.005520785409848873,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.00127,
+      "loss": 0.291,
+      "step": 636
+    },
+    {
+      "epoch": 0.005529465890053038,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0012720000000000001,
+      "loss": 0.293,
+      "step": 637
+    },
+    {
+      "epoch": 0.005538146370257202,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.001274,
+      "loss": 0.3555,
+      "step": 638
+    },
+    {
+      "epoch": 0.0055468268504613675,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001276,
+      "loss": 0.375,
+      "step": 639
+    },
+    {
+      "epoch": 0.005555507330665533,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001278,
+      "loss": 0.2236,
+      "step": 640
+    },
+    {
+      "epoch": 0.005564187810869698,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.00128,
+      "loss": 0.3066,
+      "step": 641
+    },
+    {
+      "epoch": 0.005572868291073862,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0012820000000000002,
+      "loss": 0.2891,
+      "step": 642
+    },
+    {
+      "epoch": 0.005581548771278027,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001284,
+      "loss": 0.3242,
+      "step": 643
+    },
+    {
+      "epoch": 0.005590229251482192,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001286,
+      "loss": 0.3398,
+      "step": 644
+    },
+    {
+      "epoch": 0.0055989097316863565,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001288,
+      "loss": 0.2148,
+      "step": 645
+    },
+    {
+      "epoch": 0.005607590211890522,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0012900000000000001,
+      "loss": 0.293,
+      "step": 646
+    },
+    {
+      "epoch": 0.005616270692094687,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001292,
+      "loss": 0.3281,
+      "step": 647
+    },
+    {
+      "epoch": 0.005624951172298852,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001294,
+      "loss": 0.3105,
+      "step": 648
+    },
+    {
+      "epoch": 0.005633631652503016,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.001296,
+      "loss": 0.3555,
+      "step": 649
+    },
+    {
+      "epoch": 0.005642312132707181,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0012980000000000001,
+      "loss": 0.3477,
+      "step": 650
+    },
+    {
+      "epoch": 0.005650992612911346,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0013000000000000002,
+      "loss": 0.2988,
+      "step": 651
+    },
+    {
+      "epoch": 0.0056596730931155114,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001302,
+      "loss": 0.2539,
+      "step": 652
+    },
+    {
+      "epoch": 0.005668353573319676,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.001304,
+      "loss": 0.3086,
+      "step": 653
+    },
+    {
+      "epoch": 0.005677034053523841,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001306,
+      "loss": 0.3906,
+      "step": 654
+    },
+    {
+      "epoch": 0.005685714533728006,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0013080000000000001,
+      "loss": 0.4121,
+      "step": 655
+    },
+    {
+      "epoch": 0.005694395013932171,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0013100000000000002,
+      "loss": 0.3281,
+      "step": 656
+    },
+    {
+      "epoch": 0.005703075494136335,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001312,
+      "loss": 0.3574,
+      "step": 657
+    },
+    {
+      "epoch": 0.0057117559743405,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001314,
+      "loss": 0.2422,
+      "step": 658
+    },
+    {
+      "epoch": 0.0057204364545446655,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0013160000000000001,
+      "loss": 0.2676,
+      "step": 659
+    },
+    {
+      "epoch": 0.005729116934748831,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0013180000000000002,
+      "loss": 0.3477,
+      "step": 660
+    },
+    {
+      "epoch": 0.005737797414952995,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.00132,
+      "loss": 0.3008,
+      "step": 661
+    },
+    {
+      "epoch": 0.00574647789515716,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001322,
+      "loss": 0.2852,
+      "step": 662
+    },
+    {
+      "epoch": 0.005755158375361325,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001324,
+      "loss": 0.2812,
+      "step": 663
+    },
+    {
+      "epoch": 0.00576383885556549,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0013260000000000001,
+      "loss": 0.3086,
+      "step": 664
+    },
+    {
+      "epoch": 0.0057725193357696545,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0013280000000000002,
+      "loss": 0.2188,
+      "step": 665
+    },
+    {
+      "epoch": 0.00578119981597382,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.00133,
+      "loss": 0.2891,
+      "step": 666
+    },
+    {
+      "epoch": 0.005789880296177985,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001332,
+      "loss": 0.332,
+      "step": 667
+    },
+    {
+      "epoch": 0.00579856077638215,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0013340000000000001,
+      "loss": 0.3223,
+      "step": 668
+    },
+    {
+      "epoch": 0.005807241256586314,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0013360000000000002,
+      "loss": 0.3359,
+      "step": 669
+    },
+    {
+      "epoch": 0.005815921736790479,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0013380000000000002,
+      "loss": 0.2461,
+      "step": 670
+    },
+    {
+      "epoch": 0.005824602216994644,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.00134,
+      "loss": 0.2871,
+      "step": 671
+    },
+    {
+      "epoch": 0.005833282697198809,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001342,
+      "loss": 0.2656,
+      "step": 672
+    },
+    {
+      "epoch": 0.005841963177402974,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0013440000000000001,
+      "loss": 0.2559,
+      "step": 673
+    },
+    {
+      "epoch": 0.005850643657607139,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0013460000000000002,
+      "loss": 0.2656,
+      "step": 674
+    },
+    {
+      "epoch": 0.005859324137811304,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001348,
+      "loss": 0.2793,
+      "step": 675
+    },
+    {
+      "epoch": 0.005868004618015468,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.00135,
+      "loss": 0.3008,
+      "step": 676
+    },
+    {
+      "epoch": 0.005876685098219633,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0013520000000000001,
+      "loss": 0.2451,
+      "step": 677
+    },
+    {
+      "epoch": 0.0058853655784237985,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0013540000000000002,
+      "loss": 0.2812,
+      "step": 678
+    },
+    {
+      "epoch": 0.005894046058627964,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0013560000000000002,
+      "loss": 0.373,
+      "step": 679
+    },
+    {
+      "epoch": 0.005902726538832128,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001358,
+      "loss": 0.2168,
+      "step": 680
+    },
+    {
+      "epoch": 0.005911407019036293,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.00136,
+      "loss": 0.2793,
+      "step": 681
+    },
+    {
+      "epoch": 0.005920087499240458,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0013620000000000001,
+      "loss": 0.293,
+      "step": 682
+    },
+    {
+      "epoch": 0.005928767979444623,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0013640000000000002,
+      "loss": 0.249,
+      "step": 683
+    },
+    {
+      "epoch": 0.0059374484596487875,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001366,
+      "loss": 0.2461,
+      "step": 684
+    },
+    {
+      "epoch": 0.005946128939852953,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001368,
+      "loss": 0.3066,
+      "step": 685
+    },
+    {
+      "epoch": 0.005954809420057118,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0013700000000000001,
+      "loss": 0.2598,
+      "step": 686
+    },
+    {
+      "epoch": 0.005963489900261283,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0013720000000000002,
+      "loss": 0.2314,
+      "step": 687
+    },
+    {
+      "epoch": 0.005972170380465447,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0013740000000000002,
+      "loss": 0.3086,
+      "step": 688
+    },
+    {
+      "epoch": 0.005980850860669612,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0013759999999999998,
+      "loss": 0.2734,
+      "step": 689
+    },
+    {
+      "epoch": 0.005989531340873777,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0013779999999999999,
+      "loss": 0.1924,
+      "step": 690
+    },
+    {
+      "epoch": 0.0059982118210779424,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.00138,
+      "loss": 0.2891,
+      "step": 691
+    },
+    {
+      "epoch": 0.006006892301282107,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001382,
+      "loss": 0.3047,
+      "step": 692
+    },
+    {
+      "epoch": 0.006015572781486272,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001384,
+      "loss": 0.2773,
+      "step": 693
+    },
+    {
+      "epoch": 0.006024253261690437,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0013859999999999999,
+      "loss": 0.21,
+      "step": 694
+    },
+    {
+      "epoch": 0.006032933741894601,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001388,
+      "loss": 0.2793,
+      "step": 695
+    },
+    {
+      "epoch": 0.006041614222098766,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.00139,
+      "loss": 0.3359,
+      "step": 696
+    },
+    {
+      "epoch": 0.006050294702302931,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001392,
+      "loss": 0.2432,
+      "step": 697
+    },
+    {
+      "epoch": 0.0060589751825070965,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0013939999999999998,
+      "loss": 0.3105,
+      "step": 698
+    },
+    {
+      "epoch": 0.006067655662711261,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0013959999999999999,
+      "loss": 0.3457,
+      "step": 699
+    },
+    {
+      "epoch": 0.006076336142915426,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.001398,
+      "loss": 0.3086,
+      "step": 700
+    },
+    {
+      "epoch": 0.006085016623119591,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0014,
+      "loss": 0.249,
+      "step": 701
+    },
+    {
+      "epoch": 0.006093697103323756,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001402,
+      "loss": 0.2812,
+      "step": 702
+    },
+    {
+      "epoch": 0.00610237758352792,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0014039999999999999,
+      "loss": 0.3145,
+      "step": 703
+    },
+    {
+      "epoch": 0.0061110580637320855,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001406,
+      "loss": 0.2617,
+      "step": 704
+    },
+    {
+      "epoch": 0.006119738543936251,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001408,
+      "loss": 0.2539,
+      "step": 705
+    },
+    {
+      "epoch": 0.006128419024140416,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.00141,
+      "loss": 0.2432,
+      "step": 706
+    },
+    {
+      "epoch": 0.00613709950434458,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0014119999999999998,
+      "loss": 0.3047,
+      "step": 707
+    },
+    {
+      "epoch": 0.006145779984548745,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001414,
+      "loss": 0.4062,
+      "step": 708
+    },
+    {
+      "epoch": 0.00615446046475291,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001416,
+      "loss": 0.2832,
+      "step": 709
+    },
+    {
+      "epoch": 0.006163140944957075,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001418,
+      "loss": 0.2578,
+      "step": 710
+    },
+    {
+      "epoch": 0.00617182142516124,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.00142,
+      "loss": 0.2949,
+      "step": 711
+    },
+    {
+      "epoch": 0.006180501905365405,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0014219999999999999,
+      "loss": 0.2949,
+      "step": 712
+    },
+    {
+      "epoch": 0.00618918238556957,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001424,
+      "loss": 0.252,
+      "step": 713
+    },
+    {
+      "epoch": 0.006197862865773735,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001426,
+      "loss": 0.3047,
+      "step": 714
+    },
+    {
+      "epoch": 0.006206543345977899,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001428,
+      "loss": 0.25,
+      "step": 715
+    },
+    {
+      "epoch": 0.006215223826182064,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.00143,
+      "loss": 0.3164,
+      "step": 716
+    },
+    {
+      "epoch": 0.0062239043063862295,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001432,
+      "loss": 0.2812,
+      "step": 717
+    },
+    {
+      "epoch": 0.006232584786590395,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001434,
+      "loss": 0.2852,
+      "step": 718
+    },
+    {
+      "epoch": 0.006241265266794559,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001436,
+      "loss": 0.3008,
+      "step": 719
+    },
+    {
+      "epoch": 0.006249945746998724,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001438,
+      "loss": 0.2246,
+      "step": 720
+    },
+    {
+      "epoch": 0.006258626227202889,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0014399999999999999,
+      "loss": 0.2949,
+      "step": 721
+    },
+    {
+      "epoch": 0.006267306707407053,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001442,
+      "loss": 0.2383,
+      "step": 722
+    },
+    {
+      "epoch": 0.0062759871876112185,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001444,
+      "loss": 0.2734,
+      "step": 723
+    },
+    {
+      "epoch": 0.006284667667815384,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001446,
+      "loss": 0.207,
+      "step": 724
+    },
+    {
+      "epoch": 0.006293348148019549,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.001448,
+      "loss": 0.3086,
+      "step": 725
+    },
+    {
+      "epoch": 0.006302028628223713,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.00145,
+      "loss": 0.2188,
+      "step": 726
+    },
+    {
+      "epoch": 0.006310709108427878,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001452,
+      "loss": 0.3105,
+      "step": 727
+    },
+    {
+      "epoch": 0.006319389588632043,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001454,
+      "loss": 0.3066,
+      "step": 728
+    },
+    {
+      "epoch": 0.006328070068836208,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001456,
+      "loss": 0.293,
+      "step": 729
+    },
+    {
+      "epoch": 0.0063367505490403726,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001458,
+      "loss": 0.2402,
+      "step": 730
+    },
+    {
+      "epoch": 0.006345431029244538,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.00146,
+      "loss": 0.2773,
+      "step": 731
+    },
+    {
+      "epoch": 0.006354111509448703,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.001462,
+      "loss": 0.2559,
+      "step": 732
+    },
+    {
+      "epoch": 0.006362791989652868,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001464,
+      "loss": 0.2988,
+      "step": 733
+    },
+    {
+      "epoch": 0.006371472469857032,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001466,
+      "loss": 0.2246,
+      "step": 734
+    },
+    {
+      "epoch": 0.006380152950061197,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001468,
+      "loss": 0.3301,
+      "step": 735
+    },
+    {
+      "epoch": 0.006388833430265362,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.00147,
+      "loss": 0.2422,
+      "step": 736
+    },
+    {
+      "epoch": 0.0063975139104695275,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001472,
+      "loss": 0.2539,
+      "step": 737
+    },
+    {
+      "epoch": 0.006406194390673692,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001474,
+      "loss": 0.2773,
+      "step": 738
+    },
+    {
+      "epoch": 0.006414874870877857,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001476,
+      "loss": 0.2539,
+      "step": 739
+    },
+    {
+      "epoch": 0.006423555351082022,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001478,
+      "loss": 0.3477,
+      "step": 740
+    },
+    {
+      "epoch": 0.006432235831286187,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.00148,
+      "loss": 0.3477,
+      "step": 741
+    },
+    {
+      "epoch": 0.006440916311490351,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001482,
+      "loss": 0.2773,
+      "step": 742
+    },
+    {
+      "epoch": 0.0064495967916945165,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001484,
+      "loss": 0.2754,
+      "step": 743
+    },
+    {
+      "epoch": 0.006458277271898682,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0014860000000000001,
+      "loss": 0.3145,
+      "step": 744
+    },
+    {
+      "epoch": 0.006466957752102847,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001488,
+      "loss": 0.3066,
+      "step": 745
+    },
+    {
+      "epoch": 0.006475638232307011,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.00149,
+      "loss": 0.2393,
+      "step": 746
+    },
+    {
+      "epoch": 0.006484318712511176,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001492,
+      "loss": 0.2461,
+      "step": 747
+    },
+    {
+      "epoch": 0.006492999192715341,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001494,
+      "loss": 0.2617,
+      "step": 748
+    },
+    {
+      "epoch": 0.0065016796729195055,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001496,
+      "loss": 0.249,
+      "step": 749
+    },
+    {
+      "epoch": 0.006510360153123671,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.001498,
+      "loss": 0.2656,
+      "step": 750
+    },
+    {
+      "epoch": 0.006519040633327836,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0015,
+      "loss": 0.2578,
+      "step": 751
+    },
+    {
+      "epoch": 0.006527721113532001,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001502,
+      "loss": 0.2559,
+      "step": 752
+    },
+    {
+      "epoch": 0.006536401593736165,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0015040000000000001,
+      "loss": 0.2285,
+      "step": 753
+    },
+    {
+      "epoch": 0.00654508207394033,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001506,
+      "loss": 0.3125,
+      "step": 754
+    },
+    {
+      "epoch": 0.006553762554144495,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001508,
+      "loss": 0.2451,
+      "step": 755
+    },
+    {
+      "epoch": 0.0065624430343486605,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.00151,
+      "loss": 0.3027,
+      "step": 756
+    },
+    {
+      "epoch": 0.006571123514552825,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001512,
+      "loss": 0.2734,
+      "step": 757
+    },
+    {
+      "epoch": 0.00657980399475699,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001514,
+      "loss": 0.3281,
+      "step": 758
+    },
+    {
+      "epoch": 0.006588484474961155,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001516,
+      "loss": 0.2305,
+      "step": 759
+    },
+    {
+      "epoch": 0.00659716495516532,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001518,
+      "loss": 0.2314,
+      "step": 760
+    },
+    {
+      "epoch": 0.006605845435369484,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.00152,
+      "loss": 0.2773,
+      "step": 761
+    },
+    {
+      "epoch": 0.0066145259155736495,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0015220000000000001,
+      "loss": 0.2227,
+      "step": 762
+    },
+    {
+      "epoch": 0.006623206395777815,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001524,
+      "loss": 0.2617,
+      "step": 763
+    },
+    {
+      "epoch": 0.00663188687598198,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001526,
+      "loss": 0.25,
+      "step": 764
+    },
+    {
+      "epoch": 0.006640567356186144,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001528,
+      "loss": 0.207,
+      "step": 765
+    },
+    {
+      "epoch": 0.006649247836390309,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0015300000000000001,
+      "loss": 0.2598,
+      "step": 766
+    },
+    {
+      "epoch": 0.006657928316594474,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0015320000000000002,
+      "loss": 0.2578,
+      "step": 767
+    },
+    {
+      "epoch": 0.006666608796798639,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001534,
+      "loss": 0.2617,
+      "step": 768
+    },
+    {
+      "epoch": 0.0066752892770028036,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.001536,
+      "loss": 0.3008,
+      "step": 769
+    },
+    {
+      "epoch": 0.006683969757206969,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001538,
+      "loss": 0.3613,
+      "step": 770
+    },
+    {
+      "epoch": 0.006692650237411134,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0015400000000000001,
+      "loss": 0.2422,
+      "step": 771
+    },
+    {
+      "epoch": 0.006701330717615298,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001542,
+      "loss": 0.3379,
+      "step": 772
+    },
+    {
+      "epoch": 0.006710011197819463,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001544,
+      "loss": 0.2695,
+      "step": 773
+    },
+    {
+      "epoch": 0.006718691678023628,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001546,
+      "loss": 0.3477,
+      "step": 774
+    },
+    {
+      "epoch": 0.006727372158227793,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0015480000000000001,
+      "loss": 0.252,
+      "step": 775
+    },
+    {
+      "epoch": 0.006736052638431958,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0015500000000000002,
+      "loss": 0.2637,
+      "step": 776
+    },
+    {
+      "epoch": 0.006744733118636123,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001552,
+      "loss": 0.2559,
+      "step": 777
+    },
+    {
+      "epoch": 0.006753413598840288,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001554,
+      "loss": 0.4492,
+      "step": 778
+    },
+    {
+      "epoch": 0.006762094079044453,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001556,
+      "loss": 0.2188,
+      "step": 779
+    },
+    {
+      "epoch": 0.006770774559248617,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0015580000000000001,
+      "loss": 0.2656,
+      "step": 780
+    },
+    {
+      "epoch": 0.006779455039452782,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0015600000000000002,
+      "loss": 0.2793,
+      "step": 781
+    },
+    {
+      "epoch": 0.0067881355196569475,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001562,
+      "loss": 0.25,
+      "step": 782
+    },
+    {
+      "epoch": 0.006796815999861113,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001564,
+      "loss": 0.2217,
+      "step": 783
+    },
+    {
+      "epoch": 0.006805496480065277,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0015660000000000001,
+      "loss": 0.2422,
+      "step": 784
+    },
+    {
+      "epoch": 0.006814176960269442,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0015680000000000002,
+      "loss": 0.2812,
+      "step": 785
+    },
+    {
+      "epoch": 0.006822857440473607,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.00157,
+      "loss": 0.2188,
+      "step": 786
+    },
+    {
+      "epoch": 0.006831537920677772,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001572,
+      "loss": 0.2305,
+      "step": 787
+    },
+    {
+      "epoch": 0.0068402184008819365,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001574,
+      "loss": 0.2812,
+      "step": 788
+    },
+    {
+      "epoch": 0.006848898881086102,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0015760000000000001,
+      "loss": 0.2168,
+      "step": 789
+    },
+    {
+      "epoch": 0.006857579361290267,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0015780000000000002,
+      "loss": 0.252,
+      "step": 790
+    },
+    {
+      "epoch": 0.006866259841494432,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.00158,
+      "loss": 0.3359,
+      "step": 791
+    },
+    {
+      "epoch": 0.006874940321698596,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001582,
+      "loss": 0.3125,
+      "step": 792
+    },
+    {
+      "epoch": 0.006883620801902761,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0015840000000000001,
+      "loss": 0.293,
+      "step": 793
+    },
+    {
+      "epoch": 0.006892301282106926,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0015860000000000002,
+      "loss": 0.2617,
+      "step": 794
+    },
+    {
+      "epoch": 0.0069009817623110915,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0015880000000000002,
+      "loss": 0.3008,
+      "step": 795
+    },
+    {
+      "epoch": 0.006909662242515256,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.00159,
+      "loss": 0.2949,
+      "step": 796
+    },
+    {
+      "epoch": 0.006918342722719421,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001592,
+      "loss": 0.3262,
+      "step": 797
+    },
+    {
+      "epoch": 0.006927023202923586,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0015940000000000001,
+      "loss": 0.2637,
+      "step": 798
+    },
+    {
+      "epoch": 0.00693570368312775,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0015960000000000002,
+      "loss": 0.25,
+      "step": 799
+    },
+    {
+      "epoch": 0.006944384163331915,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001598,
+      "loss": 0.2148,
+      "step": 800
+    },
+    {
+      "epoch": 0.0069530646435360804,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0016,
+      "loss": 0.3086,
+      "step": 801
+    },
+    {
+      "epoch": 0.006961745123740246,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0016020000000000001,
+      "loss": 0.2891,
+      "step": 802
+    },
+    {
+      "epoch": 0.00697042560394441,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0016040000000000002,
+      "loss": 0.2207,
+      "step": 803
+    },
+    {
+      "epoch": 0.006979106084148575,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0016060000000000002,
+      "loss": 0.2852,
+      "step": 804
+    },
+    {
+      "epoch": 0.00698778656435274,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001608,
+      "loss": 0.2793,
+      "step": 805
+    },
+    {
+      "epoch": 0.006996467044556905,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.00161,
+      "loss": 0.291,
+      "step": 806
+    },
+    {
+      "epoch": 0.007005147524761069,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0016120000000000002,
+      "loss": 0.3086,
+      "step": 807
+    },
+    {
+      "epoch": 0.0070138280049652345,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0016140000000000002,
+      "loss": 0.2373,
+      "step": 808
+    },
+    {
+      "epoch": 0.0070225084851694,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001616,
+      "loss": 0.209,
+      "step": 809
+    },
+    {
+      "epoch": 0.007031188965373565,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001618,
+      "loss": 0.3398,
+      "step": 810
+    },
+    {
+      "epoch": 0.007039869445577729,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0016200000000000001,
+      "loss": 0.1973,
+      "step": 811
+    },
+    {
+      "epoch": 0.007048549925781894,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0016220000000000002,
+      "loss": 0.3281,
+      "step": 812
+    },
+    {
+      "epoch": 0.007057230405986059,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0016240000000000002,
+      "loss": 0.2832,
+      "step": 813
+    },
+    {
+      "epoch": 0.007065910886190224,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0016259999999999998,
+      "loss": 0.2793,
+      "step": 814
+    },
+    {
+      "epoch": 0.007074591366394389,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0016279999999999999,
+      "loss": 0.2275,
+      "step": 815
+    },
+    {
+      "epoch": 0.007083271846598554,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.00163,
+      "loss": 0.2295,
+      "step": 816
+    },
+    {
+      "epoch": 0.007091952326802719,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001632,
+      "loss": 0.2988,
+      "step": 817
+    },
+    {
+      "epoch": 0.007100632807006884,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001634,
+      "loss": 0.4414,
+      "step": 818
+    },
+    {
+      "epoch": 0.007109313287211048,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0016359999999999999,
+      "loss": 0.3105,
+      "step": 819
+    },
+    {
+      "epoch": 0.007117993767415213,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001638,
+      "loss": 0.2227,
+      "step": 820
+    },
+    {
+      "epoch": 0.0071266742476193785,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.00164,
+      "loss": 0.2852,
+      "step": 821
+    },
+    {
+      "epoch": 0.007135354727823544,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001642,
+      "loss": 0.208,
+      "step": 822
+    },
+    {
+      "epoch": 0.007144035208027708,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0016439999999999998,
+      "loss": 0.3652,
+      "step": 823
+    },
+    {
+      "epoch": 0.007152715688231873,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.001646,
+      "loss": 0.2832,
+      "step": 824
+    },
+    {
+      "epoch": 0.007161396168436038,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001648,
+      "loss": 0.2715,
+      "step": 825
+    },
+    {
+      "epoch": 0.007170076648640202,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.00165,
+      "loss": 0.2305,
+      "step": 826
+    },
+    {
+      "epoch": 0.0071787571288443675,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001652,
+      "loss": 0.252,
+      "step": 827
+    },
+    {
+      "epoch": 0.007187437609048533,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0016539999999999999,
+      "loss": 0.25,
+      "step": 828
+    },
+    {
+      "epoch": 0.007196118089252698,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001656,
+      "loss": 0.2734,
+      "step": 829
+    },
+    {
+      "epoch": 0.007204798569456862,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001658,
+      "loss": 0.21,
+      "step": 830
+    },
+    {
+      "epoch": 0.007213479049661027,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.00166,
+      "loss": 0.2412,
+      "step": 831
+    },
+    {
+      "epoch": 0.007222159529865192,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0016619999999999998,
+      "loss": 0.252,
+      "step": 832
+    },
+    {
+      "epoch": 0.007230840010069357,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001664,
+      "loss": 0.2734,
+      "step": 833
+    },
+    {
+      "epoch": 0.007239520490273522,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001666,
+      "loss": 0.208,
+      "step": 834
+    },
+    {
+      "epoch": 0.007248200970477687,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001668,
+      "loss": 0.2422,
+      "step": 835
+    },
+    {
+      "epoch": 0.007256881450681852,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.00167,
+      "loss": 0.2539,
+      "step": 836
+    },
+    {
+      "epoch": 0.007265561930886017,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0016719999999999999,
+      "loss": 0.3086,
+      "step": 837
+    },
+    {
+      "epoch": 0.007274242411090181,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001674,
+      "loss": 0.2324,
+      "step": 838
+    },
+    {
+      "epoch": 0.007282922891294346,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001676,
+      "loss": 0.2734,
+      "step": 839
+    },
+    {
+      "epoch": 0.0072916033714985114,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001678,
+      "loss": 0.293,
+      "step": 840
+    },
+    {
+      "epoch": 0.0073002838517026766,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.00168,
+      "loss": 0.2246,
+      "step": 841
+    },
+    {
+      "epoch": 0.007308964331906841,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001682,
+      "loss": 0.2734,
+      "step": 842
+    },
+    {
+      "epoch": 0.007317644812111006,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001684,
+      "loss": 0.3281,
+      "step": 843
+    },
+    {
+      "epoch": 0.007326325292315171,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.001686,
+      "loss": 0.3008,
+      "step": 844
+    },
+    {
+      "epoch": 0.007335005772519336,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001688,
+      "loss": 0.25,
+      "step": 845
+    },
+    {
+      "epoch": 0.0073436862527235,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0016899999999999999,
+      "loss": 0.3027,
+      "step": 846
+    },
+    {
+      "epoch": 0.0073523667329276655,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001692,
+      "loss": 0.2207,
+      "step": 847
+    },
+    {
+      "epoch": 0.007361047213131831,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001694,
+      "loss": 0.2793,
+      "step": 848
+    },
+    {
+      "epoch": 0.007369727693335995,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001696,
+      "loss": 0.2031,
+      "step": 849
+    },
+    {
+      "epoch": 0.00737840817354016,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001698,
+      "loss": 0.2871,
+      "step": 850
+    },
+    {
+      "epoch": 0.007387088653744325,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0017,
+      "loss": 0.2949,
+      "step": 851
+    },
+    {
+      "epoch": 0.00739576913394849,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.001702,
+      "loss": 0.2285,
+      "step": 852
+    },
+    {
+      "epoch": 0.0074044496141526545,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001704,
+      "loss": 0.2275,
+      "step": 853
+    },
+    {
+      "epoch": 0.00741313009435682,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001706,
+      "loss": 0.3008,
+      "step": 854
+    },
+    {
+      "epoch": 0.007421810574560985,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.001708,
+      "loss": 0.3184,
+      "step": 855
+    },
+    {
+      "epoch": 0.00743049105476515,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.00171,
+      "loss": 0.2695,
+      "step": 856
+    },
+    {
+      "epoch": 0.007439171534969314,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001712,
+      "loss": 0.2637,
+      "step": 857
+    },
+    {
+      "epoch": 0.007447852015173479,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001714,
+      "loss": 0.3008,
+      "step": 858
+    },
+    {
+      "epoch": 0.007456532495377644,
+      "grad_norm": 3.140625,
+      "learning_rate": 0.001716,
+      "loss": 0.6523,
+      "step": 859
+    },
+    {
+      "epoch": 0.0074652129755818095,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001718,
+      "loss": 0.3145,
+      "step": 860
+    },
+    {
+      "epoch": 0.007473893455785974,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.00172,
+      "loss": 0.291,
+      "step": 861
+    },
+    {
+      "epoch": 0.007482573935990139,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001722,
+      "loss": 0.3359,
+      "step": 862
+    },
+    {
+      "epoch": 0.007491254416194304,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001724,
+      "loss": 0.2363,
+      "step": 863
+    },
+    {
+      "epoch": 0.007499934896398469,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001726,
+      "loss": 0.2734,
+      "step": 864
+    },
+    {
+      "epoch": 0.007508615376602633,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.001728,
+      "loss": 0.2812,
+      "step": 865
+    },
+    {
+      "epoch": 0.0075172958568067985,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.00173,
+      "loss": 0.2695,
+      "step": 866
+    },
+    {
+      "epoch": 0.007525976337010964,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001732,
+      "loss": 0.2734,
+      "step": 867
+    },
+    {
+      "epoch": 0.007534656817215129,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001734,
+      "loss": 0.2832,
+      "step": 868
+    },
+    {
+      "epoch": 0.007543337297419293,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0017360000000000001,
+      "loss": 0.2988,
+      "step": 869
+    },
+    {
+      "epoch": 0.007552017777623458,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001738,
+      "loss": 0.2539,
+      "step": 870
+    },
+    {
+      "epoch": 0.007560698257827623,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.00174,
+      "loss": 0.2578,
+      "step": 871
+    },
+    {
+      "epoch": 0.007569378738031788,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001742,
+      "loss": 0.248,
+      "step": 872
+    },
+    {
+      "epoch": 0.007578059218235953,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001744,
+      "loss": 0.3066,
+      "step": 873
+    },
+    {
+      "epoch": 0.007586739698440118,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001746,
+      "loss": 0.2148,
+      "step": 874
+    },
+    {
+      "epoch": 0.007595420178644283,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001748,
+      "loss": 0.3125,
+      "step": 875
+    },
+    {
+      "epoch": 0.007604100658848447,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.00175,
+      "loss": 0.2441,
+      "step": 876
+    },
+    {
+      "epoch": 0.007612781139052612,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001752,
+      "loss": 0.2617,
+      "step": 877
+    },
+    {
+      "epoch": 0.007621461619256777,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0017540000000000001,
+      "loss": 0.2715,
+      "step": 878
+    },
+    {
+      "epoch": 0.0076301420994609424,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001756,
+      "loss": 0.6016,
+      "step": 879
+    },
+    {
+      "epoch": 0.007638822579665107,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001758,
+      "loss": 0.1846,
+      "step": 880
+    },
+    {
+      "epoch": 0.007647503059869272,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.00176,
+      "loss": 0.3008,
+      "step": 881
+    },
+    {
+      "epoch": 0.007656183540073437,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0017620000000000001,
+      "loss": 0.2715,
+      "step": 882
+    },
+    {
+      "epoch": 0.007664864020277602,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001764,
+      "loss": 0.293,
+      "step": 883
+    },
+    {
+      "epoch": 0.007673544500481766,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001766,
+      "loss": 0.2695,
+      "step": 884
+    },
+    {
+      "epoch": 0.007682224980685931,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001768,
+      "loss": 0.2812,
+      "step": 885
+    },
+    {
+      "epoch": 0.0076909054608900965,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.00177,
+      "loss": 0.375,
+      "step": 886
+    },
+    {
+      "epoch": 0.007699585941094262,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0017720000000000001,
+      "loss": 0.2578,
+      "step": 887
+    },
+    {
+      "epoch": 0.007708266421298426,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001774,
+      "loss": 0.2637,
+      "step": 888
+    },
+    {
+      "epoch": 0.007716946901502591,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001776,
+      "loss": 0.2656,
+      "step": 889
+    },
+    {
+      "epoch": 0.007725627381706756,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001778,
+      "loss": 0.2393,
+      "step": 890
+    },
+    {
+      "epoch": 0.007734307861910921,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0017800000000000001,
+      "loss": 0.3184,
+      "step": 891
+    },
+    {
+      "epoch": 0.0077429883421150855,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0017820000000000002,
+      "loss": 0.2676,
+      "step": 892
+    },
+    {
+      "epoch": 0.007751668822319251,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001784,
+      "loss": 0.3086,
+      "step": 893
+    },
+    {
+      "epoch": 0.007760349302523416,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001786,
+      "loss": 0.293,
+      "step": 894
+    },
+    {
+      "epoch": 0.007769029782727581,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.001788,
+      "loss": 0.1904,
+      "step": 895
+    },
+    {
+      "epoch": 0.007777710262931745,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0017900000000000001,
+      "loss": 0.3086,
+      "step": 896
+    },
+    {
+      "epoch": 0.00778639074313591,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001792,
+      "loss": 0.2305,
+      "step": 897
+    },
+    {
+      "epoch": 0.007795071223340075,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001794,
+      "loss": 0.3105,
+      "step": 898
+    },
+    {
+      "epoch": 0.0078037517035442405,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.001796,
+      "loss": 0.3223,
+      "step": 899
+    },
+    {
+      "epoch": 0.007812432183748405,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0017980000000000001,
+      "loss": 0.3691,
+      "step": 900
+    },
+    {
+      "epoch": 0.00782111266395257,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0018000000000000002,
+      "loss": 0.3047,
+      "step": 901
+    },
+    {
+      "epoch": 0.007829793144156735,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001802,
+      "loss": 0.2031,
+      "step": 902
+    },
+    {
+      "epoch": 0.0078384736243609,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001804,
+      "loss": 0.2422,
+      "step": 903
+    },
+    {
+      "epoch": 0.007847154104565065,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001806,
+      "loss": 0.3965,
+      "step": 904
+    },
+    {
+      "epoch": 0.007855834584769229,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0018080000000000001,
+      "loss": 0.2441,
+      "step": 905
+    },
+    {
+      "epoch": 0.007864515064973394,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0018100000000000002,
+      "loss": 0.2656,
+      "step": 906
+    },
+    {
+      "epoch": 0.007873195545177559,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001812,
+      "loss": 0.2188,
+      "step": 907
+    },
+    {
+      "epoch": 0.007881876025381724,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001814,
+      "loss": 0.3027,
+      "step": 908
+    },
+    {
+      "epoch": 0.007890556505585889,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0018160000000000001,
+      "loss": 0.4023,
+      "step": 909
+    },
+    {
+      "epoch": 0.007899236985790054,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0018180000000000002,
+      "loss": 0.2598,
+      "step": 910
+    },
+    {
+      "epoch": 0.00790791746599422,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.00182,
+      "loss": 0.2715,
+      "step": 911
+    },
+    {
+      "epoch": 0.007916597946198384,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001822,
+      "loss": 0.2275,
+      "step": 912
+    },
+    {
+      "epoch": 0.007925278426402548,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001824,
+      "loss": 0.2891,
+      "step": 913
+    },
+    {
+      "epoch": 0.007933958906606713,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0018260000000000001,
+      "loss": 0.293,
+      "step": 914
+    },
+    {
+      "epoch": 0.007942639386810878,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0018280000000000002,
+      "loss": 0.2402,
+      "step": 915
+    },
+    {
+      "epoch": 0.007951319867015043,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.00183,
+      "loss": 0.2598,
+      "step": 916
+    },
+    {
+      "epoch": 0.007960000347219208,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001832,
+      "loss": 0.3125,
+      "step": 917
+    },
+    {
+      "epoch": 0.007968680827423373,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0018340000000000001,
+      "loss": 0.2578,
+      "step": 918
+    },
+    {
+      "epoch": 0.007977361307627539,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018360000000000002,
+      "loss": 0.4102,
+      "step": 919
+    },
+    {
+      "epoch": 0.007986041787831704,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0018380000000000002,
+      "loss": 0.7422,
+      "step": 920
+    },
+    {
+      "epoch": 0.007994722268035867,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.00184,
+      "loss": 0.2617,
+      "step": 921
+    },
+    {
+      "epoch": 0.008003402748240032,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001842,
+      "loss": 0.2441,
+      "step": 922
+    },
+    {
+      "epoch": 0.008012083228444197,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0018440000000000002,
+      "loss": 0.2227,
+      "step": 923
+    },
+    {
+      "epoch": 0.008020763708648362,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0018460000000000002,
+      "loss": 0.2061,
+      "step": 924
+    },
+    {
+      "epoch": 0.008029444188852528,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001848,
+      "loss": 0.252,
+      "step": 925
+    },
+    {
+      "epoch": 0.008038124669056693,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.00185,
+      "loss": 0.2559,
+      "step": 926
+    },
+    {
+      "epoch": 0.008046805149260858,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0018520000000000001,
+      "loss": 0.2197,
+      "step": 927
+    },
+    {
+      "epoch": 0.008055485629465021,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0018540000000000002,
+      "loss": 0.2617,
+      "step": 928
+    },
+    {
+      "epoch": 0.008064166109669186,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0018560000000000002,
+      "loss": 0.2773,
+      "step": 929
+    },
+    {
+      "epoch": 0.008072846589873351,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001858,
+      "loss": 0.2676,
+      "step": 930
+    },
+    {
+      "epoch": 0.008081527070077517,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.00186,
+      "loss": 0.252,
+      "step": 931
+    },
+    {
+      "epoch": 0.008090207550281682,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0018620000000000002,
+      "loss": 0.2363,
+      "step": 932
+    },
+    {
+      "epoch": 0.008098888030485847,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0018640000000000002,
+      "loss": 0.2188,
+      "step": 933
+    },
+    {
+      "epoch": 0.008107568510690012,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001866,
+      "loss": 0.2637,
+      "step": 934
+    },
+    {
+      "epoch": 0.008116248990894177,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001868,
+      "loss": 0.2314,
+      "step": 935
+    },
+    {
+      "epoch": 0.00812492947109834,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0018700000000000001,
+      "loss": 0.2969,
+      "step": 936
+    },
+    {
+      "epoch": 0.008133609951302505,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0018720000000000002,
+      "loss": 0.3047,
+      "step": 937
+    },
+    {
+      "epoch": 0.00814229043150667,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0018740000000000002,
+      "loss": 0.2773,
+      "step": 938
+    },
+    {
+      "epoch": 0.008150970911710836,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0018759999999999998,
+      "loss": 0.2422,
+      "step": 939
+    },
+    {
+      "epoch": 0.008159651391915,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001878,
+      "loss": 0.3203,
+      "step": 940
+    },
+    {
+      "epoch": 0.008168331872119166,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.00188,
+      "loss": 0.2773,
+      "step": 941
+    },
+    {
+      "epoch": 0.008177012352323331,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001882,
+      "loss": 0.2637,
+      "step": 942
+    },
+    {
+      "epoch": 0.008185692832527496,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001884,
+      "loss": 0.2383,
+      "step": 943
+    },
+    {
+      "epoch": 0.00819437331273166,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0018859999999999999,
+      "loss": 0.3105,
+      "step": 944
+    },
+    {
+      "epoch": 0.008203053792935825,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.001888,
+      "loss": 0.252,
+      "step": 945
+    },
+    {
+      "epoch": 0.00821173427313999,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.00189,
+      "loss": 0.293,
+      "step": 946
+    },
+    {
+      "epoch": 0.008220414753344155,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001892,
+      "loss": 0.332,
+      "step": 947
+    },
+    {
+      "epoch": 0.00822909523354832,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0018939999999999999,
+      "loss": 0.2168,
+      "step": 948
+    },
+    {
+      "epoch": 0.008237775713752485,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001896,
+      "loss": 0.2676,
+      "step": 949
+    },
+    {
+      "epoch": 0.00824645619395665,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001898,
+      "loss": 0.2266,
+      "step": 950
+    },
+    {
+      "epoch": 0.008255136674160815,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0019,
+      "loss": 0.2578,
+      "step": 951
+    },
+    {
+      "epoch": 0.008263817154364979,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001902,
+      "loss": 0.2324,
+      "step": 952
+    },
+    {
+      "epoch": 0.008272497634569144,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019039999999999999,
+      "loss": 0.3008,
+      "step": 953
+    },
+    {
+      "epoch": 0.008281178114773309,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001906,
+      "loss": 0.3066,
+      "step": 954
+    },
+    {
+      "epoch": 0.008289858594977474,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001908,
+      "loss": 0.3516,
+      "step": 955
+    },
+    {
+      "epoch": 0.00829853907518164,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.00191,
+      "loss": 0.2148,
+      "step": 956
+    },
+    {
+      "epoch": 0.008307219555385804,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0019119999999999999,
+      "loss": 0.291,
+      "step": 957
+    },
+    {
+      "epoch": 0.00831590003558997,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001914,
+      "loss": 0.5,
+      "step": 958
+    },
+    {
+      "epoch": 0.008324580515794133,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001916,
+      "loss": 0.2656,
+      "step": 959
+    },
+    {
+      "epoch": 0.008333260995998298,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.001918,
+      "loss": 0.3086,
+      "step": 960
+    },
+    {
+      "epoch": 0.008341941476202463,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.00192,
+      "loss": 0.2246,
+      "step": 961
+    },
+    {
+      "epoch": 0.008350621956406628,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0019219999999999999,
+      "loss": 0.2422,
+      "step": 962
+    },
+    {
+      "epoch": 0.008359302436610793,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001924,
+      "loss": 0.2637,
+      "step": 963
+    },
+    {
+      "epoch": 0.008367982916814959,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001926,
+      "loss": 0.2578,
+      "step": 964
+    },
+    {
+      "epoch": 0.008376663397019124,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001928,
+      "loss": 0.2402,
+      "step": 965
+    },
+    {
+      "epoch": 0.008385343877223289,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.00193,
+      "loss": 0.2617,
+      "step": 966
+    },
+    {
+      "epoch": 0.008394024357427452,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001932,
+      "loss": 0.2148,
+      "step": 967
+    },
+    {
+      "epoch": 0.008402704837631617,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.001934,
+      "loss": 0.332,
+      "step": 968
+    },
+    {
+      "epoch": 0.008411385317835782,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001936,
+      "loss": 0.2139,
+      "step": 969
+    },
+    {
+      "epoch": 0.008420065798039948,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001938,
+      "loss": 0.2812,
+      "step": 970
+    },
+    {
+      "epoch": 0.008428746278244113,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0019399999999999999,
+      "loss": 0.25,
+      "step": 971
+    },
+    {
+      "epoch": 0.008437426758448278,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001942,
+      "loss": 0.2656,
+      "step": 972
+    },
+    {
+      "epoch": 0.008446107238652443,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001944,
+      "loss": 0.2422,
+      "step": 973
+    },
+    {
+      "epoch": 0.008454787718856608,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001946,
+      "loss": 0.2617,
+      "step": 974
+    },
+    {
+      "epoch": 0.008463468199060771,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001948,
+      "loss": 0.3086,
+      "step": 975
+    },
+    {
+      "epoch": 0.008472148679264936,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.00195,
+      "loss": 0.2832,
+      "step": 976
+    },
+    {
+      "epoch": 0.008480829159469102,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001952,
+      "loss": 0.2617,
+      "step": 977
+    },
+    {
+      "epoch": 0.008489509639673267,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001954,
+      "loss": 0.2461,
+      "step": 978
+    },
+    {
+      "epoch": 0.008498190119877432,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001956,
+      "loss": 0.2168,
+      "step": 979
+    },
+    {
+      "epoch": 0.008506870600081597,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001958,
+      "loss": 0.2266,
+      "step": 980
+    },
+    {
+      "epoch": 0.008515551080285762,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.00196,
+      "loss": 0.2373,
+      "step": 981
+    },
+    {
+      "epoch": 0.008524231560489925,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001962,
+      "loss": 0.2734,
+      "step": 982
+    },
+    {
+      "epoch": 0.00853291204069409,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.001964,
+      "loss": 0.3574,
+      "step": 983
+    },
+    {
+      "epoch": 0.008541592520898256,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001966,
+      "loss": 0.2324,
+      "step": 984
+    },
+    {
+      "epoch": 0.00855027300110242,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001968,
+      "loss": 0.2676,
+      "step": 985
+    },
+    {
+      "epoch": 0.008558953481306586,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.00197,
+      "loss": 0.2139,
+      "step": 986
+    },
+    {
+      "epoch": 0.008567633961510751,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0019720000000000002,
+      "loss": 0.2793,
+      "step": 987
+    },
+    {
+      "epoch": 0.008576314441714916,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001974,
+      "loss": 0.3164,
+      "step": 988
+    },
+    {
+      "epoch": 0.008584994921919081,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001976,
+      "loss": 0.2207,
+      "step": 989
+    },
+    {
+      "epoch": 0.008593675402123245,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.001978,
+      "loss": 0.2324,
+      "step": 990
+    },
+    {
+      "epoch": 0.00860235588232741,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.00198,
+      "loss": 0.2295,
+      "step": 991
+    },
+    {
+      "epoch": 0.008611036362531575,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001982,
+      "loss": 0.3203,
+      "step": 992
+    },
+    {
+      "epoch": 0.00861971684273574,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001984,
+      "loss": 0.2656,
+      "step": 993
+    },
+    {
+      "epoch": 0.008628397322939905,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001986,
+      "loss": 0.2637,
+      "step": 994
+    },
+    {
+      "epoch": 0.00863707780314407,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001988,
+      "loss": 0.2383,
+      "step": 995
+    },
+    {
+      "epoch": 0.008645758283348235,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.00199,
+      "loss": 0.2178,
+      "step": 996
+    },
+    {
+      "epoch": 0.0086544387635524,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001992,
+      "loss": 0.2832,
+      "step": 997
+    },
+    {
+      "epoch": 0.008663119243756564,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001994,
+      "loss": 0.252,
+      "step": 998
+    },
+    {
+      "epoch": 0.008671799723960729,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001996,
+      "loss": 0.2734,
+      "step": 999
+    },
+    {
+      "epoch": 0.008680480204164894,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001998,
+      "loss": 0.5977,
+      "step": 1000
+    },
+    {
+      "epoch": 0.00868916068436906,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.002,
+      "loss": 0.2871,
+      "step": 1001
+    },
+    {
+      "epoch": 0.008697841164573224,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019999999995078867,
+      "loss": 0.2734,
+      "step": 1002
+    },
+    {
+      "epoch": 0.00870652164477739,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019999999980315473,
+      "loss": 0.2891,
+      "step": 1003
+    },
+    {
+      "epoch": 0.008715202124981555,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001999999995570981,
+      "loss": 0.2578,
+      "step": 1004
+    },
+    {
+      "epoch": 0.008723882605185718,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.001999999992126188,
+      "loss": 0.2578,
+      "step": 1005
+    },
+    {
+      "epoch": 0.008732563085389883,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001999999987697169,
+      "loss": 0.2949,
+      "step": 1006
+    },
+    {
+      "epoch": 0.008741243565594048,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019999999822839236,
+      "loss": 0.3164,
+      "step": 1007
+    },
+    {
+      "epoch": 0.008749924045798213,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019999999758864516,
+      "loss": 0.2676,
+      "step": 1008
+    },
+    {
+      "epoch": 0.008758604526002379,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019999999685047527,
+      "loss": 0.248,
+      "step": 1009
+    },
+    {
+      "epoch": 0.008767285006206544,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001999999960138828,
+      "loss": 0.2754,
+      "step": 1010
+    },
+    {
+      "epoch": 0.008775965486410709,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019999999507886767,
+      "loss": 0.3027,
+      "step": 1011
+    },
+    {
+      "epoch": 0.008784645966614874,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019999999404542988,
+      "loss": 0.2285,
+      "step": 1012
+    },
+    {
+      "epoch": 0.008793326446819037,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0019999999291356943,
+      "loss": 0.2617,
+      "step": 1013
+    },
+    {
+      "epoch": 0.008802006927023202,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001999999916832864,
+      "loss": 0.2617,
+      "step": 1014
+    },
+    {
+      "epoch": 0.008810687407227367,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001999999903545807,
+      "loss": 0.3086,
+      "step": 1015
+    },
+    {
+      "epoch": 0.008819367887431533,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019999998892745235,
+      "loss": 0.2383,
+      "step": 1016
+    },
+    {
+      "epoch": 0.008828048367635698,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019999998740190135,
+      "loss": 0.3086,
+      "step": 1017
+    },
+    {
+      "epoch": 0.008836728847839863,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001999999857779278,
+      "loss": 0.2539,
+      "step": 1018
+    },
+    {
+      "epoch": 0.008845409328044028,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019999998405553156,
+      "loss": 0.2002,
+      "step": 1019
+    },
+    {
+      "epoch": 0.008854089808248193,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019999998223471267,
+      "loss": 0.332,
+      "step": 1020
+    },
+    {
+      "epoch": 0.008862770288452356,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019999998031547117,
+      "loss": 0.2363,
+      "step": 1021
+    },
+    {
+      "epoch": 0.008871450768656522,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019999997829780705,
+      "loss": 0.25,
+      "step": 1022
+    },
+    {
+      "epoch": 0.008880131248860687,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001999999761817203,
+      "loss": 0.2412,
+      "step": 1023
+    },
+    {
+      "epoch": 0.008888811729064852,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0019999997396721093,
+      "loss": 0.2246,
+      "step": 1024
+    },
+    {
+      "epoch": 0.008897492209269017,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.001999999716542789,
+      "loss": 0.2324,
+      "step": 1025
+    },
+    {
+      "epoch": 0.008906172689473182,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001999999692429243,
+      "loss": 0.3105,
+      "step": 1026
+    },
+    {
+      "epoch": 0.008914853169677347,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001999999667331471,
+      "loss": 0.2422,
+      "step": 1027
+    },
+    {
+      "epoch": 0.008923533649881512,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001999999641249473,
+      "loss": 0.2539,
+      "step": 1028
+    },
+    {
+      "epoch": 0.008932214130085676,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019999996141832482,
+      "loss": 0.2812,
+      "step": 1029
+    },
+    {
+      "epoch": 0.00894089461028984,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001999999586132798,
+      "loss": 0.2539,
+      "step": 1030
+    },
+    {
+      "epoch": 0.008949575090494006,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019999995570981212,
+      "loss": 0.2891,
+      "step": 1031
+    },
+    {
+      "epoch": 0.008958255570698171,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.001999999527079219,
+      "loss": 0.25,
+      "step": 1032
+    },
+    {
+      "epoch": 0.008966936050902336,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0019999994960760905,
+      "loss": 0.2266,
+      "step": 1033
+    },
+    {
+      "epoch": 0.008975616531106501,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019999994640887363,
+      "loss": 0.2754,
+      "step": 1034
+    },
+    {
+      "epoch": 0.008984297011310666,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001999999431117156,
+      "loss": 0.209,
+      "step": 1035
+    },
+    {
+      "epoch": 0.00899297749151483,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.00199999939716135,
+      "loss": 0.249,
+      "step": 1036
+    },
+    {
+      "epoch": 0.009001657971718995,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0019999993622213177,
+      "loss": 0.2422,
+      "step": 1037
+    },
+    {
+      "epoch": 0.00901033845192316,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019999993262970603,
+      "loss": 0.2363,
+      "step": 1038
+    },
+    {
+      "epoch": 0.009019018932127325,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019999992893885766,
+      "loss": 0.2871,
+      "step": 1039
+    },
+    {
+      "epoch": 0.00902769941233149,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019999992514958677,
+      "loss": 0.2734,
+      "step": 1040
+    },
+    {
+      "epoch": 0.009036379892535655,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019999992126189326,
+      "loss": 0.3496,
+      "step": 1041
+    },
+    {
+      "epoch": 0.00904506037273982,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001999999172757772,
+      "loss": 0.2598,
+      "step": 1042
+    },
+    {
+      "epoch": 0.009053740852943986,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001999999131912386,
+      "loss": 0.2422,
+      "step": 1043
+    },
+    {
+      "epoch": 0.009062421333148149,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019999990900827747,
+      "loss": 0.2305,
+      "step": 1044
+    },
+    {
+      "epoch": 0.009071101813352314,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019999990472689376,
+      "loss": 0.2324,
+      "step": 1045
+    },
+    {
+      "epoch": 0.00907978229355648,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.001999999003470875,
+      "loss": 0.2129,
+      "step": 1046
+    },
+    {
+      "epoch": 0.009088462773760644,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019999989586885875,
+      "loss": 0.2891,
+      "step": 1047
+    },
+    {
+      "epoch": 0.00909714325396481,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019999989129220745,
+      "loss": 0.2891,
+      "step": 1048
+    },
+    {
+      "epoch": 0.009105823734168975,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001999998866171336,
+      "loss": 0.2285,
+      "step": 1049
+    },
+    {
+      "epoch": 0.00911450421437314,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001999998818436372,
+      "loss": 0.2402,
+      "step": 1050
+    },
+    {
+      "epoch": 0.009123184694577305,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019999987697171834,
+      "loss": 0.2363,
+      "step": 1051
+    },
+    {
+      "epoch": 0.009131865174781468,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019999987200137693,
+      "loss": 0.2012,
+      "step": 1052
+    },
+    {
+      "epoch": 0.009140545654985633,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.00199999866932613,
+      "loss": 0.252,
+      "step": 1053
+    },
+    {
+      "epoch": 0.009149226135189798,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001999998617654266,
+      "loss": 0.3047,
+      "step": 1054
+    },
+    {
+      "epoch": 0.009157906615393964,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001999998564998177,
+      "loss": 0.2266,
+      "step": 1055
+    },
+    {
+      "epoch": 0.009166587095598129,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.001999998511357863,
+      "loss": 0.2539,
+      "step": 1056
+    },
+    {
+      "epoch": 0.009175267575802294,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001999998456733324,
+      "loss": 0.2422,
+      "step": 1057
+    },
+    {
+      "epoch": 0.009183948056006459,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019999984011245604,
+      "loss": 0.2266,
+      "step": 1058
+    },
+    {
+      "epoch": 0.009192628536210622,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019999983445315723,
+      "loss": 0.3203,
+      "step": 1059
+    },
+    {
+      "epoch": 0.009201309016414787,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001999998286954359,
+      "loss": 0.3066,
+      "step": 1060
+    },
+    {
+      "epoch": 0.009209989496618953,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0019999982283929216,
+      "loss": 0.3086,
+      "step": 1061
+    },
+    {
+      "epoch": 0.009218669976823118,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0019999981688472593,
+      "loss": 0.2227,
+      "step": 1062
+    },
+    {
+      "epoch": 0.009227350457027283,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0019999981083173727,
+      "loss": 0.248,
+      "step": 1063
+    },
+    {
+      "epoch": 0.009236030937231448,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0019999980468032616,
+      "loss": 0.2451,
+      "step": 1064
+    },
+    {
+      "epoch": 0.009244711417435613,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001999997984304926,
+      "loss": 0.2676,
+      "step": 1065
+    },
+    {
+      "epoch": 0.009253391897639778,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019999979208223666,
+      "loss": 0.2402,
+      "step": 1066
+    },
+    {
+      "epoch": 0.009262072377843942,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019999978563555827,
+      "loss": 0.2871,
+      "step": 1067
+    },
+    {
+      "epoch": 0.009270752858048107,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019999977909045744,
+      "loss": 0.2773,
+      "step": 1068
+    },
+    {
+      "epoch": 0.009279433338252272,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019999977244693425,
+      "loss": 0.2285,
+      "step": 1069
+    },
+    {
+      "epoch": 0.009288113818456437,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019999976570498866,
+      "loss": 0.248,
+      "step": 1070
+    },
+    {
+      "epoch": 0.009296794298660602,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019999975886462063,
+      "loss": 0.2852,
+      "step": 1071
+    },
+    {
+      "epoch": 0.009305474778864767,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001999997519258303,
+      "loss": 0.3711,
+      "step": 1072
+    },
+    {
+      "epoch": 0.009314155259068932,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001999997448886175,
+      "loss": 0.2314,
+      "step": 1073
+    },
+    {
+      "epoch": 0.009322835739273097,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001999997377529824,
+      "loss": 0.3281,
+      "step": 1074
+    },
+    {
+      "epoch": 0.00933151621947726,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001999997305189249,
+      "loss": 0.2773,
+      "step": 1075
+    },
+    {
+      "epoch": 0.009340196699681426,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0019999972318644503,
+      "loss": 0.2266,
+      "step": 1076
+    },
+    {
+      "epoch": 0.009348877179885591,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019999971575554287,
+      "loss": 0.2715,
+      "step": 1077
+    },
+    {
+      "epoch": 0.009357557660089756,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019999970822621835,
+      "loss": 0.332,
+      "step": 1078
+    },
+    {
+      "epoch": 0.009366238140293921,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001999997005984715,
+      "loss": 0.2432,
+      "step": 1079
+    },
+    {
+      "epoch": 0.009374918620498086,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0019999969287230234,
+      "loss": 0.2949,
+      "step": 1080
+    },
+    {
+      "epoch": 0.009383599100702252,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0019999968504771084,
+      "loss": 0.2461,
+      "step": 1081
+    },
+    {
+      "epoch": 0.009392279580906415,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019999967712469708,
+      "loss": 0.1953,
+      "step": 1082
+    },
+    {
+      "epoch": 0.00940096006111058,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019999966910326096,
+      "loss": 0.3008,
+      "step": 1083
+    },
+    {
+      "epoch": 0.009409640541314745,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001999996609834026,
+      "loss": 0.2852,
+      "step": 1084
+    },
+    {
+      "epoch": 0.00941832102151891,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019999965276512196,
+      "loss": 0.2441,
+      "step": 1085
+    },
+    {
+      "epoch": 0.009427001501723075,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0019999964444841903,
+      "loss": 0.252,
+      "step": 1086
+    },
+    {
+      "epoch": 0.00943568198192724,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001999996360332939,
+      "loss": 0.2578,
+      "step": 1087
+    },
+    {
+      "epoch": 0.009444362462131406,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019999962751974646,
+      "loss": 0.3242,
+      "step": 1088
+    },
+    {
+      "epoch": 0.00945304294233557,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019999961890777677,
+      "loss": 0.248,
+      "step": 1089
+    },
+    {
+      "epoch": 0.009461723422539734,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.001999996101973849,
+      "loss": 0.2324,
+      "step": 1090
+    },
+    {
+      "epoch": 0.0094704039027439,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019999960138857077,
+      "loss": 0.2402,
+      "step": 1091
+    },
+    {
+      "epoch": 0.009479084382948064,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001999995924813345,
+      "loss": 0.3281,
+      "step": 1092
+    },
+    {
+      "epoch": 0.00948776486315223,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019999958347567594,
+      "loss": 0.2031,
+      "step": 1093
+    },
+    {
+      "epoch": 0.009496445343356395,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0019999957437159517,
+      "loss": 0.1943,
+      "step": 1094
+    },
+    {
+      "epoch": 0.00950512582356056,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001999995651690923,
+      "loss": 0.2305,
+      "step": 1095
+    },
+    {
+      "epoch": 0.009513806303764725,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0019999955586816726,
+      "loss": 0.2969,
+      "step": 1096
+    },
+    {
+      "epoch": 0.00952248678396889,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0019999954646882,
+      "loss": 0.2031,
+      "step": 1097
+    },
+    {
+      "epoch": 0.009531167264173053,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001999995369710506,
+      "loss": 0.2988,
+      "step": 1098
+    },
+    {
+      "epoch": 0.009539847744377218,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019999952737485907,
+      "loss": 0.2891,
+      "step": 1099
+    },
+    {
+      "epoch": 0.009548528224581384,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019999951768024543,
+      "loss": 0.2773,
+      "step": 1100
+    },
+    {
+      "epoch": 0.009557208704785549,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019999950788720964,
+      "loss": 0.1904,
+      "step": 1101
+    },
+    {
+      "epoch": 0.009565889184989714,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019999949799575176,
+      "loss": 0.1973,
+      "step": 1102
+    },
+    {
+      "epoch": 0.009574569665193879,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019999948800587175,
+      "loss": 0.2344,
+      "step": 1103
+    },
+    {
+      "epoch": 0.009583250145398044,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001999994779175697,
+      "loss": 0.2412,
+      "step": 1104
+    },
+    {
+      "epoch": 0.00959193062560221,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019999946773084556,
+      "loss": 0.2578,
+      "step": 1105
+    },
+    {
+      "epoch": 0.009600611105806373,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001999994574456993,
+      "loss": 0.248,
+      "step": 1106
+    },
+    {
+      "epoch": 0.009609291586010538,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019999944706213103,
+      "loss": 0.209,
+      "step": 1107
+    },
+    {
+      "epoch": 0.009617972066214703,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.001999994365801407,
+      "loss": 0.2578,
+      "step": 1108
+    },
+    {
+      "epoch": 0.009626652546418868,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001999994259997284,
+      "loss": 0.3359,
+      "step": 1109
+    },
+    {
+      "epoch": 0.009635333026623033,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.00199999415320894,
+      "loss": 0.2891,
+      "step": 1110
+    },
+    {
+      "epoch": 0.009644013506827198,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001999994045436376,
+      "loss": 0.2441,
+      "step": 1111
+    },
+    {
+      "epoch": 0.009652693987031363,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019999939366795927,
+      "loss": 0.252,
+      "step": 1112
+    },
+    {
+      "epoch": 0.009661374467235527,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001999993826938589,
+      "loss": 0.2305,
+      "step": 1113
+    },
+    {
+      "epoch": 0.009670054947439692,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019999937162133658,
+      "loss": 0.2402,
+      "step": 1114
+    },
+    {
+      "epoch": 0.009678735427643857,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019999936045039224,
+      "loss": 0.252,
+      "step": 1115
+    },
+    {
+      "epoch": 0.009687415907848022,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.00199999349181026,
+      "loss": 0.2451,
+      "step": 1116
+    },
+    {
+      "epoch": 0.009696096388052187,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0019999933781323785,
+      "loss": 0.3516,
+      "step": 1117
+    },
+    {
+      "epoch": 0.009704776868256352,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019999932634702775,
+      "loss": 0.2422,
+      "step": 1118
+    },
+    {
+      "epoch": 0.009713457348460517,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019999931478239573,
+      "loss": 0.2305,
+      "step": 1119
+    },
+    {
+      "epoch": 0.009722137828664682,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019999930311934183,
+      "loss": 0.7617,
+      "step": 1120
+    },
+    {
+      "epoch": 0.009730818308868846,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.00199999291357866,
+      "loss": 0.2266,
+      "step": 1121
+    },
+    {
+      "epoch": 0.009739498789073011,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019999927949796836,
+      "loss": 0.2773,
+      "step": 1122
+    },
+    {
+      "epoch": 0.009748179269277176,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0019999926753964882,
+      "loss": 0.2578,
+      "step": 1123
+    },
+    {
+      "epoch": 0.009756859749481341,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019999925548290745,
+      "loss": 0.2734,
+      "step": 1124
+    },
+    {
+      "epoch": 0.009765540229685506,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019999924332774425,
+      "loss": 0.2617,
+      "step": 1125
+    },
+    {
+      "epoch": 0.009774220709889671,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001999992310741592,
+      "loss": 0.2158,
+      "step": 1126
+    },
+    {
+      "epoch": 0.009782901190093837,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001999992187221524,
+      "loss": 0.2188,
+      "step": 1127
+    },
+    {
+      "epoch": 0.009791581670298002,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001999992062717238,
+      "loss": 0.2412,
+      "step": 1128
+    },
+    {
+      "epoch": 0.009800262150502165,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019999919372287334,
+      "loss": 0.248,
+      "step": 1129
+    },
+    {
+      "epoch": 0.00980894263070633,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.001999991810756012,
+      "loss": 0.1885,
+      "step": 1130
+    },
+    {
+      "epoch": 0.009817623110910495,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0019999916832990727,
+      "loss": 0.2188,
+      "step": 1131
+    },
+    {
+      "epoch": 0.00982630359111466,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001999991554857916,
+      "loss": 0.2637,
+      "step": 1132
+    },
+    {
+      "epoch": 0.009834984071318826,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001999991425432542,
+      "loss": 0.2695,
+      "step": 1133
+    },
+    {
+      "epoch": 0.00984366455152299,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001999991295022951,
+      "loss": 0.2158,
+      "step": 1134
+    },
+    {
+      "epoch": 0.009852345031727156,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019999911636291432,
+      "loss": 0.2637,
+      "step": 1135
+    },
+    {
+      "epoch": 0.00986102551193132,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019999910312511185,
+      "loss": 0.2559,
+      "step": 1136
+    },
+    {
+      "epoch": 0.009869705992135484,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019999908978888766,
+      "loss": 0.2344,
+      "step": 1137
+    },
+    {
+      "epoch": 0.00987838647233965,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0019999907635424186,
+      "loss": 0.2256,
+      "step": 1138
+    },
+    {
+      "epoch": 0.009887066952543815,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019999906282117444,
+      "loss": 0.2363,
+      "step": 1139
+    },
+    {
+      "epoch": 0.00989574743274798,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001999990491896854,
+      "loss": 0.2324,
+      "step": 1140
+    },
+    {
+      "epoch": 0.009904427912952145,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019999903545977475,
+      "loss": 0.3125,
+      "step": 1141
+    },
+    {
+      "epoch": 0.00991310839315631,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.001999990216314425,
+      "loss": 0.2109,
+      "step": 1142
+    },
+    {
+      "epoch": 0.009921788873360475,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019999900770468863,
+      "loss": 0.2109,
+      "step": 1143
+    },
+    {
+      "epoch": 0.009930469353564638,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0019999899367951325,
+      "loss": 0.25,
+      "step": 1144
+    },
+    {
+      "epoch": 0.009939149833768804,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001999989795559163,
+      "loss": 0.2119,
+      "step": 1145
+    },
+    {
+      "epoch": 0.009947830313972969,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001999989653338978,
+      "loss": 0.3125,
+      "step": 1146
+    },
+    {
+      "epoch": 0.009956510794177134,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019999895101345784,
+      "loss": 0.2715,
+      "step": 1147
+    },
+    {
+      "epoch": 0.009965191274381299,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019999893659459634,
+      "loss": 0.2129,
+      "step": 1148
+    },
+    {
+      "epoch": 0.009973871754585464,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001999989220773134,
+      "loss": 0.2539,
+      "step": 1149
+    },
+    {
+      "epoch": 0.00998255223478963,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019999890746160895,
+      "loss": 0.2363,
+      "step": 1150
+    },
+    {
+      "epoch": 0.009991232714993794,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0019999889274748303,
+      "loss": 0.3008,
+      "step": 1151
+    },
+    {
+      "epoch": 0.009999913195197958,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019999887793493566,
+      "loss": 0.2637,
+      "step": 1152
+    },
+    {
+      "epoch": 0.010008593675402123,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019999886302396693,
+      "loss": 0.2695,
+      "step": 1153
+    },
+    {
+      "epoch": 0.010017274155606288,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019999884801457676,
+      "loss": 0.2539,
+      "step": 1154
+    },
+    {
+      "epoch": 0.010025954635810453,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019999883290676523,
+      "loss": 0.2578,
+      "step": 1155
+    },
+    {
+      "epoch": 0.010034635116014618,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019999881770053226,
+      "loss": 0.2334,
+      "step": 1156
+    },
+    {
+      "epoch": 0.010043315596218783,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.00199998802395878,
+      "loss": 0.2949,
+      "step": 1157
+    },
+    {
+      "epoch": 0.010051996076422948,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001999987869928024,
+      "loss": 0.2422,
+      "step": 1158
+    },
+    {
+      "epoch": 0.010060676556627112,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001999987714913055,
+      "loss": 0.2832,
+      "step": 1159
+    },
+    {
+      "epoch": 0.010069357036831277,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.001999987558913872,
+      "loss": 0.2422,
+      "step": 1160
+    },
+    {
+      "epoch": 0.010078037517035442,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019999874019304767,
+      "loss": 0.2432,
+      "step": 1161
+    },
+    {
+      "epoch": 0.010086717997239607,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001999987243962869,
+      "loss": 0.2969,
+      "step": 1162
+    },
+    {
+      "epoch": 0.010095398477443772,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019999870850110485,
+      "loss": 0.2812,
+      "step": 1163
+    },
+    {
+      "epoch": 0.010104078957647937,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019999869250750158,
+      "loss": 0.2109,
+      "step": 1164
+    },
+    {
+      "epoch": 0.010112759437852102,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019999867641547707,
+      "loss": 0.2559,
+      "step": 1165
+    },
+    {
+      "epoch": 0.010121439918056268,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019999866022503135,
+      "loss": 0.2363,
+      "step": 1166
+    },
+    {
+      "epoch": 0.010130120398260431,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0019999864393616448,
+      "loss": 0.3125,
+      "step": 1167
+    },
+    {
+      "epoch": 0.010138800878464596,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019999862754887642,
+      "loss": 0.2441,
+      "step": 1168
+    },
+    {
+      "epoch": 0.010147481358668761,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019999861106316723,
+      "loss": 0.2236,
+      "step": 1169
+    },
+    {
+      "epoch": 0.010156161838872926,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001999985944790369,
+      "loss": 0.2812,
+      "step": 1170
+    },
+    {
+      "epoch": 0.010164842319077091,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019999857779648546,
+      "loss": 0.3379,
+      "step": 1171
+    },
+    {
+      "epoch": 0.010173522799281257,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019999856101551292,
+      "loss": 0.2324,
+      "step": 1172
+    },
+    {
+      "epoch": 0.010182203279485422,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001999985441361193,
+      "loss": 0.4375,
+      "step": 1173
+    },
+    {
+      "epoch": 0.010190883759689587,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019999852715830465,
+      "loss": 0.2617,
+      "step": 1174
+    },
+    {
+      "epoch": 0.01019956423989375,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019999851008206896,
+      "loss": 0.1914,
+      "step": 1175
+    },
+    {
+      "epoch": 0.010208244720097915,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019999849290741225,
+      "loss": 0.252,
+      "step": 1176
+    },
+    {
+      "epoch": 0.01021692520030208,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019999847563433454,
+      "loss": 0.2031,
+      "step": 1177
+    },
+    {
+      "epoch": 0.010225605680506246,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001999984582628358,
+      "loss": 0.1982,
+      "step": 1178
+    },
+    {
+      "epoch": 0.01023428616071041,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001999984407929162,
+      "loss": 0.2598,
+      "step": 1179
+    },
+    {
+      "epoch": 0.010242966640914576,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001999984232245756,
+      "loss": 0.2656,
+      "step": 1180
+    },
+    {
+      "epoch": 0.010251647121118741,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019999840555781404,
+      "loss": 0.2246,
+      "step": 1181
+    },
+    {
+      "epoch": 0.010260327601322906,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0019999838779263166,
+      "loss": 0.1943,
+      "step": 1182
+    },
+    {
+      "epoch": 0.01026900808152707,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001999983699290283,
+      "loss": 0.2344,
+      "step": 1183
+    },
+    {
+      "epoch": 0.010277688561731235,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019999835196700413,
+      "loss": 0.332,
+      "step": 1184
+    },
+    {
+      "epoch": 0.0102863690419354,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001999983339065591,
+      "loss": 0.3359,
+      "step": 1185
+    },
+    {
+      "epoch": 0.010295049522139565,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001999983157476933,
+      "loss": 0.332,
+      "step": 1186
+    },
+    {
+      "epoch": 0.01030373000234373,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0019999829749040663,
+      "loss": 0.293,
+      "step": 1187
+    },
+    {
+      "epoch": 0.010312410482547895,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001999982791346992,
+      "loss": 0.2217,
+      "step": 1188
+    },
+    {
+      "epoch": 0.01032109096275206,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.00199998260680571,
+      "loss": 0.2539,
+      "step": 1189
+    },
+    {
+      "epoch": 0.010329771442956224,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019999824212802203,
+      "loss": 0.252,
+      "step": 1190
+    },
+    {
+      "epoch": 0.010338451923160389,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019999822347705233,
+      "loss": 0.2422,
+      "step": 1191
+    },
+    {
+      "epoch": 0.010347132403364554,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019999820472766197,
+      "loss": 0.2949,
+      "step": 1192
+    },
+    {
+      "epoch": 0.010355812883568719,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001999981858798509,
+      "loss": 0.2852,
+      "step": 1193
+    },
+    {
+      "epoch": 0.010364493363772884,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0019999816693361916,
+      "loss": 0.1719,
+      "step": 1194
+    },
+    {
+      "epoch": 0.010373173843977049,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019999814788896676,
+      "loss": 0.2383,
+      "step": 1195
+    },
+    {
+      "epoch": 0.010381854324181214,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0019999812874589382,
+      "loss": 0.2754,
+      "step": 1196
+    },
+    {
+      "epoch": 0.01039053480438538,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019999810950440023,
+      "loss": 0.2734,
+      "step": 1197
+    },
+    {
+      "epoch": 0.010399215284589543,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.00199998090164486,
+      "loss": 0.2422,
+      "step": 1198
+    },
+    {
+      "epoch": 0.010407895764793708,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001999980707261513,
+      "loss": 0.2539,
+      "step": 1199
+    },
+    {
+      "epoch": 0.010416576244997873,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.00199998051189396,
+      "loss": 0.2715,
+      "step": 1200
+    },
+    {
+      "epoch": 0.010425256725202038,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0019999803155422023,
+      "loss": 0.3281,
+      "step": 1201
+    },
+    {
+      "epoch": 0.010433937205406203,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0019999801182062392,
+      "loss": 0.4082,
+      "step": 1202
+    },
+    {
+      "epoch": 0.010442617685610368,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019999799198860716,
+      "loss": 0.2559,
+      "step": 1203
+    },
+    {
+      "epoch": 0.010451298165814533,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019999797205816996,
+      "loss": 0.252,
+      "step": 1204
+    },
+    {
+      "epoch": 0.010459978646018699,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001999979520293123,
+      "loss": 0.2266,
+      "step": 1205
+    },
+    {
+      "epoch": 0.010468659126222862,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001999979319020343,
+      "loss": 0.2344,
+      "step": 1206
+    },
+    {
+      "epoch": 0.010477339606427027,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019999791167633583,
+      "loss": 0.2695,
+      "step": 1207
+    },
+    {
+      "epoch": 0.010486020086631192,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.00199997891352217,
+      "loss": 0.2246,
+      "step": 1208
+    },
+    {
+      "epoch": 0.010494700566835357,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019999787092967788,
+      "loss": 0.2188,
+      "step": 1209
+    },
+    {
+      "epoch": 0.010503381047039522,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0019999785040871842,
+      "loss": 0.2344,
+      "step": 1210
+    },
+    {
+      "epoch": 0.010512061527243688,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0019999782978933865,
+      "loss": 0.2695,
+      "step": 1211
+    },
+    {
+      "epoch": 0.010520742007447853,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019999780907153865,
+      "loss": 0.2852,
+      "step": 1212
+    },
+    {
+      "epoch": 0.010529422487652016,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019999778825531838,
+      "loss": 0.2598,
+      "step": 1213
+    },
+    {
+      "epoch": 0.010538102967856181,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019999776734067787,
+      "loss": 0.2891,
+      "step": 1214
+    },
+    {
+      "epoch": 0.010546783448060346,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019999774632761713,
+      "loss": 0.3008,
+      "step": 1215
+    },
+    {
+      "epoch": 0.010555463928264511,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001999977252161362,
+      "loss": 0.2324,
+      "step": 1216
+    },
+    {
+      "epoch": 0.010564144408468677,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001999977040062352,
+      "loss": 0.2734,
+      "step": 1217
+    },
+    {
+      "epoch": 0.010572824888672842,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019999768269791393,
+      "loss": 0.2539,
+      "step": 1218
+    },
+    {
+      "epoch": 0.010581505368877007,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0019999766129117267,
+      "loss": 0.1826,
+      "step": 1219
+    },
+    {
+      "epoch": 0.010590185849081172,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019999763978601126,
+      "loss": 0.2695,
+      "step": 1220
+    },
+    {
+      "epoch": 0.010598866329285335,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019999761818242975,
+      "loss": 0.2637,
+      "step": 1221
+    },
+    {
+      "epoch": 0.0106075468094895,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019999759648042827,
+      "loss": 0.2471,
+      "step": 1222
+    },
+    {
+      "epoch": 0.010616227289693666,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0019999757468000673,
+      "loss": 0.3125,
+      "step": 1223
+    },
+    {
+      "epoch": 0.01062490776989783,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019999755278116518,
+      "loss": 0.2871,
+      "step": 1224
+    },
+    {
+      "epoch": 0.010633588250101996,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001999975307839037,
+      "loss": 0.2949,
+      "step": 1225
+    },
+    {
+      "epoch": 0.010642268730306161,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0019999750868822225,
+      "loss": 0.2178,
+      "step": 1226
+    },
+    {
+      "epoch": 0.010650949210510326,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0019999748649412088,
+      "loss": 0.4844,
+      "step": 1227
+    },
+    {
+      "epoch": 0.010659629690714491,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001999974642015996,
+      "loss": 0.2461,
+      "step": 1228
+    },
+    {
+      "epoch": 0.010668310170918655,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001999974418106585,
+      "loss": 0.2158,
+      "step": 1229
+    },
+    {
+      "epoch": 0.01067699065112282,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019999741932129745,
+      "loss": 0.2246,
+      "step": 1230
+    },
+    {
+      "epoch": 0.010685671131326985,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019999739673351668,
+      "loss": 0.2393,
+      "step": 1231
+    },
+    {
+      "epoch": 0.01069435161153115,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.00199997374047316,
+      "loss": 0.2773,
+      "step": 1232
+    },
+    {
+      "epoch": 0.010703032091735315,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019999735126269565,
+      "loss": 0.2539,
+      "step": 1233
+    },
+    {
+      "epoch": 0.01071171257193948,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019999732837965548,
+      "loss": 0.1895,
+      "step": 1234
+    },
+    {
+      "epoch": 0.010720393052143645,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.001999973053981956,
+      "loss": 0.2236,
+      "step": 1235
+    },
+    {
+      "epoch": 0.010729073532347809,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.00199997282318316,
+      "loss": 0.2451,
+      "step": 1236
+    },
+    {
+      "epoch": 0.010737754012551974,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001999972591400168,
+      "loss": 0.2598,
+      "step": 1237
+    },
+    {
+      "epoch": 0.010746434492756139,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001999972358632979,
+      "loss": 0.25,
+      "step": 1238
+    },
+    {
+      "epoch": 0.010755114972960304,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001999972124881594,
+      "loss": 0.2441,
+      "step": 1239
+    },
+    {
+      "epoch": 0.010763795453164469,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019999718901460126,
+      "loss": 0.2988,
+      "step": 1240
+    },
+    {
+      "epoch": 0.010772475933368634,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019999716544262356,
+      "loss": 0.1875,
+      "step": 1241
+    },
+    {
+      "epoch": 0.0107811564135728,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.001999971417722263,
+      "loss": 0.2295,
+      "step": 1242
+    },
+    {
+      "epoch": 0.010789836893776964,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001999971180034095,
+      "loss": 0.248,
+      "step": 1243
+    },
+    {
+      "epoch": 0.010798517373981128,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019999709413617327,
+      "loss": 0.2734,
+      "step": 1244
+    },
+    {
+      "epoch": 0.010807197854185293,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001999970701705175,
+      "loss": 0.2246,
+      "step": 1245
+    },
+    {
+      "epoch": 0.010815878334389458,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0019999704610644234,
+      "loss": 0.2051,
+      "step": 1246
+    },
+    {
+      "epoch": 0.010824558814593623,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019999702194394777,
+      "loss": 0.3223,
+      "step": 1247
+    },
+    {
+      "epoch": 0.010833239294797788,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001999969976830338,
+      "loss": 0.2441,
+      "step": 1248
+    },
+    {
+      "epoch": 0.010841919775001953,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001999969733237004,
+      "loss": 0.209,
+      "step": 1249
+    },
+    {
+      "epoch": 0.010850600255206119,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001999969488659477,
+      "loss": 0.209,
+      "step": 1250
+    },
+    {
+      "epoch": 0.010859280735410284,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001999969243097757,
+      "loss": 0.2207,
+      "step": 1251
+    },
+    {
+      "epoch": 0.010867961215614447,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0019999689965518445,
+      "loss": 0.2871,
+      "step": 1252
+    },
+    {
+      "epoch": 0.010876641695818612,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0019999687490217387,
+      "loss": 0.2441,
+      "step": 1253
+    },
+    {
+      "epoch": 0.010885322176022777,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0019999685005074415,
+      "loss": 0.2539,
+      "step": 1254
+    },
+    {
+      "epoch": 0.010894002656226942,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019999682510089514,
+      "loss": 0.2324,
+      "step": 1255
+    },
+    {
+      "epoch": 0.010902683136431108,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.00199996800052627,
+      "loss": 0.2266,
+      "step": 1256
+    },
+    {
+      "epoch": 0.010911363616635273,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001999967749059397,
+      "loss": 0.2285,
+      "step": 1257
+    },
+    {
+      "epoch": 0.010920044096839438,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019999674966083326,
+      "loss": 0.2734,
+      "step": 1258
+    },
+    {
+      "epoch": 0.010928724577043603,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019999672431730777,
+      "loss": 0.2109,
+      "step": 1259
+    },
+    {
+      "epoch": 0.010937405057247766,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0019999669887536316,
+      "loss": 0.2773,
+      "step": 1260
+    },
+    {
+      "epoch": 0.010946085537451931,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001999966733349996,
+      "loss": 0.2598,
+      "step": 1261
+    },
+    {
+      "epoch": 0.010954766017656097,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.001999966476962169,
+      "loss": 0.2188,
+      "step": 1262
+    },
+    {
+      "epoch": 0.010963446497860262,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0019999662195901535,
+      "loss": 0.2324,
+      "step": 1263
+    },
+    {
+      "epoch": 0.010972126978064427,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0019999659612339477,
+      "loss": 0.3516,
+      "step": 1264
+    },
+    {
+      "epoch": 0.010980807458268592,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001999965701893553,
+      "loss": 0.2305,
+      "step": 1265
+    },
+    {
+      "epoch": 0.010989487938472757,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001999965441568969,
+      "loss": 0.2617,
+      "step": 1266
+    },
+    {
+      "epoch": 0.01099816841867692,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019999651802601968,
+      "loss": 0.2578,
+      "step": 1267
+    },
+    {
+      "epoch": 0.011006848898881086,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019999649179672356,
+      "loss": 0.3828,
+      "step": 1268
+    },
+    {
+      "epoch": 0.01101552937908525,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0019999646546900863,
+      "loss": 0.3105,
+      "step": 1269
+    },
+    {
+      "epoch": 0.011024209859289416,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019999643904287496,
+      "loss": 0.2461,
+      "step": 1270
+    },
+    {
+      "epoch": 0.011032890339493581,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0019999641251832252,
+      "loss": 0.2539,
+      "step": 1271
+    },
+    {
+      "epoch": 0.011041570819697746,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001999963858953514,
+      "loss": 0.2891,
+      "step": 1272
+    },
+    {
+      "epoch": 0.011050251299901911,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001999963591739615,
+      "loss": 0.2793,
+      "step": 1273
+    },
+    {
+      "epoch": 0.011058931780106076,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.00199996332354153,
+      "loss": 0.293,
+      "step": 1274
+    },
+    {
+      "epoch": 0.01106761226031024,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001999963054359258,
+      "loss": 0.25,
+      "step": 1275
+    },
+    {
+      "epoch": 0.011076292740514405,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019999627841928006,
+      "loss": 0.2266,
+      "step": 1276
+    },
+    {
+      "epoch": 0.01108497322071857,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001999962513042157,
+      "loss": 0.2637,
+      "step": 1277
+    },
+    {
+      "epoch": 0.011093653700922735,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.001999962240907328,
+      "loss": 0.2383,
+      "step": 1278
+    },
+    {
+      "epoch": 0.0111023341811269,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019999619677883137,
+      "loss": 0.3633,
+      "step": 1279
+    },
+    {
+      "epoch": 0.011111014661331065,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0019999616936851147,
+      "loss": 0.2578,
+      "step": 1280
+    },
+    {
+      "epoch": 0.01111969514153523,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001999961418597731,
+      "loss": 0.2598,
+      "step": 1281
+    },
+    {
+      "epoch": 0.011128375621739395,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019999611425261634,
+      "loss": 0.2832,
+      "step": 1282
+    },
+    {
+      "epoch": 0.011137056101943559,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0019999608654704118,
+      "loss": 0.2578,
+      "step": 1283
+    },
+    {
+      "epoch": 0.011145736582147724,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019999605874304756,
+      "loss": 0.3125,
+      "step": 1284
+    },
+    {
+      "epoch": 0.011154417062351889,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001999960308406357,
+      "loss": 0.6797,
+      "step": 1285
+    },
+    {
+      "epoch": 0.011163097542556054,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019999600283980546,
+      "loss": 0.1934,
+      "step": 1286
+    },
+    {
+      "epoch": 0.01117177802276022,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.00199995974740557,
+      "loss": 0.291,
+      "step": 1287
+    },
+    {
+      "epoch": 0.011180458502964384,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001999959465428903,
+      "loss": 0.2852,
+      "step": 1288
+    },
+    {
+      "epoch": 0.01118913898316855,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0019999591824680536,
+      "loss": 0.1953,
+      "step": 1289
+    },
+    {
+      "epoch": 0.011197819463372713,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001999958898523022,
+      "loss": 0.2324,
+      "step": 1290
+    },
+    {
+      "epoch": 0.011206499943576878,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019999586135938095,
+      "loss": 0.2422,
+      "step": 1291
+    },
+    {
+      "epoch": 0.011215180423781043,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0019999583276804154,
+      "loss": 0.2656,
+      "step": 1292
+    },
+    {
+      "epoch": 0.011223860903985208,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019999580407828402,
+      "loss": 0.252,
+      "step": 1293
+    },
+    {
+      "epoch": 0.011232541384189373,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019999577529010854,
+      "loss": 0.209,
+      "step": 1294
+    },
+    {
+      "epoch": 0.011241221864393539,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019999574640351494,
+      "loss": 0.2393,
+      "step": 1295
+    },
+    {
+      "epoch": 0.011249902344597704,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019999571741850337,
+      "loss": 0.2217,
+      "step": 1296
+    },
+    {
+      "epoch": 0.011258582824801869,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019999568833507383,
+      "loss": 0.2539,
+      "step": 1297
+    },
+    {
+      "epoch": 0.011267263305006032,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001999956591532264,
+      "loss": 0.2617,
+      "step": 1298
+    },
+    {
+      "epoch": 0.011275943785210197,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.00199995629872961,
+      "loss": 0.1953,
+      "step": 1299
+    },
+    {
+      "epoch": 0.011284624265414362,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019999560049427777,
+      "loss": 0.2246,
+      "step": 1300
+    },
+    {
+      "epoch": 0.011293304745618528,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0019999557101717668,
+      "loss": 0.248,
+      "step": 1301
+    },
+    {
+      "epoch": 0.011301985225822693,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0019999554144165778,
+      "loss": 0.1865,
+      "step": 1302
+    },
+    {
+      "epoch": 0.011310665706026858,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019999551176772116,
+      "loss": 0.3086,
+      "step": 1303
+    },
+    {
+      "epoch": 0.011319346186231023,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0019999548199536674,
+      "loss": 0.25,
+      "step": 1304
+    },
+    {
+      "epoch": 0.011328026666435188,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0019999545212459465,
+      "loss": 0.1973,
+      "step": 1305
+    },
+    {
+      "epoch": 0.011336707146639351,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001999954221554049,
+      "loss": 0.3066,
+      "step": 1306
+    },
+    {
+      "epoch": 0.011345387626843517,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001999953920877975,
+      "loss": 0.2637,
+      "step": 1307
+    },
+    {
+      "epoch": 0.011354068107047682,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019999536192177245,
+      "loss": 0.2031,
+      "step": 1308
+    },
+    {
+      "epoch": 0.011362748587251847,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019999533165732985,
+      "loss": 0.3164,
+      "step": 1309
+    },
+    {
+      "epoch": 0.011371429067456012,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001999953012944697,
+      "loss": 0.2266,
+      "step": 1310
+    },
+    {
+      "epoch": 0.011380109547660177,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001999952708331921,
+      "loss": 0.252,
+      "step": 1311
+    },
+    {
+      "epoch": 0.011388790027864342,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019999524027349697,
+      "loss": 0.2246,
+      "step": 1312
+    },
+    {
+      "epoch": 0.011397470508068505,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019999520961538437,
+      "loss": 0.2617,
+      "step": 1313
+    },
+    {
+      "epoch": 0.01140615098827267,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001999951788588544,
+      "loss": 0.3242,
+      "step": 1314
+    },
+    {
+      "epoch": 0.011414831468476836,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019999514800390704,
+      "loss": 0.248,
+      "step": 1315
+    },
+    {
+      "epoch": 0.011423511948681,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019999511705054234,
+      "loss": 0.2227,
+      "step": 1316
+    },
+    {
+      "epoch": 0.011432192428885166,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019999508599876036,
+      "loss": 0.2812,
+      "step": 1317
+    },
+    {
+      "epoch": 0.011440872909089331,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001999950548485611,
+      "loss": 0.2598,
+      "step": 1318
+    },
+    {
+      "epoch": 0.011449553389293496,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0019999502359994456,
+      "loss": 0.2617,
+      "step": 1319
+    },
+    {
+      "epoch": 0.011458233869497661,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019999499225291087,
+      "loss": 0.2988,
+      "step": 1320
+    },
+    {
+      "epoch": 0.011466914349701825,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019999496080746,
+      "loss": 0.2539,
+      "step": 1321
+    },
+    {
+      "epoch": 0.01147559482990599,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0019999492926359194,
+      "loss": 0.2148,
+      "step": 1322
+    },
+    {
+      "epoch": 0.011484275310110155,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.001999948976213068,
+      "loss": 0.2422,
+      "step": 1323
+    },
+    {
+      "epoch": 0.01149295579031432,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.001999948658806046,
+      "loss": 0.2109,
+      "step": 1324
+    },
+    {
+      "epoch": 0.011501636270518485,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019999483404148535,
+      "loss": 0.332,
+      "step": 1325
+    },
+    {
+      "epoch": 0.01151031675072265,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019999480210394914,
+      "loss": 0.2793,
+      "step": 1326
+    },
+    {
+      "epoch": 0.011518997230926815,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019999477006799595,
+      "loss": 0.457,
+      "step": 1327
+    },
+    {
+      "epoch": 0.01152767771113098,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0019999473793362583,
+      "loss": 0.2266,
+      "step": 1328
+    },
+    {
+      "epoch": 0.011536358191335144,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001999947057008388,
+      "loss": 0.3301,
+      "step": 1329
+    },
+    {
+      "epoch": 0.011545038671539309,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001999946733696349,
+      "loss": 0.2354,
+      "step": 1330
+    },
+    {
+      "epoch": 0.011553719151743474,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001999946409400142,
+      "loss": 0.2246,
+      "step": 1331
+    },
+    {
+      "epoch": 0.01156239963194764,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001999946084119768,
+      "loss": 0.2383,
+      "step": 1332
+    },
+    {
+      "epoch": 0.011571080112151804,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001999945757855225,
+      "loss": 0.2656,
+      "step": 1333
+    },
+    {
+      "epoch": 0.01157976059235597,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019999454306065157,
+      "loss": 0.2422,
+      "step": 1334
+    },
+    {
+      "epoch": 0.011588441072560135,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019999451023736394,
+      "loss": 0.2559,
+      "step": 1335
+    },
+    {
+      "epoch": 0.0115971215527643,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019999447731565965,
+      "loss": 0.2754,
+      "step": 1336
+    },
+    {
+      "epoch": 0.011605802032968463,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019999444429553877,
+      "loss": 0.2129,
+      "step": 1337
+    },
+    {
+      "epoch": 0.011614482513172628,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019999441117700134,
+      "loss": 0.2324,
+      "step": 1338
+    },
+    {
+      "epoch": 0.011623162993376793,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019999437796004733,
+      "loss": 0.2217,
+      "step": 1339
+    },
+    {
+      "epoch": 0.011631843473580959,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019999434464467686,
+      "loss": 0.2598,
+      "step": 1340
+    },
+    {
+      "epoch": 0.011640523953785124,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001999943112308899,
+      "loss": 0.2539,
+      "step": 1341
+    },
+    {
+      "epoch": 0.011649204433989289,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001999942777186865,
+      "loss": 0.3066,
+      "step": 1342
+    },
+    {
+      "epoch": 0.011657884914193454,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0019999424410806674,
+      "loss": 0.2969,
+      "step": 1343
+    },
+    {
+      "epoch": 0.011666565394397617,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001999942103990306,
+      "loss": 0.2051,
+      "step": 1344
+    },
+    {
+      "epoch": 0.011675245874601782,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019999417659157816,
+      "loss": 0.2539,
+      "step": 1345
+    },
+    {
+      "epoch": 0.011683926354805948,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0019999414268570947,
+      "loss": 0.2354,
+      "step": 1346
+    },
+    {
+      "epoch": 0.011692606835010113,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001999941086814245,
+      "loss": 0.3164,
+      "step": 1347
+    },
+    {
+      "epoch": 0.011701287315214278,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001999940745787233,
+      "loss": 0.2832,
+      "step": 1348
+    },
+    {
+      "epoch": 0.011709967795418443,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.00199994040377606,
+      "loss": 0.2266,
+      "step": 1349
+    },
+    {
+      "epoch": 0.011718648275622608,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.001999940060780725,
+      "loss": 0.248,
+      "step": 1350
+    },
+    {
+      "epoch": 0.011727328755826773,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019999397168012295,
+      "loss": 0.2188,
+      "step": 1351
+    },
+    {
+      "epoch": 0.011736009236030936,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019999393718375734,
+      "loss": 0.2236,
+      "step": 1352
+    },
+    {
+      "epoch": 0.011744689716235102,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001999939025889757,
+      "loss": 0.2539,
+      "step": 1353
+    },
+    {
+      "epoch": 0.011753370196439267,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019999386789577808,
+      "loss": 0.207,
+      "step": 1354
+    },
+    {
+      "epoch": 0.011762050676643432,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001999938331041645,
+      "loss": 0.2363,
+      "step": 1355
+    },
+    {
+      "epoch": 0.011770731156847597,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019999379821413507,
+      "loss": 0.2461,
+      "step": 1356
+    },
+    {
+      "epoch": 0.011779411637051762,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019999376322568977,
+      "loss": 0.25,
+      "step": 1357
+    },
+    {
+      "epoch": 0.011788092117255927,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001999937281388286,
+      "loss": 0.2598,
+      "step": 1358
+    },
+    {
+      "epoch": 0.011796772597460092,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019999369295355166,
+      "loss": 0.2539,
+      "step": 1359
+    },
+    {
+      "epoch": 0.011805453077664256,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.00199993657669859,
+      "loss": 0.2285,
+      "step": 1360
+    },
+    {
+      "epoch": 0.01181413355786842,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001999936222877506,
+      "loss": 0.2812,
+      "step": 1361
+    },
+    {
+      "epoch": 0.011822814038072586,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001999935868072265,
+      "loss": 0.2637,
+      "step": 1362
+    },
+    {
+      "epoch": 0.011831494518276751,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001999935512282868,
+      "loss": 0.2695,
+      "step": 1363
+    },
+    {
+      "epoch": 0.011840174998480916,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001999935155509315,
+      "loss": 0.2422,
+      "step": 1364
+    },
+    {
+      "epoch": 0.011848855478685081,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0019999347977516066,
+      "loss": 0.2559,
+      "step": 1365
+    },
+    {
+      "epoch": 0.011857535958889246,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0019999344390097423,
+      "loss": 0.2158,
+      "step": 1366
+    },
+    {
+      "epoch": 0.01186621643909341,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019999340792837238,
+      "loss": 0.3008,
+      "step": 1367
+    },
+    {
+      "epoch": 0.011874896919297575,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.001999933718573551,
+      "loss": 0.2207,
+      "step": 1368
+    },
+    {
+      "epoch": 0.01188357739950174,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019999333568792243,
+      "loss": 0.1973,
+      "step": 1369
+    },
+    {
+      "epoch": 0.011892257879705905,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019999329942007437,
+      "loss": 0.2295,
+      "step": 1370
+    },
+    {
+      "epoch": 0.01190093835991007,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.00199993263053811,
+      "loss": 0.2734,
+      "step": 1371
+    },
+    {
+      "epoch": 0.011909618840114235,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001999932265891323,
+      "loss": 0.252,
+      "step": 1372
+    },
+    {
+      "epoch": 0.0119182993203184,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019999319002603844,
+      "loss": 0.2402,
+      "step": 1373
+    },
+    {
+      "epoch": 0.011926979800522566,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0019999315336452933,
+      "loss": 0.2812,
+      "step": 1374
+    },
+    {
+      "epoch": 0.011935660280726729,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001999931166046051,
+      "loss": 0.2432,
+      "step": 1375
+    },
+    {
+      "epoch": 0.011944340760930894,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001999930797462657,
+      "loss": 0.2148,
+      "step": 1376
+    },
+    {
+      "epoch": 0.01195302124113506,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019999304278951126,
+      "loss": 0.2324,
+      "step": 1377
+    },
+    {
+      "epoch": 0.011961701721339224,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001999930057343418,
+      "loss": 0.2402,
+      "step": 1378
+    },
+    {
+      "epoch": 0.01197038220154339,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019999296858075727,
+      "loss": 0.2852,
+      "step": 1379
+    },
+    {
+      "epoch": 0.011979062681747555,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0019999293132875783,
+      "loss": 0.2344,
+      "step": 1380
+    },
+    {
+      "epoch": 0.01198774316195172,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019999289397834344,
+      "loss": 0.3809,
+      "step": 1381
+    },
+    {
+      "epoch": 0.011996423642155885,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001999928565295142,
+      "loss": 0.2051,
+      "step": 1382
+    },
+    {
+      "epoch": 0.012005104122360048,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001999928189822701,
+      "loss": 0.2402,
+      "step": 1383
+    },
+    {
+      "epoch": 0.012013784602564213,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0019999278133661126,
+      "loss": 0.2012,
+      "step": 1384
+    },
+    {
+      "epoch": 0.012022465082768378,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001999927435925376,
+      "loss": 0.2363,
+      "step": 1385
+    },
+    {
+      "epoch": 0.012031145562972544,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019999270575004925,
+      "loss": 0.25,
+      "step": 1386
+    },
+    {
+      "epoch": 0.012039826043176709,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019999266780914623,
+      "loss": 0.3281,
+      "step": 1387
+    },
+    {
+      "epoch": 0.012048506523380874,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001999926297698286,
+      "loss": 0.252,
+      "step": 1388
+    },
+    {
+      "epoch": 0.012057187003585039,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019999259163209636,
+      "loss": 0.2715,
+      "step": 1389
+    },
+    {
+      "epoch": 0.012065867483789202,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001999925533959496,
+      "loss": 0.2148,
+      "step": 1390
+    },
+    {
+      "epoch": 0.012074547963993367,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0019999251506138834,
+      "loss": 0.2402,
+      "step": 1391
+    },
+    {
+      "epoch": 0.012083228444197533,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0019999247662841257,
+      "loss": 0.2324,
+      "step": 1392
+    },
+    {
+      "epoch": 0.012091908924401698,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019999243809702243,
+      "loss": 0.3066,
+      "step": 1393
+    },
+    {
+      "epoch": 0.012100589404605863,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001999923994672179,
+      "loss": 0.3828,
+      "step": 1394
+    },
+    {
+      "epoch": 0.012109269884810028,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.00199992360738999,
+      "loss": 0.2207,
+      "step": 1395
+    },
+    {
+      "epoch": 0.012117950365014193,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019999232191236583,
+      "loss": 0.252,
+      "step": 1396
+    },
+    {
+      "epoch": 0.012126630845218358,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0019999228298731844,
+      "loss": 0.1924,
+      "step": 1397
+    },
+    {
+      "epoch": 0.012135311325422522,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0019999224396385676,
+      "loss": 0.2207,
+      "step": 1398
+    },
+    {
+      "epoch": 0.012143991805626687,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.00199992204841981,
+      "loss": 0.2363,
+      "step": 1399
+    },
+    {
+      "epoch": 0.012152672285830852,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0019999216562169107,
+      "loss": 0.3672,
+      "step": 1400
+    },
+    {
+      "epoch": 0.012161352766035017,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019999212630298704,
+      "loss": 0.3086,
+      "step": 1401
+    },
+    {
+      "epoch": 0.012170033246239182,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019999208688586904,
+      "loss": 0.25,
+      "step": 1402
+    },
+    {
+      "epoch": 0.012178713726443347,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.00199992047370337,
+      "loss": 0.2695,
+      "step": 1403
+    },
+    {
+      "epoch": 0.012187394206647512,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.00199992007756391,
+      "loss": 0.2324,
+      "step": 1404
+    },
+    {
+      "epoch": 0.012196074686851677,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.001999919680440311,
+      "loss": 0.2598,
+      "step": 1405
+    },
+    {
+      "epoch": 0.01220475516705584,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0019999192823325737,
+      "loss": 0.2637,
+      "step": 1406
+    },
+    {
+      "epoch": 0.012213435647260006,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001999918883240698,
+      "loss": 0.1895,
+      "step": 1407
+    },
+    {
+      "epoch": 0.012222116127464171,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019999184831646847,
+      "loss": 0.2578,
+      "step": 1408
+    },
+    {
+      "epoch": 0.012230796607668336,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019999180821045335,
+      "loss": 0.2598,
+      "step": 1409
+    },
+    {
+      "epoch": 0.012239477087872501,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.001999917680060246,
+      "loss": 0.1953,
+      "step": 1410
+    },
+    {
+      "epoch": 0.012248157568076666,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001999917277031822,
+      "loss": 0.2676,
+      "step": 1411
+    },
+    {
+      "epoch": 0.012256838048280832,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019999168730192615,
+      "loss": 0.334,
+      "step": 1412
+    },
+    {
+      "epoch": 0.012265518528484997,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001999916468022566,
+      "loss": 0.2871,
+      "step": 1413
+    },
+    {
+      "epoch": 0.01227419900868916,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001999916062041735,
+      "loss": 0.3027,
+      "step": 1414
+    },
+    {
+      "epoch": 0.012282879488893325,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019999156550767694,
+      "loss": 0.2539,
+      "step": 1415
+    },
+    {
+      "epoch": 0.01229155996909749,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019999152471276696,
+      "loss": 0.2441,
+      "step": 1416
+    },
+    {
+      "epoch": 0.012300240449301655,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0019999148381944355,
+      "loss": 0.2598,
+      "step": 1417
+    },
+    {
+      "epoch": 0.01230892092950582,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001999914428277069,
+      "loss": 0.2734,
+      "step": 1418
+    },
+    {
+      "epoch": 0.012317601409709986,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019999140173755686,
+      "loss": 0.2578,
+      "step": 1419
+    },
+    {
+      "epoch": 0.01232628188991415,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019999136054899367,
+      "loss": 0.2266,
+      "step": 1420
+    },
+    {
+      "epoch": 0.012334962370118314,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0019999131926201723,
+      "loss": 0.2217,
+      "step": 1421
+    },
+    {
+      "epoch": 0.01234364285032248,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019999127787662767,
+      "loss": 0.2832,
+      "step": 1422
+    },
+    {
+      "epoch": 0.012352323330526644,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019999123639282495,
+      "loss": 0.2715,
+      "step": 1423
+    },
+    {
+      "epoch": 0.01236100381073081,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001999911948106092,
+      "loss": 0.2617,
+      "step": 1424
+    },
+    {
+      "epoch": 0.012369684290934975,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.001999911531299804,
+      "loss": 0.2109,
+      "step": 1425
+    },
+    {
+      "epoch": 0.01237836477113914,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 0.0019999111135093864,
+      "loss": 0.1924,
+      "step": 1426
+    },
+    {
+      "epoch": 0.012387045251343305,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.00199991069473484,
+      "loss": 0.1992,
+      "step": 1427
+    },
+    {
+      "epoch": 0.01239572573154747,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019999102749761644,
+      "loss": 0.2578,
+      "step": 1428
+    },
+    {
+      "epoch": 0.012404406211751633,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.00199990985423336,
+      "loss": 0.25,
+      "step": 1429
+    },
+    {
+      "epoch": 0.012413086691955798,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019999094325064285,
+      "loss": 0.2891,
+      "step": 1430
+    },
+    {
+      "epoch": 0.012421767172159964,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001999909009795369,
+      "loss": 0.2812,
+      "step": 1431
+    },
+    {
+      "epoch": 0.012430447652364129,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001999908586100183,
+      "loss": 0.3008,
+      "step": 1432
+    },
+    {
+      "epoch": 0.012439128132568294,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.00199990816142087,
+      "loss": 0.2559,
+      "step": 1433
+    },
+    {
+      "epoch": 0.012447808612772459,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001999907735757431,
+      "loss": 0.2217,
+      "step": 1434
+    },
+    {
+      "epoch": 0.012456489092976624,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001999907309109867,
+      "loss": 0.2676,
+      "step": 1435
+    },
+    {
+      "epoch": 0.01246516957318079,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019999068814781774,
+      "loss": 0.3359,
+      "step": 1436
+    },
+    {
+      "epoch": 0.012473850053384953,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001999906452862363,
+      "loss": 0.2041,
+      "step": 1437
+    },
+    {
+      "epoch": 0.012482530533589118,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001999906023262425,
+      "loss": 0.25,
+      "step": 1438
+    },
+    {
+      "epoch": 0.012491211013793283,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019999055926783627,
+      "loss": 0.2021,
+      "step": 1439
+    },
+    {
+      "epoch": 0.012499891493997448,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019999051611101775,
+      "loss": 0.2031,
+      "step": 1440
+    },
+    {
+      "epoch": 0.012508571974201613,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0019999047285578697,
+      "loss": 0.2324,
+      "step": 1441
+    },
+    {
+      "epoch": 0.012517252454405778,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019999042950214394,
+      "loss": 0.2539,
+      "step": 1442
+    },
+    {
+      "epoch": 0.012525932934609943,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001999903860500887,
+      "loss": 0.2715,
+      "step": 1443
+    },
+    {
+      "epoch": 0.012534613414814107,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0019999034249962135,
+      "loss": 0.1484,
+      "step": 1444
+    },
+    {
+      "epoch": 0.012543293895018272,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.001999902988507419,
+      "loss": 0.2256,
+      "step": 1445
+    },
+    {
+      "epoch": 0.012551974375222437,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0019999025510345045,
+      "loss": 0.2207,
+      "step": 1446
+    },
+    {
+      "epoch": 0.012560654855426602,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.00199990211257747,
+      "loss": 0.2422,
+      "step": 1447
+    },
+    {
+      "epoch": 0.012569335335630767,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.001999901673136316,
+      "loss": 0.2314,
+      "step": 1448
+    },
+    {
+      "epoch": 0.012578015815834932,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019999012327110433,
+      "loss": 0.208,
+      "step": 1449
+    },
+    {
+      "epoch": 0.012586696296039097,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019999007913016516,
+      "loss": 0.2715,
+      "step": 1450
+    },
+    {
+      "epoch": 0.012595376776243263,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001999900348908142,
+      "loss": 0.3086,
+      "step": 1451
+    },
+    {
+      "epoch": 0.012604057256447426,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.001999899905530515,
+      "loss": 0.2344,
+      "step": 1452
+    },
+    {
+      "epoch": 0.012612737736651591,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001999899461168771,
+      "loss": 0.332,
+      "step": 1453
+    },
+    {
+      "epoch": 0.012621418216855756,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0019998990158229106,
+      "loss": 0.2891,
+      "step": 1454
+    },
+    {
+      "epoch": 0.012630098697059921,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001999898569492934,
+      "loss": 0.2969,
+      "step": 1455
+    },
+    {
+      "epoch": 0.012638779177264086,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001999898122178842,
+      "loss": 0.2051,
+      "step": 1456
+    },
+    {
+      "epoch": 0.012647459657468252,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019998976738806345,
+      "loss": 0.2539,
+      "step": 1457
+    },
+    {
+      "epoch": 0.012656140137672417,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001999897224598313,
+      "loss": 0.2344,
+      "step": 1458
+    },
+    {
+      "epoch": 0.012664820617876582,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0019998967743318774,
+      "loss": 0.252,
+      "step": 1459
+    },
+    {
+      "epoch": 0.012673501098080745,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001999896323081328,
+      "loss": 0.2324,
+      "step": 1460
+    },
+    {
+      "epoch": 0.01268218157828491,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0019998958708466654,
+      "loss": 0.3281,
+      "step": 1461
+    },
+    {
+      "epoch": 0.012690862058489075,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.00199989541762789,
+      "loss": 0.2461,
+      "step": 1462
+    },
+    {
+      "epoch": 0.01269954253869324,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001999894963425003,
+      "loss": 0.3086,
+      "step": 1463
+    },
+    {
+      "epoch": 0.012708223018897406,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.001999894508238004,
+      "loss": 0.1689,
+      "step": 1464
+    },
+    {
+      "epoch": 0.01271690349910157,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001999894052066894,
+      "loss": 0.2852,
+      "step": 1465
+    },
+    {
+      "epoch": 0.012725583979305736,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0019998935949116737,
+      "loss": 0.2129,
+      "step": 1466
+    },
+    {
+      "epoch": 0.0127342644595099,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0019998931367723426,
+      "loss": 0.3242,
+      "step": 1467
+    },
+    {
+      "epoch": 0.012742944939714064,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0019998926776489028,
+      "loss": 0.2441,
+      "step": 1468
+    },
+    {
+      "epoch": 0.01275162541991823,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001999892217541353,
+      "loss": 0.2256,
+      "step": 1469
+    },
+    {
+      "epoch": 0.012760305900122395,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001999891756449695,
+      "loss": 0.2539,
+      "step": 1470
+    },
+    {
+      "epoch": 0.01276898638032656,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001999891294373929,
+      "loss": 0.2578,
+      "step": 1471
+    },
+    {
+      "epoch": 0.012777666860530725,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001999890831314055,
+      "loss": 0.2383,
+      "step": 1472
+    },
+    {
+      "epoch": 0.01278634734073489,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019998903672700744,
+      "loss": 0.1992,
+      "step": 1473
+    },
+    {
+      "epoch": 0.012795027820939055,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001999889902241987,
+      "loss": 0.2656,
+      "step": 1474
+    },
+    {
+      "epoch": 0.012803708301143218,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019998894362297935,
+      "loss": 0.252,
+      "step": 1475
+    },
+    {
+      "epoch": 0.012812388781347384,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0019998889692334947,
+      "loss": 0.3223,
+      "step": 1476
+    },
+    {
+      "epoch": 0.012821069261551549,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019998885012530903,
+      "loss": 0.25,
+      "step": 1477
+    },
+    {
+      "epoch": 0.012829749741755714,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001999888032288582,
+      "loss": 0.1875,
+      "step": 1478
+    },
+    {
+      "epoch": 0.012838430221959879,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.001999887562339969,
+      "loss": 0.1943,
+      "step": 1479
+    },
+    {
+      "epoch": 0.012847110702164044,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001999887091407253,
+      "loss": 0.2422,
+      "step": 1480
+    },
+    {
+      "epoch": 0.01285579118236821,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.001999886619490434,
+      "loss": 0.2129,
+      "step": 1481
+    },
+    {
+      "epoch": 0.012864471662572374,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001999886146589512,
+      "loss": 0.2676,
+      "step": 1482
+    },
+    {
+      "epoch": 0.012873152142776538,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019998856727044883,
+      "loss": 0.2246,
+      "step": 1483
+    },
+    {
+      "epoch": 0.012881832622980703,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019998851978353634,
+      "loss": 0.2988,
+      "step": 1484
+    },
+    {
+      "epoch": 0.012890513103184868,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0019998847219821377,
+      "loss": 0.2344,
+      "step": 1485
+    },
+    {
+      "epoch": 0.012899193583389033,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001999884245144811,
+      "loss": 0.2539,
+      "step": 1486
+    },
+    {
+      "epoch": 0.012907874063593198,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0019998837673233846,
+      "loss": 0.2324,
+      "step": 1487
+    },
+    {
+      "epoch": 0.012916554543797363,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.001999883288517859,
+      "loss": 0.2461,
+      "step": 1488
+    },
+    {
+      "epoch": 0.012925235024001528,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019998828087282343,
+      "loss": 0.2432,
+      "step": 1489
+    },
+    {
+      "epoch": 0.012933915504205694,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0019998823279545113,
+      "loss": 0.25,
+      "step": 1490
+    },
+    {
+      "epoch": 0.012942595984409857,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0019998818461966906,
+      "loss": 0.2148,
+      "step": 1491
+    },
+    {
+      "epoch": 0.012951276464614022,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001999881363454773,
+      "loss": 0.3281,
+      "step": 1492
+    },
+    {
+      "epoch": 0.012959956944818187,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001999880879728758,
+      "loss": 0.2969,
+      "step": 1493
+    },
+    {
+      "epoch": 0.012968637425022352,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0019998803950186475,
+      "loss": 0.2285,
+      "step": 1494
+    },
+    {
+      "epoch": 0.012977317905226517,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.001999879909324441,
+      "loss": 0.1855,
+      "step": 1495
+    },
+    {
+      "epoch": 0.012985998385430682,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001999879422646139,
+      "loss": 0.5859,
+      "step": 1496
+    },
+    {
+      "epoch": 0.012994678865634848,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001999878934983743,
+      "loss": 0.2129,
+      "step": 1497
+    },
+    {
+      "epoch": 0.013003359345839011,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0019998784463372524,
+      "loss": 0.1992,
+      "step": 1498
+    },
+    {
+      "epoch": 0.013012039826043176,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019998779567066685,
+      "loss": 0.1768,
+      "step": 1499
+    },
+    {
+      "epoch": 0.013020720306247341,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0019998774660919916,
+      "loss": 0.248,
+      "step": 1500
+    },
+    {
+      "epoch": 0.013029400786451506,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001999876974493222,
+      "loss": 0.332,
+      "step": 1501
+    },
+    {
+      "epoch": 0.013038081266655671,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001999876481910361,
+      "loss": 0.2617,
+      "step": 1502
+    },
+    {
+      "epoch": 0.013046761746859837,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001999875988343408,
+      "loss": 0.252,
+      "step": 1503
+    },
+    {
+      "epoch": 0.013055442227064002,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.001999875493792364,
+      "loss": 0.2129,
+      "step": 1504
+    },
+    {
+      "epoch": 0.013064122707268167,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019998749982572304,
+      "loss": 0.2002,
+      "step": 1505
+    },
+    {
+      "epoch": 0.01307280318747233,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019998745017380066,
+      "loss": 0.2734,
+      "step": 1506
+    },
+    {
+      "epoch": 0.013081483667676495,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019998740042346938,
+      "loss": 0.2461,
+      "step": 1507
+    },
+    {
+      "epoch": 0.01309016414788066,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0019998735057472922,
+      "loss": 0.2148,
+      "step": 1508
+    },
+    {
+      "epoch": 0.013098844628084826,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0019998730062758025,
+      "loss": 0.21,
+      "step": 1509
+    },
+    {
+      "epoch": 0.01310752510828899,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001999872505820225,
+      "loss": 0.2461,
+      "step": 1510
+    },
+    {
+      "epoch": 0.013116205588493156,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0019998720043805603,
+      "loss": 0.2539,
+      "step": 1511
+    },
+    {
+      "epoch": 0.013124886068697321,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019998715019568093,
+      "loss": 0.2656,
+      "step": 1512
+    },
+    {
+      "epoch": 0.013133566548901486,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0019998709985489726,
+      "loss": 0.2236,
+      "step": 1513
+    },
+    {
+      "epoch": 0.01314224702910565,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0019998704941570503,
+      "loss": 0.1973,
+      "step": 1514
+    },
+    {
+      "epoch": 0.013150927509309815,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001999869988781043,
+      "loss": 0.2598,
+      "step": 1515
+    },
+    {
+      "epoch": 0.01315960798951398,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019998694824209517,
+      "loss": 0.2695,
+      "step": 1516
+    },
+    {
+      "epoch": 0.013168288469718145,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0019998689750767764,
+      "loss": 0.2578,
+      "step": 1517
+    },
+    {
+      "epoch": 0.01317696894992231,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001999868466748518,
+      "loss": 0.2363,
+      "step": 1518
+    },
+    {
+      "epoch": 0.013185649430126475,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001999867957436177,
+      "loss": 0.293,
+      "step": 1519
+    },
+    {
+      "epoch": 0.01319432991033064,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.001999867447139754,
+      "loss": 0.1914,
+      "step": 1520
+    },
+    {
+      "epoch": 0.013203010390534804,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019998669358592494,
+      "loss": 0.2451,
+      "step": 1521
+    },
+    {
+      "epoch": 0.013211690870738969,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019998664235946636,
+      "loss": 0.2334,
+      "step": 1522
+    },
+    {
+      "epoch": 0.013220371350943134,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019998659103459978,
+      "loss": 0.2617,
+      "step": 1523
+    },
+    {
+      "epoch": 0.013229051831147299,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.001999865396113252,
+      "loss": 0.168,
+      "step": 1524
+    },
+    {
+      "epoch": 0.013237732311351464,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019998648808964266,
+      "loss": 0.248,
+      "step": 1525
+    },
+    {
+      "epoch": 0.01324641279155563,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.001999864364695523,
+      "loss": 0.2334,
+      "step": 1526
+    },
+    {
+      "epoch": 0.013255093271759794,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.0019998638475105406,
+      "loss": 0.1562,
+      "step": 1527
+    },
+    {
+      "epoch": 0.01326377375196396,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019998633293414813,
+      "loss": 0.2109,
+      "step": 1528
+    },
+    {
+      "epoch": 0.013272454232168123,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019998628101883446,
+      "loss": 0.1924,
+      "step": 1529
+    },
+    {
+      "epoch": 0.013281134712372288,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.001999862290051132,
+      "loss": 0.2324,
+      "step": 1530
+    },
+    {
+      "epoch": 0.013289815192576453,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.001999861768929843,
+      "loss": 0.2256,
+      "step": 1531
+    },
+    {
+      "epoch": 0.013298495672780618,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0019998612468244787,
+      "loss": 0.2188,
+      "step": 1532
+    },
+    {
+      "epoch": 0.013307176152984783,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.00199986072373504,
+      "loss": 0.1934,
+      "step": 1533
+    },
+    {
+      "epoch": 0.013315856633188948,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019998601996615265,
+      "loss": 0.2559,
+      "step": 1534
+    },
+    {
+      "epoch": 0.013324537113393113,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.00199985967460394,
+      "loss": 0.209,
+      "step": 1535
+    },
+    {
+      "epoch": 0.013333217593597279,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.00199985914856228,
+      "loss": 0.2441,
+      "step": 1536
+    },
+    {
+      "epoch": 0.013341898073801442,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001999858621536548,
+      "loss": 0.2598,
+      "step": 1537
+    },
+    {
+      "epoch": 0.013350578554005607,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001999858093526744,
+      "loss": 0.2539,
+      "step": 1538
+    },
+    {
+      "epoch": 0.013359259034209772,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019998575645328687,
+      "loss": 0.1963,
+      "step": 1539
+    },
+    {
+      "epoch": 0.013367939514413937,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0019998570345549226,
+      "loss": 0.2266,
+      "step": 1540
+    },
+    {
+      "epoch": 0.013376619994618102,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019998565035929065,
+      "loss": 0.3203,
+      "step": 1541
+    },
+    {
+      "epoch": 0.013385300474822268,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019998559716468208,
+      "loss": 0.2109,
+      "step": 1542
+    },
+    {
+      "epoch": 0.013393980955026433,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001999855438716666,
+      "loss": 0.2598,
+      "step": 1543
+    },
+    {
+      "epoch": 0.013402661435230596,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019998549048024427,
+      "loss": 0.3496,
+      "step": 1544
+    },
+    {
+      "epoch": 0.013411341915434761,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001999854369904152,
+      "loss": 0.2373,
+      "step": 1545
+    },
+    {
+      "epoch": 0.013420022395638926,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.001999853834021794,
+      "loss": 0.209,
+      "step": 1546
+    },
+    {
+      "epoch": 0.013428702875843091,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001999853297155369,
+      "loss": 0.2148,
+      "step": 1547
+    },
+    {
+      "epoch": 0.013437383356047257,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019998527593048784,
+      "loss": 0.1836,
+      "step": 1548
+    },
+    {
+      "epoch": 0.013446063836251422,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019998522204703224,
+      "loss": 0.3203,
+      "step": 1549
+    },
+    {
+      "epoch": 0.013454744316455587,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.001999851680651701,
+      "loss": 0.1914,
+      "step": 1550
+    },
+    {
+      "epoch": 0.013463424796659752,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019998511398490156,
+      "loss": 0.2812,
+      "step": 1551
+    },
+    {
+      "epoch": 0.013472105276863915,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0019998505980622664,
+      "loss": 0.3457,
+      "step": 1552
+    },
+    {
+      "epoch": 0.01348078575706808,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019998500552914546,
+      "loss": 0.2598,
+      "step": 1553
+    },
+    {
+      "epoch": 0.013489466237272246,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019998495115365797,
+      "loss": 0.2246,
+      "step": 1554
+    },
+    {
+      "epoch": 0.01349814671747641,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001999848966797643,
+      "loss": 0.293,
+      "step": 1555
+    },
+    {
+      "epoch": 0.013506827197680576,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019998484210746455,
+      "loss": 0.2305,
+      "step": 1556
+    },
+    {
+      "epoch": 0.013515507677884741,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019998478743675865,
+      "loss": 0.2129,
+      "step": 1557
+    },
+    {
+      "epoch": 0.013524188158088906,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001999847326676468,
+      "loss": 0.3477,
+      "step": 1558
+    },
+    {
+      "epoch": 0.013532868638293071,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0019998467780012897,
+      "loss": 0.1992,
+      "step": 1559
+    },
+    {
+      "epoch": 0.013541549118497235,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019998462283420527,
+      "loss": 0.2812,
+      "step": 1560
+    },
+    {
+      "epoch": 0.0135502295987014,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.001999845677698757,
+      "loss": 0.2383,
+      "step": 1561
+    },
+    {
+      "epoch": 0.013558910078905565,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001999845126071404,
+      "loss": 0.543,
+      "step": 1562
+    },
+    {
+      "epoch": 0.01356759055910973,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0019998445734599937,
+      "loss": 0.2324,
+      "step": 1563
+    },
+    {
+      "epoch": 0.013576271039313895,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001999844019864527,
+      "loss": 0.2422,
+      "step": 1564
+    },
+    {
+      "epoch": 0.01358495151951806,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001999843465285004,
+      "loss": 0.2891,
+      "step": 1565
+    },
+    {
+      "epoch": 0.013593631999722225,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.001999842909721426,
+      "loss": 0.209,
+      "step": 1566
+    },
+    {
+      "epoch": 0.01360231247992639,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0019998423531737935,
+      "loss": 0.3027,
+      "step": 1567
+    },
+    {
+      "epoch": 0.013610992960130554,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001999841795642107,
+      "loss": 0.2891,
+      "step": 1568
+    },
+    {
+      "epoch": 0.013619673440334719,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0019998412371263668,
+      "loss": 0.2422,
+      "step": 1569
+    },
+    {
+      "epoch": 0.013628353920538884,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0019998406776265735,
+      "loss": 0.3398,
+      "step": 1570
+    },
+    {
+      "epoch": 0.013637034400743049,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001999840117142728,
+      "loss": 0.2324,
+      "step": 1571
+    },
+    {
+      "epoch": 0.013645714880947214,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001999839555674831,
+      "loss": 0.3418,
+      "step": 1572
+    },
+    {
+      "epoch": 0.01365439536115138,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.001999838993222883,
+      "loss": 0.3398,
+      "step": 1573
+    },
+    {
+      "epoch": 0.013663075841355544,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0019998384297868848,
+      "loss": 0.2129,
+      "step": 1574
+    },
+    {
+      "epoch": 0.013671756321559708,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0019998378653668368,
+      "loss": 0.2637,
+      "step": 1575
+    },
+    {
+      "epoch": 0.013680436801763873,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019998372999627395,
+      "loss": 0.2422,
+      "step": 1576
+    },
+    {
+      "epoch": 0.013689117281968038,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001999836733574594,
+      "loss": 0.2285,
+      "step": 1577
+    },
+    {
+      "epoch": 0.013697797762172203,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019998361662023996,
+      "loss": 0.2246,
+      "step": 1578
+    },
+    {
+      "epoch": 0.013706478242376368,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019998355978461586,
+      "loss": 0.2578,
+      "step": 1579
+    },
+    {
+      "epoch": 0.013715158722580533,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019998350285058706,
+      "loss": 0.2188,
+      "step": 1580
+    },
+    {
+      "epoch": 0.013723839202784699,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001999834458181537,
+      "loss": 0.2598,
+      "step": 1581
+    },
+    {
+      "epoch": 0.013732519682988864,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019998338868731573,
+      "loss": 0.3125,
+      "step": 1582
+    },
+    {
+      "epoch": 0.013741200163193027,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0019998333145807333,
+      "loss": 0.3594,
+      "step": 1583
+    },
+    {
+      "epoch": 0.013749880643397192,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019998327413042654,
+      "loss": 0.2178,
+      "step": 1584
+    },
+    {
+      "epoch": 0.013758561123601357,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001999832167043753,
+      "loss": 0.1699,
+      "step": 1585
+    },
+    {
+      "epoch": 0.013767241603805522,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019998315917991983,
+      "loss": 0.2402,
+      "step": 1586
+    },
+    {
+      "epoch": 0.013775922084009688,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0019998310155706013,
+      "loss": 0.1895,
+      "step": 1587
+    },
+    {
+      "epoch": 0.013784602564213853,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0019998304383579625,
+      "loss": 0.2227,
+      "step": 1588
+    },
+    {
+      "epoch": 0.013793283044418018,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0019998298601612828,
+      "loss": 0.2012,
+      "step": 1589
+    },
+    {
+      "epoch": 0.013801963524622183,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001999829280980562,
+      "loss": 0.2109,
+      "step": 1590
+    },
+    {
+      "epoch": 0.013810644004826346,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.001999828700815802,
+      "loss": 0.2383,
+      "step": 1591
+    },
+    {
+      "epoch": 0.013819324485030511,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001999828119667003,
+      "loss": 0.2539,
+      "step": 1592
+    },
+    {
+      "epoch": 0.013828004965234677,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001999827537534165,
+      "loss": 0.2617,
+      "step": 1593
+    },
+    {
+      "epoch": 0.013836685445438842,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0019998269544172897,
+      "loss": 0.2422,
+      "step": 1594
+    },
+    {
+      "epoch": 0.013845365925643007,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0019998263703163766,
+      "loss": 0.2656,
+      "step": 1595
+    },
+    {
+      "epoch": 0.013854046405847172,
+      "grad_norm": 0.061767578125,
+      "learning_rate": 0.0019998257852314274,
+      "loss": 0.2168,
+      "step": 1596
+    },
+    {
+      "epoch": 0.013862726886051337,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001999825199162442,
+      "loss": 0.2598,
+      "step": 1597
+    },
+    {
+      "epoch": 0.0138714073662555,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001999824612109421,
+      "loss": 0.248,
+      "step": 1598
+    },
+    {
+      "epoch": 0.013880087846459666,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001999824024072366,
+      "loss": 0.2871,
+      "step": 1599
+    },
+    {
+      "epoch": 0.01388876832666383,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019998234350512762,
+      "loss": 0.2871,
+      "step": 1600
+    },
+    {
+      "epoch": 0.013897448806867996,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0019998228450461533,
+      "loss": 0.25,
+      "step": 1601
+    },
+    {
+      "epoch": 0.013906129287072161,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019998222540569977,
+      "loss": 0.2168,
+      "step": 1602
+    },
+    {
+      "epoch": 0.013914809767276326,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019998216620838102,
+      "loss": 0.2793,
+      "step": 1603
+    },
+    {
+      "epoch": 0.013923490247480491,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0019998210691265913,
+      "loss": 0.2207,
+      "step": 1604
+    },
+    {
+      "epoch": 0.013932170727684656,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019998204751853414,
+      "loss": 0.291,
+      "step": 1605
+    },
+    {
+      "epoch": 0.01394085120788882,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019998198802600614,
+      "loss": 0.2441,
+      "step": 1606
+    },
+    {
+      "epoch": 0.013949531688092985,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.001999819284350752,
+      "loss": 0.2559,
+      "step": 1607
+    },
+    {
+      "epoch": 0.01395821216829715,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001999818687457413,
+      "loss": 0.252,
+      "step": 1608
+    },
+    {
+      "epoch": 0.013966892648501315,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0019998180895800465,
+      "loss": 0.1865,
+      "step": 1609
+    },
+    {
+      "epoch": 0.01397557312870548,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019998174907186524,
+      "loss": 0.2539,
+      "step": 1610
+    },
+    {
+      "epoch": 0.013984253608909645,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019998168908732317,
+      "loss": 0.2324,
+      "step": 1611
+    },
+    {
+      "epoch": 0.01399293408911381,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019998162900437843,
+      "loss": 0.2402,
+      "step": 1612
+    },
+    {
+      "epoch": 0.014001614569317975,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019998156882303116,
+      "loss": 0.2812,
+      "step": 1613
+    },
+    {
+      "epoch": 0.014010295049522139,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019998150854328134,
+      "loss": 0.2246,
+      "step": 1614
+    },
+    {
+      "epoch": 0.014018975529726304,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0019998144816512917,
+      "loss": 0.2637,
+      "step": 1615
+    },
+    {
+      "epoch": 0.014027656009930469,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019998138768857463,
+      "loss": 0.2656,
+      "step": 1616
+    },
+    {
+      "epoch": 0.014036336490134634,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019998132711361778,
+      "loss": 0.25,
+      "step": 1617
+    },
+    {
+      "epoch": 0.0140450169703388,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019998126644025865,
+      "loss": 0.2578,
+      "step": 1618
+    },
+    {
+      "epoch": 0.014053697450542964,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001999812056684974,
+      "loss": 0.2812,
+      "step": 1619
+    },
+    {
+      "epoch": 0.01406237793074713,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0019998114479833407,
+      "loss": 0.1621,
+      "step": 1620
+    },
+    {
+      "epoch": 0.014071058410951293,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019998108382976868,
+      "loss": 0.2383,
+      "step": 1621
+    },
+    {
+      "epoch": 0.014079738891155458,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.001999810227628014,
+      "loss": 0.2246,
+      "step": 1622
+    },
+    {
+      "epoch": 0.014088419371359623,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019998096159743214,
+      "loss": 0.2314,
+      "step": 1623
+    },
+    {
+      "epoch": 0.014097099851563788,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001999809003336611,
+      "loss": 0.2637,
+      "step": 1624
+    },
+    {
+      "epoch": 0.014105780331767953,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019998083897148832,
+      "loss": 0.2246,
+      "step": 1625
+    },
+    {
+      "epoch": 0.014114460811972119,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001999807775109138,
+      "loss": 0.2324,
+      "step": 1626
+    },
+    {
+      "epoch": 0.014123141292176284,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019998071595193766,
+      "loss": 0.2129,
+      "step": 1627
+    },
+    {
+      "epoch": 0.014131821772380449,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019998065429455997,
+      "loss": 0.2207,
+      "step": 1628
+    },
+    {
+      "epoch": 0.014140502252584612,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001999805925387808,
+      "loss": 0.2344,
+      "step": 1629
+    },
+    {
+      "epoch": 0.014149182732788777,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0019998053068460016,
+      "loss": 0.25,
+      "step": 1630
+    },
+    {
+      "epoch": 0.014157863212992942,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.001999804687320182,
+      "loss": 0.2207,
+      "step": 1631
+    },
+    {
+      "epoch": 0.014166543693197108,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019998040668103498,
+      "loss": 0.2695,
+      "step": 1632
+    },
+    {
+      "epoch": 0.014175224173401273,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001999803445316505,
+      "loss": 0.4668,
+      "step": 1633
+    },
+    {
+      "epoch": 0.014183904653605438,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019998028228386485,
+      "loss": 0.2412,
+      "step": 1634
+    },
+    {
+      "epoch": 0.014192585133809603,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019998021993767818,
+      "loss": 0.2314,
+      "step": 1635
+    },
+    {
+      "epoch": 0.014201265614013768,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019998015749309044,
+      "loss": 0.2754,
+      "step": 1636
+    },
+    {
+      "epoch": 0.014209946094217931,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019998009495010177,
+      "loss": 0.293,
+      "step": 1637
+    },
+    {
+      "epoch": 0.014218626574422097,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001999800323087122,
+      "loss": 0.3281,
+      "step": 1638
+    },
+    {
+      "epoch": 0.014227307054626262,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0019997996956892185,
+      "loss": 0.2246,
+      "step": 1639
+    },
+    {
+      "epoch": 0.014235987534830427,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0019997990673073078,
+      "loss": 0.2578,
+      "step": 1640
+    },
+    {
+      "epoch": 0.014244668015034592,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0019997984379413894,
+      "loss": 0.1748,
+      "step": 1641
+    },
+    {
+      "epoch": 0.014253348495238757,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0019997978075914657,
+      "loss": 0.875,
+      "step": 1642
+    },
+    {
+      "epoch": 0.014262028975442922,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0019997971762575366,
+      "loss": 0.2168,
+      "step": 1643
+    },
+    {
+      "epoch": 0.014270709455647087,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019997965439396024,
+      "loss": 0.2344,
+      "step": 1644
+    },
+    {
+      "epoch": 0.01427938993585125,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001999795910637665,
+      "loss": 0.2314,
+      "step": 1645
+    },
+    {
+      "epoch": 0.014288070416055416,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019997952763517236,
+      "loss": 0.3125,
+      "step": 1646
+    },
+    {
+      "epoch": 0.014296750896259581,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.00199979464108178,
+      "loss": 0.2949,
+      "step": 1647
+    },
+    {
+      "epoch": 0.014305431376463746,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0019997940048278344,
+      "loss": 0.2158,
+      "step": 1648
+    },
+    {
+      "epoch": 0.014314111856667911,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.001999793367589887,
+      "loss": 0.1777,
+      "step": 1649
+    },
+    {
+      "epoch": 0.014322792336872076,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.00199979272936794,
+      "loss": 0.2109,
+      "step": 1650
+    },
+    {
+      "epoch": 0.014331472817076241,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0019997920901619927,
+      "loss": 0.2637,
+      "step": 1651
+    },
+    {
+      "epoch": 0.014340153297280405,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019997914499720465,
+      "loss": 0.2773,
+      "step": 1652
+    },
+    {
+      "epoch": 0.01434883377748457,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001999790808798102,
+      "loss": 0.1914,
+      "step": 1653
+    },
+    {
+      "epoch": 0.014357514257688735,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001999790166640159,
+      "loss": 0.2402,
+      "step": 1654
+    },
+    {
+      "epoch": 0.0143661947378929,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019997895234982197,
+      "loss": 0.2461,
+      "step": 1655
+    },
+    {
+      "epoch": 0.014374875218097065,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001999788879372284,
+      "loss": 0.248,
+      "step": 1656
+    },
+    {
+      "epoch": 0.01438355569830123,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001999788234262353,
+      "loss": 0.2812,
+      "step": 1657
+    },
+    {
+      "epoch": 0.014392236178505395,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019997875881684266,
+      "loss": 0.2793,
+      "step": 1658
+    },
+    {
+      "epoch": 0.01440091665870956,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019997869410905062,
+      "loss": 0.2295,
+      "step": 1659
+    },
+    {
+      "epoch": 0.014409597138913724,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001999786293028592,
+      "loss": 0.2402,
+      "step": 1660
+    },
+    {
+      "epoch": 0.014418277619117889,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0019997856439826858,
+      "loss": 0.1855,
+      "step": 1661
+    },
+    {
+      "epoch": 0.014426958099322054,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001999784993952787,
+      "loss": 0.2949,
+      "step": 1662
+    },
+    {
+      "epoch": 0.01443563857952622,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001999784342938897,
+      "loss": 0.209,
+      "step": 1663
+    },
+    {
+      "epoch": 0.014444319059730384,
+      "grad_norm": 0.0615234375,
+      "learning_rate": 0.0019997836909410164,
+      "loss": 0.25,
+      "step": 1664
+    },
+    {
+      "epoch": 0.01445299953993455,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019997830379591456,
+      "loss": 0.2656,
+      "step": 1665
+    },
+    {
+      "epoch": 0.014461680020138715,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.001999782383993286,
+      "loss": 0.2041,
+      "step": 1666
+    },
+    {
+      "epoch": 0.01447036050034288,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019997817290434376,
+      "loss": 0.2422,
+      "step": 1667
+    },
+    {
+      "epoch": 0.014479040980547043,
+      "grad_norm": 0.0615234375,
+      "learning_rate": 0.0019997810731096012,
+      "loss": 0.1904,
+      "step": 1668
+    },
+    {
+      "epoch": 0.014487721460751208,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001999780416191778,
+      "loss": 0.2539,
+      "step": 1669
+    },
+    {
+      "epoch": 0.014496401940955373,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019997797582899687,
+      "loss": 0.2754,
+      "step": 1670
+    },
+    {
+      "epoch": 0.014505082421159539,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019997790994041734,
+      "loss": 0.2207,
+      "step": 1671
+    },
+    {
+      "epoch": 0.014513762901363704,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.001999778439534393,
+      "loss": 0.2285,
+      "step": 1672
+    },
+    {
+      "epoch": 0.014522443381567869,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001999777778680629,
+      "loss": 0.3398,
+      "step": 1673
+    },
+    {
+      "epoch": 0.014531123861772034,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001999777116842881,
+      "loss": 0.1904,
+      "step": 1674
+    },
+    {
+      "epoch": 0.014539804341976197,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001999776454021151,
+      "loss": 0.2734,
+      "step": 1675
+    },
+    {
+      "epoch": 0.014548484822180362,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019997757902154387,
+      "loss": 0.2617,
+      "step": 1676
+    },
+    {
+      "epoch": 0.014557165302384528,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.001999775125425745,
+      "loss": 0.3008,
+      "step": 1677
+    },
+    {
+      "epoch": 0.014565845782588693,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019997744596520705,
+      "loss": 0.6094,
+      "step": 1678
+    },
+    {
+      "epoch": 0.014574526262792858,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.001999773792894416,
+      "loss": 0.207,
+      "step": 1679
+    },
+    {
+      "epoch": 0.014583206742997023,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001999773125152783,
+      "loss": 0.2314,
+      "step": 1680
+    },
+    {
+      "epoch": 0.014591887223201188,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001999772456427171,
+      "loss": 0.1758,
+      "step": 1681
+    },
+    {
+      "epoch": 0.014600567703405353,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019997717867175826,
+      "loss": 0.2109,
+      "step": 1682
+    },
+    {
+      "epoch": 0.014609248183609517,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019997711160240164,
+      "loss": 0.2617,
+      "step": 1683
+    },
+    {
+      "epoch": 0.014617928663813682,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.001999770444346474,
+      "loss": 0.2305,
+      "step": 1684
+    },
+    {
+      "epoch": 0.014626609144017847,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.001999769771684956,
+      "loss": 0.1992,
+      "step": 1685
+    },
+    {
+      "epoch": 0.014635289624222012,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019997690980394635,
+      "loss": 0.2969,
+      "step": 1686
+    },
+    {
+      "epoch": 0.014643970104426177,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001999768423409997,
+      "loss": 0.2988,
+      "step": 1687
+    },
+    {
+      "epoch": 0.014652650584630342,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0019997677477965577,
+      "loss": 0.1943,
+      "step": 1688
+    },
+    {
+      "epoch": 0.014661331064834507,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019997670711991455,
+      "loss": 0.2129,
+      "step": 1689
+    },
+    {
+      "epoch": 0.014670011545038672,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019997663936177613,
+      "loss": 0.1992,
+      "step": 1690
+    },
+    {
+      "epoch": 0.014678692025242836,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019997657150524067,
+      "loss": 0.2539,
+      "step": 1691
+    },
+    {
+      "epoch": 0.014687372505447,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0019997650355030815,
+      "loss": 0.2246,
+      "step": 1692
+    },
+    {
+      "epoch": 0.014696052985651166,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.001999764354969787,
+      "loss": 0.2275,
+      "step": 1693
+    },
+    {
+      "epoch": 0.014704733465855331,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019997636734525237,
+      "loss": 0.2148,
+      "step": 1694
+    },
+    {
+      "epoch": 0.014713413946059496,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001999762990951292,
+      "loss": 0.3242,
+      "step": 1695
+    },
+    {
+      "epoch": 0.014722094426263661,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.001999762307466093,
+      "loss": 0.2383,
+      "step": 1696
+    },
+    {
+      "epoch": 0.014730774906467826,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0019997616229969276,
+      "loss": 0.248,
+      "step": 1697
+    },
+    {
+      "epoch": 0.01473945538667199,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019997609375437967,
+      "loss": 0.2988,
+      "step": 1698
+    },
+    {
+      "epoch": 0.014748135866876155,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0019997602511067003,
+      "loss": 0.2617,
+      "step": 1699
+    },
+    {
+      "epoch": 0.01475681634708032,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019997595636856397,
+      "loss": 0.2891,
+      "step": 1700
+    },
+    {
+      "epoch": 0.014765496827284485,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001999758875280616,
+      "loss": 0.2002,
+      "step": 1701
+    },
+    {
+      "epoch": 0.01477417730748865,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019997581858916293,
+      "loss": 0.2656,
+      "step": 1702
+    },
+    {
+      "epoch": 0.014782857787692815,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019997574955186804,
+      "loss": 0.3457,
+      "step": 1703
+    },
+    {
+      "epoch": 0.01479153826789698,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0019997568041617703,
+      "loss": 0.2344,
+      "step": 1704
+    },
+    {
+      "epoch": 0.014800218748101146,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0019997561118208994,
+      "loss": 0.1973,
+      "step": 1705
+    },
+    {
+      "epoch": 0.014808899228305309,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001999755418496069,
+      "loss": 0.2695,
+      "step": 1706
+    },
+    {
+      "epoch": 0.014817579708509474,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.00199975472418728,
+      "loss": 0.2354,
+      "step": 1707
+    },
+    {
+      "epoch": 0.01482626018871364,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001999754028894532,
+      "loss": 0.1934,
+      "step": 1708
+    },
+    {
+      "epoch": 0.014834940668917804,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.001999753332617827,
+      "loss": 0.208,
+      "step": 1709
+    },
+    {
+      "epoch": 0.01484362114912197,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019997526353571654,
+      "loss": 0.2617,
+      "step": 1710
+    },
+    {
+      "epoch": 0.014852301629326135,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0019997519371125474,
+      "loss": 0.2393,
+      "step": 1711
+    },
+    {
+      "epoch": 0.0148609821095303,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001999751237883975,
+      "loss": 0.2715,
+      "step": 1712
+    },
+    {
+      "epoch": 0.014869662589734465,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001999750537671447,
+      "loss": 0.2852,
+      "step": 1713
+    },
+    {
+      "epoch": 0.014878343069938628,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019997498364749656,
+      "loss": 0.2266,
+      "step": 1714
+    },
+    {
+      "epoch": 0.014887023550142793,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001999749134294532,
+      "loss": 0.2285,
+      "step": 1715
+    },
+    {
+      "epoch": 0.014895704030346959,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019997484311301454,
+      "loss": 0.252,
+      "step": 1716
+    },
+    {
+      "epoch": 0.014904384510551124,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.001999747726981808,
+      "loss": 0.2578,
+      "step": 1717
+    },
+    {
+      "epoch": 0.014913064990755289,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.00199974702184952,
+      "loss": 0.2168,
+      "step": 1718
+    },
+    {
+      "epoch": 0.014921745470959454,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001999746315733282,
+      "loss": 0.2305,
+      "step": 1719
+    },
+    {
+      "epoch": 0.014930425951163619,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001999745608633095,
+      "loss": 0.2852,
+      "step": 1720
+    },
+    {
+      "epoch": 0.014939106431367784,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.00199974490054896,
+      "loss": 0.2275,
+      "step": 1721
+    },
+    {
+      "epoch": 0.014947786911571947,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001999744191480877,
+      "loss": 0.2969,
+      "step": 1722
+    },
+    {
+      "epoch": 0.014956467391776113,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019997434814288477,
+      "loss": 0.3086,
+      "step": 1723
+    },
+    {
+      "epoch": 0.014965147871980278,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.001999742770392872,
+      "loss": 0.2559,
+      "step": 1724
+    },
+    {
+      "epoch": 0.014973828352184443,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001999742058372951,
+      "loss": 0.3418,
+      "step": 1725
+    },
+    {
+      "epoch": 0.014982508832388608,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019997413453690864,
+      "loss": 0.2539,
+      "step": 1726
+    },
+    {
+      "epoch": 0.014991189312592773,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019997406313812774,
+      "loss": 0.3047,
+      "step": 1727
+    },
+    {
+      "epoch": 0.014999869792796938,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019997399164095263,
+      "loss": 0.2402,
+      "step": 1728
+    },
+    {
+      "epoch": 0.015008550273001102,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0019997392004538327,
+      "loss": 0.2021,
+      "step": 1729
+    },
+    {
+      "epoch": 0.015017230753205267,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001999738483514198,
+      "loss": 0.2539,
+      "step": 1730
+    },
+    {
+      "epoch": 0.015025911233409432,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0019997377655906227,
+      "loss": 0.2344,
+      "step": 1731
+    },
+    {
+      "epoch": 0.015034591713613597,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019997370466831076,
+      "loss": 0.3164,
+      "step": 1732
+    },
+    {
+      "epoch": 0.015043272193817762,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001999736326791654,
+      "loss": 0.4219,
+      "step": 1733
+    },
+    {
+      "epoch": 0.015051952674021927,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019997356059162615,
+      "loss": 0.2715,
+      "step": 1734
+    },
+    {
+      "epoch": 0.015060633154226092,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019997348840569322,
+      "loss": 0.2334,
+      "step": 1735
+    },
+    {
+      "epoch": 0.015069313634430257,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019997341612136665,
+      "loss": 0.1836,
+      "step": 1736
+    },
+    {
+      "epoch": 0.01507799411463442,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001999733437386465,
+      "loss": 0.2773,
+      "step": 1737
+    },
+    {
+      "epoch": 0.015086674594838586,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001999732712575328,
+      "loss": 0.2461,
+      "step": 1738
+    },
+    {
+      "epoch": 0.015095355075042751,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001999731986780257,
+      "loss": 0.2461,
+      "step": 1739
+    },
+    {
+      "epoch": 0.015104035555246916,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0019997312600012526,
+      "loss": 0.1787,
+      "step": 1740
+    },
+    {
+      "epoch": 0.015112716035451081,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019997305322383163,
+      "loss": 0.2168,
+      "step": 1741
+    },
+    {
+      "epoch": 0.015121396515655246,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019997298034914474,
+      "loss": 0.2324,
+      "step": 1742
+    },
+    {
+      "epoch": 0.015130076995859412,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019997290737606478,
+      "loss": 0.2422,
+      "step": 1743
+    },
+    {
+      "epoch": 0.015138757476063577,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001999728343045918,
+      "loss": 0.2354,
+      "step": 1744
+    },
+    {
+      "epoch": 0.01514743795626774,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001999727611347259,
+      "loss": 0.2451,
+      "step": 1745
+    },
+    {
+      "epoch": 0.015156118436471905,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0019997268786646713,
+      "loss": 0.3359,
+      "step": 1746
+    },
+    {
+      "epoch": 0.01516479891667607,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019997261449981553,
+      "loss": 0.2656,
+      "step": 1747
+    },
+    {
+      "epoch": 0.015173479396880235,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019997254103477128,
+      "loss": 0.2559,
+      "step": 1748
+    },
+    {
+      "epoch": 0.0151821598770844,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0019997246747133443,
+      "loss": 0.2168,
+      "step": 1749
+    },
+    {
+      "epoch": 0.015190840357288566,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.00199972393809505,
+      "loss": 0.208,
+      "step": 1750
+    },
+    {
+      "epoch": 0.01519952083749273,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0019997232004928312,
+      "loss": 0.2227,
+      "step": 1751
+    },
+    {
+      "epoch": 0.015208201317696894,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001999722461906689,
+      "loss": 0.3418,
+      "step": 1752
+    },
+    {
+      "epoch": 0.01521688179790106,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.001999721722336623,
+      "loss": 0.2227,
+      "step": 1753
+    },
+    {
+      "epoch": 0.015225562278105224,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019997209817826356,
+      "loss": 0.2188,
+      "step": 1754
+    },
+    {
+      "epoch": 0.01523424275830939,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.001999720240244727,
+      "loss": 0.2363,
+      "step": 1755
+    },
+    {
+      "epoch": 0.015242923238513555,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0019997194977228972,
+      "loss": 0.2207,
+      "step": 1756
+    },
+    {
+      "epoch": 0.01525160371871772,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001999718754217148,
+      "loss": 0.21,
+      "step": 1757
+    },
+    {
+      "epoch": 0.015260284198921885,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.00199971800972748,
+      "loss": 0.2637,
+      "step": 1758
+    },
+    {
+      "epoch": 0.01526896467912605,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0019997172642538933,
+      "loss": 0.2266,
+      "step": 1759
+    },
+    {
+      "epoch": 0.015277645159330213,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.00199971651779639,
+      "loss": 0.1797,
+      "step": 1760
+    },
+    {
+      "epoch": 0.015286325639534378,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.00199971577035497,
+      "loss": 0.2559,
+      "step": 1761
+    },
+    {
+      "epoch": 0.015295006119738544,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0019997150219296343,
+      "loss": 0.2344,
+      "step": 1762
+    },
+    {
+      "epoch": 0.015303686599942709,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001999714272520384,
+      "loss": 0.2578,
+      "step": 1763
+    },
+    {
+      "epoch": 0.015312367080146874,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0019997135221272192,
+      "loss": 0.1914,
+      "step": 1764
+    },
+    {
+      "epoch": 0.015321047560351039,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.001999712770750142,
+      "loss": 0.2695,
+      "step": 1765
+    },
+    {
+      "epoch": 0.015329728040555204,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019997120183891516,
+      "loss": 0.2197,
+      "step": 1766
+    },
+    {
+      "epoch": 0.01533840852075937,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019997112650442504,
+      "loss": 0.2773,
+      "step": 1767
+    },
+    {
+      "epoch": 0.015347089000963533,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001999710510715438,
+      "loss": 0.3086,
+      "step": 1768
+    },
+    {
+      "epoch": 0.015355769481167698,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019997097554027158,
+      "loss": 0.2812,
+      "step": 1769
+    },
+    {
+      "epoch": 0.015364449961371863,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0019997089991060845,
+      "loss": 0.1846,
+      "step": 1770
+    },
+    {
+      "epoch": 0.015373130441576028,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.001999708241825545,
+      "loss": 0.2354,
+      "step": 1771
+    },
+    {
+      "epoch": 0.015381810921780193,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019997074835610977,
+      "loss": 0.2305,
+      "step": 1772
+    },
+    {
+      "epoch": 0.015390491401984358,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019997067243127443,
+      "loss": 0.2148,
+      "step": 1773
+    },
+    {
+      "epoch": 0.015399171882188523,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001999705964080485,
+      "loss": 0.334,
+      "step": 1774
+    },
+    {
+      "epoch": 0.015407852362392687,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001999705202864321,
+      "loss": 0.2178,
+      "step": 1775
+    },
+    {
+      "epoch": 0.015416532842596852,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0019997044406642526,
+      "loss": 0.1758,
+      "step": 1776
+    },
+    {
+      "epoch": 0.015425213322801017,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019997036774802813,
+      "loss": 0.1992,
+      "step": 1777
+    },
+    {
+      "epoch": 0.015433893803005182,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001999702913312407,
+      "loss": 0.2871,
+      "step": 1778
+    },
+    {
+      "epoch": 0.015442574283209347,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019997021481606312,
+      "loss": 0.2256,
+      "step": 1779
+    },
+    {
+      "epoch": 0.015451254763413512,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001999701382024955,
+      "loss": 0.2598,
+      "step": 1780
+    },
+    {
+      "epoch": 0.015459935243617677,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0019997006149053793,
+      "loss": 0.2578,
+      "step": 1781
+    },
+    {
+      "epoch": 0.015468615723821843,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0019996998468019035,
+      "loss": 0.2188,
+      "step": 1782
+    },
+    {
+      "epoch": 0.015477296204026006,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.00199969907771453,
+      "loss": 0.2969,
+      "step": 1783
+    },
+    {
+      "epoch": 0.015485976684230171,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019996983076432592,
+      "loss": 0.2266,
+      "step": 1784
+    },
+    {
+      "epoch": 0.015494657164434336,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0019996975365880916,
+      "loss": 0.2168,
+      "step": 1785
+    },
+    {
+      "epoch": 0.015503337644638501,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0019996967645490283,
+      "loss": 0.2324,
+      "step": 1786
+    },
+    {
+      "epoch": 0.015512018124842666,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0019996959915260706,
+      "loss": 0.2695,
+      "step": 1787
+    },
+    {
+      "epoch": 0.015520698605046832,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.001999695217519218,
+      "loss": 0.1855,
+      "step": 1788
+    },
+    {
+      "epoch": 0.015529379085250997,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019996944425284733,
+      "loss": 0.1904,
+      "step": 1789
+    },
+    {
+      "epoch": 0.015538059565455162,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0019996936665538354,
+      "loss": 0.2578,
+      "step": 1790
+    },
+    {
+      "epoch": 0.015546740045659325,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019996928895953067,
+      "loss": 0.2324,
+      "step": 1791
+    },
+    {
+      "epoch": 0.01555542052586349,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.001999692111652887,
+      "loss": 0.2695,
+      "step": 1792
+    },
+    {
+      "epoch": 0.015564101006067655,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019996913327265777,
+      "loss": 0.2695,
+      "step": 1793
+    },
+    {
+      "epoch": 0.01557278148627182,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001999690552816379,
+      "loss": 0.2773,
+      "step": 1794
+    },
+    {
+      "epoch": 0.015581461966475986,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0019996897719222924,
+      "loss": 0.1982,
+      "step": 1795
+    },
+    {
+      "epoch": 0.01559014244668015,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001999688990044319,
+      "loss": 0.2656,
+      "step": 1796
+    },
+    {
+      "epoch": 0.015598822926884316,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019996882071824586,
+      "loss": 0.2305,
+      "step": 1797
+    },
+    {
+      "epoch": 0.015607503407088481,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019996874233367133,
+      "loss": 0.1904,
+      "step": 1798
+    },
+    {
+      "epoch": 0.015616183887292644,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0019996866385070832,
+      "loss": 0.2578,
+      "step": 1799
+    },
+    {
+      "epoch": 0.01562486436749681,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.001999685852693569,
+      "loss": 0.2266,
+      "step": 1800
+    },
+    {
+      "epoch": 0.015633544847700975,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019996850658961724,
+      "loss": 0.2285,
+      "step": 1801
+    },
+    {
+      "epoch": 0.01564222532790514,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0019996842781148934,
+      "loss": 0.2041,
+      "step": 1802
+    },
+    {
+      "epoch": 0.015650905808109305,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001999683489349733,
+      "loss": 0.2832,
+      "step": 1803
+    },
+    {
+      "epoch": 0.01565958628831347,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019996826996006925,
+      "loss": 0.248,
+      "step": 1804
+    },
+    {
+      "epoch": 0.015668266768517635,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.001999681908867773,
+      "loss": 0.1719,
+      "step": 1805
+    },
+    {
+      "epoch": 0.0156769472487218,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001999681117150974,
+      "loss": 0.2578,
+      "step": 1806
+    },
+    {
+      "epoch": 0.015685627728925965,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0019996803244502976,
+      "loss": 0.207,
+      "step": 1807
+    },
+    {
+      "epoch": 0.01569430820913013,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019996795307657446,
+      "loss": 0.2363,
+      "step": 1808
+    },
+    {
+      "epoch": 0.015702988689334296,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001999678736097315,
+      "loss": 0.2617,
+      "step": 1809
+    },
+    {
+      "epoch": 0.015711669169538457,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019996779404450105,
+      "loss": 0.2354,
+      "step": 1810
+    },
+    {
+      "epoch": 0.015720349649742622,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001999677143808832,
+      "loss": 0.3027,
+      "step": 1811
+    },
+    {
+      "epoch": 0.015729030129946787,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.00199967634618878,
+      "loss": 0.2637,
+      "step": 1812
+    },
+    {
+      "epoch": 0.015737710610150953,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0019996755475848553,
+      "loss": 0.293,
+      "step": 1813
+    },
+    {
+      "epoch": 0.015746391090355118,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001999674747997059,
+      "loss": 0.2422,
+      "step": 1814
+    },
+    {
+      "epoch": 0.015755071570559283,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001999673947425392,
+      "loss": 0.3125,
+      "step": 1815
+    },
+    {
+      "epoch": 0.015763752050763448,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.001999673145869855,
+      "loss": 0.1943,
+      "step": 1816
+    },
+    {
+      "epoch": 0.015772432530967613,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.001999672343330449,
+      "loss": 0.2061,
+      "step": 1817
+    },
+    {
+      "epoch": 0.015781113011171778,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0019996715398071744,
+      "loss": 0.1641,
+      "step": 1818
+    },
+    {
+      "epoch": 0.015789793491375943,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019996707353000334,
+      "loss": 0.248,
+      "step": 1819
+    },
+    {
+      "epoch": 0.01579847397158011,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019996699298090253,
+      "loss": 0.2539,
+      "step": 1820
+    },
+    {
+      "epoch": 0.015807154451784274,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.001999669123334152,
+      "loss": 0.2168,
+      "step": 1821
+    },
+    {
+      "epoch": 0.01581583493198844,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.001999668315875414,
+      "loss": 0.2168,
+      "step": 1822
+    },
+    {
+      "epoch": 0.015824515412192604,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001999667507432812,
+      "loss": 0.209,
+      "step": 1823
+    },
+    {
+      "epoch": 0.01583319589239677,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019996666980063474,
+      "loss": 0.2578,
+      "step": 1824
+    },
+    {
+      "epoch": 0.015841876372600934,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001999665887596021,
+      "loss": 0.2207,
+      "step": 1825
+    },
+    {
+      "epoch": 0.015850556852805096,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019996650762018333,
+      "loss": 0.2656,
+      "step": 1826
+    },
+    {
+      "epoch": 0.01585923733300926,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001999664263823785,
+      "loss": 0.2383,
+      "step": 1827
+    },
+    {
+      "epoch": 0.015867917813213426,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001999663450461878,
+      "loss": 0.2324,
+      "step": 1828
+    },
+    {
+      "epoch": 0.01587659829341759,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.001999662636116112,
+      "loss": 0.2578,
+      "step": 1829
+    },
+    {
+      "epoch": 0.015885278773621756,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001999661820786489,
+      "loss": 0.2324,
+      "step": 1830
+    },
+    {
+      "epoch": 0.01589395925382592,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019996610044730094,
+      "loss": 0.332,
+      "step": 1831
+    },
+    {
+      "epoch": 0.015902639734030086,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0019996601871756737,
+      "loss": 0.1855,
+      "step": 1832
+    },
+    {
+      "epoch": 0.01591132021423425,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019996593688944827,
+      "loss": 0.2676,
+      "step": 1833
+    },
+    {
+      "epoch": 0.015920000694438417,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019996585496294384,
+      "loss": 0.2598,
+      "step": 1834
+    },
+    {
+      "epoch": 0.015928681174642582,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001999657729380541,
+      "loss": 0.2256,
+      "step": 1835
+    },
+    {
+      "epoch": 0.015937361654846747,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.001999656908147791,
+      "loss": 0.1865,
+      "step": 1836
+    },
+    {
+      "epoch": 0.015946042135050912,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019996560859311904,
+      "loss": 0.2246,
+      "step": 1837
+    },
+    {
+      "epoch": 0.015954722615255077,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001999655262730739,
+      "loss": 0.248,
+      "step": 1838
+    },
+    {
+      "epoch": 0.015963403095459242,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019996544385464383,
+      "loss": 0.2168,
+      "step": 1839
+    },
+    {
+      "epoch": 0.015972083575663407,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001999653613378289,
+      "loss": 0.2129,
+      "step": 1840
+    },
+    {
+      "epoch": 0.01598076405586757,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0019996527872262913,
+      "loss": 0.1846,
+      "step": 1841
+    },
+    {
+      "epoch": 0.015989444536071734,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0019996519600904475,
+      "loss": 0.1777,
+      "step": 1842
+    },
+    {
+      "epoch": 0.0159981250162759,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001999651131970758,
+      "loss": 0.2715,
+      "step": 1843
+    },
+    {
+      "epoch": 0.016006805496480064,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001999650302867223,
+      "loss": 0.3555,
+      "step": 1844
+    },
+    {
+      "epoch": 0.01601548597668423,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019996494727798444,
+      "loss": 0.1914,
+      "step": 1845
+    },
+    {
+      "epoch": 0.016024166456888395,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019996486417086226,
+      "loss": 0.291,
+      "step": 1846
+    },
+    {
+      "epoch": 0.01603284693709256,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019996478096535584,
+      "loss": 0.2422,
+      "step": 1847
+    },
+    {
+      "epoch": 0.016041527417296725,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019996469766146528,
+      "loss": 0.2559,
+      "step": 1848
+    },
+    {
+      "epoch": 0.01605020789750089,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.001999646142591907,
+      "loss": 0.1982,
+      "step": 1849
+    },
+    {
+      "epoch": 0.016058888377705055,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019996453075853218,
+      "loss": 0.2227,
+      "step": 1850
+    },
+    {
+      "epoch": 0.01606756885790922,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0019996444715948978,
+      "loss": 0.2715,
+      "step": 1851
+    },
+    {
+      "epoch": 0.016076249338113385,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001999643634620636,
+      "loss": 0.2227,
+      "step": 1852
+    },
+    {
+      "epoch": 0.01608492981831755,
+      "grad_norm": 0.059326171875,
+      "learning_rate": 0.001999642796662538,
+      "loss": 0.2012,
+      "step": 1853
+    },
+    {
+      "epoch": 0.016093610298521716,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019996419577206033,
+      "loss": 0.3027,
+      "step": 1854
+    },
+    {
+      "epoch": 0.01610229077872588,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001999641117794834,
+      "loss": 0.2227,
+      "step": 1855
+    },
+    {
+      "epoch": 0.016110971258930042,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001999640276885231,
+      "loss": 0.2344,
+      "step": 1856
+    },
+    {
+      "epoch": 0.016119651739134207,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019996394349917944,
+      "loss": 0.2539,
+      "step": 1857
+    },
+    {
+      "epoch": 0.016128332219338373,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019996385921145264,
+      "loss": 0.1982,
+      "step": 1858
+    },
+    {
+      "epoch": 0.016137012699542538,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019996377482534265,
+      "loss": 0.2344,
+      "step": 1859
+    },
+    {
+      "epoch": 0.016145693179746703,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019996369034084964,
+      "loss": 0.2676,
+      "step": 1860
+    },
+    {
+      "epoch": 0.016154373659950868,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.001999636057579737,
+      "loss": 0.2129,
+      "step": 1861
+    },
+    {
+      "epoch": 0.016163054140155033,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001999635210767149,
+      "loss": 0.2422,
+      "step": 1862
+    },
+    {
+      "epoch": 0.016171734620359198,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019996343629707335,
+      "loss": 0.2334,
+      "step": 1863
+    },
+    {
+      "epoch": 0.016180415100563363,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001999633514190491,
+      "loss": 0.2051,
+      "step": 1864
+    },
+    {
+      "epoch": 0.01618909558076753,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.001999632664426424,
+      "loss": 0.1699,
+      "step": 1865
+    },
+    {
+      "epoch": 0.016197776060971694,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001999631813678531,
+      "loss": 0.2266,
+      "step": 1866
+    },
+    {
+      "epoch": 0.01620645654117586,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001999630961946815,
+      "loss": 0.2539,
+      "step": 1867
+    },
+    {
+      "epoch": 0.016215137021380024,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019996301092312756,
+      "loss": 0.2031,
+      "step": 1868
+    },
+    {
+      "epoch": 0.01622381750158419,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019996292555319144,
+      "loss": 0.2168,
+      "step": 1869
+    },
+    {
+      "epoch": 0.016232497981788354,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001999628400848732,
+      "loss": 0.2305,
+      "step": 1870
+    },
+    {
+      "epoch": 0.01624117846199252,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.00199962754518173,
+      "loss": 0.2432,
+      "step": 1871
+    },
+    {
+      "epoch": 0.01624985894219668,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001999626688530908,
+      "loss": 0.2275,
+      "step": 1872
+    },
+    {
+      "epoch": 0.016258539422400846,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.001999625830896268,
+      "loss": 0.209,
+      "step": 1873
+    },
+    {
+      "epoch": 0.01626721990260501,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019996249722778114,
+      "loss": 0.2285,
+      "step": 1874
+    },
+    {
+      "epoch": 0.016275900382809176,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001999624112675538,
+      "loss": 0.1973,
+      "step": 1875
+    },
+    {
+      "epoch": 0.01628458086301334,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001999623252089449,
+      "loss": 0.252,
+      "step": 1876
+    },
+    {
+      "epoch": 0.016293261343217506,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0019996223905195455,
+      "loss": 0.2354,
+      "step": 1877
+    },
+    {
+      "epoch": 0.01630194182342167,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0019996215279658286,
+      "loss": 0.1934,
+      "step": 1878
+    },
+    {
+      "epoch": 0.016310622303625837,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019996206644282994,
+      "loss": 0.2812,
+      "step": 1879
+    },
+    {
+      "epoch": 0.01631930278383,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.001999619799906958,
+      "loss": 0.2012,
+      "step": 1880
+    },
+    {
+      "epoch": 0.016327983264034167,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019996189344018067,
+      "loss": 0.25,
+      "step": 1881
+    },
+    {
+      "epoch": 0.016336663744238332,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.001999618067912845,
+      "loss": 0.1758,
+      "step": 1882
+    },
+    {
+      "epoch": 0.016345344224442497,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0019996172004400747,
+      "loss": 0.2031,
+      "step": 1883
+    },
+    {
+      "epoch": 0.016354024704646662,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019996163319834967,
+      "loss": 0.2334,
+      "step": 1884
+    },
+    {
+      "epoch": 0.016362705184850827,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001999615462543111,
+      "loss": 0.2412,
+      "step": 1885
+    },
+    {
+      "epoch": 0.016371385665054992,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019996145921189206,
+      "loss": 0.2285,
+      "step": 1886
+    },
+    {
+      "epoch": 0.016380066145259154,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019996137207109244,
+      "loss": 0.2773,
+      "step": 1887
+    },
+    {
+      "epoch": 0.01638874662546332,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019996128483191246,
+      "loss": 0.2129,
+      "step": 1888
+    },
+    {
+      "epoch": 0.016397427105667484,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001999611974943521,
+      "loss": 0.2402,
+      "step": 1889
+    },
+    {
+      "epoch": 0.01640610758587165,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001999611100584116,
+      "loss": 0.2852,
+      "step": 1890
+    },
+    {
+      "epoch": 0.016414788066075815,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019996102252409094,
+      "loss": 0.2324,
+      "step": 1891
+    },
+    {
+      "epoch": 0.01642346854627998,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001999609348913903,
+      "loss": 0.2158,
+      "step": 1892
+    },
+    {
+      "epoch": 0.016432149026484145,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001999608471603097,
+      "loss": 0.2236,
+      "step": 1893
+    },
+    {
+      "epoch": 0.01644082950668831,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001999607593308492,
+      "loss": 0.2227,
+      "step": 1894
+    },
+    {
+      "epoch": 0.016449509986892475,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001999606714030091,
+      "loss": 0.1816,
+      "step": 1895
+    },
+    {
+      "epoch": 0.01645819046709664,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0019996058337678926,
+      "loss": 0.2051,
+      "step": 1896
+    },
+    {
+      "epoch": 0.016466870947300805,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001999604952521899,
+      "loss": 0.2207,
+      "step": 1897
+    },
+    {
+      "epoch": 0.01647555142750497,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019996040702921114,
+      "loss": 0.2617,
+      "step": 1898
+    },
+    {
+      "epoch": 0.016484231907709136,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0019996031870785297,
+      "loss": 0.2168,
+      "step": 1899
+    },
+    {
+      "epoch": 0.0164929123879133,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001999602302881156,
+      "loss": 0.2246,
+      "step": 1900
+    },
+    {
+      "epoch": 0.016501592868117466,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.00199960141769999,
+      "loss": 0.2656,
+      "step": 1901
+    },
+    {
+      "epoch": 0.01651027334832163,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001999600531535034,
+      "loss": 0.2715,
+      "step": 1902
+    },
+    {
+      "epoch": 0.016518953828525793,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0019995996443862886,
+      "loss": 0.1953,
+      "step": 1903
+    },
+    {
+      "epoch": 0.016527634308729958,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0019995987562537545,
+      "loss": 0.2168,
+      "step": 1904
+    },
+    {
+      "epoch": 0.016536314788934123,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001999597867137432,
+      "loss": 0.2246,
+      "step": 1905
+    },
+    {
+      "epoch": 0.016544995269138288,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0019995969770373236,
+      "loss": 0.1924,
+      "step": 1906
+    },
+    {
+      "epoch": 0.016553675749342453,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001999596085953429,
+      "loss": 0.2119,
+      "step": 1907
+    },
+    {
+      "epoch": 0.016562356229546618,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019995951938857497,
+      "loss": 0.2227,
+      "step": 1908
+    },
+    {
+      "epoch": 0.016571036709750783,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019995943008342867,
+      "loss": 0.2129,
+      "step": 1909
+    },
+    {
+      "epoch": 0.01657971718995495,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019995934067990407,
+      "loss": 0.2734,
+      "step": 1910
+    },
+    {
+      "epoch": 0.016588397670159113,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001999592511780013,
+      "loss": 0.3398,
+      "step": 1911
+    },
+    {
+      "epoch": 0.01659707815036328,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0019995916157772046,
+      "loss": 0.2266,
+      "step": 1912
+    },
+    {
+      "epoch": 0.016605758630567444,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001999590718790616,
+      "loss": 0.248,
+      "step": 1913
+    },
+    {
+      "epoch": 0.01661443911077161,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.001999589820820249,
+      "loss": 0.1787,
+      "step": 1914
+    },
+    {
+      "epoch": 0.016623119590975774,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019995889218661035,
+      "loss": 0.2422,
+      "step": 1915
+    },
+    {
+      "epoch": 0.01663180007117994,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019995880219281816,
+      "loss": 0.2324,
+      "step": 1916
+    },
+    {
+      "epoch": 0.016640480551384104,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001999587121006483,
+      "loss": 0.252,
+      "step": 1917
+    },
+    {
+      "epoch": 0.016649161031588266,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.00199958621910101,
+      "loss": 0.1973,
+      "step": 1918
+    },
+    {
+      "epoch": 0.01665784151179243,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001999585316211763,
+      "loss": 0.2617,
+      "step": 1919
+    },
+    {
+      "epoch": 0.016666521991996596,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.001999584412338743,
+      "loss": 0.2158,
+      "step": 1920
+    },
+    {
+      "epoch": 0.01667520247220076,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001999583507481951,
+      "loss": 0.2715,
+      "step": 1921
+    },
+    {
+      "epoch": 0.016683882952404926,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.001999582601641388,
+      "loss": 0.2285,
+      "step": 1922
+    },
+    {
+      "epoch": 0.01669256343260909,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001999581694817055,
+      "loss": 0.1934,
+      "step": 1923
+    },
+    {
+      "epoch": 0.016701243912813257,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019995807870089527,
+      "loss": 0.2656,
+      "step": 1924
+    },
+    {
+      "epoch": 0.01670992439301742,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001999579878217083,
+      "loss": 0.1914,
+      "step": 1925
+    },
+    {
+      "epoch": 0.016718604873221587,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019995789684414456,
+      "loss": 0.1963,
+      "step": 1926
+    },
+    {
+      "epoch": 0.016727285353425752,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019995780576820424,
+      "loss": 0.2246,
+      "step": 1927
+    },
+    {
+      "epoch": 0.016735965833629917,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0019995771459388745,
+      "loss": 0.2246,
+      "step": 1928
+    },
+    {
+      "epoch": 0.016744646313834082,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001999576233211942,
+      "loss": 0.2422,
+      "step": 1929
+    },
+    {
+      "epoch": 0.016753326794038247,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0019995753195012466,
+      "loss": 0.25,
+      "step": 1930
+    },
+    {
+      "epoch": 0.016762007274242412,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019995744048067893,
+      "loss": 0.2559,
+      "step": 1931
+    },
+    {
+      "epoch": 0.016770687754446578,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019995734891285707,
+      "loss": 0.2441,
+      "step": 1932
+    },
+    {
+      "epoch": 0.01677936823465074,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001999572572466592,
+      "loss": 0.2295,
+      "step": 1933
+    },
+    {
+      "epoch": 0.016788048714854904,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019995716548208546,
+      "loss": 0.2031,
+      "step": 1934
+    },
+    {
+      "epoch": 0.01679672919505907,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001999570736191359,
+      "loss": 0.3203,
+      "step": 1935
+    },
+    {
+      "epoch": 0.016805409675263235,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0019995698165781064,
+      "loss": 0.168,
+      "step": 1936
+    },
+    {
+      "epoch": 0.0168140901554674,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019995688959810977,
+      "loss": 0.2812,
+      "step": 1937
+    },
+    {
+      "epoch": 0.016822770635671565,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001999567974400334,
+      "loss": 0.2539,
+      "step": 1938
+    },
+    {
+      "epoch": 0.01683145111587573,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019995670518358163,
+      "loss": 0.3047,
+      "step": 1939
+    },
+    {
+      "epoch": 0.016840131596079895,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001999566128287546,
+      "loss": 0.2295,
+      "step": 1940
+    },
+    {
+      "epoch": 0.01684881207628406,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001999565203755523,
+      "loss": 0.2695,
+      "step": 1941
+    },
+    {
+      "epoch": 0.016857492556488225,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001999564278239749,
+      "loss": 0.1846,
+      "step": 1942
+    },
+    {
+      "epoch": 0.01686617303669239,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019995633517402253,
+      "loss": 0.2812,
+      "step": 1943
+    },
+    {
+      "epoch": 0.016874853516896555,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001999562424256953,
+      "loss": 0.2246,
+      "step": 1944
+    },
+    {
+      "epoch": 0.01688353399710072,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001999561495789932,
+      "loss": 0.2197,
+      "step": 1945
+    },
+    {
+      "epoch": 0.016892214477304886,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.001999560566339165,
+      "loss": 0.2656,
+      "step": 1946
+    },
+    {
+      "epoch": 0.01690089495750905,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001999559635904651,
+      "loss": 0.2168,
+      "step": 1947
+    },
+    {
+      "epoch": 0.016909575437713216,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0019995587044863926,
+      "loss": 0.2422,
+      "step": 1948
+    },
+    {
+      "epoch": 0.016918255917917378,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0019995577720843907,
+      "loss": 0.2363,
+      "step": 1949
+    },
+    {
+      "epoch": 0.016926936398121543,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019995568386986452,
+      "loss": 0.1846,
+      "step": 1950
+    },
+    {
+      "epoch": 0.016935616878325708,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019995559043291585,
+      "loss": 0.2422,
+      "step": 1951
+    },
+    {
+      "epoch": 0.016944297358529873,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019995549689759305,
+      "loss": 0.21,
+      "step": 1952
+    },
+    {
+      "epoch": 0.016952977838734038,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001999554032638963,
+      "loss": 0.25,
+      "step": 1953
+    },
+    {
+      "epoch": 0.016961658318938203,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0019995530953182566,
+      "loss": 0.2461,
+      "step": 1954
+    },
+    {
+      "epoch": 0.01697033879914237,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019995521570138125,
+      "loss": 0.2227,
+      "step": 1955
+    },
+    {
+      "epoch": 0.016979019279346533,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001999551217725632,
+      "loss": 0.2344,
+      "step": 1956
+    },
+    {
+      "epoch": 0.0169876997595507,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001999550277453715,
+      "loss": 0.1963,
+      "step": 1957
+    },
+    {
+      "epoch": 0.016996380239754864,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001999549336198064,
+      "loss": 0.2832,
+      "step": 1958
+    },
+    {
+      "epoch": 0.01700506071995903,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0019995483939586793,
+      "loss": 0.2441,
+      "step": 1959
+    },
+    {
+      "epoch": 0.017013741200163194,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0019995474507355617,
+      "loss": 0.3984,
+      "step": 1960
+    },
+    {
+      "epoch": 0.01702242168036736,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019995465065287127,
+      "loss": 0.21,
+      "step": 1961
+    },
+    {
+      "epoch": 0.017031102160571524,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019995455613381332,
+      "loss": 0.207,
+      "step": 1962
+    },
+    {
+      "epoch": 0.01703978264077569,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.001999544615163824,
+      "loss": 0.1768,
+      "step": 1963
+    },
+    {
+      "epoch": 0.01704846312097985,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0019995436680057864,
+      "loss": 0.2637,
+      "step": 1964
+    },
+    {
+      "epoch": 0.017057143601184016,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019995427198640217,
+      "loss": 0.2441,
+      "step": 1965
+    },
+    {
+      "epoch": 0.01706582408138818,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.00199954177073853,
+      "loss": 0.2275,
+      "step": 1966
+    },
+    {
+      "epoch": 0.017074504561592346,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019995408206293134,
+      "loss": 0.2285,
+      "step": 1967
+    },
+    {
+      "epoch": 0.01708318504179651,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019995398695363724,
+      "loss": 0.3086,
+      "step": 1968
+    },
+    {
+      "epoch": 0.017091865522000677,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001999538917459708,
+      "loss": 0.2314,
+      "step": 1969
+    },
+    {
+      "epoch": 0.01710054600220484,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0019995379643993213,
+      "loss": 0.2314,
+      "step": 1970
+    },
+    {
+      "epoch": 0.017109226482409007,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0019995370103552137,
+      "loss": 0.1719,
+      "step": 1971
+    },
+    {
+      "epoch": 0.017117906962613172,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019995360553273856,
+      "loss": 0.2227,
+      "step": 1972
+    },
+    {
+      "epoch": 0.017126587442817337,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019995350993158387,
+      "loss": 0.2617,
+      "step": 1973
+    },
+    {
+      "epoch": 0.017135267923021502,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001999534142320573,
+      "loss": 0.293,
+      "step": 1974
+    },
+    {
+      "epoch": 0.017143948403225667,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001999533184341591,
+      "loss": 0.2559,
+      "step": 1975
+    },
+    {
+      "epoch": 0.017152628883429832,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001999532225378893,
+      "loss": 0.2109,
+      "step": 1976
+    },
+    {
+      "epoch": 0.017161309363633998,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.00199953126543248,
+      "loss": 0.3164,
+      "step": 1977
+    },
+    {
+      "epoch": 0.017169989843838163,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.001999530304502353,
+      "loss": 0.2129,
+      "step": 1978
+    },
+    {
+      "epoch": 0.017178670324042328,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001999529342588513,
+      "loss": 0.2314,
+      "step": 1979
+    },
+    {
+      "epoch": 0.01718735080424649,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019995283796909614,
+      "loss": 0.3008,
+      "step": 1980
+    },
+    {
+      "epoch": 0.017196031284450655,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019995274158096992,
+      "loss": 0.2109,
+      "step": 1981
+    },
+    {
+      "epoch": 0.01720471176465482,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019995264509447275,
+      "loss": 0.2441,
+      "step": 1982
+    },
+    {
+      "epoch": 0.017213392244858985,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0019995254850960465,
+      "loss": 0.2041,
+      "step": 1983
+    },
+    {
+      "epoch": 0.01722207272506315,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019995245182636585,
+      "loss": 0.2578,
+      "step": 1984
+    },
+    {
+      "epoch": 0.017230753205267315,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001999523550447564,
+      "loss": 0.2354,
+      "step": 1985
+    },
+    {
+      "epoch": 0.01723943368547148,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001999522581647764,
+      "loss": 0.2441,
+      "step": 1986
+    },
+    {
+      "epoch": 0.017248114165675645,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0019995216118642595,
+      "loss": 0.2324,
+      "step": 1987
+    },
+    {
+      "epoch": 0.01725679464587981,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001999520641097052,
+      "loss": 0.2656,
+      "step": 1988
+    },
+    {
+      "epoch": 0.017265475126083975,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001999519669346142,
+      "loss": 0.1973,
+      "step": 1989
+    },
+    {
+      "epoch": 0.01727415560628814,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0019995186966115307,
+      "loss": 0.2383,
+      "step": 1990
+    },
+    {
+      "epoch": 0.017282836086492306,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019995177228932194,
+      "loss": 0.2295,
+      "step": 1991
+    },
+    {
+      "epoch": 0.01729151656669647,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001999516748191209,
+      "loss": 0.2676,
+      "step": 1992
+    },
+    {
+      "epoch": 0.017300197046900636,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0019995157725055004,
+      "loss": 0.2188,
+      "step": 1993
+    },
+    {
+      "epoch": 0.0173088775271048,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.001999514795836095,
+      "loss": 0.2324,
+      "step": 1994
+    },
+    {
+      "epoch": 0.017317558007308963,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019995138181829936,
+      "loss": 0.4336,
+      "step": 1995
+    },
+    {
+      "epoch": 0.017326238487513128,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001999512839546198,
+      "loss": 0.293,
+      "step": 1996
+    },
+    {
+      "epoch": 0.017334918967717293,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001999511859925708,
+      "loss": 0.1982,
+      "step": 1997
+    },
+    {
+      "epoch": 0.017343599447921458,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019995108793215257,
+      "loss": 0.2363,
+      "step": 1998
+    },
+    {
+      "epoch": 0.017352279928125623,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0019995098977336517,
+      "loss": 0.2559,
+      "step": 1999
+    },
+    {
+      "epoch": 0.01736096040832979,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019995089151620873,
+      "loss": 0.2012,
+      "step": 2000
+    },
+    {
+      "epoch": 0.017369640888533953,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0019995079316068335,
+      "loss": 0.2598,
+      "step": 2001
+    },
+    {
+      "epoch": 0.01737832136873812,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019995069470678914,
+      "loss": 0.3105,
+      "step": 2002
+    },
+    {
+      "epoch": 0.017387001848942284,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0019995059615452613,
+      "loss": 0.2422,
+      "step": 2003
+    },
+    {
+      "epoch": 0.01739568232914645,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001999504975038946,
+      "loss": 0.2021,
+      "step": 2004
+    },
+    {
+      "epoch": 0.017404362809350614,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.001999503987548945,
+      "loss": 0.248,
+      "step": 2005
+    },
+    {
+      "epoch": 0.01741304328955478,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.00199950299907526,
+      "loss": 0.2773,
+      "step": 2006
+    },
+    {
+      "epoch": 0.017421723769758944,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019995020096178918,
+      "loss": 0.1973,
+      "step": 2007
+    },
+    {
+      "epoch": 0.01743040424996311,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0019995010191768427,
+      "loss": 0.248,
+      "step": 2008
+    },
+    {
+      "epoch": 0.017439084730167274,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019995000277521118,
+      "loss": 0.2002,
+      "step": 2009
+    },
+    {
+      "epoch": 0.017447765210371436,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019994990353437016,
+      "loss": 0.2676,
+      "step": 2010
+    },
+    {
+      "epoch": 0.0174564456905756,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0019994980419516125,
+      "loss": 0.1768,
+      "step": 2011
+    },
+    {
+      "epoch": 0.017465126170779766,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0019994970475758463,
+      "loss": 0.2109,
+      "step": 2012
+    },
+    {
+      "epoch": 0.01747380665098393,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.001999496052216403,
+      "loss": 0.2539,
+      "step": 2013
+    },
+    {
+      "epoch": 0.017482487131188097,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.001999495055873285,
+      "loss": 0.2285,
+      "step": 2014
+    },
+    {
+      "epoch": 0.01749116761139226,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0019994940585464924,
+      "loss": 0.2168,
+      "step": 2015
+    },
+    {
+      "epoch": 0.017499848091596427,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0019994930602360264,
+      "loss": 0.2188,
+      "step": 2016
+    },
+    {
+      "epoch": 0.017508528571800592,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019994920609418885,
+      "loss": 0.2695,
+      "step": 2017
+    },
+    {
+      "epoch": 0.017517209052004757,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.00199949106066408,
+      "loss": 0.2812,
+      "step": 2018
+    },
+    {
+      "epoch": 0.017525889532208922,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001999490059402601,
+      "loss": 0.2139,
+      "step": 2019
+    },
+    {
+      "epoch": 0.017534570012413087,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0019994890571574534,
+      "loss": 0.25,
+      "step": 2020
+    },
+    {
+      "epoch": 0.017543250492617252,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019994880539286383,
+      "loss": 0.2383,
+      "step": 2021
+    },
+    {
+      "epoch": 0.017551930972821417,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0019994870497161564,
+      "loss": 0.2393,
+      "step": 2022
+    },
+    {
+      "epoch": 0.017560611453025583,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001999486044520009,
+      "loss": 0.2422,
+      "step": 2023
+    },
+    {
+      "epoch": 0.017569291933229748,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0019994850383401974,
+      "loss": 0.2109,
+      "step": 2024
+    },
+    {
+      "epoch": 0.017577972413433913,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001999484031176722,
+      "loss": 0.2363,
+      "step": 2025
+    },
+    {
+      "epoch": 0.017586652893638074,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019994830230295846,
+      "loss": 0.7656,
+      "step": 2026
+    },
+    {
+      "epoch": 0.01759533337384224,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001999482013898786,
+      "loss": 0.2129,
+      "step": 2027
+    },
+    {
+      "epoch": 0.017604013854046405,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0019994810037843275,
+      "loss": 0.2246,
+      "step": 2028
+    },
+    {
+      "epoch": 0.01761269433425057,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.00199947999268621,
+      "loss": 0.2461,
+      "step": 2029
+    },
+    {
+      "epoch": 0.017621374814454735,
+      "grad_norm": 0.057861328125,
+      "learning_rate": 0.0019994789806044347,
+      "loss": 0.1875,
+      "step": 2030
+    },
+    {
+      "epoch": 0.0176300552946589,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001999477967539003,
+      "loss": 0.2891,
+      "step": 2031
+    },
+    {
+      "epoch": 0.017638735774863065,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0019994769534899155,
+      "loss": 0.293,
+      "step": 2032
+    },
+    {
+      "epoch": 0.01764741625506723,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0019994759384571736,
+      "loss": 0.2148,
+      "step": 2033
+    },
+    {
+      "epoch": 0.017656096735271395,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001999474922440778,
+      "loss": 0.3027,
+      "step": 2034
+    },
+    {
+      "epoch": 0.01766477721547556,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.00199947390544073,
+      "loss": 0.2227,
+      "step": 2035
+    },
+    {
+      "epoch": 0.017673457695679726,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019994728874570314,
+      "loss": 0.2617,
+      "step": 2036
+    },
+    {
+      "epoch": 0.01768213817588389,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019994718684896826,
+      "loss": 0.2051,
+      "step": 2037
+    },
+    {
+      "epoch": 0.017690818656088056,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0019994708485386848,
+      "loss": 0.1777,
+      "step": 2038
+    },
+    {
+      "epoch": 0.01769949913629222,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019994698276040394,
+      "loss": 0.25,
+      "step": 2039
+    },
+    {
+      "epoch": 0.017708179616496386,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001999468805685747,
+      "loss": 0.2305,
+      "step": 2040
+    },
+    {
+      "epoch": 0.017716860096700548,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.001999467782783809,
+      "loss": 0.2383,
+      "step": 2041
+    },
+    {
+      "epoch": 0.017725540576904713,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001999466758898227,
+      "loss": 0.2041,
+      "step": 2042
+    },
+    {
+      "epoch": 0.017734221057108878,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019994657340290014,
+      "loss": 0.25,
+      "step": 2043
+    },
+    {
+      "epoch": 0.017742901537313043,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019994647081761335,
+      "loss": 0.4082,
+      "step": 2044
+    },
+    {
+      "epoch": 0.01775158201751721,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019994636813396244,
+      "loss": 0.2031,
+      "step": 2045
+    },
+    {
+      "epoch": 0.017760262497721373,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0019994626535194755,
+      "loss": 0.2734,
+      "step": 2046
+    },
+    {
+      "epoch": 0.01776894297792554,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0019994616247156877,
+      "loss": 0.2168,
+      "step": 2047
+    },
+    {
+      "epoch": 0.017777623458129704,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0019994605949282622,
+      "loss": 0.1797,
+      "step": 2048
+    },
+    {
+      "epoch": 0.01778630393833387,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019994595641572004,
+      "loss": 0.2754,
+      "step": 2049
+    },
+    {
+      "epoch": 0.017794984418538034,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001999458532402503,
+      "loss": 0.2695,
+      "step": 2050
+    },
+    {
+      "epoch": 0.0178036648987422,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.001999457499664171,
+      "loss": 0.2207,
+      "step": 2051
+    },
+    {
+      "epoch": 0.017812345378946364,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001999456465942206,
+      "loss": 0.2188,
+      "step": 2052
+    },
+    {
+      "epoch": 0.01782102585915053,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0019994554312366087,
+      "loss": 0.1982,
+      "step": 2053
+    },
+    {
+      "epoch": 0.017829706339354694,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019994543955473808,
+      "loss": 0.2334,
+      "step": 2054
+    },
+    {
+      "epoch": 0.01783838681955886,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001999453358874523,
+      "loss": 0.2383,
+      "step": 2055
+    },
+    {
+      "epoch": 0.017847067299763025,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019994523212180366,
+      "loss": 0.2559,
+      "step": 2056
+    },
+    {
+      "epoch": 0.017855747779967186,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001999451282577922,
+      "loss": 0.2158,
+      "step": 2057
+    },
+    {
+      "epoch": 0.01786442826017135,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.001999450242954182,
+      "loss": 0.209,
+      "step": 2058
+    },
+    {
+      "epoch": 0.017873108740375517,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001999449202346816,
+      "loss": 0.2275,
+      "step": 2059
+    },
+    {
+      "epoch": 0.01788178922057968,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.001999448160755826,
+      "loss": 0.2188,
+      "step": 2060
+    },
+    {
+      "epoch": 0.017890469700783847,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.001999447118181213,
+      "loss": 0.2129,
+      "step": 2061
+    },
+    {
+      "epoch": 0.017899150180988012,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001999446074622978,
+      "loss": 0.168,
+      "step": 2062
+    },
+    {
+      "epoch": 0.017907830661192177,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001999445030081123,
+      "loss": 0.2471,
+      "step": 2063
+    },
+    {
+      "epoch": 0.017916511141396342,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.001999443984555648,
+      "loss": 0.2217,
+      "step": 2064
+    },
+    {
+      "epoch": 0.017925191621600507,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.001999442938046554,
+      "loss": 0.1797,
+      "step": 2065
+    },
+    {
+      "epoch": 0.017933872101804672,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0019994418905538436,
+      "loss": 0.2695,
+      "step": 2066
+    },
+    {
+      "epoch": 0.017942552582008837,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019994408420775166,
+      "loss": 0.1992,
+      "step": 2067
+    },
+    {
+      "epoch": 0.017951233062213003,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001999439792617575,
+      "loss": 0.2334,
+      "step": 2068
+    },
+    {
+      "epoch": 0.017959913542417168,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.001999438742174019,
+      "loss": 0.1738,
+      "step": 2069
+    },
+    {
+      "epoch": 0.017968594022621333,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0019994376907468505,
+      "loss": 0.1914,
+      "step": 2070
+    },
+    {
+      "epoch": 0.017977274502825498,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.001999436638336071,
+      "loss": 0.2852,
+      "step": 2071
+    },
+    {
+      "epoch": 0.01798595498302966,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0019994355849416805,
+      "loss": 0.2539,
+      "step": 2072
+    },
+    {
+      "epoch": 0.017994635463233825,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019994345305636807,
+      "loss": 0.2461,
+      "step": 2073
+    },
+    {
+      "epoch": 0.01800331594343799,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.001999433475202073,
+      "loss": 0.2129,
+      "step": 2074
+    },
+    {
+      "epoch": 0.018011996423642155,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0019994324188568583,
+      "loss": 0.1943,
+      "step": 2075
+    },
+    {
+      "epoch": 0.01802067690384632,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.001999431361528038,
+      "loss": 0.2256,
+      "step": 2076
+    },
+    {
+      "epoch": 0.018029357384050485,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019994303032156127,
+      "loss": 0.2656,
+      "step": 2077
+    },
+    {
+      "epoch": 0.01803803786425465,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019994292439195847,
+      "loss": 0.2754,
+      "step": 2078
+    },
+    {
+      "epoch": 0.018046718344458815,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019994281836399536,
+      "loss": 0.1924,
+      "step": 2079
+    },
+    {
+      "epoch": 0.01805539882466298,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019994271223767214,
+      "loss": 0.2393,
+      "step": 2080
+    },
+    {
+      "epoch": 0.018064079304867146,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0019994260601298897,
+      "loss": 0.1875,
+      "step": 2081
+    },
+    {
+      "epoch": 0.01807275978507131,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.001999424996899459,
+      "loss": 0.208,
+      "step": 2082
+    },
+    {
+      "epoch": 0.018081440265275476,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0019994239326854304,
+      "loss": 0.2539,
+      "step": 2083
+    },
+    {
+      "epoch": 0.01809012074547964,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0019994228674878054,
+      "loss": 0.1719,
+      "step": 2084
+    },
+    {
+      "epoch": 0.018098801225683806,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0019994218013065852,
+      "loss": 0.1914,
+      "step": 2085
+    },
+    {
+      "epoch": 0.01810748170588797,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.001999420734141771,
+      "loss": 0.2676,
+      "step": 2086
+    },
+    {
+      "epoch": 0.018116162186092133,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019994196659933634,
+      "loss": 0.2852,
+      "step": 2087
+    },
+    {
+      "epoch": 0.018124842666296298,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001999418596861364,
+      "loss": 0.2266,
+      "step": 2088
+    },
+    {
+      "epoch": 0.018133523146500463,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.001999417526745774,
+      "loss": 0.2812,
+      "step": 2089
+    },
+    {
+      "epoch": 0.01814220362670463,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0019994164556465946,
+      "loss": 0.2207,
+      "step": 2090
+    },
+    {
+      "epoch": 0.018150884106908793,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.001999415383563827,
+      "loss": 0.2158,
+      "step": 2091
+    },
+    {
+      "epoch": 0.01815956458711296,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001999414310497472,
+      "loss": 0.2061,
+      "step": 2092
+    },
+    {
+      "epoch": 0.018168245067317124,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.001999413236447531,
+      "loss": 0.2012,
+      "step": 2093
+    },
+    {
+      "epoch": 0.01817692554752129,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019994121614140053,
+      "loss": 0.2363,
+      "step": 2094
+    },
+    {
+      "epoch": 0.018185606027725454,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001999411085396896,
+      "loss": 0.1875,
+      "step": 2095
+    },
+    {
+      "epoch": 0.01819428650792962,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0019994100083962044,
+      "loss": 0.1924,
+      "step": 2096
+    },
+    {
+      "epoch": 0.018202966988133784,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.001999408930411932,
+      "loss": 0.1816,
+      "step": 2097
+    },
+    {
+      "epoch": 0.01821164746833795,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0019994078514440784,
+      "loss": 0.1846,
+      "step": 2098
+    },
+    {
+      "epoch": 0.018220327948542114,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019994067714926463,
+      "loss": 0.2422,
+      "step": 2099
+    },
+    {
+      "epoch": 0.01822900842874628,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019994056905576364,
+      "loss": 0.2461,
+      "step": 2100
+    },
+    {
+      "epoch": 0.018237688908950445,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019994046086390504,
+      "loss": 0.2158,
+      "step": 2101
+    },
+    {
+      "epoch": 0.01824636938915461,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0019994035257368883,
+      "loss": 0.2168,
+      "step": 2102
+    },
+    {
+      "epoch": 0.01825504986935877,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001999402441851153,
+      "loss": 0.2402,
+      "step": 2103
+    },
+    {
+      "epoch": 0.018263730349562936,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.001999401356981844,
+      "loss": 0.209,
+      "step": 2104
+    },
+    {
+      "epoch": 0.0182724108297671,
+      "grad_norm": 0.059326171875,
+      "learning_rate": 0.0019994002711289636,
+      "loss": 0.1914,
+      "step": 2105
+    },
+    {
+      "epoch": 0.018281091309971267,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001999399184292512,
+      "loss": 0.2285,
+      "step": 2106
+    },
+    {
+      "epoch": 0.018289771790175432,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0019993980964724913,
+      "loss": 0.1816,
+      "step": 2107
+    },
+    {
+      "epoch": 0.018298452270379597,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019993970076689024,
+      "loss": 0.2227,
+      "step": 2108
+    },
+    {
+      "epoch": 0.018307132750583762,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001999395917881746,
+      "loss": 0.1885,
+      "step": 2109
+    },
+    {
+      "epoch": 0.018315813230787927,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0019993948271110245,
+      "loss": 0.2402,
+      "step": 2110
+    },
+    {
+      "epoch": 0.018324493710992092,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001999393735356738,
+      "loss": 0.248,
+      "step": 2111
+    },
+    {
+      "epoch": 0.018333174191196257,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001999392642618888,
+      "loss": 0.2305,
+      "step": 2112
+    },
+    {
+      "epoch": 0.018341854671400423,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0019993915488974753,
+      "loss": 0.2266,
+      "step": 2113
+    },
+    {
+      "epoch": 0.018350535151604588,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001999390454192502,
+      "loss": 0.2402,
+      "step": 2114
+    },
+    {
+      "epoch": 0.018359215631808753,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.001999389358503969,
+      "loss": 0.1973,
+      "step": 2115
+    },
+    {
+      "epoch": 0.018367896112012918,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.001999388261831877,
+      "loss": 0.1836,
+      "step": 2116
+    },
+    {
+      "epoch": 0.018376576592217083,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019993871641762273,
+      "loss": 0.2305,
+      "step": 2117
+    },
+    {
+      "epoch": 0.018385257072421245,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0019993860655370217,
+      "loss": 0.2441,
+      "step": 2118
+    },
+    {
+      "epoch": 0.01839393755262541,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001999384965914261,
+      "loss": 0.2344,
+      "step": 2119
+    },
+    {
+      "epoch": 0.018402618032829575,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001999383865307946,
+      "loss": 0.2275,
+      "step": 2120
+    },
+    {
+      "epoch": 0.01841129851303374,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001999382763718079,
+      "loss": 0.208,
+      "step": 2121
+    },
+    {
+      "epoch": 0.018419978993237905,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.00199938166114466,
+      "loss": 0.2256,
+      "step": 2122
+    },
+    {
+      "epoch": 0.01842865947344207,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0019993805575876907,
+      "loss": 0.1816,
+      "step": 2123
+    },
+    {
+      "epoch": 0.018437339953646235,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001999379453047172,
+      "loss": 0.25,
+      "step": 2124
+    },
+    {
+      "epoch": 0.0184460204338504,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0019993783475231062,
+      "loss": 0.2266,
+      "step": 2125
+    },
+    {
+      "epoch": 0.018454700914054566,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019993772410154937,
+      "loss": 0.2285,
+      "step": 2126
+    },
+    {
+      "epoch": 0.01846338139425873,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0019993761335243354,
+      "loss": 0.2344,
+      "step": 2127
+    },
+    {
+      "epoch": 0.018472061874462896,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019993750250496332,
+      "loss": 0.2539,
+      "step": 2128
+    },
+    {
+      "epoch": 0.01848074235466706,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001999373915591388,
+      "loss": 0.2344,
+      "step": 2129
+    },
+    {
+      "epoch": 0.018489422834871226,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019993728051496003,
+      "loss": 0.2559,
+      "step": 2130
+    },
+    {
+      "epoch": 0.01849810331507539,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019993716937242727,
+      "loss": 0.2109,
+      "step": 2131
+    },
+    {
+      "epoch": 0.018506783795279556,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0019993705813154054,
+      "loss": 0.1807,
+      "step": 2132
+    },
+    {
+      "epoch": 0.01851546427548372,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0019993694679230002,
+      "loss": 0.2383,
+      "step": 2133
+    },
+    {
+      "epoch": 0.018524144755687883,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001999368353547058,
+      "loss": 0.2441,
+      "step": 2134
+    },
+    {
+      "epoch": 0.018532825235892048,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.00199936723818758,
+      "loss": 0.1758,
+      "step": 2135
+    },
+    {
+      "epoch": 0.018541505716096213,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019993661218445677,
+      "loss": 0.207,
+      "step": 2136
+    },
+    {
+      "epoch": 0.01855018619630038,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0019993650045180217,
+      "loss": 0.2871,
+      "step": 2137
+    },
+    {
+      "epoch": 0.018558866676504544,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001999363886207944,
+      "loss": 0.2305,
+      "step": 2138
+    },
+    {
+      "epoch": 0.01856754715670871,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001999362766914335,
+      "loss": 0.3477,
+      "step": 2139
+    },
+    {
+      "epoch": 0.018576227636912874,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001999361646637197,
+      "loss": 0.1924,
+      "step": 2140
+    },
+    {
+      "epoch": 0.01858490811711704,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019993605253765304,
+      "loss": 0.2129,
+      "step": 2141
+    },
+    {
+      "epoch": 0.018593588597321204,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0019993594031323366,
+      "loss": 0.2168,
+      "step": 2142
+    },
+    {
+      "epoch": 0.01860226907752537,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.001999358279904617,
+      "loss": 0.1826,
+      "step": 2143
+    },
+    {
+      "epoch": 0.018610949557729534,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0019993571556933725,
+      "loss": 0.2129,
+      "step": 2144
+    },
+    {
+      "epoch": 0.0186196300379337,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001999356030498605,
+      "loss": 0.1885,
+      "step": 2145
+    },
+    {
+      "epoch": 0.018628310518137865,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019993549043203144,
+      "loss": 0.2441,
+      "step": 2146
+    },
+    {
+      "epoch": 0.01863699099834203,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0019993537771585035,
+      "loss": 0.2402,
+      "step": 2147
+    },
+    {
+      "epoch": 0.018645671478546195,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.001999352649013172,
+      "loss": 0.1748,
+      "step": 2148
+    },
+    {
+      "epoch": 0.018654351958750356,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.001999351519884323,
+      "loss": 0.2637,
+      "step": 2149
+    },
+    {
+      "epoch": 0.01866303243895452,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.001999350389771956,
+      "loss": 0.1865,
+      "step": 2150
+    },
+    {
+      "epoch": 0.018671712919158687,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0019993492586760733,
+      "loss": 0.2188,
+      "step": 2151
+    },
+    {
+      "epoch": 0.018680393399362852,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0019993481265966756,
+      "loss": 0.1885,
+      "step": 2152
+    },
+    {
+      "epoch": 0.018689073879567017,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019993469935337643,
+      "loss": 0.3555,
+      "step": 2153
+    },
+    {
+      "epoch": 0.018697754359771182,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019993458594873407,
+      "loss": 0.2539,
+      "step": 2154
+    },
+    {
+      "epoch": 0.018706434839975347,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001999344724457406,
+      "loss": 0.2129,
+      "step": 2155
+    },
+    {
+      "epoch": 0.018715115320179512,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0019993435884439613,
+      "loss": 0.2109,
+      "step": 2156
+    },
+    {
+      "epoch": 0.018723795800383677,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001999342451447008,
+      "loss": 0.2285,
+      "step": 2157
+    },
+    {
+      "epoch": 0.018732476280587843,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0019993413134665474,
+      "loss": 0.2695,
+      "step": 2158
+    },
+    {
+      "epoch": 0.018741156760792008,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019993401745025804,
+      "loss": 0.2168,
+      "step": 2159
+    },
+    {
+      "epoch": 0.018749837240996173,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019993390345551085,
+      "loss": 0.2383,
+      "step": 2160
+    },
+    {
+      "epoch": 0.018758517721200338,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001999337893624133,
+      "loss": 0.2021,
+      "step": 2161
+    },
+    {
+      "epoch": 0.018767198201404503,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0019993367517096555,
+      "loss": 0.2227,
+      "step": 2162
+    },
+    {
+      "epoch": 0.018775878681608668,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019993356088116765,
+      "loss": 0.2852,
+      "step": 2163
+    },
+    {
+      "epoch": 0.01878455916181283,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001999334464930198,
+      "loss": 0.2988,
+      "step": 2164
+    },
+    {
+      "epoch": 0.018793239642016995,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019993333200652203,
+      "loss": 0.2207,
+      "step": 2165
+    },
+    {
+      "epoch": 0.01880192012222116,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001999332174216745,
+      "loss": 0.2773,
+      "step": 2166
+    },
+    {
+      "epoch": 0.018810600602425325,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.001999331027384774,
+      "loss": 0.2002,
+      "step": 2167
+    },
+    {
+      "epoch": 0.01881928108262949,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019993298795693082,
+      "loss": 0.2441,
+      "step": 2168
+    },
+    {
+      "epoch": 0.018827961562833655,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019993287307703486,
+      "loss": 0.2852,
+      "step": 2169
+    },
+    {
+      "epoch": 0.01883664204303782,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019993275809878966,
+      "loss": 0.2236,
+      "step": 2170
+    },
+    {
+      "epoch": 0.018845322523241986,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.001999326430221953,
+      "loss": 0.1709,
+      "step": 2171
+    },
+    {
+      "epoch": 0.01885400300344615,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0019993252784725204,
+      "loss": 0.2197,
+      "step": 2172
+    },
+    {
+      "epoch": 0.018862683483650316,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019993241257395987,
+      "loss": 0.1836,
+      "step": 2173
+    },
+    {
+      "epoch": 0.01887136396385448,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0019993229720231902,
+      "loss": 0.2344,
+      "step": 2174
+    },
+    {
+      "epoch": 0.018880044444058646,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019993218173232946,
+      "loss": 0.3477,
+      "step": 2175
+    },
+    {
+      "epoch": 0.01888872492426281,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.001999320661639915,
+      "loss": 0.1797,
+      "step": 2176
+    },
+    {
+      "epoch": 0.018897405404466976,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.001999319504973051,
+      "loss": 0.2363,
+      "step": 2177
+    },
+    {
+      "epoch": 0.01890608588467114,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019993183473227057,
+      "loss": 0.2275,
+      "step": 2178
+    },
+    {
+      "epoch": 0.018914766364875307,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019993171886888788,
+      "loss": 0.2344,
+      "step": 2179
+    },
+    {
+      "epoch": 0.018923446845079468,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019993160290715725,
+      "loss": 0.2656,
+      "step": 2180
+    },
+    {
+      "epoch": 0.018932127325283633,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019993148684707873,
+      "loss": 0.2197,
+      "step": 2181
+    },
+    {
+      "epoch": 0.0189408078054878,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001999313706886525,
+      "loss": 0.1885,
+      "step": 2182
+    },
+    {
+      "epoch": 0.018949488285691964,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0019993125443187867,
+      "loss": 0.2617,
+      "step": 2183
+    },
+    {
+      "epoch": 0.01895816876589613,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001999311380767574,
+      "loss": 0.2344,
+      "step": 2184
+    },
+    {
+      "epoch": 0.018966849246100294,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019993102162328877,
+      "loss": 0.2148,
+      "step": 2185
+    },
+    {
+      "epoch": 0.01897552972630446,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001999309050714729,
+      "loss": 0.2734,
+      "step": 2186
+    },
+    {
+      "epoch": 0.018984210206508624,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019993078842131,
+      "loss": 0.1836,
+      "step": 2187
+    },
+    {
+      "epoch": 0.01899289068671279,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001999306716728001,
+      "loss": 0.209,
+      "step": 2188
+    },
+    {
+      "epoch": 0.019001571166916954,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019993055482594338,
+      "loss": 0.249,
+      "step": 2189
+    },
+    {
+      "epoch": 0.01901025164712112,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0019993043788073994,
+      "loss": 0.1807,
+      "step": 2190
+    },
+    {
+      "epoch": 0.019018932127325285,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.001999303208371899,
+      "loss": 0.1836,
+      "step": 2191
+    },
+    {
+      "epoch": 0.01902761260752945,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019993020369529347,
+      "loss": 0.2734,
+      "step": 2192
+    },
+    {
+      "epoch": 0.019036293087733615,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.001999300864550507,
+      "loss": 0.2031,
+      "step": 2193
+    },
+    {
+      "epoch": 0.01904497356793778,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0019992996911646172,
+      "loss": 0.2305,
+      "step": 2194
+    },
+    {
+      "epoch": 0.01905365404814194,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0019992985167952672,
+      "loss": 0.2266,
+      "step": 2195
+    },
+    {
+      "epoch": 0.019062334528346107,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019992973414424578,
+      "loss": 0.2227,
+      "step": 2196
+    },
+    {
+      "epoch": 0.019071015008550272,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.00199929616510619,
+      "loss": 0.2363,
+      "step": 2197
+    },
+    {
+      "epoch": 0.019079695488754437,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001999294987786466,
+      "loss": 0.291,
+      "step": 2198
+    },
+    {
+      "epoch": 0.019088375968958602,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0019992938094832856,
+      "loss": 0.1934,
+      "step": 2199
+    },
+    {
+      "epoch": 0.019097056449162767,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019992926301966515,
+      "loss": 0.3066,
+      "step": 2200
+    },
+    {
+      "epoch": 0.019105736929366932,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019992914499265646,
+      "loss": 0.2695,
+      "step": 2201
+    },
+    {
+      "epoch": 0.019114417409571097,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.001999290268673026,
+      "loss": 0.1787,
+      "step": 2202
+    },
+    {
+      "epoch": 0.019123097889775263,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019992890864360367,
+      "loss": 0.2461,
+      "step": 2203
+    },
+    {
+      "epoch": 0.019131778369979428,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001999287903215599,
+      "loss": 0.2578,
+      "step": 2204
+    },
+    {
+      "epoch": 0.019140458850183593,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019992867190117133,
+      "loss": 0.2695,
+      "step": 2205
+    },
+    {
+      "epoch": 0.019149139330387758,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001999285533824381,
+      "loss": 0.2363,
+      "step": 2206
+    },
+    {
+      "epoch": 0.019157819810591923,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019992843476536034,
+      "loss": 0.1836,
+      "step": 2207
+    },
+    {
+      "epoch": 0.019166500290796088,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0019992831604993826,
+      "loss": 0.1533,
+      "step": 2208
+    },
+    {
+      "epoch": 0.019175180771000253,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.001999281972361719,
+      "loss": 0.1768,
+      "step": 2209
+    },
+    {
+      "epoch": 0.01918386125120442,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.001999280783240614,
+      "loss": 0.2354,
+      "step": 2210
+    },
+    {
+      "epoch": 0.01919254173140858,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.001999279593136069,
+      "loss": 0.2168,
+      "step": 2211
+    },
+    {
+      "epoch": 0.019201222211612745,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001999278402048085,
+      "loss": 0.25,
+      "step": 2212
+    },
+    {
+      "epoch": 0.01920990269181691,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.001999277209976664,
+      "loss": 0.207,
+      "step": 2213
+    },
+    {
+      "epoch": 0.019218583172021075,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.001999276016921807,
+      "loss": 0.2305,
+      "step": 2214
+    },
+    {
+      "epoch": 0.01922726365222524,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019992748228835153,
+      "loss": 0.2383,
+      "step": 2215
+    },
+    {
+      "epoch": 0.019235944132429406,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.00199927362786179,
+      "loss": 0.3027,
+      "step": 2216
+    },
+    {
+      "epoch": 0.01924462461263357,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019992724318566328,
+      "loss": 0.2266,
+      "step": 2217
+    },
+    {
+      "epoch": 0.019253305092837736,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.0019992712348680447,
+      "loss": 0.1816,
+      "step": 2218
+    },
+    {
+      "epoch": 0.0192619855730419,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.001999270036896027,
+      "loss": 0.2158,
+      "step": 2219
+    },
+    {
+      "epoch": 0.019270666053246066,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019992688379405813,
+      "loss": 0.2734,
+      "step": 2220
+    },
+    {
+      "epoch": 0.01927934653345023,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0019992676380017086,
+      "loss": 0.1787,
+      "step": 2221
+    },
+    {
+      "epoch": 0.019288027013654396,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0019992664370794104,
+      "loss": 0.2559,
+      "step": 2222
+    },
+    {
+      "epoch": 0.01929670749385856,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001999265235173688,
+      "loss": 0.2441,
+      "step": 2223
+    },
+    {
+      "epoch": 0.019305387974062727,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019992640322845424,
+      "loss": 0.2432,
+      "step": 2224
+    },
+    {
+      "epoch": 0.01931406845426689,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001999262828411975,
+      "loss": 0.1992,
+      "step": 2225
+    },
+    {
+      "epoch": 0.019322748934471053,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019992616235559877,
+      "loss": 0.2344,
+      "step": 2226
+    },
+    {
+      "epoch": 0.01933142941467522,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019992604177165815,
+      "loss": 0.2402,
+      "step": 2227
+    },
+    {
+      "epoch": 0.019340109894879384,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001999259210893757,
+      "loss": 0.2617,
+      "step": 2228
+    },
+    {
+      "epoch": 0.01934879037508355,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0019992580030875166,
+      "loss": 0.1934,
+      "step": 2229
+    },
+    {
+      "epoch": 0.019357470855287714,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001999256794297861,
+      "loss": 0.1885,
+      "step": 2230
+    },
+    {
+      "epoch": 0.01936615133549188,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001999255584524792,
+      "loss": 0.2188,
+      "step": 2231
+    },
+    {
+      "epoch": 0.019374831815696044,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019992543737683104,
+      "loss": 0.2617,
+      "step": 2232
+    },
+    {
+      "epoch": 0.01938351229590021,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019992531620284175,
+      "loss": 0.1914,
+      "step": 2233
+    },
+    {
+      "epoch": 0.019392192776104374,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001999251949305115,
+      "loss": 0.209,
+      "step": 2234
+    },
+    {
+      "epoch": 0.01940087325630854,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0019992507355984044,
+      "loss": 0.209,
+      "step": 2235
+    },
+    {
+      "epoch": 0.019409553736512705,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001999249520908286,
+      "loss": 0.2656,
+      "step": 2236
+    },
+    {
+      "epoch": 0.01941823421671687,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019992483052347623,
+      "loss": 0.625,
+      "step": 2237
+    },
+    {
+      "epoch": 0.019426914696921035,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019992470885778343,
+      "loss": 0.2559,
+      "step": 2238
+    },
+    {
+      "epoch": 0.0194355951771252,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001999245870937503,
+      "loss": 0.248,
+      "step": 2239
+    },
+    {
+      "epoch": 0.019444275657329365,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.00199924465231377,
+      "loss": 0.2148,
+      "step": 2240
+    },
+    {
+      "epoch": 0.019452956137533527,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019992434327066365,
+      "loss": 0.207,
+      "step": 2241
+    },
+    {
+      "epoch": 0.019461636617737692,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001999242212116104,
+      "loss": 0.3105,
+      "step": 2242
+    },
+    {
+      "epoch": 0.019470317097941857,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019992409905421732,
+      "loss": 0.2178,
+      "step": 2243
+    },
+    {
+      "epoch": 0.019478997578146022,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019992397679848466,
+      "loss": 0.2598,
+      "step": 2244
+    },
+    {
+      "epoch": 0.019487678058350187,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019992385444441247,
+      "loss": 0.2109,
+      "step": 2245
+    },
+    {
+      "epoch": 0.019496358538554352,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001999237319920009,
+      "loss": 0.2383,
+      "step": 2246
+    },
+    {
+      "epoch": 0.019505039018758517,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.001999236094412501,
+      "loss": 0.2383,
+      "step": 2247
+    },
+    {
+      "epoch": 0.019513719498962682,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0019992348679216017,
+      "loss": 0.207,
+      "step": 2248
+    },
+    {
+      "epoch": 0.019522399979166848,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001999233640447313,
+      "loss": 0.2656,
+      "step": 2249
+    },
+    {
+      "epoch": 0.019531080459371013,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019992324119896356,
+      "loss": 0.207,
+      "step": 2250
+    },
+    {
+      "epoch": 0.019539760939575178,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001999231182548571,
+      "loss": 0.2188,
+      "step": 2251
+    },
+    {
+      "epoch": 0.019548441419779343,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001999229952124121,
+      "loss": 0.1982,
+      "step": 2252
+    },
+    {
+      "epoch": 0.019557121899983508,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0019992287207162866,
+      "loss": 0.2812,
+      "step": 2253
+    },
+    {
+      "epoch": 0.019565802380187673,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.001999227488325069,
+      "loss": 0.2598,
+      "step": 2254
+    },
+    {
+      "epoch": 0.01957448286039184,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.00199922625495047,
+      "loss": 0.2266,
+      "step": 2255
+    },
+    {
+      "epoch": 0.019583163340596003,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019992250205924903,
+      "loss": 0.248,
+      "step": 2256
+    },
+    {
+      "epoch": 0.019591843820800165,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001999223785251132,
+      "loss": 0.1768,
+      "step": 2257
+    },
+    {
+      "epoch": 0.01960052430100433,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001999222548926396,
+      "loss": 0.3125,
+      "step": 2258
+    },
+    {
+      "epoch": 0.019609204781208495,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019992213116182835,
+      "loss": 0.2324,
+      "step": 2259
+    },
+    {
+      "epoch": 0.01961788526141266,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019992200733267964,
+      "loss": 0.248,
+      "step": 2260
+    },
+    {
+      "epoch": 0.019626565741616826,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0019992188340519357,
+      "loss": 0.2402,
+      "step": 2261
+    },
+    {
+      "epoch": 0.01963524622182099,
+      "grad_norm": 0.061767578125,
+      "learning_rate": 0.0019992175937937023,
+      "loss": 0.1953,
+      "step": 2262
+    },
+    {
+      "epoch": 0.019643926702025156,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019992163525520985,
+      "loss": 0.2158,
+      "step": 2263
+    },
+    {
+      "epoch": 0.01965260718222932,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019992151103271254,
+      "loss": 0.2285,
+      "step": 2264
+    },
+    {
+      "epoch": 0.019661287662433486,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019992138671187836,
+      "loss": 0.249,
+      "step": 2265
+    },
+    {
+      "epoch": 0.01966996814263765,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019992126229270756,
+      "loss": 0.5078,
+      "step": 2266
+    },
+    {
+      "epoch": 0.019678648622841816,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001999211377752002,
+      "loss": 0.2207,
+      "step": 2267
+    },
+    {
+      "epoch": 0.01968732910304598,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001999210131593564,
+      "loss": 0.2002,
+      "step": 2268
+    },
+    {
+      "epoch": 0.019696009583250147,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019992088844517637,
+      "loss": 0.2061,
+      "step": 2269
+    },
+    {
+      "epoch": 0.01970469006345431,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001999207636326602,
+      "loss": 0.2207,
+      "step": 2270
+    },
+    {
+      "epoch": 0.019713370543658477,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.00199920638721808,
+      "loss": 0.2227,
+      "step": 2271
+    },
+    {
+      "epoch": 0.01972205102386264,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0019992051371262,
+      "loss": 0.2539,
+      "step": 2272
+    },
+    {
+      "epoch": 0.019730731504066804,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019992038860509624,
+      "loss": 0.2539,
+      "step": 2273
+    },
+    {
+      "epoch": 0.01973941198427097,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 0.001999202633992369,
+      "loss": 0.1719,
+      "step": 2274
+    },
+    {
+      "epoch": 0.019748092464475134,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0019992013809504213,
+      "loss": 0.2051,
+      "step": 2275
+    },
+    {
+      "epoch": 0.0197567729446793,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.00199920012692512,
+      "loss": 0.2393,
+      "step": 2276
+    },
+    {
+      "epoch": 0.019765453424883464,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.001999198871916467,
+      "loss": 0.2148,
+      "step": 2277
+    },
+    {
+      "epoch": 0.01977413390508763,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001999197615924464,
+      "loss": 0.2236,
+      "step": 2278
+    },
+    {
+      "epoch": 0.019782814385291794,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001999196358949112,
+      "loss": 0.2012,
+      "step": 2279
+    },
+    {
+      "epoch": 0.01979149486549596,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019991951009904123,
+      "loss": 0.2383,
+      "step": 2280
+    },
+    {
+      "epoch": 0.019800175345700124,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019991938420483666,
+      "loss": 0.2207,
+      "step": 2281
+    },
+    {
+      "epoch": 0.01980885582590429,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001999192582122975,
+      "loss": 0.2461,
+      "step": 2282
+    },
+    {
+      "epoch": 0.019817536306108455,
+      "grad_norm": 0.056640625,
+      "learning_rate": 0.0019991913212142414,
+      "loss": 0.1719,
+      "step": 2283
+    },
+    {
+      "epoch": 0.01982621678631262,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001999190059322165,
+      "loss": 0.2109,
+      "step": 2284
+    },
+    {
+      "epoch": 0.019834897266516785,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019991887964467474,
+      "loss": 0.6211,
+      "step": 2285
+    },
+    {
+      "epoch": 0.01984357774672095,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001999187532587991,
+      "loss": 0.2656,
+      "step": 2286
+    },
+    {
+      "epoch": 0.019852258226925115,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0019991862677458962,
+      "loss": 0.2461,
+      "step": 2287
+    },
+    {
+      "epoch": 0.019860938707129277,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.001999185001920465,
+      "loss": 0.1777,
+      "step": 2288
+    },
+    {
+      "epoch": 0.019869619187333442,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001999183735111699,
+      "loss": 0.3047,
+      "step": 2289
+    },
+    {
+      "epoch": 0.019878299667537607,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019991824673195983,
+      "loss": 0.2637,
+      "step": 2290
+    },
+    {
+      "epoch": 0.019886980147741772,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0019991811985441655,
+      "loss": 0.2539,
+      "step": 2291
+    },
+    {
+      "epoch": 0.019895660627945937,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.001999179928785402,
+      "loss": 0.1865,
+      "step": 2292
+    },
+    {
+      "epoch": 0.019904341108150102,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001999178658043309,
+      "loss": 0.2363,
+      "step": 2293
+    },
+    {
+      "epoch": 0.019913021588354268,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019991773863178874,
+      "loss": 0.2871,
+      "step": 2294
+    },
+    {
+      "epoch": 0.019921702068558433,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0019991761136091387,
+      "loss": 0.2344,
+      "step": 2295
+    },
+    {
+      "epoch": 0.019930382548762598,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001999174839917065,
+      "loss": 0.2422,
+      "step": 2296
+    },
+    {
+      "epoch": 0.019939063028966763,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019991735652416668,
+      "loss": 0.2324,
+      "step": 2297
+    },
+    {
+      "epoch": 0.019947743509170928,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.001999172289582946,
+      "loss": 0.25,
+      "step": 2298
+    },
+    {
+      "epoch": 0.019956423989375093,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001999171012940904,
+      "loss": 0.2422,
+      "step": 2299
+    },
+    {
+      "epoch": 0.01996510446957926,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001999169735315542,
+      "loss": 0.1914,
+      "step": 2300
+    },
+    {
+      "epoch": 0.019973784949783423,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0019991684567068615,
+      "loss": 0.252,
+      "step": 2301
+    },
+    {
+      "epoch": 0.01998246542998759,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.001999167177114864,
+      "loss": 0.1748,
+      "step": 2302
+    },
+    {
+      "epoch": 0.01999114591019175,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019991658965395503,
+      "loss": 0.2441,
+      "step": 2303
+    },
+    {
+      "epoch": 0.019999826390395915,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001999164614980923,
+      "loss": 0.25,
+      "step": 2304
+    },
+    {
+      "epoch": 0.02000850687060008,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001999163332438982,
+      "loss": 0.1992,
+      "step": 2305
+    },
+    {
+      "epoch": 0.020017187350804246,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.00199916204891373,
+      "loss": 0.2949,
+      "step": 2306
+    },
+    {
+      "epoch": 0.02002586783100841,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0019991607644051683,
+      "loss": 0.1953,
+      "step": 2307
+    },
+    {
+      "epoch": 0.020034548311212576,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019991594789132974,
+      "loss": 0.3125,
+      "step": 2308
+    },
+    {
+      "epoch": 0.02004322879141674,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019991581924381193,
+      "loss": 0.1992,
+      "step": 2309
+    },
+    {
+      "epoch": 0.020051909271620906,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.001999156904979635,
+      "loss": 0.2217,
+      "step": 2310
+    },
+    {
+      "epoch": 0.02006058975182507,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0019991556165378462,
+      "loss": 0.2041,
+      "step": 2311
+    },
+    {
+      "epoch": 0.020069270232029236,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0019991543271127552,
+      "loss": 0.2021,
+      "step": 2312
+    },
+    {
+      "epoch": 0.0200779507122334,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019991530367043614,
+      "loss": 0.2109,
+      "step": 2313
+    },
+    {
+      "epoch": 0.020086631192437567,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001999151745312668,
+      "loss": 0.2422,
+      "step": 2314
+    },
+    {
+      "epoch": 0.02009531167264173,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0019991504529376755,
+      "loss": 0.1924,
+      "step": 2315
+    },
+    {
+      "epoch": 0.020103992152845897,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0019991491595793857,
+      "loss": 0.2559,
+      "step": 2316
+    },
+    {
+      "epoch": 0.020112672633050062,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019991478652378,
+      "loss": 0.2119,
+      "step": 2317
+    },
+    {
+      "epoch": 0.020121353113254224,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019991465699129196,
+      "loss": 0.2344,
+      "step": 2318
+    },
+    {
+      "epoch": 0.02013003359345839,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001999145273604746,
+      "loss": 0.2109,
+      "step": 2319
+    },
+    {
+      "epoch": 0.020138714073662554,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0019991439763132808,
+      "loss": 0.1826,
+      "step": 2320
+    },
+    {
+      "epoch": 0.02014739455386672,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001999142678038525,
+      "loss": 0.2305,
+      "step": 2321
+    },
+    {
+      "epoch": 0.020156075034070884,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001999141378780481,
+      "loss": 0.209,
+      "step": 2322
+    },
+    {
+      "epoch": 0.02016475551427505,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019991400785391487,
+      "loss": 0.2598,
+      "step": 2323
+    },
+    {
+      "epoch": 0.020173435994479214,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019991387773145306,
+      "loss": 0.2148,
+      "step": 2324
+    },
+    {
+      "epoch": 0.02018211647468338,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001999137475106627,
+      "loss": 0.2188,
+      "step": 2325
+    },
+    {
+      "epoch": 0.020190796954887544,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019991361719154414,
+      "loss": 0.2461,
+      "step": 2326
+    },
+    {
+      "epoch": 0.02019947743509171,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0019991348677409733,
+      "loss": 0.1836,
+      "step": 2327
+    },
+    {
+      "epoch": 0.020208157915295875,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0019991335625832254,
+      "loss": 0.252,
+      "step": 2328
+    },
+    {
+      "epoch": 0.02021683839550004,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.001999132256442198,
+      "loss": 0.1572,
+      "step": 2329
+    },
+    {
+      "epoch": 0.020225518875704205,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001999130949317893,
+      "loss": 0.2188,
+      "step": 2330
+    },
+    {
+      "epoch": 0.02023419935590837,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019991296412103124,
+      "loss": 0.2363,
+      "step": 2331
+    },
+    {
+      "epoch": 0.020242879836112535,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0019991283321194567,
+      "loss": 0.2119,
+      "step": 2332
+    },
+    {
+      "epoch": 0.0202515603163167,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001999127022045328,
+      "loss": 0.1855,
+      "step": 2333
+    },
+    {
+      "epoch": 0.020260240796520862,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.001999125710987928,
+      "loss": 0.2188,
+      "step": 2334
+    },
+    {
+      "epoch": 0.020268921276725027,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0019991243989472564,
+      "loss": 0.2559,
+      "step": 2335
+    },
+    {
+      "epoch": 0.020277601756929192,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.001999123085923317,
+      "loss": 0.1797,
+      "step": 2336
+    },
+    {
+      "epoch": 0.020286282237133357,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001999121771916109,
+      "loss": 0.2402,
+      "step": 2337
+    },
+    {
+      "epoch": 0.020294962717337522,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001999120456925636,
+      "loss": 0.2617,
+      "step": 2338
+    },
+    {
+      "epoch": 0.020303643197541688,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001999119140951898,
+      "loss": 0.252,
+      "step": 2339
+    },
+    {
+      "epoch": 0.020312323677745853,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001999117823994897,
+      "loss": 0.2207,
+      "step": 2340
+    },
+    {
+      "epoch": 0.020321004157950018,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019991165060546335,
+      "loss": 0.2539,
+      "step": 2341
+    },
+    {
+      "epoch": 0.020329684638154183,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0019991151871311106,
+      "loss": 0.1953,
+      "step": 2342
+    },
+    {
+      "epoch": 0.020338365118358348,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0019991138672243282,
+      "loss": 0.165,
+      "step": 2343
+    },
+    {
+      "epoch": 0.020347045598562513,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001999112546334289,
+      "loss": 0.2002,
+      "step": 2344
+    },
+    {
+      "epoch": 0.02035572607876668,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019991112244609932,
+      "loss": 0.252,
+      "step": 2345
+    },
+    {
+      "epoch": 0.020364406558970843,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019991099016044432,
+      "loss": 0.2441,
+      "step": 2346
+    },
+    {
+      "epoch": 0.02037308703917501,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019991085777646403,
+      "loss": 0.2188,
+      "step": 2347
+    },
+    {
+      "epoch": 0.020381767519379174,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019991072529415856,
+      "loss": 0.2236,
+      "step": 2348
+    },
+    {
+      "epoch": 0.020390447999583335,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0019991059271352806,
+      "loss": 0.3047,
+      "step": 2349
+    },
+    {
+      "epoch": 0.0203991284797875,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001999104600345727,
+      "loss": 0.248,
+      "step": 2350
+    },
+    {
+      "epoch": 0.020407808959991666,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001999103272572926,
+      "loss": 0.2344,
+      "step": 2351
+    },
+    {
+      "epoch": 0.02041648944019583,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0019991019438168793,
+      "loss": 0.1533,
+      "step": 2352
+    },
+    {
+      "epoch": 0.020425169920399996,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001999100614077588,
+      "loss": 0.252,
+      "step": 2353
+    },
+    {
+      "epoch": 0.02043385040060416,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019990992833550537,
+      "loss": 0.2246,
+      "step": 2354
+    },
+    {
+      "epoch": 0.020442530880808326,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001999097951649278,
+      "loss": 0.2539,
+      "step": 2355
+    },
+    {
+      "epoch": 0.02045121136101249,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.001999096618960263,
+      "loss": 0.1904,
+      "step": 2356
+    },
+    {
+      "epoch": 0.020459891841216656,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019990952852880087,
+      "loss": 0.2373,
+      "step": 2357
+    },
+    {
+      "epoch": 0.02046857232142082,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019990939506325175,
+      "loss": 0.2617,
+      "step": 2358
+    },
+    {
+      "epoch": 0.020477252801624986,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0019990926149937908,
+      "loss": 0.1689,
+      "step": 2359
+    },
+    {
+      "epoch": 0.02048593328182915,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019990912783718296,
+      "loss": 0.2246,
+      "step": 2360
+    },
+    {
+      "epoch": 0.020494613762033317,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001999089940766636,
+      "loss": 0.2129,
+      "step": 2361
+    },
+    {
+      "epoch": 0.020503294242237482,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.001999088602178211,
+      "loss": 0.1787,
+      "step": 2362
+    },
+    {
+      "epoch": 0.020511974722441647,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001999087262606556,
+      "loss": 0.2598,
+      "step": 2363
+    },
+    {
+      "epoch": 0.020520655202645812,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.001999085922051673,
+      "loss": 0.1963,
+      "step": 2364
+    },
+    {
+      "epoch": 0.020529335682849974,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.0019990845805135634,
+      "loss": 0.2051,
+      "step": 2365
+    },
+    {
+      "epoch": 0.02053801616305414,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001999083237992228,
+      "loss": 0.25,
+      "step": 2366
+    },
+    {
+      "epoch": 0.020546696643258304,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019990818944876686,
+      "loss": 0.2402,
+      "step": 2367
+    },
+    {
+      "epoch": 0.02055537712346247,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019990805499998866,
+      "loss": 0.2227,
+      "step": 2368
+    },
+    {
+      "epoch": 0.020564057603666634,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019990792045288842,
+      "loss": 0.2383,
+      "step": 2369
+    },
+    {
+      "epoch": 0.0205727380838708,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.001999077858074662,
+      "loss": 0.2148,
+      "step": 2370
+    },
+    {
+      "epoch": 0.020581418564074964,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001999076510637222,
+      "loss": 0.2266,
+      "step": 2371
+    },
+    {
+      "epoch": 0.02059009904427913,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001999075162216565,
+      "loss": 0.2598,
+      "step": 2372
+    },
+    {
+      "epoch": 0.020598779524483295,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001999073812812693,
+      "loss": 0.2148,
+      "step": 2373
+    },
+    {
+      "epoch": 0.02060746000468746,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001999072462425608,
+      "loss": 0.2422,
+      "step": 2374
+    },
+    {
+      "epoch": 0.020616140484891625,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.00199907111105531,
+      "loss": 0.2402,
+      "step": 2375
+    },
+    {
+      "epoch": 0.02062482096509579,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019990697587018015,
+      "loss": 0.2285,
+      "step": 2376
+    },
+    {
+      "epoch": 0.020633501445299955,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001999068405365084,
+      "loss": 0.2041,
+      "step": 2377
+    },
+    {
+      "epoch": 0.02064218192550412,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001999067051045159,
+      "loss": 0.2227,
+      "step": 2378
+    },
+    {
+      "epoch": 0.020650862405708285,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0019990656957420277,
+      "loss": 0.1934,
+      "step": 2379
+    },
+    {
+      "epoch": 0.020659542885912447,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001999064339455692,
+      "loss": 0.2324,
+      "step": 2380
+    },
+    {
+      "epoch": 0.020668223366116612,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019990629821861525,
+      "loss": 0.2061,
+      "step": 2381
+    },
+    {
+      "epoch": 0.020676903846320777,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0019990616239334113,
+      "loss": 0.1826,
+      "step": 2382
+    },
+    {
+      "epoch": 0.020685584326524942,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019990602646974697,
+      "loss": 0.2363,
+      "step": 2383
+    },
+    {
+      "epoch": 0.020694264806729108,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019990589044783296,
+      "loss": 0.2305,
+      "step": 2384
+    },
+    {
+      "epoch": 0.020702945286933273,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001999057543275992,
+      "loss": 0.2617,
+      "step": 2385
+    },
+    {
+      "epoch": 0.020711625767137438,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0019990561810904585,
+      "loss": 0.21,
+      "step": 2386
+    },
+    {
+      "epoch": 0.020720306247341603,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001999054817921731,
+      "loss": 0.2354,
+      "step": 2387
+    },
+    {
+      "epoch": 0.020728986727545768,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019990534537698106,
+      "loss": 0.2393,
+      "step": 2388
+    },
+    {
+      "epoch": 0.020737667207749933,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0019990520886346984,
+      "loss": 0.2344,
+      "step": 2389
+    },
+    {
+      "epoch": 0.020746347687954098,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001999050722516397,
+      "loss": 0.2266,
+      "step": 2390
+    },
+    {
+      "epoch": 0.020755028168158263,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001999049355414907,
+      "loss": 0.2617,
+      "step": 2391
+    },
+    {
+      "epoch": 0.02076370864836243,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.00199904798733023,
+      "loss": 0.2197,
+      "step": 2392
+    },
+    {
+      "epoch": 0.020772389128566594,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019990466182623675,
+      "loss": 0.2539,
+      "step": 2393
+    },
+    {
+      "epoch": 0.02078106960877076,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019990452482113214,
+      "loss": 0.3516,
+      "step": 2394
+    },
+    {
+      "epoch": 0.02078975008897492,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019990438771770925,
+      "loss": 0.2188,
+      "step": 2395
+    },
+    {
+      "epoch": 0.020798430569179086,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001999042505159683,
+      "loss": 0.2656,
+      "step": 2396
+    },
+    {
+      "epoch": 0.02080711104938325,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0019990411321590944,
+      "loss": 0.25,
+      "step": 2397
+    },
+    {
+      "epoch": 0.020815791529587416,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001999039758175328,
+      "loss": 0.1895,
+      "step": 2398
+    },
+    {
+      "epoch": 0.02082447200979158,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0019990383832083846,
+      "loss": 0.2148,
+      "step": 2399
+    },
+    {
+      "epoch": 0.020833152489995746,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.001999037007258267,
+      "loss": 0.3008,
+      "step": 2400
+    },
+    {
+      "epoch": 0.02084183297019991,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0019990356303249755,
+      "loss": 0.1992,
+      "step": 2401
+    },
+    {
+      "epoch": 0.020850513450404076,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019990342524085123,
+      "loss": 0.2246,
+      "step": 2402
+    },
+    {
+      "epoch": 0.02085919393060824,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001999032873508879,
+      "loss": 0.1934,
+      "step": 2403
+    },
+    {
+      "epoch": 0.020867874410812406,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019990314936260767,
+      "loss": 0.5469,
+      "step": 2404
+    },
+    {
+      "epoch": 0.02087655489101657,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019990301127601065,
+      "loss": 0.2227,
+      "step": 2405
+    },
+    {
+      "epoch": 0.020885235371220737,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001999028730910971,
+      "loss": 0.208,
+      "step": 2406
+    },
+    {
+      "epoch": 0.020893915851424902,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019990273480786714,
+      "loss": 0.2344,
+      "step": 2407
+    },
+    {
+      "epoch": 0.020902596331629067,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0019990259642632085,
+      "loss": 0.1797,
+      "step": 2408
+    },
+    {
+      "epoch": 0.020911276811833232,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019990245794645847,
+      "loss": 0.2227,
+      "step": 2409
+    },
+    {
+      "epoch": 0.020919957292037397,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.001999023193682801,
+      "loss": 0.1953,
+      "step": 2410
+    },
+    {
+      "epoch": 0.02092863777224156,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001999021806917859,
+      "loss": 0.1855,
+      "step": 2411
+    },
+    {
+      "epoch": 0.020937318252445724,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0019990204191697603,
+      "loss": 0.209,
+      "step": 2412
+    },
+    {
+      "epoch": 0.02094599873264989,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019990190304385066,
+      "loss": 0.2246,
+      "step": 2413
+    },
+    {
+      "epoch": 0.020954679212854054,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001999017640724099,
+      "loss": 0.2695,
+      "step": 2414
+    },
+    {
+      "epoch": 0.02096335969305822,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001999016250026539,
+      "loss": 0.2148,
+      "step": 2415
+    },
+    {
+      "epoch": 0.020972040173262384,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001999014858345829,
+      "loss": 0.291,
+      "step": 2416
+    },
+    {
+      "epoch": 0.02098072065346655,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001999013465681969,
+      "loss": 0.248,
+      "step": 2417
+    },
+    {
+      "epoch": 0.020989401133670715,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001999012072034962,
+      "loss": 0.2344,
+      "step": 2418
+    },
+    {
+      "epoch": 0.02099808161387488,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0019990106774048087,
+      "loss": 0.1865,
+      "step": 2419
+    },
+    {
+      "epoch": 0.021006762094079045,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0019990092817915106,
+      "loss": 0.2285,
+      "step": 2420
+    },
+    {
+      "epoch": 0.02101544257428321,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019990078851950697,
+      "loss": 0.207,
+      "step": 2421
+    },
+    {
+      "epoch": 0.021024123054487375,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0019990064876154873,
+      "loss": 0.2168,
+      "step": 2422
+    },
+    {
+      "epoch": 0.02103280353469154,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001999005089052765,
+      "loss": 0.2773,
+      "step": 2423
+    },
+    {
+      "epoch": 0.021041484014895705,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0019990036895069043,
+      "loss": 0.2305,
+      "step": 2424
+    },
+    {
+      "epoch": 0.02105016449509987,
+      "grad_norm": 0.056640625,
+      "learning_rate": 0.0019990022889779064,
+      "loss": 0.1992,
+      "step": 2425
+    },
+    {
+      "epoch": 0.021058844975304032,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001999000887465773,
+      "loss": 0.2676,
+      "step": 2426
+    },
+    {
+      "epoch": 0.021067525455508197,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001998999484970506,
+      "loss": 0.252,
+      "step": 2427
+    },
+    {
+      "epoch": 0.021076205935712362,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0019989980814921066,
+      "loss": 0.1914,
+      "step": 2428
+    },
+    {
+      "epoch": 0.021084886415916528,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0019989966770305764,
+      "loss": 0.168,
+      "step": 2429
+    },
+    {
+      "epoch": 0.021093566896120693,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001998995271585917,
+      "loss": 0.3555,
+      "step": 2430
+    },
+    {
+      "epoch": 0.021102247376324858,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0019989938651581297,
+      "loss": 0.2266,
+      "step": 2431
+    },
+    {
+      "epoch": 0.021110927856529023,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.001998992457747216,
+      "loss": 0.2041,
+      "step": 2432
+    },
+    {
+      "epoch": 0.021119608336733188,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0019989910493531785,
+      "loss": 0.1973,
+      "step": 2433
+    },
+    {
+      "epoch": 0.021128288816937353,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001998989639976017,
+      "loss": 0.3125,
+      "step": 2434
+    },
+    {
+      "epoch": 0.021136969297141518,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019989882296157346,
+      "loss": 0.2207,
+      "step": 2435
+    },
+    {
+      "epoch": 0.021145649777345683,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0019989868182723317,
+      "loss": 0.1973,
+      "step": 2436
+    },
+    {
+      "epoch": 0.02115433025754985,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001998985405945811,
+      "loss": 0.2695,
+      "step": 2437
+    },
+    {
+      "epoch": 0.021163010737754014,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0019989839926361727,
+      "loss": 0.1904,
+      "step": 2438
+    },
+    {
+      "epoch": 0.02117169121795818,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001998982578343419,
+      "loss": 0.2012,
+      "step": 2439
+    },
+    {
+      "epoch": 0.021180371698162344,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001998981163067552,
+      "loss": 0.2188,
+      "step": 2440
+    },
+    {
+      "epoch": 0.02118905217836651,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019989797468085726,
+      "loss": 0.2441,
+      "step": 2441
+    },
+    {
+      "epoch": 0.02119773265857067,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001998978329566482,
+      "loss": 0.2324,
+      "step": 2442
+    },
+    {
+      "epoch": 0.021206413138774836,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.001998976911341282,
+      "loss": 0.2285,
+      "step": 2443
+    },
+    {
+      "epoch": 0.021215093618979,
+      "grad_norm": 0.061279296875,
+      "learning_rate": 0.001998975492132975,
+      "loss": 0.208,
+      "step": 2444
+    },
+    {
+      "epoch": 0.021223774099183166,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.001998974071941562,
+      "loss": 0.1875,
+      "step": 2445
+    },
+    {
+      "epoch": 0.02123245457938733,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001998972650767044,
+      "loss": 0.2266,
+      "step": 2446
+    },
+    {
+      "epoch": 0.021241135059591496,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001998971228609423,
+      "loss": 0.2617,
+      "step": 2447
+    },
+    {
+      "epoch": 0.02124981553979566,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001998969805468701,
+      "loss": 0.2012,
+      "step": 2448
+    },
+    {
+      "epoch": 0.021258496019999826,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0019989683813448787,
+      "loss": 0.2373,
+      "step": 2449
+    },
+    {
+      "epoch": 0.02126717650020399,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001998966956237958,
+      "loss": 0.2637,
+      "step": 2450
+    },
+    {
+      "epoch": 0.021275856980408157,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019989655301479413,
+      "loss": 0.3086,
+      "step": 2451
+    },
+    {
+      "epoch": 0.021284537460612322,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.001998964103074829,
+      "loss": 0.2002,
+      "step": 2452
+    },
+    {
+      "epoch": 0.021293217940816487,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019989626750186226,
+      "loss": 0.2178,
+      "step": 2453
+    },
+    {
+      "epoch": 0.021301898421020652,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0019989612459793246,
+      "loss": 0.2441,
+      "step": 2454
+    },
+    {
+      "epoch": 0.021310578901224817,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001998959815956936,
+      "loss": 0.209,
+      "step": 2455
+    },
+    {
+      "epoch": 0.021319259381428982,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001998958384951458,
+      "loss": 0.2334,
+      "step": 2456
+    },
+    {
+      "epoch": 0.021327939861633144,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0019989569529628936,
+      "loss": 0.2012,
+      "step": 2457
+    },
+    {
+      "epoch": 0.02133662034183731,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001998955519991243,
+      "loss": 0.2227,
+      "step": 2458
+    },
+    {
+      "epoch": 0.021345300822041474,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.001998954086036508,
+      "loss": 0.1768,
+      "step": 2459
+    },
+    {
+      "epoch": 0.02135398130224564,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.00199895265109869,
+      "loss": 0.2168,
+      "step": 2460
+    },
+    {
+      "epoch": 0.021362661782449804,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001998951215177791,
+      "loss": 0.2227,
+      "step": 2461
+    },
+    {
+      "epoch": 0.02137134226265397,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001998949778273813,
+      "loss": 0.2363,
+      "step": 2462
+    },
+    {
+      "epoch": 0.021380022742858135,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001998948340386757,
+      "loss": 0.2285,
+      "step": 2463
+    },
+    {
+      "epoch": 0.0213887032230623,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.001998946901516624,
+      "loss": 0.1904,
+      "step": 2464
+    },
+    {
+      "epoch": 0.021397383703266465,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001998945461663416,
+      "loss": 0.2402,
+      "step": 2465
+    },
+    {
+      "epoch": 0.02140606418347063,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019989440208271355,
+      "loss": 0.2021,
+      "step": 2466
+    },
+    {
+      "epoch": 0.021414744663674795,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001998942579007783,
+      "loss": 0.1895,
+      "step": 2467
+    },
+    {
+      "epoch": 0.02142342514387896,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0019989411362053605,
+      "loss": 0.1699,
+      "step": 2468
+    },
+    {
+      "epoch": 0.021432105624083125,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019989396924198696,
+      "loss": 0.208,
+      "step": 2469
+    },
+    {
+      "epoch": 0.02144078610428729,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019989382476513114,
+      "loss": 0.2051,
+      "step": 2470
+    },
+    {
+      "epoch": 0.021449466584491456,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001998936801899688,
+      "loss": 0.248,
+      "step": 2471
+    },
+    {
+      "epoch": 0.021458147064695617,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.001998935355165001,
+      "loss": 0.1963,
+      "step": 2472
+    },
+    {
+      "epoch": 0.021466827544899782,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.0019989339074472515,
+      "loss": 0.1738,
+      "step": 2473
+    },
+    {
+      "epoch": 0.021475508025103947,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0019989324587464416,
+      "loss": 0.1709,
+      "step": 2474
+    },
+    {
+      "epoch": 0.021484188505308113,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.0019989310090625725,
+      "loss": 0.1768,
+      "step": 2475
+    },
+    {
+      "epoch": 0.021492868985512278,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.001998929558395646,
+      "loss": 0.252,
+      "step": 2476
+    },
+    {
+      "epoch": 0.021501549465716443,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0019989281067456636,
+      "loss": 0.2109,
+      "step": 2477
+    },
+    {
+      "epoch": 0.021510229945920608,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.001998926654112627,
+      "loss": 0.2852,
+      "step": 2478
+    },
+    {
+      "epoch": 0.021518910426124773,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001998925200496538,
+      "loss": 0.2227,
+      "step": 2479
+    },
+    {
+      "epoch": 0.021527590906328938,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0019989237458973978,
+      "loss": 0.2383,
+      "step": 2480
+    },
+    {
+      "epoch": 0.021536271386533103,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0019989222903152074,
+      "loss": 0.2051,
+      "step": 2481
+    },
+    {
+      "epoch": 0.02154495186673727,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.00199892083374997,
+      "loss": 0.2266,
+      "step": 2482
+    },
+    {
+      "epoch": 0.021553632346941434,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.001998919376201686,
+      "loss": 0.2275,
+      "step": 2483
+    },
+    {
+      "epoch": 0.0215623128271456,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001998917917670357,
+      "loss": 0.2188,
+      "step": 2484
+    },
+    {
+      "epoch": 0.021570993307349764,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019989164581559853,
+      "loss": 0.2793,
+      "step": 2485
+    },
+    {
+      "epoch": 0.02157967378755393,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0019989149976585717,
+      "loss": 0.2168,
+      "step": 2486
+    },
+    {
+      "epoch": 0.021588354267758094,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.001998913536178118,
+      "loss": 0.1738,
+      "step": 2487
+    },
+    {
+      "epoch": 0.021597034747962256,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019989120737146267,
+      "loss": 0.2461,
+      "step": 2488
+    },
+    {
+      "epoch": 0.02160571522816642,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0019989106102680982,
+      "loss": 0.2715,
+      "step": 2489
+    },
+    {
+      "epoch": 0.021614395708370586,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0019989091458385345,
+      "loss": 0.2617,
+      "step": 2490
+    },
+    {
+      "epoch": 0.02162307618857475,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0019989076804259372,
+      "loss": 0.1895,
+      "step": 2491
+    },
+    {
+      "epoch": 0.021631756668778916,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001998906214030308,
+      "loss": 0.2344,
+      "step": 2492
+    },
+    {
+      "epoch": 0.02164043714898308,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0019989047466516486,
+      "loss": 0.1641,
+      "step": 2493
+    },
+    {
+      "epoch": 0.021649117629187246,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0019989032782899607,
+      "loss": 0.2266,
+      "step": 2494
+    },
+    {
+      "epoch": 0.02165779810939141,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019989018089452454,
+      "loss": 0.2441,
+      "step": 2495
+    },
+    {
+      "epoch": 0.021666478589595577,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019989003386175043,
+      "loss": 0.1973,
+      "step": 2496
+    },
+    {
+      "epoch": 0.021675159069799742,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.00199889886730674,
+      "loss": 0.2148,
+      "step": 2497
+    },
+    {
+      "epoch": 0.021683839550003907,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001998897395012953,
+      "loss": 0.248,
+      "step": 2498
+    },
+    {
+      "epoch": 0.021692520030208072,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019988959217361454,
+      "loss": 0.2578,
+      "step": 2499
+    },
+    {
+      "epoch": 0.021701200510412237,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019988944474763188,
+      "loss": 0.2422,
+      "step": 2500
+    },
+    {
+      "epoch": 0.021709880990616402,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0019988929722334742,
+      "loss": 0.1768,
+      "step": 2501
+    },
+    {
+      "epoch": 0.021718561470820567,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0019988914960076144,
+      "loss": 0.2207,
+      "step": 2502
+    },
+    {
+      "epoch": 0.02172724195102473,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.00199889001879874,
+      "loss": 0.2021,
+      "step": 2503
+    },
+    {
+      "epoch": 0.021735922431228894,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019988885406068534,
+      "loss": 0.2197,
+      "step": 2504
+    },
+    {
+      "epoch": 0.02174460291143306,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019988870614319554,
+      "loss": 0.2148,
+      "step": 2505
+    },
+    {
+      "epoch": 0.021753283391637224,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001998885581274048,
+      "loss": 0.2227,
+      "step": 2506
+    },
+    {
+      "epoch": 0.02176196387184139,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001998884100133133,
+      "loss": 0.2471,
+      "step": 2507
+    },
+    {
+      "epoch": 0.021770644352045555,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019988826180092115,
+      "loss": 0.2031,
+      "step": 2508
+    },
+    {
+      "epoch": 0.02177932483224972,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001998881134902286,
+      "loss": 0.2197,
+      "step": 2509
+    },
+    {
+      "epoch": 0.021788005312453885,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0019988796508123574,
+      "loss": 0.1982,
+      "step": 2510
+    },
+    {
+      "epoch": 0.02179668579265805,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019988781657394273,
+      "loss": 0.2402,
+      "step": 2511
+    },
+    {
+      "epoch": 0.021805366272862215,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001998876679683498,
+      "loss": 0.2246,
+      "step": 2512
+    },
+    {
+      "epoch": 0.02181404675306638,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0019988751926445702,
+      "loss": 0.2207,
+      "step": 2513
+    },
+    {
+      "epoch": 0.021822727233270545,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019988737046226462,
+      "loss": 0.2383,
+      "step": 2514
+    },
+    {
+      "epoch": 0.02183140771347471,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019988722156177273,
+      "loss": 0.2305,
+      "step": 2515
+    },
+    {
+      "epoch": 0.021840088193678876,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019988707256298156,
+      "loss": 0.2383,
+      "step": 2516
+    },
+    {
+      "epoch": 0.02184876867388304,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001998869234658912,
+      "loss": 0.207,
+      "step": 2517
+    },
+    {
+      "epoch": 0.021857449154087206,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0019988677427050187,
+      "loss": 0.1738,
+      "step": 2518
+    },
+    {
+      "epoch": 0.021866129634291367,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019988662497681373,
+      "loss": 0.2002,
+      "step": 2519
+    },
+    {
+      "epoch": 0.021874810114495533,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001998864755848269,
+      "loss": 0.1963,
+      "step": 2520
+    },
+    {
+      "epoch": 0.021883490594699698,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.001998863260945416,
+      "loss": 0.2061,
+      "step": 2521
+    },
+    {
+      "epoch": 0.021892171074903863,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019988617650595793,
+      "loss": 0.208,
+      "step": 2522
+    },
+    {
+      "epoch": 0.021900851555108028,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001998860268190761,
+      "loss": 0.2334,
+      "step": 2523
+    },
+    {
+      "epoch": 0.021909532035312193,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019988587703389624,
+      "loss": 0.1836,
+      "step": 2524
+    },
+    {
+      "epoch": 0.021918212515516358,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0019988572715041856,
+      "loss": 0.1738,
+      "step": 2525
+    },
+    {
+      "epoch": 0.021926892995720523,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.001998855771686432,
+      "loss": 0.1689,
+      "step": 2526
+    },
+    {
+      "epoch": 0.02193557347592469,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001998854270885703,
+      "loss": 0.2305,
+      "step": 2527
+    },
+    {
+      "epoch": 0.021944253956128854,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001998852769102001,
+      "loss": 0.2363,
+      "step": 2528
+    },
+    {
+      "epoch": 0.02195293443633302,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019988512663353265,
+      "loss": 0.2188,
+      "step": 2529
+    },
+    {
+      "epoch": 0.021961614916537184,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001998849762585682,
+      "loss": 0.1953,
+      "step": 2530
+    },
+    {
+      "epoch": 0.02197029539674135,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019988482578530693,
+      "loss": 0.2236,
+      "step": 2531
+    },
+    {
+      "epoch": 0.021978975876945514,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019988467521374893,
+      "loss": 0.2285,
+      "step": 2532
+    },
+    {
+      "epoch": 0.02198765635714968,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001998845245438944,
+      "loss": 0.1895,
+      "step": 2533
+    },
+    {
+      "epoch": 0.02199633683735384,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001998843737757435,
+      "loss": 0.2246,
+      "step": 2534
+    },
+    {
+      "epoch": 0.022005017317558006,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001998842229092964,
+      "loss": 0.2158,
+      "step": 2535
+    },
+    {
+      "epoch": 0.02201369779776217,
+      "grad_norm": 0.061767578125,
+      "learning_rate": 0.0019988407194455327,
+      "loss": 0.2119,
+      "step": 2536
+    },
+    {
+      "epoch": 0.022022378277966336,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0019988392088151428,
+      "loss": 0.2051,
+      "step": 2537
+    },
+    {
+      "epoch": 0.0220310587581705,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0019988376972017957,
+      "loss": 0.1777,
+      "step": 2538
+    },
+    {
+      "epoch": 0.022039739238374666,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001998836184605493,
+      "loss": 0.248,
+      "step": 2539
+    },
+    {
+      "epoch": 0.02204841971857883,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001998834671026237,
+      "loss": 0.2188,
+      "step": 2540
+    },
+    {
+      "epoch": 0.022057100198782997,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.001998833156464029,
+      "loss": 0.1963,
+      "step": 2541
+    },
+    {
+      "epoch": 0.022065780678987162,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.00199883164091887,
+      "loss": 0.2041,
+      "step": 2542
+    },
+    {
+      "epoch": 0.022074461159191327,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019988301243907625,
+      "loss": 0.2246,
+      "step": 2543
+    },
+    {
+      "epoch": 0.022083141639395492,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001998828606879708,
+      "loss": 0.2109,
+      "step": 2544
+    },
+    {
+      "epoch": 0.022091822119599657,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001998827088385708,
+      "loss": 0.2988,
+      "step": 2545
+    },
+    {
+      "epoch": 0.022100502599803822,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001998825568908764,
+      "loss": 0.2324,
+      "step": 2546
+    },
+    {
+      "epoch": 0.022109183080007987,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001998824048448878,
+      "loss": 0.1836,
+      "step": 2547
+    },
+    {
+      "epoch": 0.022117863560212152,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019988225270060516,
+      "loss": 0.2402,
+      "step": 2548
+    },
+    {
+      "epoch": 0.022126544040416314,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0019988210045802863,
+      "loss": 0.1641,
+      "step": 2549
+    },
+    {
+      "epoch": 0.02213522452062048,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019988194811715837,
+      "loss": 0.2129,
+      "step": 2550
+    },
+    {
+      "epoch": 0.022143905000824644,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019988179567799456,
+      "loss": 0.2041,
+      "step": 2551
+    },
+    {
+      "epoch": 0.02215258548102881,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0019988164314053737,
+      "loss": 0.2637,
+      "step": 2552
+    },
+    {
+      "epoch": 0.022161265961232975,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.00199881490504787,
+      "loss": 0.2656,
+      "step": 2553
+    },
+    {
+      "epoch": 0.02216994644143714,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019988133777074355,
+      "loss": 0.2402,
+      "step": 2554
+    },
+    {
+      "epoch": 0.022178626921641305,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019988118493840727,
+      "loss": 0.2227,
+      "step": 2555
+    },
+    {
+      "epoch": 0.02218730740184547,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001998810320077782,
+      "loss": 0.2148,
+      "step": 2556
+    },
+    {
+      "epoch": 0.022195987882049635,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0019988087897885665,
+      "loss": 0.1885,
+      "step": 2557
+    },
+    {
+      "epoch": 0.0222046683622538,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001998807258516427,
+      "loss": 0.2285,
+      "step": 2558
+    },
+    {
+      "epoch": 0.022213348842457965,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001998805726261365,
+      "loss": 0.1758,
+      "step": 2559
+    },
+    {
+      "epoch": 0.02222202932266213,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001998804193023383,
+      "loss": 0.2637,
+      "step": 2560
+    },
+    {
+      "epoch": 0.022230709802866296,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001998802658802482,
+      "loss": 0.2324,
+      "step": 2561
+    },
+    {
+      "epoch": 0.02223939028307046,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.001998801123598664,
+      "loss": 0.2256,
+      "step": 2562
+    },
+    {
+      "epoch": 0.022248070763274626,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001998799587411931,
+      "loss": 0.2344,
+      "step": 2563
+    },
+    {
+      "epoch": 0.02225675124347879,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001998798050242284,
+      "loss": 0.2246,
+      "step": 2564
+    },
+    {
+      "epoch": 0.022265431723682953,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0019987965120897245,
+      "loss": 0.209,
+      "step": 2565
+    },
+    {
+      "epoch": 0.022274112203887118,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.001998794972954255,
+      "loss": 0.1699,
+      "step": 2566
+    },
+    {
+      "epoch": 0.022282792684091283,
+      "grad_norm": 0.0615234375,
+      "learning_rate": 0.001998793432835877,
+      "loss": 0.1943,
+      "step": 2567
+    },
+    {
+      "epoch": 0.022291473164295448,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019987918917345917,
+      "loss": 0.2676,
+      "step": 2568
+    },
+    {
+      "epoch": 0.022300153644499613,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019987903496504014,
+      "loss": 0.2266,
+      "step": 2569
+    },
+    {
+      "epoch": 0.022308834124703778,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.001998788806583307,
+      "loss": 0.1748,
+      "step": 2570
+    },
+    {
+      "epoch": 0.022317514604907943,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001998787262533311,
+      "loss": 0.2275,
+      "step": 2571
+    },
+    {
+      "epoch": 0.02232619508511211,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001998785717500415,
+      "loss": 0.2656,
+      "step": 2572
+    },
+    {
+      "epoch": 0.022334875565316274,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.00199878417148462,
+      "loss": 0.2295,
+      "step": 2573
+    },
+    {
+      "epoch": 0.02234355604552044,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001998782624485928,
+      "loss": 0.2412,
+      "step": 2574
+    },
+    {
+      "epoch": 0.022352236525724604,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0019987810765043413,
+      "loss": 0.2061,
+      "step": 2575
+    },
+    {
+      "epoch": 0.02236091700592877,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.001998779527539861,
+      "loss": 0.2061,
+      "step": 2576
+    },
+    {
+      "epoch": 0.022369597486132934,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001998777977592489,
+      "loss": 0.2188,
+      "step": 2577
+    },
+    {
+      "epoch": 0.0223782779663371,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0019987764266622267,
+      "loss": 0.2217,
+      "step": 2578
+    },
+    {
+      "epoch": 0.022386958446541264,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019987748747490757,
+      "loss": 0.2256,
+      "step": 2579
+    },
+    {
+      "epoch": 0.022395638926745426,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0019987733218530387,
+      "loss": 0.2891,
+      "step": 2580
+    },
+    {
+      "epoch": 0.02240431940694959,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001998771767974116,
+      "loss": 0.2246,
+      "step": 2581
+    },
+    {
+      "epoch": 0.022412999887153756,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0019987702131123108,
+      "loss": 0.3809,
+      "step": 2582
+    },
+    {
+      "epoch": 0.02242168036735792,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0019987686572676237,
+      "loss": 0.1729,
+      "step": 2583
+    },
+    {
+      "epoch": 0.022430360847562086,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019987671004400563,
+      "loss": 0.2168,
+      "step": 2584
+    },
+    {
+      "epoch": 0.02243904132776625,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001998765542629611,
+      "loss": 0.2207,
+      "step": 2585
+    },
+    {
+      "epoch": 0.022447721807970417,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001998763983836289,
+      "loss": 0.2314,
+      "step": 2586
+    },
+    {
+      "epoch": 0.02245640228817458,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019987624240600924,
+      "loss": 0.2715,
+      "step": 2587
+    },
+    {
+      "epoch": 0.022465082768378747,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001998760863301023,
+      "loss": 0.2617,
+      "step": 2588
+    },
+    {
+      "epoch": 0.022473763248582912,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019987593015590817,
+      "loss": 0.1777,
+      "step": 2589
+    },
+    {
+      "epoch": 0.022482443728787077,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.001998757738834271,
+      "loss": 0.1631,
+      "step": 2590
+    },
+    {
+      "epoch": 0.022491124208991242,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019987561751265925,
+      "loss": 0.1875,
+      "step": 2591
+    },
+    {
+      "epoch": 0.022499804689195407,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001998754610436048,
+      "loss": 0.2021,
+      "step": 2592
+    },
+    {
+      "epoch": 0.022508485169399572,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.001998753044762638,
+      "loss": 0.1973,
+      "step": 2593
+    },
+    {
+      "epoch": 0.022517165649603738,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0019987514781063657,
+      "loss": 0.2188,
+      "step": 2594
+    },
+    {
+      "epoch": 0.022525846129807903,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001998749910467232,
+      "loss": 0.2969,
+      "step": 2595
+    },
+    {
+      "epoch": 0.022534526610012064,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.001998748341845239,
+      "loss": 0.209,
+      "step": 2596
+    },
+    {
+      "epoch": 0.02254320709021623,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001998746772240389,
+      "loss": 0.1836,
+      "step": 2597
+    },
+    {
+      "epoch": 0.022551887570420395,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019987452016526825,
+      "loss": 0.2061,
+      "step": 2598
+    },
+    {
+      "epoch": 0.02256056805062456,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0019987436300821218,
+      "loss": 0.1992,
+      "step": 2599
+    },
+    {
+      "epoch": 0.022569248530828725,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019987420575287083,
+      "loss": 0.2207,
+      "step": 2600
+    },
+    {
+      "epoch": 0.02257792901103289,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019987404839924444,
+      "loss": 0.1787,
+      "step": 2601
+    },
+    {
+      "epoch": 0.022586609491237055,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001998738909473331,
+      "loss": 0.2402,
+      "step": 2602
+    },
+    {
+      "epoch": 0.02259528997144122,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001998737333971371,
+      "loss": 0.1992,
+      "step": 2603
+    },
+    {
+      "epoch": 0.022603970451645385,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019987357574865645,
+      "loss": 0.1963,
+      "step": 2604
+    },
+    {
+      "epoch": 0.02261265093184955,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001998734180018914,
+      "loss": 0.2012,
+      "step": 2605
+    },
+    {
+      "epoch": 0.022621331412053716,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001998732601568422,
+      "loss": 0.1973,
+      "step": 2606
+    },
+    {
+      "epoch": 0.02263001189225788,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019987310221350895,
+      "loss": 0.2178,
+      "step": 2607
+    },
+    {
+      "epoch": 0.022638692372462046,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001998729441718918,
+      "loss": 0.1875,
+      "step": 2608
+    },
+    {
+      "epoch": 0.02264737285266621,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0019987278603199096,
+      "loss": 0.3105,
+      "step": 2609
+    },
+    {
+      "epoch": 0.022656053332870376,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019987262779380655,
+      "loss": 0.3203,
+      "step": 2610
+    },
+    {
+      "epoch": 0.022664733813074538,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019987246945733883,
+      "loss": 0.2656,
+      "step": 2611
+    },
+    {
+      "epoch": 0.022673414293278703,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001998723110225879,
+      "loss": 0.2188,
+      "step": 2612
+    },
+    {
+      "epoch": 0.022682094773482868,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.00199872152489554,
+      "loss": 0.207,
+      "step": 2613
+    },
+    {
+      "epoch": 0.022690775253687033,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019987199385823723,
+      "loss": 0.2266,
+      "step": 2614
+    },
+    {
+      "epoch": 0.022699455733891198,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001998718351286378,
+      "loss": 0.2168,
+      "step": 2615
+    },
+    {
+      "epoch": 0.022708136214095363,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001998716763007559,
+      "loss": 0.2891,
+      "step": 2616
+    },
+    {
+      "epoch": 0.02271681669429953,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0019987151737459164,
+      "loss": 0.1689,
+      "step": 2617
+    },
+    {
+      "epoch": 0.022725497174503693,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019987135835014525,
+      "loss": 0.2578,
+      "step": 2618
+    },
+    {
+      "epoch": 0.02273417765470786,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019987119922741698,
+      "loss": 0.2334,
+      "step": 2619
+    },
+    {
+      "epoch": 0.022742858134912024,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001998710400064068,
+      "loss": 0.2734,
+      "step": 2620
+    },
+    {
+      "epoch": 0.02275153861511619,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0019987088068711507,
+      "loss": 0.2949,
+      "step": 2621
+    },
+    {
+      "epoch": 0.022760219095320354,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019987072126954187,
+      "loss": 0.1973,
+      "step": 2622
+    },
+    {
+      "epoch": 0.02276889957552452,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001998705617536874,
+      "loss": 0.1865,
+      "step": 2623
+    },
+    {
+      "epoch": 0.022777580055728684,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0019987040213955186,
+      "loss": 0.2891,
+      "step": 2624
+    },
+    {
+      "epoch": 0.02278626053593285,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001998702424271354,
+      "loss": 0.2266,
+      "step": 2625
+    },
+    {
+      "epoch": 0.02279494101613701,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0019987008261643817,
+      "loss": 0.2227,
+      "step": 2626
+    },
+    {
+      "epoch": 0.022803621496341176,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019986992270746035,
+      "loss": 0.25,
+      "step": 2627
+    },
+    {
+      "epoch": 0.02281230197654534,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019986976270020213,
+      "loss": 0.25,
+      "step": 2628
+    },
+    {
+      "epoch": 0.022820982456749506,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0019986960259466375,
+      "loss": 0.1924,
+      "step": 2629
+    },
+    {
+      "epoch": 0.02282966293695367,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0019986944239084527,
+      "loss": 0.1797,
+      "step": 2630
+    },
+    {
+      "epoch": 0.022838343417157837,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019986928208874694,
+      "loss": 0.2559,
+      "step": 2631
+    },
+    {
+      "epoch": 0.022847023897362,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.001998691216883689,
+      "loss": 0.1719,
+      "step": 2632
+    },
+    {
+      "epoch": 0.022855704377566167,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019986896118971134,
+      "loss": 0.209,
+      "step": 2633
+    },
+    {
+      "epoch": 0.022864384857770332,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001998688005927744,
+      "loss": 0.5664,
+      "step": 2634
+    },
+    {
+      "epoch": 0.022873065337974497,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.001998686398975584,
+      "loss": 0.2236,
+      "step": 2635
+    },
+    {
+      "epoch": 0.022881745818178662,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001998684791040633,
+      "loss": 0.2129,
+      "step": 2636
+    },
+    {
+      "epoch": 0.022890426298382827,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019986831821228943,
+      "loss": 0.2383,
+      "step": 2637
+    },
+    {
+      "epoch": 0.022899106778586992,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001998681572222369,
+      "loss": 0.2266,
+      "step": 2638
+    },
+    {
+      "epoch": 0.022907787258791158,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001998679961339059,
+      "loss": 0.2344,
+      "step": 2639
+    },
+    {
+      "epoch": 0.022916467738995323,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001998678349472966,
+      "loss": 0.252,
+      "step": 2640
+    },
+    {
+      "epoch": 0.022925148219199488,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.001998676736624092,
+      "loss": 0.1807,
+      "step": 2641
+    },
+    {
+      "epoch": 0.02293382869940365,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019986751227924386,
+      "loss": 0.2344,
+      "step": 2642
+    },
+    {
+      "epoch": 0.022942509179607815,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.001998673507978008,
+      "loss": 0.1982,
+      "step": 2643
+    },
+    {
+      "epoch": 0.02295118965981198,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001998671892180801,
+      "loss": 0.2734,
+      "step": 2644
+    },
+    {
+      "epoch": 0.022959870140016145,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0019986702754008203,
+      "loss": 0.2051,
+      "step": 2645
+    },
+    {
+      "epoch": 0.02296855062022031,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019986686576380667,
+      "loss": 0.2305,
+      "step": 2646
+    },
+    {
+      "epoch": 0.022977231100424475,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001998667038892543,
+      "loss": 0.2363,
+      "step": 2647
+    },
+    {
+      "epoch": 0.02298591158062864,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0019986654191642508,
+      "loss": 0.1924,
+      "step": 2648
+    },
+    {
+      "epoch": 0.022994592060832805,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001998663798453191,
+      "loss": 0.1934,
+      "step": 2649
+    },
+    {
+      "epoch": 0.02300327254103697,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0019986621767593663,
+      "loss": 0.2188,
+      "step": 2650
+    },
+    {
+      "epoch": 0.023011953021241136,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.001998660554082778,
+      "loss": 0.2539,
+      "step": 2651
+    },
+    {
+      "epoch": 0.0230206335014453,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019986589304234284,
+      "loss": 0.2852,
+      "step": 2652
+    },
+    {
+      "epoch": 0.023029313981649466,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0019986573057813183,
+      "loss": 0.4863,
+      "step": 2653
+    },
+    {
+      "epoch": 0.02303799446185363,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0019986556801564505,
+      "loss": 0.2031,
+      "step": 2654
+    },
+    {
+      "epoch": 0.023046674942057796,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019986540535488263,
+      "loss": 0.1895,
+      "step": 2655
+    },
+    {
+      "epoch": 0.02305535542226196,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019986524259584474,
+      "loss": 0.2363,
+      "step": 2656
+    },
+    {
+      "epoch": 0.023064035902466123,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001998650797385316,
+      "loss": 0.5977,
+      "step": 2657
+    },
+    {
+      "epoch": 0.023072716382670288,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001998649167829433,
+      "loss": 0.293,
+      "step": 2658
+    },
+    {
+      "epoch": 0.023081396862874453,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0019986475372908014,
+      "loss": 0.2168,
+      "step": 2659
+    },
+    {
+      "epoch": 0.023090077343078618,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001998645905769422,
+      "loss": 0.2051,
+      "step": 2660
+    },
+    {
+      "epoch": 0.023098757823282783,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.001998644273265297,
+      "loss": 0.1738,
+      "step": 2661
+    },
+    {
+      "epoch": 0.02310743830348695,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0019986426397784283,
+      "loss": 0.209,
+      "step": 2662
+    },
+    {
+      "epoch": 0.023116118783691113,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0019986410053088174,
+      "loss": 0.2031,
+      "step": 2663
+    },
+    {
+      "epoch": 0.02312479926389528,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.001998639369856466,
+      "loss": 0.7188,
+      "step": 2664
+    },
+    {
+      "epoch": 0.023133479744099444,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0019986377334213763,
+      "loss": 0.1875,
+      "step": 2665
+    },
+    {
+      "epoch": 0.02314216022430361,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.00199863609600355,
+      "loss": 0.3125,
+      "step": 2666
+    },
+    {
+      "epoch": 0.023150840704507774,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019986344576029885,
+      "loss": 0.2158,
+      "step": 2667
+    },
+    {
+      "epoch": 0.02315952118471194,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0019986328182196936,
+      "loss": 0.168,
+      "step": 2668
+    },
+    {
+      "epoch": 0.023168201664916104,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0019986311778536683,
+      "loss": 0.2891,
+      "step": 2669
+    },
+    {
+      "epoch": 0.02317688214512027,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0019986295365049126,
+      "loss": 0.1934,
+      "step": 2670
+    },
+    {
+      "epoch": 0.023185562625324434,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019986278941734295,
+      "loss": 0.2559,
+      "step": 2671
+    },
+    {
+      "epoch": 0.0231942431055286,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0019986262508592204,
+      "loss": 0.2383,
+      "step": 2672
+    },
+    {
+      "epoch": 0.02320292358573276,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001998624606562287,
+      "loss": 0.2402,
+      "step": 2673
+    },
+    {
+      "epoch": 0.023211604065936926,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001998622961282631,
+      "loss": 0.2178,
+      "step": 2674
+    },
+    {
+      "epoch": 0.02322028454614109,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019986213150202546,
+      "loss": 0.2578,
+      "step": 2675
+    },
+    {
+      "epoch": 0.023228965026345257,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.00199861966777516,
+      "loss": 0.2363,
+      "step": 2676
+    },
+    {
+      "epoch": 0.02323764550654942,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001998618019547348,
+      "loss": 0.2617,
+      "step": 2677
+    },
+    {
+      "epoch": 0.023246325986753587,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0019986163703368206,
+      "loss": 0.2246,
+      "step": 2678
+    },
+    {
+      "epoch": 0.023255006466957752,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0019986147201435803,
+      "loss": 0.2207,
+      "step": 2679
+    },
+    {
+      "epoch": 0.023263686947161917,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0019986130689676283,
+      "loss": 0.2227,
+      "step": 2680
+    },
+    {
+      "epoch": 0.023272367427366082,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001998611416808966,
+      "loss": 0.248,
+      "step": 2681
+    },
+    {
+      "epoch": 0.023281047907570247,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019986097636675963,
+      "loss": 0.2002,
+      "step": 2682
+    },
+    {
+      "epoch": 0.023289728387774412,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019986081095435208,
+      "loss": 0.2402,
+      "step": 2683
+    },
+    {
+      "epoch": 0.023298408867978578,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019986064544367404,
+      "loss": 0.2168,
+      "step": 2684
+    },
+    {
+      "epoch": 0.023307089348182743,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0019986047983472574,
+      "loss": 0.1953,
+      "step": 2685
+    },
+    {
+      "epoch": 0.023315769828386908,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001998603141275074,
+      "loss": 0.2168,
+      "step": 2686
+    },
+    {
+      "epoch": 0.023324450308591073,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001998601483220192,
+      "loss": 0.2236,
+      "step": 2687
+    },
+    {
+      "epoch": 0.023333130788795235,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019985998241826126,
+      "loss": 0.2109,
+      "step": 2688
+    },
+    {
+      "epoch": 0.0233418112689994,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019985981641623377,
+      "loss": 0.248,
+      "step": 2689
+    },
+    {
+      "epoch": 0.023350491749203565,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019985965031593697,
+      "loss": 0.1982,
+      "step": 2690
+    },
+    {
+      "epoch": 0.02335917222940773,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.00199859484117371,
+      "loss": 0.2285,
+      "step": 2691
+    },
+    {
+      "epoch": 0.023367852709611895,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.00199859317820536,
+      "loss": 0.2021,
+      "step": 2692
+    },
+    {
+      "epoch": 0.02337653318981606,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019985915142543224,
+      "loss": 0.3047,
+      "step": 2693
+    },
+    {
+      "epoch": 0.023385213670020225,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001998589849320599,
+      "loss": 0.1816,
+      "step": 2694
+    },
+    {
+      "epoch": 0.02339389415022439,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019985881834041906,
+      "loss": 0.2598,
+      "step": 2695
+    },
+    {
+      "epoch": 0.023402574630428555,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0019985865165050997,
+      "loss": 0.2002,
+      "step": 2696
+    },
+    {
+      "epoch": 0.02341125511063272,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0019985848486233286,
+      "loss": 0.2188,
+      "step": 2697
+    },
+    {
+      "epoch": 0.023419935590836886,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0019985831797588783,
+      "loss": 0.2617,
+      "step": 2698
+    },
+    {
+      "epoch": 0.02342861607104105,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001998581509911751,
+      "loss": 0.2314,
+      "step": 2699
+    },
+    {
+      "epoch": 0.023437296551245216,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019985798390819483,
+      "loss": 0.252,
+      "step": 2700
+    },
+    {
+      "epoch": 0.02344597703144938,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001998578167269472,
+      "loss": 0.2246,
+      "step": 2701
+    },
+    {
+      "epoch": 0.023454657511653546,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001998576494474325,
+      "loss": 0.248,
+      "step": 2702
+    },
+    {
+      "epoch": 0.023463337991857708,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019985748206965076,
+      "loss": 0.2754,
+      "step": 2703
+    },
+    {
+      "epoch": 0.023472018472061873,
+      "grad_norm": 0.056640625,
+      "learning_rate": 0.0019985731459360224,
+      "loss": 0.2021,
+      "step": 2704
+    },
+    {
+      "epoch": 0.023480698952266038,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001998571470192871,
+      "loss": 0.332,
+      "step": 2705
+    },
+    {
+      "epoch": 0.023489379432470203,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001998569793467055,
+      "loss": 0.2695,
+      "step": 2706
+    },
+    {
+      "epoch": 0.02349805991267437,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0019985681157585772,
+      "loss": 0.248,
+      "step": 2707
+    },
+    {
+      "epoch": 0.023506740392878533,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019985664370674385,
+      "loss": 0.1719,
+      "step": 2708
+    },
+    {
+      "epoch": 0.0235154208730827,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0019985647573936413,
+      "loss": 0.2168,
+      "step": 2709
+    },
+    {
+      "epoch": 0.023524101353286864,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019985630767371866,
+      "loss": 0.498,
+      "step": 2710
+    },
+    {
+      "epoch": 0.02353278183349103,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019985613950980778,
+      "loss": 0.165,
+      "step": 2711
+    },
+    {
+      "epoch": 0.023541462313695194,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001998559712476315,
+      "loss": 0.208,
+      "step": 2712
+    },
+    {
+      "epoch": 0.02355014279389936,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001998558028871901,
+      "loss": 0.2246,
+      "step": 2713
+    },
+    {
+      "epoch": 0.023558823274103524,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019985563442848375,
+      "loss": 0.252,
+      "step": 2714
+    },
+    {
+      "epoch": 0.02356750375430769,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019985546587151263,
+      "loss": 0.1953,
+      "step": 2715
+    },
+    {
+      "epoch": 0.023576184234511854,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001998552972162769,
+      "loss": 0.2656,
+      "step": 2716
+    },
+    {
+      "epoch": 0.02358486471471602,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019985512846277674,
+      "loss": 0.293,
+      "step": 2717
+    },
+    {
+      "epoch": 0.023593545194920185,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0019985495961101244,
+      "loss": 0.2227,
+      "step": 2718
+    },
+    {
+      "epoch": 0.023602225675124346,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019985479066098404,
+      "loss": 0.25,
+      "step": 2719
+    },
+    {
+      "epoch": 0.02361090615532851,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0019985462161269184,
+      "loss": 0.2168,
+      "step": 2720
+    },
+    {
+      "epoch": 0.023619586635532677,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019985445246613596,
+      "loss": 0.209,
+      "step": 2721
+    },
+    {
+      "epoch": 0.02362826711573684,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.001998542832213166,
+      "loss": 0.2246,
+      "step": 2722
+    },
+    {
+      "epoch": 0.023636947595941007,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0019985411387823393,
+      "loss": 0.2246,
+      "step": 2723
+    },
+    {
+      "epoch": 0.023645628076145172,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019985394443688817,
+      "loss": 0.2393,
+      "step": 2724
+    },
+    {
+      "epoch": 0.023654308556349337,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019985377489727947,
+      "loss": 0.25,
+      "step": 2725
+    },
+    {
+      "epoch": 0.023662989036553502,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019985360525940805,
+      "loss": 0.2246,
+      "step": 2726
+    },
+    {
+      "epoch": 0.023671669516757667,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019985343552327405,
+      "loss": 0.2402,
+      "step": 2727
+    },
+    {
+      "epoch": 0.023680349996961832,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019985326568887772,
+      "loss": 0.1738,
+      "step": 2728
+    },
+    {
+      "epoch": 0.023689030477165997,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001998530957562192,
+      "loss": 0.2344,
+      "step": 2729
+    },
+    {
+      "epoch": 0.023697710957370163,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001998529257252987,
+      "loss": 0.2461,
+      "step": 2730
+    },
+    {
+      "epoch": 0.023706391437574328,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019985275559611633,
+      "loss": 0.2168,
+      "step": 2731
+    },
+    {
+      "epoch": 0.023715071917778493,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001998525853686724,
+      "loss": 0.2656,
+      "step": 2732
+    },
+    {
+      "epoch": 0.023723752397982658,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.00199852415042967,
+      "loss": 0.2109,
+      "step": 2733
+    },
+    {
+      "epoch": 0.02373243287818682,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019985224461900033,
+      "loss": 0.2871,
+      "step": 2734
+    },
+    {
+      "epoch": 0.023741113358390985,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019985207409677266,
+      "loss": 0.2949,
+      "step": 2735
+    },
+    {
+      "epoch": 0.02374979383859515,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0019985190347628404,
+      "loss": 0.1895,
+      "step": 2736
+    },
+    {
+      "epoch": 0.023758474318799315,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019985173275753475,
+      "loss": 0.2598,
+      "step": 2737
+    },
+    {
+      "epoch": 0.02376715479900348,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0019985156194052495,
+      "loss": 0.2012,
+      "step": 2738
+    },
+    {
+      "epoch": 0.023775835279207645,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0019985139102525486,
+      "loss": 0.2246,
+      "step": 2739
+    },
+    {
+      "epoch": 0.02378451575941181,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.001998512200117246,
+      "loss": 0.1914,
+      "step": 2740
+    },
+    {
+      "epoch": 0.023793196239615975,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019985104889993443,
+      "loss": 0.2207,
+      "step": 2741
+    },
+    {
+      "epoch": 0.02380187671982014,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001998508776898845,
+      "loss": 0.2969,
+      "step": 2742
+    },
+    {
+      "epoch": 0.023810557200024306,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.00199850706381575,
+      "loss": 0.2275,
+      "step": 2743
+    },
+    {
+      "epoch": 0.02381923768022847,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.001998505349750061,
+      "loss": 0.2275,
+      "step": 2744
+    },
+    {
+      "epoch": 0.023827918160432636,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0019985036347017803,
+      "loss": 0.1523,
+      "step": 2745
+    },
+    {
+      "epoch": 0.0238365986406368,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001998501918670909,
+      "loss": 0.2197,
+      "step": 2746
+    },
+    {
+      "epoch": 0.023845279120840966,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.00199850020165745,
+      "loss": 0.209,
+      "step": 2747
+    },
+    {
+      "epoch": 0.02385395960104513,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.001998498483661405,
+      "loss": 0.1914,
+      "step": 2748
+    },
+    {
+      "epoch": 0.023862640081249296,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0019984967646827748,
+      "loss": 0.2754,
+      "step": 2749
+    },
+    {
+      "epoch": 0.023871320561453458,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019984950447215627,
+      "loss": 0.1992,
+      "step": 2750
+    },
+    {
+      "epoch": 0.023880001041657623,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0019984933237777694,
+      "loss": 0.2051,
+      "step": 2751
+    },
+    {
+      "epoch": 0.02388868152186179,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019984916018513975,
+      "loss": 0.2773,
+      "step": 2752
+    },
+    {
+      "epoch": 0.023897362002065953,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0019984898789424488,
+      "loss": 0.1494,
+      "step": 2753
+    },
+    {
+      "epoch": 0.02390604248227012,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019984881550509244,
+      "loss": 0.1758,
+      "step": 2754
+    },
+    {
+      "epoch": 0.023914722962474284,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001998486430176828,
+      "loss": 0.167,
+      "step": 2755
+    },
+    {
+      "epoch": 0.02392340344267845,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0019984847043201595,
+      "loss": 0.2217,
+      "step": 2756
+    },
+    {
+      "epoch": 0.023932083922882614,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001998482977480922,
+      "loss": 0.2119,
+      "step": 2757
+    },
+    {
+      "epoch": 0.02394076440308678,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019984812496591166,
+      "loss": 0.2021,
+      "step": 2758
+    },
+    {
+      "epoch": 0.023949444883290944,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.001998479520854746,
+      "loss": 0.2363,
+      "step": 2759
+    },
+    {
+      "epoch": 0.02395812536349511,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0019984777910678118,
+      "loss": 0.2432,
+      "step": 2760
+    },
+    {
+      "epoch": 0.023966805843699274,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0019984760602983153,
+      "loss": 0.1758,
+      "step": 2761
+    },
+    {
+      "epoch": 0.02397548632390344,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001998474328546259,
+      "loss": 0.21,
+      "step": 2762
+    },
+    {
+      "epoch": 0.023984166804107605,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0019984725958116446,
+      "loss": 0.2148,
+      "step": 2763
+    },
+    {
+      "epoch": 0.02399284728431177,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.001998470862094475,
+      "loss": 0.2246,
+      "step": 2764
+    },
+    {
+      "epoch": 0.02400152776451593,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.00199846912739475,
+      "loss": 0.2031,
+      "step": 2765
+    },
+    {
+      "epoch": 0.024010208244720097,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.001998467391712473,
+      "loss": 0.1953,
+      "step": 2766
+    },
+    {
+      "epoch": 0.02401888872492426,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019984656550476455,
+      "loss": 0.2559,
+      "step": 2767
+    },
+    {
+      "epoch": 0.024027569205128427,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.00199846391740027,
+      "loss": 0.3066,
+      "step": 2768
+    },
+    {
+      "epoch": 0.024036249685332592,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019984621787703474,
+      "loss": 0.2041,
+      "step": 2769
+    },
+    {
+      "epoch": 0.024044930165536757,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0019984604391578803,
+      "loss": 0.2324,
+      "step": 2770
+    },
+    {
+      "epoch": 0.024053610645740922,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.00199845869856287,
+      "loss": 0.2559,
+      "step": 2771
+    },
+    {
+      "epoch": 0.024062291125945087,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001998456956985319,
+      "loss": 0.2109,
+      "step": 2772
+    },
+    {
+      "epoch": 0.024070971606149252,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001998455214425229,
+      "loss": 0.2656,
+      "step": 2773
+    },
+    {
+      "epoch": 0.024079652086353417,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001998453470882602,
+      "loss": 0.3633,
+      "step": 2774
+    },
+    {
+      "epoch": 0.024088332566557583,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0019984517263574395,
+      "loss": 0.2559,
+      "step": 2775
+    },
+    {
+      "epoch": 0.024097013046761748,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019984499808497437,
+      "loss": 0.2363,
+      "step": 2776
+    },
+    {
+      "epoch": 0.024105693526965913,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001998448234359517,
+      "loss": 0.2871,
+      "step": 2777
+    },
+    {
+      "epoch": 0.024114374007170078,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.00199844648688676,
+      "loss": 0.2188,
+      "step": 2778
+    },
+    {
+      "epoch": 0.024123054487374243,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.001998444738431476,
+      "loss": 0.2158,
+      "step": 2779
+    },
+    {
+      "epoch": 0.024131734967578405,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001998442988993666,
+      "loss": 0.248,
+      "step": 2780
+    },
+    {
+      "epoch": 0.02414041544778257,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019984412385733326,
+      "loss": 0.2324,
+      "step": 2781
+    },
+    {
+      "epoch": 0.024149095927986735,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001998439487170477,
+      "loss": 0.1807,
+      "step": 2782
+    },
+    {
+      "epoch": 0.0241577764081909,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019984377347851017,
+      "loss": 0.2051,
+      "step": 2783
+    },
+    {
+      "epoch": 0.024166456888395065,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.001998435981417208,
+      "loss": 0.1943,
+      "step": 2784
+    },
+    {
+      "epoch": 0.02417513736859923,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001998434227066799,
+      "loss": 0.3086,
+      "step": 2785
+    },
+    {
+      "epoch": 0.024183817848803395,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001998432471733875,
+      "loss": 0.2559,
+      "step": 2786
+    },
+    {
+      "epoch": 0.02419249832900756,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001998430715418439,
+      "loss": 0.2383,
+      "step": 2787
+    },
+    {
+      "epoch": 0.024201178809211726,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001998428958120493,
+      "loss": 0.2207,
+      "step": 2788
+    },
+    {
+      "epoch": 0.02420985928941589,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019984271998400387,
+      "loss": 0.1689,
+      "step": 2789
+    },
+    {
+      "epoch": 0.024218539769620056,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019984254405770773,
+      "loss": 0.2246,
+      "step": 2790
+    },
+    {
+      "epoch": 0.02422722024982422,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001998423680331612,
+      "loss": 0.1982,
+      "step": 2791
+    },
+    {
+      "epoch": 0.024235900730028386,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0019984219191036432,
+      "loss": 0.2324,
+      "step": 2792
+    },
+    {
+      "epoch": 0.02424458121023255,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0019984201568931746,
+      "loss": 0.1924,
+      "step": 2793
+    },
+    {
+      "epoch": 0.024253261690436716,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019984183937002066,
+      "loss": 0.2246,
+      "step": 2794
+    },
+    {
+      "epoch": 0.02426194217064088,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001998416629524742,
+      "loss": 0.1826,
+      "step": 2795
+    },
+    {
+      "epoch": 0.024270622650845043,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.001998414864366782,
+      "loss": 0.2188,
+      "step": 2796
+    },
+    {
+      "epoch": 0.02427930313104921,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.00199841309822633,
+      "loss": 0.2021,
+      "step": 2797
+    },
+    {
+      "epoch": 0.024287983611253373,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001998411331103386,
+      "loss": 0.2637,
+      "step": 2798
+    },
+    {
+      "epoch": 0.02429666409145754,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0019984095629979534,
+      "loss": 0.1973,
+      "step": 2799
+    },
+    {
+      "epoch": 0.024305344571661704,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019984077939100334,
+      "loss": 0.1992,
+      "step": 2800
+    },
+    {
+      "epoch": 0.02431402505186587,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001998406023839628,
+      "loss": 0.1934,
+      "step": 2801
+    },
+    {
+      "epoch": 0.024322705532070034,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019984042527867395,
+      "loss": 0.1943,
+      "step": 2802
+    },
+    {
+      "epoch": 0.0243313860122742,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019984024807513695,
+      "loss": 0.2363,
+      "step": 2803
+    },
+    {
+      "epoch": 0.024340066492478364,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019984007077335202,
+      "loss": 0.2383,
+      "step": 2804
+    },
+    {
+      "epoch": 0.02434874697268253,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.001998398933733193,
+      "loss": 0.2236,
+      "step": 2805
+    },
+    {
+      "epoch": 0.024357427452886694,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0019983971587503907,
+      "loss": 0.1865,
+      "step": 2806
+    },
+    {
+      "epoch": 0.02436610793309086,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019983953827851144,
+      "loss": 0.2402,
+      "step": 2807
+    },
+    {
+      "epoch": 0.024374788413295025,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019983936058373666,
+      "loss": 0.2832,
+      "step": 2808
+    },
+    {
+      "epoch": 0.02438346889349919,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001998391827907149,
+      "loss": 0.2578,
+      "step": 2809
+    },
+    {
+      "epoch": 0.024392149373703355,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019983900489944635,
+      "loss": 0.2139,
+      "step": 2810
+    },
+    {
+      "epoch": 0.024400829853907516,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001998388269099312,
+      "loss": 0.2383,
+      "step": 2811
+    },
+    {
+      "epoch": 0.02440951033411168,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019983864882216974,
+      "loss": 0.2412,
+      "step": 2812
+    },
+    {
+      "epoch": 0.024418190814315847,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019983847063616204,
+      "loss": 0.1748,
+      "step": 2813
+    },
+    {
+      "epoch": 0.024426871294520012,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001998382923519083,
+      "loss": 0.2266,
+      "step": 2814
+    },
+    {
+      "epoch": 0.024435551774724177,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0019983811396940875,
+      "loss": 0.1875,
+      "step": 2815
+    },
+    {
+      "epoch": 0.024444232254928342,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019983793548866364,
+      "loss": 0.1807,
+      "step": 2816
+    },
+    {
+      "epoch": 0.024452912735132507,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019983775690967306,
+      "loss": 0.2129,
+      "step": 2817
+    },
+    {
+      "epoch": 0.024461593215336672,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001998375782324373,
+      "loss": 0.2031,
+      "step": 2818
+    },
+    {
+      "epoch": 0.024470273695540837,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019983739945695655,
+      "loss": 0.2148,
+      "step": 2819
+    },
+    {
+      "epoch": 0.024478954175745003,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001998372205832309,
+      "loss": 0.291,
+      "step": 2820
+    },
+    {
+      "epoch": 0.024487634655949168,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0019983704161126064,
+      "loss": 0.25,
+      "step": 2821
+    },
+    {
+      "epoch": 0.024496315136153333,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019983686254104595,
+      "loss": 0.25,
+      "step": 2822
+    },
+    {
+      "epoch": 0.024504995616357498,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019983668337258697,
+      "loss": 0.2148,
+      "step": 2823
+    },
+    {
+      "epoch": 0.024513676096561663,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.00199836504105884,
+      "loss": 0.2383,
+      "step": 2824
+    },
+    {
+      "epoch": 0.024522356576765828,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0019983632474093716,
+      "loss": 0.1787,
+      "step": 2825
+    },
+    {
+      "epoch": 0.024531037056969993,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019983614527774667,
+      "loss": 0.1934,
+      "step": 2826
+    },
+    {
+      "epoch": 0.024539717537174155,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001998359657163127,
+      "loss": 0.2148,
+      "step": 2827
+    },
+    {
+      "epoch": 0.02454839801737832,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001998357860566355,
+      "loss": 0.2637,
+      "step": 2828
+    },
+    {
+      "epoch": 0.024557078497582485,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001998356062987152,
+      "loss": 0.2793,
+      "step": 2829
+    },
+    {
+      "epoch": 0.02456575897778665,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0019983542644255205,
+      "loss": 0.2344,
+      "step": 2830
+    },
+    {
+      "epoch": 0.024574439457990815,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001998352464881462,
+      "loss": 0.2227,
+      "step": 2831
+    },
+    {
+      "epoch": 0.02458311993819498,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0019983506643549793,
+      "loss": 0.2041,
+      "step": 2832
+    },
+    {
+      "epoch": 0.024591800418399146,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019983488628460733,
+      "loss": 0.208,
+      "step": 2833
+    },
+    {
+      "epoch": 0.02460048089860331,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001998347060354747,
+      "loss": 0.249,
+      "step": 2834
+    },
+    {
+      "epoch": 0.024609161378807476,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001998345256881001,
+      "loss": 0.1973,
+      "step": 2835
+    },
+    {
+      "epoch": 0.02461784185901164,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.001998343452424839,
+      "loss": 0.2285,
+      "step": 2836
+    },
+    {
+      "epoch": 0.024626522339215806,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019983416469862617,
+      "loss": 0.1758,
+      "step": 2837
+    },
+    {
+      "epoch": 0.02463520281941997,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019983398405652715,
+      "loss": 0.2266,
+      "step": 2838
+    },
+    {
+      "epoch": 0.024643883299624136,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0019983380331618705,
+      "loss": 0.252,
+      "step": 2839
+    },
+    {
+      "epoch": 0.0246525637798283,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0019983362247760605,
+      "loss": 0.1973,
+      "step": 2840
+    },
+    {
+      "epoch": 0.024661244260032467,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001998334415407843,
+      "loss": 0.1562,
+      "step": 2841
+    },
+    {
+      "epoch": 0.024669924740236628,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.001998332605057221,
+      "loss": 0.1953,
+      "step": 2842
+    },
+    {
+      "epoch": 0.024678605220440793,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0019983307937241957,
+      "loss": 0.2617,
+      "step": 2843
+    },
+    {
+      "epoch": 0.02468728570064496,
+      "grad_norm": 0.05908203125,
+      "learning_rate": 0.0019983289814087695,
+      "loss": 0.1523,
+      "step": 2844
+    },
+    {
+      "epoch": 0.024695966180849124,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019983271681109447,
+      "loss": 0.1855,
+      "step": 2845
+    },
+    {
+      "epoch": 0.02470464666105329,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001998325353830722,
+      "loss": 0.1553,
+      "step": 2846
+    },
+    {
+      "epoch": 0.024713327141257454,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019983235385681044,
+      "loss": 0.2148,
+      "step": 2847
+    },
+    {
+      "epoch": 0.02472200762146162,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001998321722323094,
+      "loss": 0.2148,
+      "step": 2848
+    },
+    {
+      "epoch": 0.024730688101665784,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001998319905095692,
+      "loss": 0.2344,
+      "step": 2849
+    },
+    {
+      "epoch": 0.02473936858186995,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001998318086885901,
+      "loss": 0.2598,
+      "step": 2850
+    },
+    {
+      "epoch": 0.024748049062074114,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001998316267693723,
+      "loss": 0.25,
+      "step": 2851
+    },
+    {
+      "epoch": 0.02475672954227828,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.00199831444751916,
+      "loss": 0.1719,
+      "step": 2852
+    },
+    {
+      "epoch": 0.024765410022482445,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019983126263622138,
+      "loss": 0.3223,
+      "step": 2853
+    },
+    {
+      "epoch": 0.02477409050268661,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001998310804222886,
+      "loss": 0.1758,
+      "step": 2854
+    },
+    {
+      "epoch": 0.024782770982890775,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001998308981101179,
+      "loss": 0.2363,
+      "step": 2855
+    },
+    {
+      "epoch": 0.02479145146309494,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001998307156997095,
+      "loss": 0.2148,
+      "step": 2856
+    },
+    {
+      "epoch": 0.0248001319432991,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001998305331910636,
+      "loss": 0.1699,
+      "step": 2857
+    },
+    {
+      "epoch": 0.024808812423503267,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019983035058418032,
+      "loss": 0.1963,
+      "step": 2858
+    },
+    {
+      "epoch": 0.024817492903707432,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019983016787905998,
+      "loss": 0.2734,
+      "step": 2859
+    },
+    {
+      "epoch": 0.024826173383911597,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0019982998507570267,
+      "loss": 0.1289,
+      "step": 2860
+    },
+    {
+      "epoch": 0.024834853864115762,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019982980217410867,
+      "loss": 0.2139,
+      "step": 2861
+    },
+    {
+      "epoch": 0.024843534344319927,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019982961917427815,
+      "loss": 0.209,
+      "step": 2862
+    },
+    {
+      "epoch": 0.024852214824524092,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019982943607621127,
+      "loss": 0.1885,
+      "step": 2863
+    },
+    {
+      "epoch": 0.024860895304728257,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001998292528799083,
+      "loss": 0.2246,
+      "step": 2864
+    },
+    {
+      "epoch": 0.024869575784932423,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001998290695853694,
+      "loss": 0.2031,
+      "step": 2865
+    },
+    {
+      "epoch": 0.024878256265136588,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0019982888619259477,
+      "loss": 0.1895,
+      "step": 2866
+    },
+    {
+      "epoch": 0.024886936745340753,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0019982870270158462,
+      "loss": 0.2227,
+      "step": 2867
+    },
+    {
+      "epoch": 0.024895617225544918,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.001998285191123392,
+      "loss": 0.2129,
+      "step": 2868
+    },
+    {
+      "epoch": 0.024904297705749083,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001998283354248586,
+      "loss": 0.2354,
+      "step": 2869
+    },
+    {
+      "epoch": 0.024912978185953248,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001998281516391431,
+      "loss": 0.207,
+      "step": 2870
+    },
+    {
+      "epoch": 0.024921658666157413,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001998279677551929,
+      "loss": 0.2637,
+      "step": 2871
+    },
+    {
+      "epoch": 0.02493033914636158,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0019982778377300816,
+      "loss": 0.2656,
+      "step": 2872
+    },
+    {
+      "epoch": 0.02493901962656574,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0019982759969258915,
+      "loss": 0.2266,
+      "step": 2873
+    },
+    {
+      "epoch": 0.024947700106769905,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0019982741551393597,
+      "loss": 0.1846,
+      "step": 2874
+    },
+    {
+      "epoch": 0.02495638058697407,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001998272312370489,
+      "loss": 0.2266,
+      "step": 2875
+    },
+    {
+      "epoch": 0.024965061067178235,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0019982704686192813,
+      "loss": 0.1787,
+      "step": 2876
+    },
+    {
+      "epoch": 0.0249737415473824,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019982686238857387,
+      "loss": 0.2227,
+      "step": 2877
+    },
+    {
+      "epoch": 0.024982422027586566,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019982667781698626,
+      "loss": 0.2197,
+      "step": 2878
+    },
+    {
+      "epoch": 0.02499110250779073,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019982649314716555,
+      "loss": 0.2227,
+      "step": 2879
+    },
+    {
+      "epoch": 0.024999782987994896,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0019982630837911196,
+      "loss": 0.3633,
+      "step": 2880
+    },
+    {
+      "epoch": 0.02500846346819906,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019982612351282566,
+      "loss": 0.2285,
+      "step": 2881
+    },
+    {
+      "epoch": 0.025017143948403226,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019982593854830683,
+      "loss": 0.2168,
+      "step": 2882
+    },
+    {
+      "epoch": 0.02502582442860739,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0019982575348555572,
+      "loss": 0.2832,
+      "step": 2883
+    },
+    {
+      "epoch": 0.025034504908811556,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019982556832457256,
+      "loss": 0.2178,
+      "step": 2884
+    },
+    {
+      "epoch": 0.02504318538901572,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0019982538306535748,
+      "loss": 0.1826,
+      "step": 2885
+    },
+    {
+      "epoch": 0.025051865869219887,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001998251977079107,
+      "loss": 0.2012,
+      "step": 2886
+    },
+    {
+      "epoch": 0.02506054634942405,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019982501225223243,
+      "loss": 0.2422,
+      "step": 2887
+    },
+    {
+      "epoch": 0.025069226829628213,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001998248266983229,
+      "loss": 0.2578,
+      "step": 2888
+    },
+    {
+      "epoch": 0.02507790730983238,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001998246410461823,
+      "loss": 0.1514,
+      "step": 2889
+    },
+    {
+      "epoch": 0.025086587790036544,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0019982445529581074,
+      "loss": 0.2266,
+      "step": 2890
+    },
+    {
+      "epoch": 0.02509526827024071,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019982426944720856,
+      "loss": 0.2227,
+      "step": 2891
+    },
+    {
+      "epoch": 0.025103948750444874,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0019982408350037594,
+      "loss": 0.1748,
+      "step": 2892
+    },
+    {
+      "epoch": 0.02511262923064904,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.00199823897455313,
+      "loss": 0.1689,
+      "step": 2893
+    },
+    {
+      "epoch": 0.025121309710853204,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0019982371131202,
+      "loss": 0.1709,
+      "step": 2894
+    },
+    {
+      "epoch": 0.02512999019105737,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019982352507049717,
+      "loss": 0.1875,
+      "step": 2895
+    },
+    {
+      "epoch": 0.025138670671261534,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001998233387307447,
+      "loss": 0.2207,
+      "step": 2896
+    },
+    {
+      "epoch": 0.0251473511514657,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019982315229276275,
+      "loss": 0.2324,
+      "step": 2897
+    },
+    {
+      "epoch": 0.025156031631669865,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0019982296575655153,
+      "loss": 0.2344,
+      "step": 2898
+    },
+    {
+      "epoch": 0.02516471211187403,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019982277912211125,
+      "loss": 0.1992,
+      "step": 2899
+    },
+    {
+      "epoch": 0.025173392592078195,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019982259238944216,
+      "loss": 0.2266,
+      "step": 2900
+    },
+    {
+      "epoch": 0.02518207307228236,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019982240555854445,
+      "loss": 0.1924,
+      "step": 2901
+    },
+    {
+      "epoch": 0.025190753552486525,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001998222186294183,
+      "loss": 0.2119,
+      "step": 2902
+    },
+    {
+      "epoch": 0.02519943403269069,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.001998220316020639,
+      "loss": 0.2324,
+      "step": 2903
+    },
+    {
+      "epoch": 0.025208114512894852,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0019982184447648148,
+      "loss": 0.1973,
+      "step": 2904
+    },
+    {
+      "epoch": 0.025216794993099017,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0019982165725267124,
+      "loss": 0.1895,
+      "step": 2905
+    },
+    {
+      "epoch": 0.025225475473303182,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001998214699306334,
+      "loss": 0.1426,
+      "step": 2906
+    },
+    {
+      "epoch": 0.025234155953507347,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001998212825103681,
+      "loss": 0.1875,
+      "step": 2907
+    },
+    {
+      "epoch": 0.025242836433711512,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019982109499187568,
+      "loss": 0.2119,
+      "step": 2908
+    },
+    {
+      "epoch": 0.025251516913915677,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001998209073751562,
+      "loss": 0.1943,
+      "step": 2909
+    },
+    {
+      "epoch": 0.025260197394119843,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019982071966020993,
+      "loss": 0.2383,
+      "step": 2910
+    },
+    {
+      "epoch": 0.025268877874324008,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019982053184703706,
+      "loss": 0.25,
+      "step": 2911
+    },
+    {
+      "epoch": 0.025277558354528173,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019982034393563786,
+      "loss": 0.2402,
+      "step": 2912
+    },
+    {
+      "epoch": 0.025286238834732338,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019982015592601246,
+      "loss": 0.2539,
+      "step": 2913
+    },
+    {
+      "epoch": 0.025294919314936503,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0019981996781816107,
+      "loss": 0.2266,
+      "step": 2914
+    },
+    {
+      "epoch": 0.025303599795140668,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.001998197796120839,
+      "loss": 0.2266,
+      "step": 2915
+    },
+    {
+      "epoch": 0.025312280275344833,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001998195913077812,
+      "loss": 0.2676,
+      "step": 2916
+    },
+    {
+      "epoch": 0.025320960755549,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019981940290525312,
+      "loss": 0.1758,
+      "step": 2917
+    },
+    {
+      "epoch": 0.025329641235753163,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019981921440449988,
+      "loss": 0.2148,
+      "step": 2918
+    },
+    {
+      "epoch": 0.025338321715957325,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019981902580552173,
+      "loss": 0.2363,
+      "step": 2919
+    },
+    {
+      "epoch": 0.02534700219616149,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001998188371083188,
+      "loss": 0.1895,
+      "step": 2920
+    },
+    {
+      "epoch": 0.025355682676365655,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019981864831289144,
+      "loss": 0.1787,
+      "step": 2921
+    },
+    {
+      "epoch": 0.02536436315656982,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019981845941923967,
+      "loss": 0.2471,
+      "step": 2922
+    },
+    {
+      "epoch": 0.025373043636773986,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001998182704273638,
+      "loss": 0.2109,
+      "step": 2923
+    },
+    {
+      "epoch": 0.02538172411697815,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.00199818081337264,
+      "loss": 0.1895,
+      "step": 2924
+    },
+    {
+      "epoch": 0.025390404597182316,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019981789214894054,
+      "loss": 0.1602,
+      "step": 2925
+    },
+    {
+      "epoch": 0.02539908507738648,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0019981770286239355,
+      "loss": 0.2637,
+      "step": 2926
+    },
+    {
+      "epoch": 0.025407765557590646,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019981751347762327,
+      "loss": 0.2266,
+      "step": 2927
+    },
+    {
+      "epoch": 0.02541644603779481,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0019981732399462987,
+      "loss": 0.2119,
+      "step": 2928
+    },
+    {
+      "epoch": 0.025425126517998976,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0019981713441341365,
+      "loss": 0.2578,
+      "step": 2929
+    },
+    {
+      "epoch": 0.02543380699820314,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0019981694473397474,
+      "loss": 0.1826,
+      "step": 2930
+    },
+    {
+      "epoch": 0.025442487478407307,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0019981675495631336,
+      "loss": 0.1553,
+      "step": 2931
+    },
+    {
+      "epoch": 0.02545116795861147,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019981656508042977,
+      "loss": 0.2324,
+      "step": 2932
+    },
+    {
+      "epoch": 0.025459848438815637,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001998163751063241,
+      "loss": 0.21,
+      "step": 2933
+    },
+    {
+      "epoch": 0.0254685289190198,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001998161850339966,
+      "loss": 0.209,
+      "step": 2934
+    },
+    {
+      "epoch": 0.025477209399223964,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001998159948634475,
+      "loss": 0.1865,
+      "step": 2935
+    },
+    {
+      "epoch": 0.02548588987942813,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0019981580459467693,
+      "loss": 0.1689,
+      "step": 2936
+    },
+    {
+      "epoch": 0.025494570359632294,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019981561422768514,
+      "loss": 0.2227,
+      "step": 2937
+    },
+    {
+      "epoch": 0.02550325083983646,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001998154237624724,
+      "loss": 0.2363,
+      "step": 2938
+    },
+    {
+      "epoch": 0.025511931320040624,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001998152331990388,
+      "loss": 0.1934,
+      "step": 2939
+    },
+    {
+      "epoch": 0.02552061180024479,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019981504253738466,
+      "loss": 0.2188,
+      "step": 2940
+    },
+    {
+      "epoch": 0.025529292280448954,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.001998148517775101,
+      "loss": 0.1982,
+      "step": 2941
+    },
+    {
+      "epoch": 0.02553797276065312,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001998146609194154,
+      "loss": 0.2383,
+      "step": 2942
+    },
+    {
+      "epoch": 0.025546653240857285,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001998144699631007,
+      "loss": 0.1973,
+      "step": 2943
+    },
+    {
+      "epoch": 0.02555533372106145,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0019981427890856628,
+      "loss": 0.1992,
+      "step": 2944
+    },
+    {
+      "epoch": 0.025564014201265615,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019981408775581228,
+      "loss": 0.1396,
+      "step": 2945
+    },
+    {
+      "epoch": 0.02557269468146978,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0019981389650483897,
+      "loss": 0.207,
+      "step": 2946
+    },
+    {
+      "epoch": 0.025581375161673945,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019981370515564654,
+      "loss": 0.2988,
+      "step": 2947
+    },
+    {
+      "epoch": 0.02559005564187811,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019981351370823514,
+      "loss": 0.1934,
+      "step": 2948
+    },
+    {
+      "epoch": 0.025598736122082275,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019981332216260504,
+      "loss": 0.2061,
+      "step": 2949
+    },
+    {
+      "epoch": 0.025607416602286437,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001998131305187565,
+      "loss": 0.1904,
+      "step": 2950
+    },
+    {
+      "epoch": 0.025616097082490602,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019981293877668962,
+      "loss": 0.1719,
+      "step": 2951
+    },
+    {
+      "epoch": 0.025624777562694767,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001998127469364047,
+      "loss": 0.2266,
+      "step": 2952
+    },
+    {
+      "epoch": 0.025633458042898932,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001998125549979019,
+      "loss": 0.2129,
+      "step": 2953
+    },
+    {
+      "epoch": 0.025642138523103097,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0019981236296118137,
+      "loss": 0.3711,
+      "step": 2954
+    },
+    {
+      "epoch": 0.025650819003307263,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0019981217082624347,
+      "loss": 0.2148,
+      "step": 2955
+    },
+    {
+      "epoch": 0.025659499483511428,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001998119785930883,
+      "loss": 0.1963,
+      "step": 2956
+    },
+    {
+      "epoch": 0.025668179963715593,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001998117862617161,
+      "loss": 0.1826,
+      "step": 2957
+    },
+    {
+      "epoch": 0.025676860443919758,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001998115938321271,
+      "loss": 0.2451,
+      "step": 2958
+    },
+    {
+      "epoch": 0.025685540924123923,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019981140130432146,
+      "loss": 0.2471,
+      "step": 2959
+    },
+    {
+      "epoch": 0.025694221404328088,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0019981120867829947,
+      "loss": 0.1826,
+      "step": 2960
+    },
+    {
+      "epoch": 0.025702901884532253,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019981101595406125,
+      "loss": 0.2139,
+      "step": 2961
+    },
+    {
+      "epoch": 0.02571158236473642,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.00199810823131607,
+      "loss": 0.2061,
+      "step": 2962
+    },
+    {
+      "epoch": 0.025720262844940583,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001998106302109371,
+      "loss": 0.2119,
+      "step": 2963
+    },
+    {
+      "epoch": 0.02572894332514475,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019981043719205158,
+      "loss": 0.1816,
+      "step": 2964
+    },
+    {
+      "epoch": 0.02573762380534891,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001998102440749507,
+      "loss": 0.2363,
+      "step": 2965
+    },
+    {
+      "epoch": 0.025746304285553075,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001998100508596347,
+      "loss": 0.2324,
+      "step": 2966
+    },
+    {
+      "epoch": 0.02575498476575724,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001998098575461038,
+      "loss": 0.2363,
+      "step": 2967
+    },
+    {
+      "epoch": 0.025763665245961406,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019980966413435815,
+      "loss": 0.2188,
+      "step": 2968
+    },
+    {
+      "epoch": 0.02577234572616557,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019980947062439806,
+      "loss": 0.2266,
+      "step": 2969
+    },
+    {
+      "epoch": 0.025781026206369736,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019980927701622364,
+      "loss": 0.248,
+      "step": 2970
+    },
+    {
+      "epoch": 0.0257897066865739,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019980908330983513,
+      "loss": 0.2041,
+      "step": 2971
+    },
+    {
+      "epoch": 0.025798387166778066,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001998088895052328,
+      "loss": 0.2305,
+      "step": 2972
+    },
+    {
+      "epoch": 0.02580706764698223,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001998086956024168,
+      "loss": 0.2422,
+      "step": 2973
+    },
+    {
+      "epoch": 0.025815748127186396,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001998085016013873,
+      "loss": 0.2539,
+      "step": 2974
+    },
+    {
+      "epoch": 0.02582442860739056,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019980830750214464,
+      "loss": 0.209,
+      "step": 2975
+    },
+    {
+      "epoch": 0.025833109087594727,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019980811330468896,
+      "loss": 0.1992,
+      "step": 2976
+    },
+    {
+      "epoch": 0.02584178956779889,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019980791900902047,
+      "loss": 0.2188,
+      "step": 2977
+    },
+    {
+      "epoch": 0.025850470048003057,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001998077246151394,
+      "loss": 0.1963,
+      "step": 2978
+    },
+    {
+      "epoch": 0.025859150528207222,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001998075301230459,
+      "loss": 0.2383,
+      "step": 2979
+    },
+    {
+      "epoch": 0.025867831008411387,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019980733553274033,
+      "loss": 0.1953,
+      "step": 2980
+    },
+    {
+      "epoch": 0.02587651148861555,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019980714084422272,
+      "loss": 0.2363,
+      "step": 2981
+    },
+    {
+      "epoch": 0.025885191968819714,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001998069460574934,
+      "loss": 0.1855,
+      "step": 2982
+    },
+    {
+      "epoch": 0.02589387244902388,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0019980675117255257,
+      "loss": 0.248,
+      "step": 2983
+    },
+    {
+      "epoch": 0.025902552929228044,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001998065561894004,
+      "loss": 0.208,
+      "step": 2984
+    },
+    {
+      "epoch": 0.02591123340943221,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001998063611080371,
+      "loss": 0.2031,
+      "step": 2985
+    },
+    {
+      "epoch": 0.025919913889636374,
+      "grad_norm": 0.05908203125,
+      "learning_rate": 0.00199806165928463,
+      "loss": 0.168,
+      "step": 2986
+    },
+    {
+      "epoch": 0.02592859436984054,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001998059706506782,
+      "loss": 0.2246,
+      "step": 2987
+    },
+    {
+      "epoch": 0.025937274850044705,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001998057752746829,
+      "loss": 0.1826,
+      "step": 2988
+    },
+    {
+      "epoch": 0.02594595533024887,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0019980557980047737,
+      "loss": 0.1592,
+      "step": 2989
+    },
+    {
+      "epoch": 0.025954635810453035,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001998053842280618,
+      "loss": 0.1836,
+      "step": 2990
+    },
+    {
+      "epoch": 0.0259633162906572,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019980518855743645,
+      "loss": 0.209,
+      "step": 2991
+    },
+    {
+      "epoch": 0.025971996770861365,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019980499278860146,
+      "loss": 0.2168,
+      "step": 2992
+    },
+    {
+      "epoch": 0.02598067725106553,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001998047969215571,
+      "loss": 0.2324,
+      "step": 2993
+    },
+    {
+      "epoch": 0.025989357731269695,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001998046009563035,
+      "loss": 0.2197,
+      "step": 2994
+    },
+    {
+      "epoch": 0.02599803821147386,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00199804404892841,
+      "loss": 0.2109,
+      "step": 2995
+    },
+    {
+      "epoch": 0.026006718691678022,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019980420873116976,
+      "loss": 0.1963,
+      "step": 2996
+    },
+    {
+      "epoch": 0.026015399171882187,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0019980401247128993,
+      "loss": 0.2305,
+      "step": 2997
+    },
+    {
+      "epoch": 0.026024079652086352,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.001998038161132018,
+      "loss": 0.1777,
+      "step": 2998
+    },
+    {
+      "epoch": 0.026032760132290517,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001998036196569056,
+      "loss": 0.2021,
+      "step": 2999
+    },
+    {
+      "epoch": 0.026041440612494682,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001998034231024015,
+      "loss": 0.2012,
+      "step": 3000
+    },
+    {
+      "epoch": 0.026050121092698848,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0019980322644968973,
+      "loss": 0.2773,
+      "step": 3001
+    },
+    {
+      "epoch": 0.026058801572903013,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001998030296987705,
+      "loss": 0.2471,
+      "step": 3002
+    },
+    {
+      "epoch": 0.026067482053107178,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.00199802832849644,
+      "loss": 0.2334,
+      "step": 3003
+    },
+    {
+      "epoch": 0.026076162533311343,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0019980263590231046,
+      "loss": 0.1885,
+      "step": 3004
+    },
+    {
+      "epoch": 0.026084843013515508,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0019980243885677016,
+      "loss": 0.2188,
+      "step": 3005
+    },
+    {
+      "epoch": 0.026093523493719673,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001998022417130232,
+      "loss": 0.2051,
+      "step": 3006
+    },
+    {
+      "epoch": 0.02610220397392384,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.001998020444710699,
+      "loss": 0.2539,
+      "step": 3007
+    },
+    {
+      "epoch": 0.026110884454128003,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019980184713091044,
+      "loss": 0.1709,
+      "step": 3008
+    },
+    {
+      "epoch": 0.02611956493433217,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.00199801649692545,
+      "loss": 0.2354,
+      "step": 3009
+    },
+    {
+      "epoch": 0.026128245414536334,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0019980145215597383,
+      "loss": 0.2109,
+      "step": 3010
+    },
+    {
+      "epoch": 0.026136925894740495,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.001998012545211972,
+      "loss": 0.2617,
+      "step": 3011
+    },
+    {
+      "epoch": 0.02614560637494466,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001998010567882152,
+      "loss": 0.1914,
+      "step": 3012
+    },
+    {
+      "epoch": 0.026154286855148826,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001998008589570281,
+      "loss": 0.2363,
+      "step": 3013
+    },
+    {
+      "epoch": 0.02616296733535299,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001998006610276362,
+      "loss": 0.1699,
+      "step": 3014
+    },
+    {
+      "epoch": 0.026171647815557156,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001998004630000396,
+      "loss": 0.1719,
+      "step": 3015
+    },
+    {
+      "epoch": 0.02618032829576132,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.001998002648742386,
+      "loss": 0.2461,
+      "step": 3016
+    },
+    {
+      "epoch": 0.026189008775965486,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001998000666502333,
+      "loss": 0.2539,
+      "step": 3017
+    },
+    {
+      "epoch": 0.02619768925616965,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019979986832802405,
+      "loss": 0.2109,
+      "step": 3018
+    },
+    {
+      "epoch": 0.026206369736373816,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.00199799669907611,
+      "loss": 0.252,
+      "step": 3019
+    },
+    {
+      "epoch": 0.02621505021657798,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001997994713889944,
+      "loss": 0.2461,
+      "step": 3020
+    },
+    {
+      "epoch": 0.026223730696782147,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019979927277217445,
+      "loss": 0.1895,
+      "step": 3021
+    },
+    {
+      "epoch": 0.02623241117698631,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019979907405715137,
+      "loss": 0.2188,
+      "step": 3022
+    },
+    {
+      "epoch": 0.026241091657190477,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019979887524392533,
+      "loss": 0.1807,
+      "step": 3023
+    },
+    {
+      "epoch": 0.026249772137394642,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019979867633249664,
+      "loss": 0.1992,
+      "step": 3024
+    },
+    {
+      "epoch": 0.026258452617598807,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0019979847732286542,
+      "loss": 0.2207,
+      "step": 3025
+    },
+    {
+      "epoch": 0.026267133097802972,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00199798278215032,
+      "loss": 0.1924,
+      "step": 3026
+    },
+    {
+      "epoch": 0.026275813578007134,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0019979807900899647,
+      "loss": 0.1953,
+      "step": 3027
+    },
+    {
+      "epoch": 0.0262844940582113,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001997978797047591,
+      "loss": 0.1982,
+      "step": 3028
+    },
+    {
+      "epoch": 0.026293174538415464,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019979768030232016,
+      "loss": 0.2441,
+      "step": 3029
+    },
+    {
+      "epoch": 0.02630185501861963,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001997974808016798,
+      "loss": 0.1455,
+      "step": 3030
+    },
+    {
+      "epoch": 0.026310535498823794,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019979728120283827,
+      "loss": 0.2168,
+      "step": 3031
+    },
+    {
+      "epoch": 0.02631921597902796,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019979708150579577,
+      "loss": 0.1914,
+      "step": 3032
+    },
+    {
+      "epoch": 0.026327896459232124,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019979688171055257,
+      "loss": 0.2119,
+      "step": 3033
+    },
+    {
+      "epoch": 0.02633657693943629,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0019979668181710885,
+      "loss": 0.2148,
+      "step": 3034
+    },
+    {
+      "epoch": 0.026345257419640455,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001997964818254648,
+      "loss": 0.2852,
+      "step": 3035
+    },
+    {
+      "epoch": 0.02635393789984462,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019979628173562064,
+      "loss": 0.2383,
+      "step": 3036
+    },
+    {
+      "epoch": 0.026362618380048785,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.001997960815475767,
+      "loss": 0.2051,
+      "step": 3037
+    },
+    {
+      "epoch": 0.02637129886025295,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019979588126133306,
+      "loss": 0.2109,
+      "step": 3038
+    },
+    {
+      "epoch": 0.026379979340457115,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019979568087689,
+      "loss": 0.1953,
+      "step": 3039
+    },
+    {
+      "epoch": 0.02638865982066128,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001997954803942477,
+      "loss": 0.2246,
+      "step": 3040
+    },
+    {
+      "epoch": 0.026397340300865445,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0019979527981340644,
+      "loss": 0.1875,
+      "step": 3041
+    },
+    {
+      "epoch": 0.026406020781069607,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019979507913436643,
+      "loss": 0.2051,
+      "step": 3042
+    },
+    {
+      "epoch": 0.026414701261273772,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019979487835712784,
+      "loss": 0.1816,
+      "step": 3043
+    },
+    {
+      "epoch": 0.026423381741477937,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001997946774816909,
+      "loss": 0.2197,
+      "step": 3044
+    },
+    {
+      "epoch": 0.026432062221682102,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001997944765080559,
+      "loss": 0.2539,
+      "step": 3045
+    },
+    {
+      "epoch": 0.026440742701886268,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019979427543622297,
+      "loss": 0.1699,
+      "step": 3046
+    },
+    {
+      "epoch": 0.026449423182090433,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001997940742661924,
+      "loss": 0.3574,
+      "step": 3047
+    },
+    {
+      "epoch": 0.026458103662294598,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.001997938729979644,
+      "loss": 0.2422,
+      "step": 3048
+    },
+    {
+      "epoch": 0.026466784142498763,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001997936716315391,
+      "loss": 0.2422,
+      "step": 3049
+    },
+    {
+      "epoch": 0.026475464622702928,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019979347016691688,
+      "loss": 0.2119,
+      "step": 3050
+    },
+    {
+      "epoch": 0.026484145102907093,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.001997932686040978,
+      "loss": 0.168,
+      "step": 3051
+    },
+    {
+      "epoch": 0.02649282558311126,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019979306694308217,
+      "loss": 0.2578,
+      "step": 3052
+    },
+    {
+      "epoch": 0.026501506063315423,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001997928651838702,
+      "loss": 0.1875,
+      "step": 3053
+    },
+    {
+      "epoch": 0.02651018654351959,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001997926633264621,
+      "loss": 0.1797,
+      "step": 3054
+    },
+    {
+      "epoch": 0.026518867023723754,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001997924613708581,
+      "loss": 0.1797,
+      "step": 3055
+    },
+    {
+      "epoch": 0.02652754750392792,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001997922593170584,
+      "loss": 0.1846,
+      "step": 3056
+    },
+    {
+      "epoch": 0.026536227984132084,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001997920571650632,
+      "loss": 0.2051,
+      "step": 3057
+    },
+    {
+      "epoch": 0.026544908464336246,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019979185491487282,
+      "loss": 0.2207,
+      "step": 3058
+    },
+    {
+      "epoch": 0.02655358894454041,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001997916525664874,
+      "loss": 0.2031,
+      "step": 3059
+    },
+    {
+      "epoch": 0.026562269424744576,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019979145011990713,
+      "loss": 0.1768,
+      "step": 3060
+    },
+    {
+      "epoch": 0.02657094990494874,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001997912475751323,
+      "loss": 0.252,
+      "step": 3061
+    },
+    {
+      "epoch": 0.026579630385152906,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019979104493216314,
+      "loss": 0.1807,
+      "step": 3062
+    },
+    {
+      "epoch": 0.02658831086535707,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0019979084219099983,
+      "loss": 0.209,
+      "step": 3063
+    },
+    {
+      "epoch": 0.026596991345561236,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001997906393516426,
+      "loss": 0.2266,
+      "step": 3064
+    },
+    {
+      "epoch": 0.0266056718257654,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0019979043641409166,
+      "loss": 0.2246,
+      "step": 3065
+    },
+    {
+      "epoch": 0.026614352305969566,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001997902333783473,
+      "loss": 0.2324,
+      "step": 3066
+    },
+    {
+      "epoch": 0.02662303278617373,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0019979003024440966,
+      "loss": 0.2129,
+      "step": 3067
+    },
+    {
+      "epoch": 0.026631713266377897,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019978982701227897,
+      "loss": 0.208,
+      "step": 3068
+    },
+    {
+      "epoch": 0.026640393746582062,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019978962368195547,
+      "loss": 0.8203,
+      "step": 3069
+    },
+    {
+      "epoch": 0.026649074226786227,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019978942025343943,
+      "loss": 0.2061,
+      "step": 3070
+    },
+    {
+      "epoch": 0.026657754706990392,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019978921672673096,
+      "loss": 0.167,
+      "step": 3071
+    },
+    {
+      "epoch": 0.026666435187194557,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019978901310183043,
+      "loss": 0.1836,
+      "step": 3072
+    },
+    {
+      "epoch": 0.02667511566739872,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019978880937873795,
+      "loss": 0.2285,
+      "step": 3073
+    },
+    {
+      "epoch": 0.026683796147602884,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001997886055574538,
+      "loss": 0.2734,
+      "step": 3074
+    },
+    {
+      "epoch": 0.02669247662780705,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019978840163797813,
+      "loss": 0.2891,
+      "step": 3075
+    },
+    {
+      "epoch": 0.026701157108011214,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0019978819762031122,
+      "loss": 0.2373,
+      "step": 3076
+    },
+    {
+      "epoch": 0.02670983758821538,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0019978799350445337,
+      "loss": 0.2188,
+      "step": 3077
+    },
+    {
+      "epoch": 0.026718518068419544,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019978778929040466,
+      "loss": 0.2734,
+      "step": 3078
+    },
+    {
+      "epoch": 0.02672719854862371,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019978758497816535,
+      "loss": 0.2324,
+      "step": 3079
+    },
+    {
+      "epoch": 0.026735879028827875,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019978738056773567,
+      "loss": 0.2422,
+      "step": 3080
+    },
+    {
+      "epoch": 0.02674455950903204,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019978717605911595,
+      "loss": 0.1973,
+      "step": 3081
+    },
+    {
+      "epoch": 0.026753239989236205,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0019978697145230625,
+      "loss": 0.3438,
+      "step": 3082
+    },
+    {
+      "epoch": 0.02676192046944037,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001997867667473069,
+      "loss": 0.1914,
+      "step": 3083
+    },
+    {
+      "epoch": 0.026770600949644535,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.001997865619441181,
+      "loss": 0.2412,
+      "step": 3084
+    },
+    {
+      "epoch": 0.0267792814298487,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019978635704274,
+      "loss": 0.2197,
+      "step": 3085
+    },
+    {
+      "epoch": 0.026787961910052865,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019978615204317295,
+      "loss": 0.2266,
+      "step": 3086
+    },
+    {
+      "epoch": 0.02679664239025703,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0019978594694541707,
+      "loss": 0.1797,
+      "step": 3087
+    },
+    {
+      "epoch": 0.026805322870461192,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0019978574174947267,
+      "loss": 0.2383,
+      "step": 3088
+    },
+    {
+      "epoch": 0.026814003350665357,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001997855364553399,
+      "loss": 0.2227,
+      "step": 3089
+    },
+    {
+      "epoch": 0.026822683830869522,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019978533106301904,
+      "loss": 0.2285,
+      "step": 3090
+    },
+    {
+      "epoch": 0.026831364311073688,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019978512557251027,
+      "loss": 0.2578,
+      "step": 3091
+    },
+    {
+      "epoch": 0.026840044791277853,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001997849199838138,
+      "loss": 0.2695,
+      "step": 3092
+    },
+    {
+      "epoch": 0.026848725271482018,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019978471429693,
+      "loss": 0.168,
+      "step": 3093
+    },
+    {
+      "epoch": 0.026857405751686183,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001997845085118589,
+      "loss": 0.1953,
+      "step": 3094
+    },
+    {
+      "epoch": 0.026866086231890348,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019978430262860085,
+      "loss": 0.3535,
+      "step": 3095
+    },
+    {
+      "epoch": 0.026874766712094513,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.00199784096647156,
+      "loss": 0.2109,
+      "step": 3096
+    },
+    {
+      "epoch": 0.02688344719229868,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001997838905675246,
+      "loss": 0.1875,
+      "step": 3097
+    },
+    {
+      "epoch": 0.026892127672502843,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019978368438970696,
+      "loss": 0.1914,
+      "step": 3098
+    },
+    {
+      "epoch": 0.02690080815270701,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0019978347811370318,
+      "loss": 0.1973,
+      "step": 3099
+    },
+    {
+      "epoch": 0.026909488632911174,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019978327173951357,
+      "loss": 0.25,
+      "step": 3100
+    },
+    {
+      "epoch": 0.02691816911311534,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019978306526713826,
+      "loss": 0.2129,
+      "step": 3101
+    },
+    {
+      "epoch": 0.026926849593319504,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001997828586965776,
+      "loss": 0.2061,
+      "step": 3102
+    },
+    {
+      "epoch": 0.02693553007352367,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0019978265202783172,
+      "loss": 0.1846,
+      "step": 3103
+    },
+    {
+      "epoch": 0.02694421055372783,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.001997824452609009,
+      "loss": 0.2891,
+      "step": 3104
+    },
+    {
+      "epoch": 0.026952891033931996,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001997822383957853,
+      "loss": 0.1865,
+      "step": 3105
+    },
+    {
+      "epoch": 0.02696157151413616,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0019978203143248526,
+      "loss": 0.1494,
+      "step": 3106
+    },
+    {
+      "epoch": 0.026970251994340326,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0019978182437100094,
+      "loss": 0.1797,
+      "step": 3107
+    },
+    {
+      "epoch": 0.02697893247454449,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0019978161721133252,
+      "loss": 0.1855,
+      "step": 3108
+    },
+    {
+      "epoch": 0.026987612954748656,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001997814099534803,
+      "loss": 0.1543,
+      "step": 3109
+    },
+    {
+      "epoch": 0.02699629343495282,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019978120259744447,
+      "loss": 0.2539,
+      "step": 3110
+    },
+    {
+      "epoch": 0.027004973915156986,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019978099514322526,
+      "loss": 0.1816,
+      "step": 3111
+    },
+    {
+      "epoch": 0.02701365439536115,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001997807875908229,
+      "loss": 0.1924,
+      "step": 3112
+    },
+    {
+      "epoch": 0.027022334875565317,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019978057994023764,
+      "loss": 0.249,
+      "step": 3113
+    },
+    {
+      "epoch": 0.027031015355769482,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019978037219146967,
+      "loss": 0.2275,
+      "step": 3114
+    },
+    {
+      "epoch": 0.027039695835973647,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019978016434451925,
+      "loss": 0.1699,
+      "step": 3115
+    },
+    {
+      "epoch": 0.027048376316177812,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019977995639938657,
+      "loss": 0.1953,
+      "step": 3116
+    },
+    {
+      "epoch": 0.027057056796381977,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0019977974835607188,
+      "loss": 0.2236,
+      "step": 3117
+    },
+    {
+      "epoch": 0.027065737276586142,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001997795402145754,
+      "loss": 0.1846,
+      "step": 3118
+    },
+    {
+      "epoch": 0.027074417756790304,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001997793319748974,
+      "loss": 0.2363,
+      "step": 3119
+    },
+    {
+      "epoch": 0.02708309823699447,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019977912363703804,
+      "loss": 0.1865,
+      "step": 3120
+    },
+    {
+      "epoch": 0.027091778717198634,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019977891520099756,
+      "loss": 0.2031,
+      "step": 3121
+    },
+    {
+      "epoch": 0.0271004591974028,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019977870666677625,
+      "loss": 0.1836,
+      "step": 3122
+    },
+    {
+      "epoch": 0.027109139677606964,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001997784980343743,
+      "loss": 0.1865,
+      "step": 3123
+    },
+    {
+      "epoch": 0.02711782015781113,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0019977828930379193,
+      "loss": 0.1719,
+      "step": 3124
+    },
+    {
+      "epoch": 0.027126500638015295,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0019977808047502935,
+      "loss": 0.1934,
+      "step": 3125
+    },
+    {
+      "epoch": 0.02713518111821946,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001997778715480868,
+      "loss": 0.1611,
+      "step": 3126
+    },
+    {
+      "epoch": 0.027143861598423625,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001997776625229645,
+      "loss": 0.1836,
+      "step": 3127
+    },
+    {
+      "epoch": 0.02715254207862779,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019977745339966276,
+      "loss": 0.2148,
+      "step": 3128
+    },
+    {
+      "epoch": 0.027161222558831955,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019977724417818174,
+      "loss": 0.2207,
+      "step": 3129
+    },
+    {
+      "epoch": 0.02716990303903612,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0019977703485852165,
+      "loss": 0.1709,
+      "step": 3130
+    },
+    {
+      "epoch": 0.027178583519240285,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019977682544068272,
+      "loss": 0.2012,
+      "step": 3131
+    },
+    {
+      "epoch": 0.02718726399944445,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0019977661592466525,
+      "loss": 0.209,
+      "step": 3132
+    },
+    {
+      "epoch": 0.027195944479648616,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001997764063104694,
+      "loss": 0.3164,
+      "step": 3133
+    },
+    {
+      "epoch": 0.02720462495985278,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0019977619659809543,
+      "loss": 0.2012,
+      "step": 3134
+    },
+    {
+      "epoch": 0.027213305440056942,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001997759867875435,
+      "loss": 0.2305,
+      "step": 3135
+    },
+    {
+      "epoch": 0.027221985920261108,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019977577687881397,
+      "loss": 0.1934,
+      "step": 3136
+    },
+    {
+      "epoch": 0.027230666400465273,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.00199775566871907,
+      "loss": 0.2041,
+      "step": 3137
+    },
+    {
+      "epoch": 0.027239346880669438,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001997753567668228,
+      "loss": 0.2246,
+      "step": 3138
+    },
+    {
+      "epoch": 0.027248027360873603,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019977514656356163,
+      "loss": 0.2178,
+      "step": 3139
+    },
+    {
+      "epoch": 0.027256707841077768,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0019977493626212365,
+      "loss": 0.1943,
+      "step": 3140
+    },
+    {
+      "epoch": 0.027265388321281933,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.001997747258625092,
+      "loss": 0.2129,
+      "step": 3141
+    },
+    {
+      "epoch": 0.027274068801486098,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001997745153647185,
+      "loss": 0.1885,
+      "step": 3142
+    },
+    {
+      "epoch": 0.027282749281690263,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001997743047687517,
+      "loss": 0.2021,
+      "step": 3143
+    },
+    {
+      "epoch": 0.02729142976189443,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019977409407460904,
+      "loss": 0.2266,
+      "step": 3144
+    },
+    {
+      "epoch": 0.027300110242098594,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019977388328229076,
+      "loss": 0.2432,
+      "step": 3145
+    },
+    {
+      "epoch": 0.02730879072230276,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001997736723917972,
+      "loss": 0.1826,
+      "step": 3146
+    },
+    {
+      "epoch": 0.027317471202506924,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019977346140312843,
+      "loss": 0.2207,
+      "step": 3147
+    },
+    {
+      "epoch": 0.02732615168271109,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0019977325031628477,
+      "loss": 0.375,
+      "step": 3148
+    },
+    {
+      "epoch": 0.027334832162915254,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019977303913126646,
+      "loss": 0.2539,
+      "step": 3149
+    },
+    {
+      "epoch": 0.027343512643119416,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001997728278480737,
+      "loss": 0.1807,
+      "step": 3150
+    },
+    {
+      "epoch": 0.02735219312332358,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001997726164667067,
+      "loss": 0.2012,
+      "step": 3151
+    },
+    {
+      "epoch": 0.027360873603527746,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001997724049871657,
+      "loss": 0.1797,
+      "step": 3152
+    },
+    {
+      "epoch": 0.02736955408373191,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019977219340945097,
+      "loss": 0.207,
+      "step": 3153
+    },
+    {
+      "epoch": 0.027378234563936076,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001997719817335627,
+      "loss": 0.1836,
+      "step": 3154
+    },
+    {
+      "epoch": 0.02738691504414024,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0019977176995950117,
+      "loss": 0.1953,
+      "step": 3155
+    },
+    {
+      "epoch": 0.027395595524344406,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019977155808726657,
+      "loss": 0.1982,
+      "step": 3156
+    },
+    {
+      "epoch": 0.02740427600454857,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001997713461168591,
+      "loss": 0.2246,
+      "step": 3157
+    },
+    {
+      "epoch": 0.027412956484752737,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001997711340482791,
+      "loss": 0.207,
+      "step": 3158
+    },
+    {
+      "epoch": 0.027421636964956902,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001997709218815267,
+      "loss": 0.2109,
+      "step": 3159
+    },
+    {
+      "epoch": 0.027430317445161067,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001997707096166022,
+      "loss": 0.1387,
+      "step": 3160
+    },
+    {
+      "epoch": 0.027438997925365232,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019977049725350576,
+      "loss": 0.2559,
+      "step": 3161
+    },
+    {
+      "epoch": 0.027447678405569397,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019977028479223765,
+      "loss": 0.207,
+      "step": 3162
+    },
+    {
+      "epoch": 0.027456358885773562,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001997700722327981,
+      "loss": 0.1836,
+      "step": 3163
+    },
+    {
+      "epoch": 0.027465039365977727,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.001997698595751874,
+      "loss": 0.1729,
+      "step": 3164
+    },
+    {
+      "epoch": 0.02747371984618189,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001997696468194057,
+      "loss": 0.2021,
+      "step": 3165
+    },
+    {
+      "epoch": 0.027482400326386054,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.001997694339654533,
+      "loss": 0.1885,
+      "step": 3166
+    },
+    {
+      "epoch": 0.02749108080659022,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019976922101333037,
+      "loss": 0.1895,
+      "step": 3167
+    },
+    {
+      "epoch": 0.027499761286794384,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019976900796303716,
+      "loss": 0.2031,
+      "step": 3168
+    },
+    {
+      "epoch": 0.02750844176699855,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001997687948145739,
+      "loss": 0.2051,
+      "step": 3169
+    },
+    {
+      "epoch": 0.027517122247202715,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019976858156794085,
+      "loss": 0.1621,
+      "step": 3170
+    },
+    {
+      "epoch": 0.02752580272740688,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001997683682231382,
+      "loss": 0.207,
+      "step": 3171
+    },
+    {
+      "epoch": 0.027534483207611045,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019976815478016624,
+      "loss": 0.252,
+      "step": 3172
+    },
+    {
+      "epoch": 0.02754316368781521,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019976794123902518,
+      "loss": 0.2754,
+      "step": 3173
+    },
+    {
+      "epoch": 0.027551844168019375,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019976772759971524,
+      "loss": 0.1816,
+      "step": 3174
+    },
+    {
+      "epoch": 0.02756052464822354,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001997675138622367,
+      "loss": 0.249,
+      "step": 3175
+    },
+    {
+      "epoch": 0.027569205128427705,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001997673000265897,
+      "loss": 0.209,
+      "step": 3176
+    },
+    {
+      "epoch": 0.02757788560863187,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019976708609277453,
+      "loss": 0.1602,
+      "step": 3177
+    },
+    {
+      "epoch": 0.027586566088836036,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019976687206079147,
+      "loss": 0.1875,
+      "step": 3178
+    },
+    {
+      "epoch": 0.0275952465690402,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0019976665793064066,
+      "loss": 0.1885,
+      "step": 3179
+    },
+    {
+      "epoch": 0.027603927049244366,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001997664437023224,
+      "loss": 0.2207,
+      "step": 3180
+    },
+    {
+      "epoch": 0.027612607529448528,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001997662293758369,
+      "loss": 0.2051,
+      "step": 3181
+    },
+    {
+      "epoch": 0.027621288009652693,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001997660149511844,
+      "loss": 0.2246,
+      "step": 3182
+    },
+    {
+      "epoch": 0.027629968489856858,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0019976580042836514,
+      "loss": 0.2207,
+      "step": 3183
+    },
+    {
+      "epoch": 0.027638648970061023,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019976558580737935,
+      "loss": 0.2168,
+      "step": 3184
+    },
+    {
+      "epoch": 0.027647329450265188,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001997653710882273,
+      "loss": 0.2334,
+      "step": 3185
+    },
+    {
+      "epoch": 0.027656009930469353,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0019976515627090916,
+      "loss": 0.1797,
+      "step": 3186
+    },
+    {
+      "epoch": 0.027664690410673518,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019976494135542514,
+      "loss": 0.252,
+      "step": 3187
+    },
+    {
+      "epoch": 0.027673370890877683,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001997647263417756,
+      "loss": 0.2656,
+      "step": 3188
+    },
+    {
+      "epoch": 0.02768205137108185,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001997645112299607,
+      "loss": 0.1729,
+      "step": 3189
+    },
+    {
+      "epoch": 0.027690731851286014,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019976429601998064,
+      "loss": 0.2598,
+      "step": 3190
+    },
+    {
+      "epoch": 0.02769941233149018,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001997640807118357,
+      "loss": 0.1943,
+      "step": 3191
+    },
+    {
+      "epoch": 0.027708092811694344,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0019976386530552613,
+      "loss": 0.2305,
+      "step": 3192
+    },
+    {
+      "epoch": 0.02771677329189851,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0019976364980105214,
+      "loss": 0.1855,
+      "step": 3193
+    },
+    {
+      "epoch": 0.027725453772102674,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019976343419841397,
+      "loss": 0.1699,
+      "step": 3194
+    },
+    {
+      "epoch": 0.02773413425230684,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019976321849761187,
+      "loss": 0.1875,
+      "step": 3195
+    },
+    {
+      "epoch": 0.027742814732511,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0019976300269864606,
+      "loss": 0.2305,
+      "step": 3196
+    },
+    {
+      "epoch": 0.027751495212715166,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001997627868015167,
+      "loss": 0.2168,
+      "step": 3197
+    },
+    {
+      "epoch": 0.02776017569291933,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019976257080622424,
+      "loss": 0.1689,
+      "step": 3198
+    },
+    {
+      "epoch": 0.027768856173123496,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001997623547127687,
+      "loss": 0.1768,
+      "step": 3199
+    },
+    {
+      "epoch": 0.02777753665332766,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001997621385211504,
+      "loss": 0.207,
+      "step": 3200
+    },
+    {
+      "epoch": 0.027786217133531826,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001997619222313696,
+      "loss": 0.2363,
+      "step": 3201
+    },
+    {
+      "epoch": 0.02779489761373599,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001997617058434265,
+      "loss": 0.1689,
+      "step": 3202
+    },
+    {
+      "epoch": 0.027803578093940157,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019976148935732137,
+      "loss": 0.1895,
+      "step": 3203
+    },
+    {
+      "epoch": 0.027812258574144322,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001997612727730544,
+      "loss": 0.1924,
+      "step": 3204
+    },
+    {
+      "epoch": 0.027820939054348487,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019976105609062584,
+      "loss": 0.1875,
+      "step": 3205
+    },
+    {
+      "epoch": 0.027829619534552652,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019976083931003597,
+      "loss": 0.2236,
+      "step": 3206
+    },
+    {
+      "epoch": 0.027838300014756817,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0019976062243128494,
+      "loss": 0.2637,
+      "step": 3207
+    },
+    {
+      "epoch": 0.027846980494960982,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019976040545437307,
+      "loss": 0.1719,
+      "step": 3208
+    },
+    {
+      "epoch": 0.027855660975165147,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019976018837930057,
+      "loss": 0.1992,
+      "step": 3209
+    },
+    {
+      "epoch": 0.027864341455369313,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001997599712060677,
+      "loss": 0.248,
+      "step": 3210
+    },
+    {
+      "epoch": 0.027873021935573478,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0019975975393467463,
+      "loss": 0.209,
+      "step": 3211
+    },
+    {
+      "epoch": 0.02788170241577764,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001997595365651217,
+      "loss": 0.1797,
+      "step": 3212
+    },
+    {
+      "epoch": 0.027890382895981804,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019975931909740905,
+      "loss": 0.2158,
+      "step": 3213
+    },
+    {
+      "epoch": 0.02789906337618597,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001997591015315369,
+      "loss": 0.2148,
+      "step": 3214
+    },
+    {
+      "epoch": 0.027907743856390135,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0019975888386750563,
+      "loss": 0.207,
+      "step": 3215
+    },
+    {
+      "epoch": 0.0279164243365943,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001997586661053154,
+      "loss": 0.2217,
+      "step": 3216
+    },
+    {
+      "epoch": 0.027925104816798465,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001997584482449664,
+      "loss": 0.2031,
+      "step": 3217
+    },
+    {
+      "epoch": 0.02793378529700263,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019975823028645892,
+      "loss": 0.2041,
+      "step": 3218
+    },
+    {
+      "epoch": 0.027942465777206795,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001997580122297932,
+      "loss": 0.167,
+      "step": 3219
+    },
+    {
+      "epoch": 0.02795114625741096,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019975779407496947,
+      "loss": 0.2129,
+      "step": 3220
+    },
+    {
+      "epoch": 0.027959826737615125,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019975757582198794,
+      "loss": 0.1836,
+      "step": 3221
+    },
+    {
+      "epoch": 0.02796850721781929,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019975735747084886,
+      "loss": 0.1992,
+      "step": 3222
+    },
+    {
+      "epoch": 0.027977187698023456,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0019975713902155253,
+      "loss": 0.1768,
+      "step": 3223
+    },
+    {
+      "epoch": 0.02798586817822762,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001997569204740991,
+      "loss": 0.2471,
+      "step": 3224
+    },
+    {
+      "epoch": 0.027994548658431786,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001997567018284889,
+      "loss": 0.1914,
+      "step": 3225
+    },
+    {
+      "epoch": 0.02800322913863595,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0019975648308472207,
+      "loss": 0.1719,
+      "step": 3226
+    },
+    {
+      "epoch": 0.028011909618840113,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0019975626424279893,
+      "loss": 0.1963,
+      "step": 3227
+    },
+    {
+      "epoch": 0.028020590099044278,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019975604530271967,
+      "loss": 0.2031,
+      "step": 3228
+    },
+    {
+      "epoch": 0.028029270579248443,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0019975582626448455,
+      "loss": 0.1992,
+      "step": 3229
+    },
+    {
+      "epoch": 0.028037951059452608,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001997556071280938,
+      "loss": 0.2031,
+      "step": 3230
+    },
+    {
+      "epoch": 0.028046631539656773,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001997553878935477,
+      "loss": 0.2227,
+      "step": 3231
+    },
+    {
+      "epoch": 0.028055312019860938,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019975516856084643,
+      "loss": 0.1826,
+      "step": 3232
+    },
+    {
+      "epoch": 0.028063992500065103,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001997549491299902,
+      "loss": 0.1973,
+      "step": 3233
+    },
+    {
+      "epoch": 0.02807267298026927,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001997547296009794,
+      "loss": 0.1758,
+      "step": 3234
+    },
+    {
+      "epoch": 0.028081353460473434,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019975450997381417,
+      "loss": 0.2812,
+      "step": 3235
+    },
+    {
+      "epoch": 0.0280900339406776,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001997542902484947,
+      "loss": 0.1875,
+      "step": 3236
+    },
+    {
+      "epoch": 0.028098714420881764,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001997540704250213,
+      "loss": 0.165,
+      "step": 3237
+    },
+    {
+      "epoch": 0.02810739490108593,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0019975385050339423,
+      "loss": 0.2041,
+      "step": 3238
+    },
+    {
+      "epoch": 0.028116075381290094,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0019975363048361366,
+      "loss": 0.2168,
+      "step": 3239
+    },
+    {
+      "epoch": 0.02812475586149426,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0019975341036567993,
+      "loss": 0.1777,
+      "step": 3240
+    },
+    {
+      "epoch": 0.028133436341698424,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0019975319014959316,
+      "loss": 0.25,
+      "step": 3241
+    },
+    {
+      "epoch": 0.028142116821902586,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0019975296983535365,
+      "loss": 0.2344,
+      "step": 3242
+    },
+    {
+      "epoch": 0.02815079730210675,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019975274942296167,
+      "loss": 0.1836,
+      "step": 3243
+    },
+    {
+      "epoch": 0.028159477782310916,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019975252891241742,
+      "loss": 0.1631,
+      "step": 3244
+    },
+    {
+      "epoch": 0.02816815826251508,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019975230830372114,
+      "loss": 0.1992,
+      "step": 3245
+    },
+    {
+      "epoch": 0.028176838742719246,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001997520875968731,
+      "loss": 0.1719,
+      "step": 3246
+    },
+    {
+      "epoch": 0.02818551922292341,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0019975186679187353,
+      "loss": 0.1641,
+      "step": 3247
+    },
+    {
+      "epoch": 0.028194199703127577,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019975164588872264,
+      "loss": 0.2168,
+      "step": 3248
+    },
+    {
+      "epoch": 0.028202880183331742,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0019975142488742074,
+      "loss": 0.209,
+      "step": 3249
+    },
+    {
+      "epoch": 0.028211560663535907,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019975120378796798,
+      "loss": 0.2168,
+      "step": 3250
+    },
+    {
+      "epoch": 0.028220241143740072,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001997509825903647,
+      "loss": 0.1943,
+      "step": 3251
+    },
+    {
+      "epoch": 0.028228921623944237,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001997507612946111,
+      "loss": 0.1807,
+      "step": 3252
+    },
+    {
+      "epoch": 0.028237602104148402,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0019975053990070736,
+      "loss": 0.166,
+      "step": 3253
+    },
+    {
+      "epoch": 0.028246282584352567,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001997503184086538,
+      "loss": 0.1924,
+      "step": 3254
+    },
+    {
+      "epoch": 0.028254963064556732,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0019975009681845067,
+      "loss": 0.2021,
+      "step": 3255
+    },
+    {
+      "epoch": 0.028263643544760898,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019974987513009814,
+      "loss": 0.1738,
+      "step": 3256
+    },
+    {
+      "epoch": 0.028272324024965063,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001997496533435965,
+      "loss": 0.207,
+      "step": 3257
+    },
+    {
+      "epoch": 0.028281004505169224,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.00199749431458946,
+      "loss": 0.1924,
+      "step": 3258
+    },
+    {
+      "epoch": 0.02828968498537339,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001997492094761469,
+      "loss": 0.1689,
+      "step": 3259
+    },
+    {
+      "epoch": 0.028298365465577555,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.001997489873951994,
+      "loss": 0.1875,
+      "step": 3260
+    },
+    {
+      "epoch": 0.02830704594578172,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001997487652161037,
+      "loss": 0.1953,
+      "step": 3261
+    },
+    {
+      "epoch": 0.028315726425985885,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019974854293886017,
+      "loss": 0.2129,
+      "step": 3262
+    },
+    {
+      "epoch": 0.02832440690619005,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.00199748320563469,
+      "loss": 0.2539,
+      "step": 3263
+    },
+    {
+      "epoch": 0.028333087386394215,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019974809808993035,
+      "loss": 0.1582,
+      "step": 3264
+    },
+    {
+      "epoch": 0.02834176786659838,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019974787551824452,
+      "loss": 0.1719,
+      "step": 3265
+    },
+    {
+      "epoch": 0.028350448346802545,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0019974765284841178,
+      "loss": 0.7461,
+      "step": 3266
+    },
+    {
+      "epoch": 0.02835912882700671,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0019974743008043237,
+      "loss": 0.2002,
+      "step": 3267
+    },
+    {
+      "epoch": 0.028367809307210876,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.001997472072143065,
+      "loss": 0.2227,
+      "step": 3268
+    },
+    {
+      "epoch": 0.02837648978741504,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019974698425003446,
+      "loss": 0.2246,
+      "step": 3269
+    },
+    {
+      "epoch": 0.028385170267619206,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019974676118761645,
+      "loss": 0.1758,
+      "step": 3270
+    },
+    {
+      "epoch": 0.02839385074782337,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019974653802705272,
+      "loss": 0.1641,
+      "step": 3271
+    },
+    {
+      "epoch": 0.028402531228027536,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019974631476834355,
+      "loss": 0.166,
+      "step": 3272
+    },
+    {
+      "epoch": 0.028411211708231698,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019974609141148914,
+      "loss": 0.1543,
+      "step": 3273
+    },
+    {
+      "epoch": 0.028419892188435863,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0019974586795648975,
+      "loss": 0.1553,
+      "step": 3274
+    },
+    {
+      "epoch": 0.028428572668640028,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019974564440334566,
+      "loss": 0.1904,
+      "step": 3275
+    },
+    {
+      "epoch": 0.028437253148844193,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019974542075205702,
+      "loss": 0.1621,
+      "step": 3276
+    },
+    {
+      "epoch": 0.028445933629048358,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001997451970026242,
+      "loss": 0.1553,
+      "step": 3277
+    },
+    {
+      "epoch": 0.028454614109252523,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0019974497315504735,
+      "loss": 0.1973,
+      "step": 3278
+    },
+    {
+      "epoch": 0.02846329458945669,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0019974474920932675,
+      "loss": 0.1729,
+      "step": 3279
+    },
+    {
+      "epoch": 0.028471975069660854,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019974452516546264,
+      "loss": 0.2031,
+      "step": 3280
+    },
+    {
+      "epoch": 0.02848065554986502,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019974430102345526,
+      "loss": 0.2656,
+      "step": 3281
+    },
+    {
+      "epoch": 0.028489336030069184,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0019974407678330485,
+      "loss": 0.1855,
+      "step": 3282
+    },
+    {
+      "epoch": 0.02849801651027335,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001997438524450117,
+      "loss": 0.2324,
+      "step": 3283
+    },
+    {
+      "epoch": 0.028506696990477514,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.00199743628008576,
+      "loss": 0.2031,
+      "step": 3284
+    },
+    {
+      "epoch": 0.02851537747068168,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.00199743403473998,
+      "loss": 0.1875,
+      "step": 3285
+    },
+    {
+      "epoch": 0.028524057950885844,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00199743178841278,
+      "loss": 0.2246,
+      "step": 3286
+    },
+    {
+      "epoch": 0.02853273843109001,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019974295411041617,
+      "loss": 0.1992,
+      "step": 3287
+    },
+    {
+      "epoch": 0.028541418911294174,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001997427292814129,
+      "loss": 0.1855,
+      "step": 3288
+    },
+    {
+      "epoch": 0.028550099391498336,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019974250435426818,
+      "loss": 0.1855,
+      "step": 3289
+    },
+    {
+      "epoch": 0.0285587798717025,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001997422793289825,
+      "loss": 0.1562,
+      "step": 3290
+    },
+    {
+      "epoch": 0.028567460351906666,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0019974205420555595,
+      "loss": 0.1777,
+      "step": 3291
+    },
+    {
+      "epoch": 0.02857614083211083,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0019974182898398886,
+      "loss": 0.1777,
+      "step": 3292
+    },
+    {
+      "epoch": 0.028584821312314997,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019974160366428148,
+      "loss": 0.2129,
+      "step": 3293
+    },
+    {
+      "epoch": 0.028593501792519162,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019974137824643402,
+      "loss": 0.2051,
+      "step": 3294
+    },
+    {
+      "epoch": 0.028602182272723327,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001997411527304467,
+      "loss": 0.1406,
+      "step": 3295
+    },
+    {
+      "epoch": 0.028610862752927492,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0019974092711631986,
+      "loss": 0.209,
+      "step": 3296
+    },
+    {
+      "epoch": 0.028619543233131657,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0019974070140405366,
+      "loss": 0.207,
+      "step": 3297
+    },
+    {
+      "epoch": 0.028628223713335822,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001997404755936484,
+      "loss": 0.1797,
+      "step": 3298
+    },
+    {
+      "epoch": 0.028636904193539987,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001997402496851043,
+      "loss": 0.1992,
+      "step": 3299
+    },
+    {
+      "epoch": 0.028645584673744152,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001997400236784216,
+      "loss": 0.1895,
+      "step": 3300
+    },
+    {
+      "epoch": 0.028654265153948318,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0019973979757360056,
+      "loss": 0.1953,
+      "step": 3301
+    },
+    {
+      "epoch": 0.028662945634152483,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019973957137064137,
+      "loss": 0.1992,
+      "step": 3302
+    },
+    {
+      "epoch": 0.028671626114356648,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001997393450695444,
+      "loss": 0.2539,
+      "step": 3303
+    },
+    {
+      "epoch": 0.02868030659456081,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001997391186703098,
+      "loss": 0.2256,
+      "step": 3304
+    },
+    {
+      "epoch": 0.028688987074764975,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001997388921729379,
+      "loss": 0.1426,
+      "step": 3305
+    },
+    {
+      "epoch": 0.02869766755496914,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0019973866557742885,
+      "loss": 0.165,
+      "step": 3306
+    },
+    {
+      "epoch": 0.028706348035173305,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019973843888378296,
+      "loss": 0.168,
+      "step": 3307
+    },
+    {
+      "epoch": 0.02871502851537747,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0019973821209200043,
+      "loss": 0.1992,
+      "step": 3308
+    },
+    {
+      "epoch": 0.028723708995581635,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001997379852020816,
+      "loss": 0.1328,
+      "step": 3309
+    },
+    {
+      "epoch": 0.0287323894757858,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001997377582140266,
+      "loss": 0.21,
+      "step": 3310
+    },
+    {
+      "epoch": 0.028741069955989965,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019973753112783577,
+      "loss": 0.1689,
+      "step": 3311
+    },
+    {
+      "epoch": 0.02874975043619413,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019973730394350934,
+      "loss": 0.2129,
+      "step": 3312
+    },
+    {
+      "epoch": 0.028758430916398296,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001997370766610475,
+      "loss": 0.4082,
+      "step": 3313
+    },
+    {
+      "epoch": 0.02876711139660246,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019973684928045053,
+      "loss": 0.1865,
+      "step": 3314
+    },
+    {
+      "epoch": 0.028775791876806626,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019973662180171876,
+      "loss": 0.166,
+      "step": 3315
+    },
+    {
+      "epoch": 0.02878447235701079,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001997363942248523,
+      "loss": 0.2363,
+      "step": 3316
+    },
+    {
+      "epoch": 0.028793152837214956,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001997361665498515,
+      "loss": 0.1729,
+      "step": 3317
+    },
+    {
+      "epoch": 0.02880183331741912,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019973593877671654,
+      "loss": 0.2148,
+      "step": 3318
+    },
+    {
+      "epoch": 0.028810513797623283,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019973571090544776,
+      "loss": 0.2129,
+      "step": 3319
+    },
+    {
+      "epoch": 0.028819194277827448,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019973548293604534,
+      "loss": 0.1973,
+      "step": 3320
+    },
+    {
+      "epoch": 0.028827874758031613,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001997352548685095,
+      "loss": 0.2168,
+      "step": 3321
+    },
+    {
+      "epoch": 0.028836555238235778,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0019973502670284056,
+      "loss": 0.2021,
+      "step": 3322
+    },
+    {
+      "epoch": 0.028845235718439943,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001997347984390388,
+      "loss": 0.2461,
+      "step": 3323
+    },
+    {
+      "epoch": 0.02885391619864411,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019973457007710434,
+      "loss": 0.2314,
+      "step": 3324
+    },
+    {
+      "epoch": 0.028862596678848274,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001997343416170375,
+      "loss": 0.2305,
+      "step": 3325
+    },
+    {
+      "epoch": 0.02887127715905244,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001997341130588386,
+      "loss": 0.2012,
+      "step": 3326
+    },
+    {
+      "epoch": 0.028879957639256604,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019973388440250777,
+      "loss": 0.25,
+      "step": 3327
+    },
+    {
+      "epoch": 0.02888863811946077,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001997336556480453,
+      "loss": 0.1807,
+      "step": 3328
+    },
+    {
+      "epoch": 0.028897318599664934,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001997334267954515,
+      "loss": 0.2002,
+      "step": 3329
+    },
+    {
+      "epoch": 0.0289059990798691,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019973319784472652,
+      "loss": 0.2441,
+      "step": 3330
+    },
+    {
+      "epoch": 0.028914679560073264,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001997329687958707,
+      "loss": 0.2617,
+      "step": 3331
+    },
+    {
+      "epoch": 0.02892336004027743,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019973273964888428,
+      "loss": 0.2754,
+      "step": 3332
+    },
+    {
+      "epoch": 0.028932040520481594,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001997325104037674,
+      "loss": 0.208,
+      "step": 3333
+    },
+    {
+      "epoch": 0.02894072100068576,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019973228106052046,
+      "loss": 0.1709,
+      "step": 3334
+    },
+    {
+      "epoch": 0.02894940148088992,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019973205161914363,
+      "loss": 0.1992,
+      "step": 3335
+    },
+    {
+      "epoch": 0.028958081961094086,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019973182207963717,
+      "loss": 0.1836,
+      "step": 3336
+    },
+    {
+      "epoch": 0.02896676244129825,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019973159244200136,
+      "loss": 0.1836,
+      "step": 3337
+    },
+    {
+      "epoch": 0.028975442921502417,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.001997313627062364,
+      "loss": 0.1807,
+      "step": 3338
+    },
+    {
+      "epoch": 0.02898412340170658,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001997311328723426,
+      "loss": 0.1924,
+      "step": 3339
+    },
+    {
+      "epoch": 0.028992803881910747,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019973090294032013,
+      "loss": 0.2119,
+      "step": 3340
+    },
+    {
+      "epoch": 0.029001484362114912,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0019973067291016934,
+      "loss": 0.1768,
+      "step": 3341
+    },
+    {
+      "epoch": 0.029010164842319077,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0019973044278189045,
+      "loss": 0.2188,
+      "step": 3342
+    },
+    {
+      "epoch": 0.029018845322523242,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0019973021255548363,
+      "loss": 0.2344,
+      "step": 3343
+    },
+    {
+      "epoch": 0.029027525802727407,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0019972998223094923,
+      "loss": 0.2207,
+      "step": 3344
+    },
+    {
+      "epoch": 0.029036206282931572,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001997297518082875,
+      "loss": 0.1855,
+      "step": 3345
+    },
+    {
+      "epoch": 0.029044886763135738,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019972952128749864,
+      "loss": 0.1406,
+      "step": 3346
+    },
+    {
+      "epoch": 0.029053567243339903,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.001997292906685829,
+      "loss": 0.3105,
+      "step": 3347
+    },
+    {
+      "epoch": 0.029062247723544068,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0019972905995154057,
+      "loss": 0.207,
+      "step": 3348
+    },
+    {
+      "epoch": 0.029070928203748233,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001997288291363719,
+      "loss": 0.1797,
+      "step": 3349
+    },
+    {
+      "epoch": 0.029079608683952395,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019972859822307712,
+      "loss": 0.2246,
+      "step": 3350
+    },
+    {
+      "epoch": 0.02908828916415656,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001997283672116565,
+      "loss": 0.2344,
+      "step": 3351
+    },
+    {
+      "epoch": 0.029096969644360725,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001997281361021103,
+      "loss": 0.2266,
+      "step": 3352
+    },
+    {
+      "epoch": 0.02910565012456489,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.001997279048944387,
+      "loss": 0.2021,
+      "step": 3353
+    },
+    {
+      "epoch": 0.029114330604769055,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001997276735886421,
+      "loss": 0.1895,
+      "step": 3354
+    },
+    {
+      "epoch": 0.02912301108497322,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001997274421847206,
+      "loss": 0.1729,
+      "step": 3355
+    },
+    {
+      "epoch": 0.029131691565177385,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0019972721068267454,
+      "loss": 0.207,
+      "step": 3356
+    },
+    {
+      "epoch": 0.02914037204538155,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019972697908250416,
+      "loss": 0.1797,
+      "step": 3357
+    },
+    {
+      "epoch": 0.029149052525585716,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019972674738420967,
+      "loss": 0.1758,
+      "step": 3358
+    },
+    {
+      "epoch": 0.02915773300578988,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019972651558779137,
+      "loss": 0.2266,
+      "step": 3359
+    },
+    {
+      "epoch": 0.029166413485994046,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001997262836932495,
+      "loss": 0.2363,
+      "step": 3360
+    },
+    {
+      "epoch": 0.02917509396619821,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0019972605170058434,
+      "loss": 0.1934,
+      "step": 3361
+    },
+    {
+      "epoch": 0.029183774446402376,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001997258196097961,
+      "loss": 0.2734,
+      "step": 3362
+    },
+    {
+      "epoch": 0.02919245492660654,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.00199725587420885,
+      "loss": 0.2051,
+      "step": 3363
+    },
+    {
+      "epoch": 0.029201135406810706,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001997253551338514,
+      "loss": 0.1816,
+      "step": 3364
+    },
+    {
+      "epoch": 0.02920981588701487,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0019972512274869553,
+      "loss": 0.2051,
+      "step": 3365
+    },
+    {
+      "epoch": 0.029218496367219033,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001997248902654176,
+      "loss": 0.2021,
+      "step": 3366
+    },
+    {
+      "epoch": 0.029227176847423198,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0019972465768401783,
+      "loss": 0.1865,
+      "step": 3367
+    },
+    {
+      "epoch": 0.029235857327627363,
+      "grad_norm": 2.046875,
+      "learning_rate": 0.0019972442500449657,
+      "loss": 0.4219,
+      "step": 3368
+    },
+    {
+      "epoch": 0.02924453780783153,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00199724192226854,
+      "loss": 0.2539,
+      "step": 3369
+    },
+    {
+      "epoch": 0.029253218288035693,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001997239593510904,
+      "loss": 0.1973,
+      "step": 3370
+    },
+    {
+      "epoch": 0.02926189876823986,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019972372637720604,
+      "loss": 0.1738,
+      "step": 3371
+    },
+    {
+      "epoch": 0.029270579248444024,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019972349330520116,
+      "loss": 0.2578,
+      "step": 3372
+    },
+    {
+      "epoch": 0.02927925972864819,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.00199723260135076,
+      "loss": 0.2188,
+      "step": 3373
+    },
+    {
+      "epoch": 0.029287940208852354,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019972302686683085,
+      "loss": 0.2295,
+      "step": 3374
+    },
+    {
+      "epoch": 0.02929662068905652,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0019972279350046595,
+      "loss": 0.167,
+      "step": 3375
+    },
+    {
+      "epoch": 0.029305301169260684,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019972256003598153,
+      "loss": 0.1836,
+      "step": 3376
+    },
+    {
+      "epoch": 0.02931398164946485,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0019972232647337785,
+      "loss": 0.2246,
+      "step": 3377
+    },
+    {
+      "epoch": 0.029322662129669014,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019972209281265522,
+      "loss": 0.1699,
+      "step": 3378
+    },
+    {
+      "epoch": 0.02933134260987318,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019972185905381386,
+      "loss": 0.1992,
+      "step": 3379
+    },
+    {
+      "epoch": 0.029340023090077345,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00199721625196854,
+      "loss": 0.2041,
+      "step": 3380
+    },
+    {
+      "epoch": 0.029348703570281506,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0019972139124177593,
+      "loss": 0.168,
+      "step": 3381
+    },
+    {
+      "epoch": 0.02935738405048567,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0019972115718857987,
+      "loss": 0.2197,
+      "step": 3382
+    },
+    {
+      "epoch": 0.029366064530689837,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0019972092303726613,
+      "loss": 0.208,
+      "step": 3383
+    },
+    {
+      "epoch": 0.029374745010894,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019972068878783495,
+      "loss": 0.1689,
+      "step": 3384
+    },
+    {
+      "epoch": 0.029383425491098167,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001997204544402865,
+      "loss": 0.165,
+      "step": 3385
+    },
+    {
+      "epoch": 0.029392105971302332,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001997202199946212,
+      "loss": 0.2012,
+      "step": 3386
+    },
+    {
+      "epoch": 0.029400786451506497,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001997199854508392,
+      "loss": 0.1953,
+      "step": 3387
+    },
+    {
+      "epoch": 0.029409466931710662,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001997197508089407,
+      "loss": 0.1641,
+      "step": 3388
+    },
+    {
+      "epoch": 0.029418147411914827,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0019971951606892607,
+      "loss": 0.2031,
+      "step": 3389
+    },
+    {
+      "epoch": 0.029426827892118992,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019971928123079558,
+      "loss": 0.1689,
+      "step": 3390
+    },
+    {
+      "epoch": 0.029435508372323158,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019971904629454934,
+      "loss": 0.1855,
+      "step": 3391
+    },
+    {
+      "epoch": 0.029444188852527323,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0019971881126018775,
+      "loss": 0.1504,
+      "step": 3392
+    },
+    {
+      "epoch": 0.029452869332731488,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.00199718576127711,
+      "loss": 0.1611,
+      "step": 3393
+    },
+    {
+      "epoch": 0.029461549812935653,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001997183408971194,
+      "loss": 0.2012,
+      "step": 3394
+    },
+    {
+      "epoch": 0.029470230293139818,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001997181055684131,
+      "loss": 0.1934,
+      "step": 3395
+    },
+    {
+      "epoch": 0.02947891077334398,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001997178701415925,
+      "loss": 0.2188,
+      "step": 3396
+    },
+    {
+      "epoch": 0.029487591253548145,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001997176346166578,
+      "loss": 0.1992,
+      "step": 3397
+    },
+    {
+      "epoch": 0.02949627173375231,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0019971739899360915,
+      "loss": 0.1592,
+      "step": 3398
+    },
+    {
+      "epoch": 0.029504952213956475,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019971716327244694,
+      "loss": 0.168,
+      "step": 3399
+    },
+    {
+      "epoch": 0.02951363269416064,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019971692745317142,
+      "loss": 0.2266,
+      "step": 3400
+    },
+    {
+      "epoch": 0.029522313174364805,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0019971669153578276,
+      "loss": 0.1807,
+      "step": 3401
+    },
+    {
+      "epoch": 0.02953099365456897,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019971645552028135,
+      "loss": 0.1768,
+      "step": 3402
+    },
+    {
+      "epoch": 0.029539674134773135,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001997162194066673,
+      "loss": 0.2227,
+      "step": 3403
+    },
+    {
+      "epoch": 0.0295483546149773,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00199715983194941,
+      "loss": 0.1982,
+      "step": 3404
+    },
+    {
+      "epoch": 0.029557035095181466,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001997157468851026,
+      "loss": 0.1699,
+      "step": 3405
+    },
+    {
+      "epoch": 0.02956571557538563,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019971551047715244,
+      "loss": 0.1895,
+      "step": 3406
+    },
+    {
+      "epoch": 0.029574396055589796,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001997152739710907,
+      "loss": 0.2324,
+      "step": 3407
+    },
+    {
+      "epoch": 0.02958307653579396,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019971503736691773,
+      "loss": 0.2207,
+      "step": 3408
+    },
+    {
+      "epoch": 0.029591757015998126,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019971480066463374,
+      "loss": 0.1768,
+      "step": 3409
+    },
+    {
+      "epoch": 0.02960043749620229,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019971456386423895,
+      "loss": 0.2051,
+      "step": 3410
+    },
+    {
+      "epoch": 0.029609117976406456,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001997143269657337,
+      "loss": 0.1719,
+      "step": 3411
+    },
+    {
+      "epoch": 0.029617798456610618,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001997140899691182,
+      "loss": 0.1973,
+      "step": 3412
+    },
+    {
+      "epoch": 0.029626478936814783,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019971385287439274,
+      "loss": 0.207,
+      "step": 3413
+    },
+    {
+      "epoch": 0.02963515941701895,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0019971361568155753,
+      "loss": 0.1973,
+      "step": 3414
+    },
+    {
+      "epoch": 0.029643839897223113,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0019971337839061287,
+      "loss": 0.207,
+      "step": 3415
+    },
+    {
+      "epoch": 0.02965252037742728,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00199713141001559,
+      "loss": 0.2236,
+      "step": 3416
+    },
+    {
+      "epoch": 0.029661200857631444,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001997129035143962,
+      "loss": 0.1689,
+      "step": 3417
+    },
+    {
+      "epoch": 0.02966988133783561,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001997126659291247,
+      "loss": 0.1836,
+      "step": 3418
+    },
+    {
+      "epoch": 0.029678561818039774,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001997124282457448,
+      "loss": 0.1562,
+      "step": 3419
+    },
+    {
+      "epoch": 0.02968724229824394,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001997121904642567,
+      "loss": 0.1777,
+      "step": 3420
+    },
+    {
+      "epoch": 0.029695922778448104,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019971195258466075,
+      "loss": 0.1611,
+      "step": 3421
+    },
+    {
+      "epoch": 0.02970460325865227,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001997117146069571,
+      "loss": 0.168,
+      "step": 3422
+    },
+    {
+      "epoch": 0.029713283738856434,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.001997114765311461,
+      "loss": 0.2031,
+      "step": 3423
+    },
+    {
+      "epoch": 0.0297219642190606,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0019971123835722795,
+      "loss": 0.2275,
+      "step": 3424
+    },
+    {
+      "epoch": 0.029730644699264765,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019971100008520297,
+      "loss": 0.1895,
+      "step": 3425
+    },
+    {
+      "epoch": 0.02973932517946893,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019971076171507135,
+      "loss": 0.1895,
+      "step": 3426
+    },
+    {
+      "epoch": 0.02974800565967309,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019971052324683344,
+      "loss": 0.1777,
+      "step": 3427
+    },
+    {
+      "epoch": 0.029756686139877257,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.001997102846804894,
+      "loss": 0.1748,
+      "step": 3428
+    },
+    {
+      "epoch": 0.02976536662008142,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001997100460160396,
+      "loss": 0.1562,
+      "step": 3429
+    },
+    {
+      "epoch": 0.029774047100285587,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001997098072534842,
+      "loss": 0.207,
+      "step": 3430
+    },
+    {
+      "epoch": 0.029782727580489752,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019970956839282347,
+      "loss": 0.1699,
+      "step": 3431
+    },
+    {
+      "epoch": 0.029791408060693917,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019970932943405777,
+      "loss": 0.124,
+      "step": 3432
+    },
+    {
+      "epoch": 0.029800088540898082,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0019970909037718724,
+      "loss": 0.1934,
+      "step": 3433
+    },
+    {
+      "epoch": 0.029808769021102247,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019970885122221225,
+      "loss": 0.1826,
+      "step": 3434
+    },
+    {
+      "epoch": 0.029817449501306412,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019970861196913297,
+      "loss": 0.1494,
+      "step": 3435
+    },
+    {
+      "epoch": 0.029826129981510578,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001997083726179497,
+      "loss": 0.168,
+      "step": 3436
+    },
+    {
+      "epoch": 0.029834810461714743,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001997081331686627,
+      "loss": 0.1631,
+      "step": 3437
+    },
+    {
+      "epoch": 0.029843490941918908,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019970789362127226,
+      "loss": 0.1895,
+      "step": 3438
+    },
+    {
+      "epoch": 0.029852171422123073,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001997076539757786,
+      "loss": 0.1777,
+      "step": 3439
+    },
+    {
+      "epoch": 0.029860851902327238,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.00199707414232182,
+      "loss": 0.2119,
+      "step": 3440
+    },
+    {
+      "epoch": 0.029869532382531403,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001997071743904827,
+      "loss": 0.2021,
+      "step": 3441
+    },
+    {
+      "epoch": 0.029878212862735568,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019970693445068104,
+      "loss": 0.1816,
+      "step": 3442
+    },
+    {
+      "epoch": 0.02988689334293973,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0019970669441277717,
+      "loss": 0.2217,
+      "step": 3443
+    },
+    {
+      "epoch": 0.029895573823143895,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019970645427677142,
+      "loss": 0.1885,
+      "step": 3444
+    },
+    {
+      "epoch": 0.02990425430334806,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0019970621404266403,
+      "loss": 0.2178,
+      "step": 3445
+    },
+    {
+      "epoch": 0.029912934783552225,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001997059737104553,
+      "loss": 0.1826,
+      "step": 3446
+    },
+    {
+      "epoch": 0.02992161526375639,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0019970573328014544,
+      "loss": 0.1709,
+      "step": 3447
+    },
+    {
+      "epoch": 0.029930295743960555,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019970549275173475,
+      "loss": 0.209,
+      "step": 3448
+    },
+    {
+      "epoch": 0.02993897622416472,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019970525212522345,
+      "loss": 0.1787,
+      "step": 3449
+    },
+    {
+      "epoch": 0.029947656704368886,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001997050114006119,
+      "loss": 0.1699,
+      "step": 3450
+    },
+    {
+      "epoch": 0.02995633718457305,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0019970477057790026,
+      "loss": 0.2461,
+      "step": 3451
+    },
+    {
+      "epoch": 0.029965017664777216,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001997045296570888,
+      "loss": 0.1787,
+      "step": 3452
+    },
+    {
+      "epoch": 0.02997369814498138,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0019970428863817784,
+      "loss": 0.1836,
+      "step": 3453
+    },
+    {
+      "epoch": 0.029982378625185546,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019970404752116763,
+      "loss": 0.1855,
+      "step": 3454
+    },
+    {
+      "epoch": 0.02999105910538971,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001997038063060584,
+      "loss": 0.2334,
+      "step": 3455
+    },
+    {
+      "epoch": 0.029999739585593876,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019970356499285045,
+      "loss": 0.2207,
+      "step": 3456
+    },
+    {
+      "epoch": 0.03000842006579804,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019970332358154406,
+      "loss": 0.1885,
+      "step": 3457
+    },
+    {
+      "epoch": 0.030017100546002203,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001997030820721394,
+      "loss": 0.168,
+      "step": 3458
+    },
+    {
+      "epoch": 0.03002578102620637,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001997028404646368,
+      "loss": 0.1777,
+      "step": 3459
+    },
+    {
+      "epoch": 0.030034461506410533,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019970259875903658,
+      "loss": 0.2402,
+      "step": 3460
+    },
+    {
+      "epoch": 0.0300431419866147,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001997023569553389,
+      "loss": 0.2598,
+      "step": 3461
+    },
+    {
+      "epoch": 0.030051822466818864,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001997021150535441,
+      "loss": 0.1855,
+      "step": 3462
+    },
+    {
+      "epoch": 0.03006050294702303,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019970187305365238,
+      "loss": 0.1934,
+      "step": 3463
+    },
+    {
+      "epoch": 0.030069183427227194,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019970163095566406,
+      "loss": 0.25,
+      "step": 3464
+    },
+    {
+      "epoch": 0.03007786390743136,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001997013887595794,
+      "loss": 0.1855,
+      "step": 3465
+    },
+    {
+      "epoch": 0.030086544387635524,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019970114646539862,
+      "loss": 0.2324,
+      "step": 3466
+    },
+    {
+      "epoch": 0.03009522486783969,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019970090407312206,
+      "loss": 0.1943,
+      "step": 3467
+    },
+    {
+      "epoch": 0.030103905348043854,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019970066158274988,
+      "loss": 0.1699,
+      "step": 3468
+    },
+    {
+      "epoch": 0.03011258582824802,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001997004189942824,
+      "loss": 0.2207,
+      "step": 3469
+    },
+    {
+      "epoch": 0.030121266308452185,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019970017630771995,
+      "loss": 0.2422,
+      "step": 3470
+    },
+    {
+      "epoch": 0.03012994678865635,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001996999335230627,
+      "loss": 0.207,
+      "step": 3471
+    },
+    {
+      "epoch": 0.030138627268860515,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0019969969064031097,
+      "loss": 0.1875,
+      "step": 3472
+    },
+    {
+      "epoch": 0.030147307749064677,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.00199699447659465,
+      "loss": 0.1758,
+      "step": 3473
+    },
+    {
+      "epoch": 0.03015598822926884,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019969920458052506,
+      "loss": 0.1807,
+      "step": 3474
+    },
+    {
+      "epoch": 0.030164668709473007,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019969896140349134,
+      "loss": 0.2051,
+      "step": 3475
+    },
+    {
+      "epoch": 0.030173349189677172,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001996987181283643,
+      "loss": 0.1553,
+      "step": 3476
+    },
+    {
+      "epoch": 0.030182029669881337,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019969847475514403,
+      "loss": 0.166,
+      "step": 3477
+    },
+    {
+      "epoch": 0.030190710150085502,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0019969823128383087,
+      "loss": 0.1494,
+      "step": 3478
+    },
+    {
+      "epoch": 0.030199390630289667,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019969798771442508,
+      "loss": 0.1631,
+      "step": 3479
+    },
+    {
+      "epoch": 0.030208071110493832,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0019969774404692687,
+      "loss": 0.1455,
+      "step": 3480
+    },
+    {
+      "epoch": 0.030216751590697997,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.001996975002813366,
+      "loss": 0.1963,
+      "step": 3481
+    },
+    {
+      "epoch": 0.030225432070902163,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001996972564176545,
+      "loss": 0.1855,
+      "step": 3482
+    },
+    {
+      "epoch": 0.030234112551106328,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001996970124558808,
+      "loss": 0.1631,
+      "step": 3483
+    },
+    {
+      "epoch": 0.030242793031310493,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001996967683960158,
+      "loss": 0.2031,
+      "step": 3484
+    },
+    {
+      "epoch": 0.030251473511514658,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019969652423805976,
+      "loss": 0.1914,
+      "step": 3485
+    },
+    {
+      "epoch": 0.030260153991718823,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00199696279982013,
+      "loss": 0.2051,
+      "step": 3486
+    },
+    {
+      "epoch": 0.030268834471922988,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001996960356278757,
+      "loss": 0.1826,
+      "step": 3487
+    },
+    {
+      "epoch": 0.030277514952127153,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019969579117564812,
+      "loss": 0.1846,
+      "step": 3488
+    },
+    {
+      "epoch": 0.030286195432331315,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001996955466253306,
+      "loss": 0.2422,
+      "step": 3489
+    },
+    {
+      "epoch": 0.03029487591253548,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0019969530197692337,
+      "loss": 0.1562,
+      "step": 3490
+    },
+    {
+      "epoch": 0.030303556392739645,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019969505723042677,
+      "loss": 0.1689,
+      "step": 3491
+    },
+    {
+      "epoch": 0.03031223687294381,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019969481238584093,
+      "loss": 0.1826,
+      "step": 3492
+    },
+    {
+      "epoch": 0.030320917353147975,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001996945674431662,
+      "loss": 0.1338,
+      "step": 3493
+    },
+    {
+      "epoch": 0.03032959783335214,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019969432240240286,
+      "loss": 0.1885,
+      "step": 3494
+    },
+    {
+      "epoch": 0.030338278313556306,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019969407726355116,
+      "loss": 0.207,
+      "step": 3495
+    },
+    {
+      "epoch": 0.03034695879376047,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019969383202661137,
+      "loss": 0.1826,
+      "step": 3496
+    },
+    {
+      "epoch": 0.030355639273964636,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019969358669158373,
+      "loss": 0.2031,
+      "step": 3497
+    },
+    {
+      "epoch": 0.0303643197541688,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019969334125846854,
+      "loss": 0.2344,
+      "step": 3498
+    },
+    {
+      "epoch": 0.030373000234372966,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019969309572726605,
+      "loss": 0.1699,
+      "step": 3499
+    },
+    {
+      "epoch": 0.03038168071457713,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019969285009797657,
+      "loss": 0.1934,
+      "step": 3500
+    },
+    {
+      "epoch": 0.030390361194781296,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001996926043706003,
+      "loss": 0.1855,
+      "step": 3501
+    },
+    {
+      "epoch": 0.03039904167498546,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0019969235854513756,
+      "loss": 0.1592,
+      "step": 3502
+    },
+    {
+      "epoch": 0.030407722155189627,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001996921126215886,
+      "loss": 0.2051,
+      "step": 3503
+    },
+    {
+      "epoch": 0.03041640263539379,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001996918665999537,
+      "loss": 0.1855,
+      "step": 3504
+    },
+    {
+      "epoch": 0.030425083115597953,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001996916204802331,
+      "loss": 0.1992,
+      "step": 3505
+    },
+    {
+      "epoch": 0.03043376359580212,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001996913742624271,
+      "loss": 0.1787,
+      "step": 3506
+    },
+    {
+      "epoch": 0.030442444076006284,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.00199691127946536,
+      "loss": 0.1973,
+      "step": 3507
+    },
+    {
+      "epoch": 0.03045112455621045,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0019969088153256,
+      "loss": 0.1865,
+      "step": 3508
+    },
+    {
+      "epoch": 0.030459805036414614,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001996906350204994,
+      "loss": 0.1582,
+      "step": 3509
+    },
+    {
+      "epoch": 0.03046848551661878,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0019969038841035447,
+      "loss": 0.1602,
+      "step": 3510
+    },
+    {
+      "epoch": 0.030477165996822944,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001996901417021255,
+      "loss": 0.1543,
+      "step": 3511
+    },
+    {
+      "epoch": 0.03048584647702711,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001996898948958127,
+      "loss": 0.1992,
+      "step": 3512
+    },
+    {
+      "epoch": 0.030494526957231274,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019968964799141637,
+      "loss": 0.3164,
+      "step": 3513
+    },
+    {
+      "epoch": 0.03050320743743544,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019968940098893683,
+      "loss": 0.1504,
+      "step": 3514
+    },
+    {
+      "epoch": 0.030511887917639605,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.001996891538883743,
+      "loss": 0.1455,
+      "step": 3515
+    },
+    {
+      "epoch": 0.03052056839784377,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00199688906689729,
+      "loss": 0.1846,
+      "step": 3516
+    },
+    {
+      "epoch": 0.030529248878047935,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019968865939300135,
+      "loss": 0.168,
+      "step": 3517
+    },
+    {
+      "epoch": 0.0305379293582521,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019968841199819146,
+      "loss": 0.1885,
+      "step": 3518
+    },
+    {
+      "epoch": 0.030546609838456265,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0019968816450529974,
+      "loss": 0.1484,
+      "step": 3519
+    },
+    {
+      "epoch": 0.030555290318660427,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001996879169143263,
+      "loss": 0.1621,
+      "step": 3520
+    },
+    {
+      "epoch": 0.030563970798864592,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001996876692252716,
+      "loss": 0.1973,
+      "step": 3521
+    },
+    {
+      "epoch": 0.030572651279068757,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019968742143813578,
+      "loss": 0.2539,
+      "step": 3522
+    },
+    {
+      "epoch": 0.030581331759272922,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001996871735529191,
+      "loss": 0.1641,
+      "step": 3523
+    },
+    {
+      "epoch": 0.030590012239477087,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.001996869255696219,
+      "loss": 0.1953,
+      "step": 3524
+    },
+    {
+      "epoch": 0.030598692719681252,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019968667748824446,
+      "loss": 0.1484,
+      "step": 3525
+    },
+    {
+      "epoch": 0.030607373199885417,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019968642930878696,
+      "loss": 0.1914,
+      "step": 3526
+    },
+    {
+      "epoch": 0.030616053680089583,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019968618103124976,
+      "loss": 0.1719,
+      "step": 3527
+    },
+    {
+      "epoch": 0.030624734160293748,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001996859326556331,
+      "loss": 0.1973,
+      "step": 3528
+    },
+    {
+      "epoch": 0.030633414640497913,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0019968568418193724,
+      "loss": 0.1738,
+      "step": 3529
+    },
+    {
+      "epoch": 0.030642095120702078,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001996854356101625,
+      "loss": 0.2949,
+      "step": 3530
+    },
+    {
+      "epoch": 0.030650775600906243,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019968518694030908,
+      "loss": 0.2598,
+      "step": 3531
+    },
+    {
+      "epoch": 0.030659456081110408,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019968493817237726,
+      "loss": 0.1826,
+      "step": 3532
+    },
+    {
+      "epoch": 0.030668136561314573,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001996846893063674,
+      "loss": 0.1279,
+      "step": 3533
+    },
+    {
+      "epoch": 0.03067681704151874,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019968444034227967,
+      "loss": 0.208,
+      "step": 3534
+    },
+    {
+      "epoch": 0.0306854975217229,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0019968419128011438,
+      "loss": 0.1816,
+      "step": 3535
+    },
+    {
+      "epoch": 0.030694178001927065,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019968394211987185,
+      "loss": 0.1748,
+      "step": 3536
+    },
+    {
+      "epoch": 0.03070285848213123,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019968369286155227,
+      "loss": 0.2031,
+      "step": 3537
+    },
+    {
+      "epoch": 0.030711538962335395,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0019968344350515593,
+      "loss": 0.1787,
+      "step": 3538
+    },
+    {
+      "epoch": 0.03072021944253956,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0019968319405068314,
+      "loss": 0.1729,
+      "step": 3539
+    },
+    {
+      "epoch": 0.030728899922743726,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001996829444981342,
+      "loss": 0.1553,
+      "step": 3540
+    },
+    {
+      "epoch": 0.03073758040294789,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019968269484750933,
+      "loss": 0.2041,
+      "step": 3541
+    },
+    {
+      "epoch": 0.030746260883152056,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001996824450988088,
+      "loss": 0.1426,
+      "step": 3542
+    },
+    {
+      "epoch": 0.03075494136335622,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019968219525203284,
+      "loss": 0.2051,
+      "step": 3543
+    },
+    {
+      "epoch": 0.030763621843560386,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019968194530718183,
+      "loss": 0.248,
+      "step": 3544
+    },
+    {
+      "epoch": 0.03077230232376455,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.00199681695264256,
+      "loss": 0.1748,
+      "step": 3545
+    },
+    {
+      "epoch": 0.030780982803968716,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001996814451232556,
+      "loss": 0.1689,
+      "step": 3546
+    },
+    {
+      "epoch": 0.03078966328417288,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019968119488418096,
+      "loss": 0.2305,
+      "step": 3547
+    },
+    {
+      "epoch": 0.030798343764377047,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019968094454703225,
+      "loss": 0.2812,
+      "step": 3548
+    },
+    {
+      "epoch": 0.030807024244581212,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019968069411180982,
+      "loss": 0.2266,
+      "step": 3549
+    },
+    {
+      "epoch": 0.030815704724785373,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019968044357851398,
+      "loss": 0.1562,
+      "step": 3550
+    },
+    {
+      "epoch": 0.03082438520498954,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001996801929471449,
+      "loss": 0.2539,
+      "step": 3551
+    },
+    {
+      "epoch": 0.030833065685193704,
+      "grad_norm": 0.059326171875,
+      "learning_rate": 0.001996799422177029,
+      "loss": 0.1523,
+      "step": 3552
+    },
+    {
+      "epoch": 0.03084174616539787,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019967969139018833,
+      "loss": 0.2695,
+      "step": 3553
+    },
+    {
+      "epoch": 0.030850426645602034,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019967944046460134,
+      "loss": 0.1602,
+      "step": 3554
+    },
+    {
+      "epoch": 0.0308591071258062,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019967918944094228,
+      "loss": 0.1963,
+      "step": 3555
+    },
+    {
+      "epoch": 0.030867787606010364,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001996789383192114,
+      "loss": 0.1777,
+      "step": 3556
+    },
+    {
+      "epoch": 0.03087646808621453,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019967868709940897,
+      "loss": 0.1543,
+      "step": 3557
+    },
+    {
+      "epoch": 0.030885148566418694,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.001996784357815353,
+      "loss": 0.2324,
+      "step": 3558
+    },
+    {
+      "epoch": 0.03089382904662286,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019967818436559064,
+      "loss": 0.168,
+      "step": 3559
+    },
+    {
+      "epoch": 0.030902509526827025,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001996779328515752,
+      "loss": 0.1719,
+      "step": 3560
+    },
+    {
+      "epoch": 0.03091119000703119,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001996776812394894,
+      "loss": 0.1777,
+      "step": 3561
+    },
+    {
+      "epoch": 0.030919870487235355,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019967742952933343,
+      "loss": 0.2041,
+      "step": 3562
+    },
+    {
+      "epoch": 0.03092855096743952,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019967717772110755,
+      "loss": 0.1787,
+      "step": 3563
+    },
+    {
+      "epoch": 0.030937231447643685,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0019967692581481203,
+      "loss": 0.2461,
+      "step": 3564
+    },
+    {
+      "epoch": 0.03094591192784785,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.001996766738104472,
+      "loss": 0.2129,
+      "step": 3565
+    },
+    {
+      "epoch": 0.030954592408052012,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001996764217080133,
+      "loss": 0.1729,
+      "step": 3566
+    },
+    {
+      "epoch": 0.030963272888256177,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001996761695075106,
+      "loss": 0.1797,
+      "step": 3567
+    },
+    {
+      "epoch": 0.030971953368460342,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019967591720893942,
+      "loss": 0.1738,
+      "step": 3568
+    },
+    {
+      "epoch": 0.030980633848664507,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.001996756648123,
+      "loss": 0.1982,
+      "step": 3569
+    },
+    {
+      "epoch": 0.030989314328868672,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0019967541231759264,
+      "loss": 0.2227,
+      "step": 3570
+    },
+    {
+      "epoch": 0.030997994809072837,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001996751597248175,
+      "loss": 0.166,
+      "step": 3571
+    },
+    {
+      "epoch": 0.031006675289277003,
+      "grad_norm": 1.796875,
+      "learning_rate": 0.001996749070339751,
+      "loss": 0.3945,
+      "step": 3572
+    },
+    {
+      "epoch": 0.031015355769481168,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019967465424506545,
+      "loss": 0.1719,
+      "step": 3573
+    },
+    {
+      "epoch": 0.031024036249685333,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00199674401358089,
+      "loss": 0.207,
+      "step": 3574
+    },
+    {
+      "epoch": 0.031032716729889498,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019967414837304596,
+      "loss": 0.2188,
+      "step": 3575
+    },
+    {
+      "epoch": 0.031041397210093663,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001996738952899366,
+      "loss": 0.2002,
+      "step": 3576
+    },
+    {
+      "epoch": 0.031050077690297828,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001996736421087612,
+      "loss": 0.2207,
+      "step": 3577
+    },
+    {
+      "epoch": 0.031058758170501993,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001996733888295201,
+      "loss": 0.1924,
+      "step": 3578
+    },
+    {
+      "epoch": 0.03106743865070616,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001996731354522135,
+      "loss": 0.2363,
+      "step": 3579
+    },
+    {
+      "epoch": 0.031076119130910324,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0019967288197684173,
+      "loss": 0.1953,
+      "step": 3580
+    },
+    {
+      "epoch": 0.031084799611114485,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019967262840340505,
+      "loss": 0.2422,
+      "step": 3581
+    },
+    {
+      "epoch": 0.03109348009131865,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0019967237473190367,
+      "loss": 0.1914,
+      "step": 3582
+    },
+    {
+      "epoch": 0.031102160571522815,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.00199672120962338,
+      "loss": 0.2402,
+      "step": 3583
+    },
+    {
+      "epoch": 0.03111084105172698,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001996718670947082,
+      "loss": 0.3438,
+      "step": 3584
+    },
+    {
+      "epoch": 0.031119521531931146,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0019967161312901462,
+      "loss": 0.1533,
+      "step": 3585
+    },
+    {
+      "epoch": 0.03112820201213531,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001996713590652575,
+      "loss": 0.2617,
+      "step": 3586
+    },
+    {
+      "epoch": 0.031136882492339476,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001996711049034371,
+      "loss": 0.248,
+      "step": 3587
+    },
+    {
+      "epoch": 0.03114556297254364,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001996708506435538,
+      "loss": 0.1787,
+      "step": 3588
+    },
+    {
+      "epoch": 0.031154243452747806,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019967059628560775,
+      "loss": 0.1416,
+      "step": 3589
+    },
+    {
+      "epoch": 0.03116292393295197,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019967034182959927,
+      "loss": 0.167,
+      "step": 3590
+    },
+    {
+      "epoch": 0.031171604413156136,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001996700872755287,
+      "loss": 0.167,
+      "step": 3591
+    },
+    {
+      "epoch": 0.0311802848933603,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001996698326233962,
+      "loss": 0.1758,
+      "step": 3592
+    },
+    {
+      "epoch": 0.031188965373564467,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001996695778732022,
+      "loss": 0.1631,
+      "step": 3593
+    },
+    {
+      "epoch": 0.03119764585376863,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0019966932302494687,
+      "loss": 0.1914,
+      "step": 3594
+    },
+    {
+      "epoch": 0.031206326333972797,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019966906807863047,
+      "loss": 0.1729,
+      "step": 3595
+    },
+    {
+      "epoch": 0.031215006814176962,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001996688130342534,
+      "loss": 0.1963,
+      "step": 3596
+    },
+    {
+      "epoch": 0.031223687294381124,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001996685578918158,
+      "loss": 0.1699,
+      "step": 3597
+    },
+    {
+      "epoch": 0.03123236777458529,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019966830265131805,
+      "loss": 0.209,
+      "step": 3598
+    },
+    {
+      "epoch": 0.031241048254789454,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0019966804731276037,
+      "loss": 0.1836,
+      "step": 3599
+    },
+    {
+      "epoch": 0.03124972873499362,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019966779187614307,
+      "loss": 0.2793,
+      "step": 3600
+    },
+    {
+      "epoch": 0.031258409215197784,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001996675363414664,
+      "loss": 0.1797,
+      "step": 3601
+    },
+    {
+      "epoch": 0.03126708969540195,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001996672807087307,
+      "loss": 0.1768,
+      "step": 3602
+    },
+    {
+      "epoch": 0.031275770175606114,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001996670249779362,
+      "loss": 0.1963,
+      "step": 3603
+    },
+    {
+      "epoch": 0.03128445065581028,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019966676914908313,
+      "loss": 0.1758,
+      "step": 3604
+    },
+    {
+      "epoch": 0.031293131136014445,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019966651322217187,
+      "loss": 0.2559,
+      "step": 3605
+    },
+    {
+      "epoch": 0.03130181161621861,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001996662571972027,
+      "loss": 0.2207,
+      "step": 3606
+    },
+    {
+      "epoch": 0.031310492096422775,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001996660010741758,
+      "loss": 0.1504,
+      "step": 3607
+    },
+    {
+      "epoch": 0.03131917257662694,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0019966574485309153,
+      "loss": 0.1641,
+      "step": 3608
+    },
+    {
+      "epoch": 0.031327853056831105,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001996654885339502,
+      "loss": 0.1914,
+      "step": 3609
+    },
+    {
+      "epoch": 0.03133653353703527,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0019966523211675195,
+      "loss": 0.1523,
+      "step": 3610
+    },
+    {
+      "epoch": 0.031345214017239435,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.001996649756014972,
+      "loss": 0.2031,
+      "step": 3611
+    },
+    {
+      "epoch": 0.0313538944974436,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019966471898818614,
+      "loss": 0.1377,
+      "step": 3612
+    },
+    {
+      "epoch": 0.031362574977647766,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019966446227681913,
+      "loss": 0.2031,
+      "step": 3613
+    },
+    {
+      "epoch": 0.03137125545785193,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.001996642054673964,
+      "loss": 0.1777,
+      "step": 3614
+    },
+    {
+      "epoch": 0.031379935938056096,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019966394855991825,
+      "loss": 0.1885,
+      "step": 3615
+    },
+    {
+      "epoch": 0.03138861641826026,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019966369155438495,
+      "loss": 0.248,
+      "step": 3616
+    },
+    {
+      "epoch": 0.031397296898464426,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001996634344507968,
+      "loss": 0.1543,
+      "step": 3617
+    },
+    {
+      "epoch": 0.03140597737866859,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0019966317724915404,
+      "loss": 0.1797,
+      "step": 3618
+    },
+    {
+      "epoch": 0.031414657858872756,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0019966291994945695,
+      "loss": 0.1729,
+      "step": 3619
+    },
+    {
+      "epoch": 0.031423338339076914,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019966266255170588,
+      "loss": 0.1709,
+      "step": 3620
+    },
+    {
+      "epoch": 0.03143201881928108,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0019966240505590103,
+      "loss": 0.1738,
+      "step": 3621
+    },
+    {
+      "epoch": 0.031440699299485245,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019966214746204277,
+      "loss": 0.1855,
+      "step": 3622
+    },
+    {
+      "epoch": 0.03144937977968941,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001996618897701313,
+      "loss": 0.2324,
+      "step": 3623
+    },
+    {
+      "epoch": 0.031458060259893575,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001996616319801669,
+      "loss": 0.1641,
+      "step": 3624
+    },
+    {
+      "epoch": 0.03146674074009774,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019966137409215,
+      "loss": 0.1797,
+      "step": 3625
+    },
+    {
+      "epoch": 0.031475421220301905,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019966111610608068,
+      "loss": 0.2168,
+      "step": 3626
+    },
+    {
+      "epoch": 0.03148410170050607,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0019966085802195933,
+      "loss": 0.2773,
+      "step": 3627
+    },
+    {
+      "epoch": 0.031492782180710235,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001996605998397862,
+      "loss": 0.2148,
+      "step": 3628
+    },
+    {
+      "epoch": 0.0315014626609144,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001996603415595616,
+      "loss": 0.1816,
+      "step": 3629
+    },
+    {
+      "epoch": 0.031510143141118566,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001996600831812858,
+      "loss": 0.1826,
+      "step": 3630
+    },
+    {
+      "epoch": 0.03151882362132273,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001996598247049591,
+      "loss": 0.1465,
+      "step": 3631
+    },
+    {
+      "epoch": 0.031527504101526896,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019965956613058173,
+      "loss": 0.1719,
+      "step": 3632
+    },
+    {
+      "epoch": 0.03153618458173106,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00199659307458154,
+      "loss": 0.1953,
+      "step": 3633
+    },
+    {
+      "epoch": 0.031544865061935226,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019965904868767623,
+      "loss": 0.2324,
+      "step": 3634
+    },
+    {
+      "epoch": 0.03155354554213939,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0019965878981914867,
+      "loss": 0.1484,
+      "step": 3635
+    },
+    {
+      "epoch": 0.031562226022343556,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019965853085257153,
+      "loss": 0.2002,
+      "step": 3636
+    },
+    {
+      "epoch": 0.03157090650254772,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019965827178794527,
+      "loss": 0.21,
+      "step": 3637
+    },
+    {
+      "epoch": 0.03157958698275189,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019965801262527005,
+      "loss": 0.2295,
+      "step": 3638
+    },
+    {
+      "epoch": 0.03158826746295605,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0019965775336454614,
+      "loss": 0.166,
+      "step": 3639
+    },
+    {
+      "epoch": 0.03159694794316022,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0019965749400577383,
+      "loss": 0.1445,
+      "step": 3640
+    },
+    {
+      "epoch": 0.03160562842336438,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001996572345489535,
+      "loss": 0.1699,
+      "step": 3641
+    },
+    {
+      "epoch": 0.03161430890356855,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0019965697499408535,
+      "loss": 0.1846,
+      "step": 3642
+    },
+    {
+      "epoch": 0.03162298938377271,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0019965671534116964,
+      "loss": 0.418,
+      "step": 3643
+    },
+    {
+      "epoch": 0.03163166986397688,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001996564555902067,
+      "loss": 0.1787,
+      "step": 3644
+    },
+    {
+      "epoch": 0.03164035034418104,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019965619574119687,
+      "loss": 0.1924,
+      "step": 3645
+    },
+    {
+      "epoch": 0.03164903082438521,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019965593579414033,
+      "loss": 0.1826,
+      "step": 3646
+    },
+    {
+      "epoch": 0.03165771130458937,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001996556757490374,
+      "loss": 0.1855,
+      "step": 3647
+    },
+    {
+      "epoch": 0.03166639178479354,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0019965541560588835,
+      "loss": 0.1729,
+      "step": 3648
+    },
+    {
+      "epoch": 0.0316750722649977,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001996551553646935,
+      "loss": 0.1387,
+      "step": 3649
+    },
+    {
+      "epoch": 0.03168375274520187,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019965489502545316,
+      "loss": 0.1758,
+      "step": 3650
+    },
+    {
+      "epoch": 0.031692433225406026,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019965463458816754,
+      "loss": 0.1885,
+      "step": 3651
+    },
+    {
+      "epoch": 0.03170111370561019,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019965437405283695,
+      "loss": 0.1885,
+      "step": 3652
+    },
+    {
+      "epoch": 0.031709794185814356,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001996541134194617,
+      "loss": 0.2031,
+      "step": 3653
+    },
+    {
+      "epoch": 0.03171847466601852,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0019965385268804208,
+      "loss": 0.166,
+      "step": 3654
+    },
+    {
+      "epoch": 0.03172715514622269,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001996535918585783,
+      "loss": 0.2207,
+      "step": 3655
+    },
+    {
+      "epoch": 0.03173583562642685,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019965333093107072,
+      "loss": 0.2461,
+      "step": 3656
+    },
+    {
+      "epoch": 0.03174451610663102,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001996530699055196,
+      "loss": 0.1719,
+      "step": 3657
+    },
+    {
+      "epoch": 0.03175319658683518,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019965280878192523,
+      "loss": 0.1621,
+      "step": 3658
+    },
+    {
+      "epoch": 0.03176187706703935,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019965254756028794,
+      "loss": 0.1689,
+      "step": 3659
+    },
+    {
+      "epoch": 0.03177055754724351,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.001996522862406079,
+      "loss": 0.1924,
+      "step": 3660
+    },
+    {
+      "epoch": 0.03177923802744768,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0019965202482288553,
+      "loss": 0.1826,
+      "step": 3661
+    },
+    {
+      "epoch": 0.03178791850765184,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.00199651763307121,
+      "loss": 0.2217,
+      "step": 3662
+    },
+    {
+      "epoch": 0.03179659898785601,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001996515016933147,
+      "loss": 0.1689,
+      "step": 3663
+    },
+    {
+      "epoch": 0.03180527946806017,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019965123998146686,
+      "loss": 0.1621,
+      "step": 3664
+    },
+    {
+      "epoch": 0.03181395994826434,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019965097817157772,
+      "loss": 0.2051,
+      "step": 3665
+    },
+    {
+      "epoch": 0.0318226404284685,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019965071626364766,
+      "loss": 0.1777,
+      "step": 3666
+    },
+    {
+      "epoch": 0.03183132090867267,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019965045425767687,
+      "loss": 0.1777,
+      "step": 3667
+    },
+    {
+      "epoch": 0.03184000138887683,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019965019215366577,
+      "loss": 0.2051,
+      "step": 3668
+    },
+    {
+      "epoch": 0.031848681869081,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001996499299516145,
+      "loss": 0.1895,
+      "step": 3669
+    },
+    {
+      "epoch": 0.031857362349285163,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0019964966765152344,
+      "loss": 0.1689,
+      "step": 3670
+    },
+    {
+      "epoch": 0.03186604282948933,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019964940525339287,
+      "loss": 0.1582,
+      "step": 3671
+    },
+    {
+      "epoch": 0.031874723309693494,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00199649142757223,
+      "loss": 0.1689,
+      "step": 3672
+    },
+    {
+      "epoch": 0.03188340378989766,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0019964888016301427,
+      "loss": 0.1934,
+      "step": 3673
+    },
+    {
+      "epoch": 0.031892084270101824,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001996486174707668,
+      "loss": 0.1855,
+      "step": 3674
+    },
+    {
+      "epoch": 0.03190076475030599,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0019964835468048096,
+      "loss": 0.3066,
+      "step": 3675
+    },
+    {
+      "epoch": 0.031909445230510154,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019964809179215705,
+      "loss": 0.2217,
+      "step": 3676
+    },
+    {
+      "epoch": 0.03191812571071432,
+      "grad_norm": 1.703125,
+      "learning_rate": 0.0019964782880579532,
+      "loss": 0.2539,
+      "step": 3677
+    },
+    {
+      "epoch": 0.031926806190918484,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019964756572139605,
+      "loss": 0.2383,
+      "step": 3678
+    },
+    {
+      "epoch": 0.03193548667112265,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019964730253895957,
+      "loss": 0.1895,
+      "step": 3679
+    },
+    {
+      "epoch": 0.031944167151326815,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019964703925848615,
+      "loss": 0.2227,
+      "step": 3680
+    },
+    {
+      "epoch": 0.03195284763153097,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019964677587997605,
+      "loss": 0.168,
+      "step": 3681
+    },
+    {
+      "epoch": 0.03196152811173514,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001996465124034296,
+      "loss": 0.1514,
+      "step": 3682
+    },
+    {
+      "epoch": 0.0319702085919393,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001996462488288471,
+      "loss": 0.1914,
+      "step": 3683
+    },
+    {
+      "epoch": 0.03197888907214347,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0019964598515622876,
+      "loss": 0.1543,
+      "step": 3684
+    },
+    {
+      "epoch": 0.03198756955234763,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001996457213855749,
+      "loss": 0.2041,
+      "step": 3685
+    },
+    {
+      "epoch": 0.0319962500325518,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001996454575168859,
+      "loss": 0.209,
+      "step": 3686
+    },
+    {
+      "epoch": 0.032004930512755964,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001996451935501619,
+      "loss": 0.1387,
+      "step": 3687
+    },
+    {
+      "epoch": 0.03201361099296013,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001996449294854033,
+      "loss": 0.1904,
+      "step": 3688
+    },
+    {
+      "epoch": 0.032022291473164294,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019964466532261037,
+      "loss": 0.2012,
+      "step": 3689
+    },
+    {
+      "epoch": 0.03203097195336846,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001996444010617834,
+      "loss": 0.1885,
+      "step": 3690
+    },
+    {
+      "epoch": 0.032039652433572624,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.001996441367029226,
+      "loss": 0.1963,
+      "step": 3691
+    },
+    {
+      "epoch": 0.03204833291377679,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019964387224602836,
+      "loss": 0.1602,
+      "step": 3692
+    },
+    {
+      "epoch": 0.032057013393980954,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001996436076911009,
+      "loss": 0.2383,
+      "step": 3693
+    },
+    {
+      "epoch": 0.03206569387418512,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001996433430381405,
+      "loss": 0.1797,
+      "step": 3694
+    },
+    {
+      "epoch": 0.032074374354389285,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001996430782871476,
+      "loss": 0.1758,
+      "step": 3695
+    },
+    {
+      "epoch": 0.03208305483459345,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001996428134381223,
+      "loss": 0.1807,
+      "step": 3696
+    },
+    {
+      "epoch": 0.032091735314797615,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019964254849106494,
+      "loss": 0.1572,
+      "step": 3697
+    },
+    {
+      "epoch": 0.03210041579500178,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001996422834459759,
+      "loss": 0.1855,
+      "step": 3698
+    },
+    {
+      "epoch": 0.032109096275205945,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0019964201830285537,
+      "loss": 0.1758,
+      "step": 3699
+    },
+    {
+      "epoch": 0.03211777675541011,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019964175306170368,
+      "loss": 0.2002,
+      "step": 3700
+    },
+    {
+      "epoch": 0.032126457235614275,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019964148772252115,
+      "loss": 0.1738,
+      "step": 3701
+    },
+    {
+      "epoch": 0.03213513771581844,
+      "grad_norm": 0.5,
+      "learning_rate": 0.00199641222285308,
+      "loss": 0.1807,
+      "step": 3702
+    },
+    {
+      "epoch": 0.032143818196022605,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019964095675006456,
+      "loss": 0.1807,
+      "step": 3703
+    },
+    {
+      "epoch": 0.03215249867622677,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001996406911167911,
+      "loss": 0.1963,
+      "step": 3704
+    },
+    {
+      "epoch": 0.032161179156430936,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019964042538548796,
+      "loss": 0.2031,
+      "step": 3705
+    },
+    {
+      "epoch": 0.0321698596366351,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0019964015955615537,
+      "loss": 0.2119,
+      "step": 3706
+    },
+    {
+      "epoch": 0.032178540116839266,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001996398936287937,
+      "loss": 0.1572,
+      "step": 3707
+    },
+    {
+      "epoch": 0.03218722059704343,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019963962760340312,
+      "loss": 0.1572,
+      "step": 3708
+    },
+    {
+      "epoch": 0.032195901077247596,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019963936147998403,
+      "loss": 0.1699,
+      "step": 3709
+    },
+    {
+      "epoch": 0.03220458155745176,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019963909525853667,
+      "loss": 0.1455,
+      "step": 3710
+    },
+    {
+      "epoch": 0.032213262037655926,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019963882893906135,
+      "loss": 0.2227,
+      "step": 3711
+    },
+    {
+      "epoch": 0.032221942517860085,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019963856252155836,
+      "loss": 0.1611,
+      "step": 3712
+    },
+    {
+      "epoch": 0.03223062299806425,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019963829600602796,
+      "loss": 0.1797,
+      "step": 3713
+    },
+    {
+      "epoch": 0.032239303478268415,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019963802939247047,
+      "loss": 0.2012,
+      "step": 3714
+    },
+    {
+      "epoch": 0.03224798395847258,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001996377626808862,
+      "loss": 0.1611,
+      "step": 3715
+    },
+    {
+      "epoch": 0.032256664438676745,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001996374958712754,
+      "loss": 0.1816,
+      "step": 3716
+    },
+    {
+      "epoch": 0.03226534491888091,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001996372289636384,
+      "loss": 0.209,
+      "step": 3717
+    },
+    {
+      "epoch": 0.032274025399085075,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0019963696195797547,
+      "loss": 0.2246,
+      "step": 3718
+    },
+    {
+      "epoch": 0.03228270587928924,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019963669485428687,
+      "loss": 0.1533,
+      "step": 3719
+    },
+    {
+      "epoch": 0.032291386359493406,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.00199636427652573,
+      "loss": 0.2441,
+      "step": 3720
+    },
+    {
+      "epoch": 0.03230006683969757,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00199636160352834,
+      "loss": 0.1914,
+      "step": 3721
+    },
+    {
+      "epoch": 0.032308747319901736,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0019963589295507026,
+      "loss": 0.1934,
+      "step": 3722
+    },
+    {
+      "epoch": 0.0323174278001059,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019963562545928207,
+      "loss": 0.1641,
+      "step": 3723
+    },
+    {
+      "epoch": 0.032326108280310066,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0019963535786546973,
+      "loss": 0.1973,
+      "step": 3724
+    },
+    {
+      "epoch": 0.03233478876051423,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0019963509017363345,
+      "loss": 0.1562,
+      "step": 3725
+    },
+    {
+      "epoch": 0.032343469240718396,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019963482238377363,
+      "loss": 0.2031,
+      "step": 3726
+    },
+    {
+      "epoch": 0.03235214972092256,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001996345544958905,
+      "loss": 0.1582,
+      "step": 3727
+    },
+    {
+      "epoch": 0.03236083020112673,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0019963428650998437,
+      "loss": 0.1738,
+      "step": 3728
+    },
+    {
+      "epoch": 0.03236951068133089,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0019963401842605557,
+      "loss": 0.1719,
+      "step": 3729
+    },
+    {
+      "epoch": 0.03237819116153506,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019963375024410427,
+      "loss": 0.2773,
+      "step": 3730
+    },
+    {
+      "epoch": 0.03238687164173922,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001996334819641309,
+      "loss": 0.1885,
+      "step": 3731
+    },
+    {
+      "epoch": 0.03239555212194339,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001996332135861357,
+      "loss": 0.1719,
+      "step": 3732
+    },
+    {
+      "epoch": 0.03240423260214755,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0019963294511011896,
+      "loss": 0.1934,
+      "step": 3733
+    },
+    {
+      "epoch": 0.03241291308235172,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.00199632676536081,
+      "loss": 0.2051,
+      "step": 3734
+    },
+    {
+      "epoch": 0.03242159356255588,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019963240786402205,
+      "loss": 0.2207,
+      "step": 3735
+    },
+    {
+      "epoch": 0.03243027404276005,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019963213909394246,
+      "loss": 0.1807,
+      "step": 3736
+    },
+    {
+      "epoch": 0.03243895452296421,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0019963187022584254,
+      "loss": 0.2227,
+      "step": 3737
+    },
+    {
+      "epoch": 0.03244763500316838,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001996316012597225,
+      "loss": 0.1689,
+      "step": 3738
+    },
+    {
+      "epoch": 0.03245631548337254,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001996313321955827,
+      "loss": 0.1738,
+      "step": 3739
+    },
+    {
+      "epoch": 0.03246499596357671,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019963106303342347,
+      "loss": 0.2119,
+      "step": 3740
+    },
+    {
+      "epoch": 0.03247367644378087,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.00199630793773245,
+      "loss": 0.2285,
+      "step": 3741
+    },
+    {
+      "epoch": 0.03248235692398504,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019963052441504766,
+      "loss": 0.2363,
+      "step": 3742
+    },
+    {
+      "epoch": 0.032491037404189196,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019963025495883173,
+      "loss": 0.1895,
+      "step": 3743
+    },
+    {
+      "epoch": 0.03249971788439336,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001996299854045975,
+      "loss": 0.4746,
+      "step": 3744
+    },
+    {
+      "epoch": 0.03250839836459753,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019962971575234524,
+      "loss": 0.2285,
+      "step": 3745
+    },
+    {
+      "epoch": 0.03251707884480169,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019962944600207533,
+      "loss": 0.1973,
+      "step": 3746
+    },
+    {
+      "epoch": 0.03252575932500586,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.00199629176153788,
+      "loss": 0.1836,
+      "step": 3747
+    },
+    {
+      "epoch": 0.03253443980521002,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019962890620748345,
+      "loss": 0.2002,
+      "step": 3748
+    },
+    {
+      "epoch": 0.03254312028541419,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019962863616316217,
+      "loss": 0.1533,
+      "step": 3749
+    },
+    {
+      "epoch": 0.03255180076561835,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019962836602082433,
+      "loss": 0.1934,
+      "step": 3750
+    },
+    {
+      "epoch": 0.03256048124582252,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019962809578047023,
+      "loss": 0.2285,
+      "step": 3751
+    },
+    {
+      "epoch": 0.03256916172602668,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019962782544210023,
+      "loss": 0.1953,
+      "step": 3752
+    },
+    {
+      "epoch": 0.03257784220623085,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019962755500571457,
+      "loss": 0.1924,
+      "step": 3753
+    },
+    {
+      "epoch": 0.03258652268643501,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019962728447131357,
+      "loss": 0.166,
+      "step": 3754
+    },
+    {
+      "epoch": 0.03259520316663918,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019962701383889753,
+      "loss": 0.1562,
+      "step": 3755
+    },
+    {
+      "epoch": 0.03260388364684334,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001996267431084667,
+      "loss": 0.2207,
+      "step": 3756
+    },
+    {
+      "epoch": 0.03261256412704751,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0019962647228002144,
+      "loss": 0.2324,
+      "step": 3757
+    },
+    {
+      "epoch": 0.03262124460725167,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.00199626201353562,
+      "loss": 0.1924,
+      "step": 3758
+    },
+    {
+      "epoch": 0.03262992508745584,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001996259303290887,
+      "loss": 0.1797,
+      "step": 3759
+    },
+    {
+      "epoch": 0.03263860556766,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019962565920660187,
+      "loss": 0.1348,
+      "step": 3760
+    },
+    {
+      "epoch": 0.03264728604786417,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001996253879861017,
+      "loss": 0.2344,
+      "step": 3761
+    },
+    {
+      "epoch": 0.032655966528068334,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001996251166675886,
+      "loss": 0.2148,
+      "step": 3762
+    },
+    {
+      "epoch": 0.0326646470082725,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.001996248452510628,
+      "loss": 0.166,
+      "step": 3763
+    },
+    {
+      "epoch": 0.032673327488476664,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001996245737365246,
+      "loss": 0.1943,
+      "step": 3764
+    },
+    {
+      "epoch": 0.03268200796868083,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001996243021239743,
+      "loss": 0.1592,
+      "step": 3765
+    },
+    {
+      "epoch": 0.032690688448884994,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0019962403041341222,
+      "loss": 0.1895,
+      "step": 3766
+    },
+    {
+      "epoch": 0.03269936892908916,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001996237586048387,
+      "loss": 0.1836,
+      "step": 3767
+    },
+    {
+      "epoch": 0.032708049409293324,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001996234866982539,
+      "loss": 0.2002,
+      "step": 3768
+    },
+    {
+      "epoch": 0.03271672988949749,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001996232146936583,
+      "loss": 0.209,
+      "step": 3769
+    },
+    {
+      "epoch": 0.032725410369701655,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0019962294259105204,
+      "loss": 0.1855,
+      "step": 3770
+    },
+    {
+      "epoch": 0.03273409084990582,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019962267039043543,
+      "loss": 0.1729,
+      "step": 3771
+    },
+    {
+      "epoch": 0.032742771330109985,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001996223980918089,
+      "loss": 0.1934,
+      "step": 3772
+    },
+    {
+      "epoch": 0.03275145181031415,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019962212569517262,
+      "loss": 0.2129,
+      "step": 3773
+    },
+    {
+      "epoch": 0.03276013229051831,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019962185320052694,
+      "loss": 0.1602,
+      "step": 3774
+    },
+    {
+      "epoch": 0.03276881277072247,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019962158060787215,
+      "loss": 0.2285,
+      "step": 3775
+    },
+    {
+      "epoch": 0.03277749325092664,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019962130791720853,
+      "loss": 0.2227,
+      "step": 3776
+    },
+    {
+      "epoch": 0.032786173731130804,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001996210351285364,
+      "loss": 0.1699,
+      "step": 3777
+    },
+    {
+      "epoch": 0.03279485421133497,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019962076224185605,
+      "loss": 0.2119,
+      "step": 3778
+    },
+    {
+      "epoch": 0.032803534691539134,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001996204892571678,
+      "loss": 0.1738,
+      "step": 3779
+    },
+    {
+      "epoch": 0.0328122151717433,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001996202161744719,
+      "loss": 0.1504,
+      "step": 3780
+    },
+    {
+      "epoch": 0.032820895651947464,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001996199429937687,
+      "loss": 0.1445,
+      "step": 3781
+    },
+    {
+      "epoch": 0.03282957613215163,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001996196697150585,
+      "loss": 0.207,
+      "step": 3782
+    },
+    {
+      "epoch": 0.032838256612355794,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001996193963383416,
+      "loss": 0.209,
+      "step": 3783
+    },
+    {
+      "epoch": 0.03284693709255996,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001996191228636182,
+      "loss": 0.2227,
+      "step": 3784
+    },
+    {
+      "epoch": 0.032855617572764124,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0019961884929088868,
+      "loss": 0.2324,
+      "step": 3785
+    },
+    {
+      "epoch": 0.03286429805296829,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019961857562015334,
+      "loss": 0.1846,
+      "step": 3786
+    },
+    {
+      "epoch": 0.032872978533172455,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019961830185141246,
+      "loss": 0.2246,
+      "step": 3787
+    },
+    {
+      "epoch": 0.03288165901337662,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001996180279846664,
+      "loss": 0.2031,
+      "step": 3788
+    },
+    {
+      "epoch": 0.032890339493580785,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001996177540199154,
+      "loss": 0.1816,
+      "step": 3789
+    },
+    {
+      "epoch": 0.03289901997378495,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019961747995715976,
+      "loss": 0.1953,
+      "step": 3790
+    },
+    {
+      "epoch": 0.032907700453989115,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001996172057963998,
+      "loss": 0.2051,
+      "step": 3791
+    },
+    {
+      "epoch": 0.03291638093419328,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001996169315376358,
+      "loss": 0.2051,
+      "step": 3792
+    },
+    {
+      "epoch": 0.032925061414397445,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0019961665718086804,
+      "loss": 0.1973,
+      "step": 3793
+    },
+    {
+      "epoch": 0.03293374189460161,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001996163827260969,
+      "loss": 0.1963,
+      "step": 3794
+    },
+    {
+      "epoch": 0.032942422374805776,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001996161081733226,
+      "loss": 0.168,
+      "step": 3795
+    },
+    {
+      "epoch": 0.03295110285500994,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019961583352254546,
+      "loss": 0.1777,
+      "step": 3796
+    },
+    {
+      "epoch": 0.032959783335214106,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019961555877376583,
+      "loss": 0.1807,
+      "step": 3797
+    },
+    {
+      "epoch": 0.03296846381541827,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019961528392698396,
+      "loss": 0.1533,
+      "step": 3798
+    },
+    {
+      "epoch": 0.032977144295622436,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019961500898220013,
+      "loss": 0.2061,
+      "step": 3799
+    },
+    {
+      "epoch": 0.0329858247758266,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001996147339394147,
+      "loss": 0.1885,
+      "step": 3800
+    },
+    {
+      "epoch": 0.032994505256030766,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019961445879862795,
+      "loss": 0.2285,
+      "step": 3801
+    },
+    {
+      "epoch": 0.03300318573623493,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019961418355984016,
+      "loss": 0.1973,
+      "step": 3802
+    },
+    {
+      "epoch": 0.0330118662164391,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019961390822305167,
+      "loss": 0.1611,
+      "step": 3803
+    },
+    {
+      "epoch": 0.03302054669664326,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001996136327882627,
+      "loss": 0.1777,
+      "step": 3804
+    },
+    {
+      "epoch": 0.03302922717684742,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001996133572554737,
+      "loss": 0.1602,
+      "step": 3805
+    },
+    {
+      "epoch": 0.033037907657051585,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019961308162468475,
+      "loss": 0.3379,
+      "step": 3806
+    },
+    {
+      "epoch": 0.03304658813725575,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001996128058958964,
+      "loss": 0.2559,
+      "step": 3807
+    },
+    {
+      "epoch": 0.033055268617459915,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0019961253006910876,
+      "loss": 0.1621,
+      "step": 3808
+    },
+    {
+      "epoch": 0.03306394909766408,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019961225414432226,
+      "loss": 0.1621,
+      "step": 3809
+    },
+    {
+      "epoch": 0.033072629577868246,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0019961197812153712,
+      "loss": 0.1758,
+      "step": 3810
+    },
+    {
+      "epoch": 0.03308131005807241,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0019961170200075364,
+      "loss": 0.1729,
+      "step": 3811
+    },
+    {
+      "epoch": 0.033089990538276576,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019961142578197215,
+      "loss": 0.2246,
+      "step": 3812
+    },
+    {
+      "epoch": 0.03309867101848074,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0019961114946519297,
+      "loss": 0.1543,
+      "step": 3813
+    },
+    {
+      "epoch": 0.033107351498684906,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001996108730504164,
+      "loss": 0.1641,
+      "step": 3814
+    },
+    {
+      "epoch": 0.03311603197888907,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001996105965376427,
+      "loss": 0.1953,
+      "step": 3815
+    },
+    {
+      "epoch": 0.033124712459093236,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001996103199268722,
+      "loss": 0.1582,
+      "step": 3816
+    },
+    {
+      "epoch": 0.0331333929392974,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019961004321810524,
+      "loss": 0.1631,
+      "step": 3817
+    },
+    {
+      "epoch": 0.033142073419501566,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.00199609766411342,
+      "loss": 0.1572,
+      "step": 3818
+    },
+    {
+      "epoch": 0.03315075389970573,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019960948950658295,
+      "loss": 0.1777,
+      "step": 3819
+    },
+    {
+      "epoch": 0.0331594343799099,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019960921250382828,
+      "loss": 0.168,
+      "step": 3820
+    },
+    {
+      "epoch": 0.03316811486011406,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019960893540307834,
+      "loss": 0.168,
+      "step": 3821
+    },
+    {
+      "epoch": 0.03317679534031823,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019960865820433334,
+      "loss": 0.2129,
+      "step": 3822
+    },
+    {
+      "epoch": 0.03318547582052239,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019960838090759374,
+      "loss": 0.168,
+      "step": 3823
+    },
+    {
+      "epoch": 0.03319415630072656,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019960810351285973,
+      "loss": 0.2012,
+      "step": 3824
+    },
+    {
+      "epoch": 0.03320283678093072,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019960782602013163,
+      "loss": 0.1572,
+      "step": 3825
+    },
+    {
+      "epoch": 0.03321151726113489,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001996075484294098,
+      "loss": 0.1396,
+      "step": 3826
+    },
+    {
+      "epoch": 0.03322019774133905,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0019960727074069444,
+      "loss": 0.1426,
+      "step": 3827
+    },
+    {
+      "epoch": 0.03322887822154322,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019960699295398596,
+      "loss": 0.1768,
+      "step": 3828
+    },
+    {
+      "epoch": 0.03323755870174738,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001996067150692846,
+      "loss": 0.2148,
+      "step": 3829
+    },
+    {
+      "epoch": 0.03324623918195155,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001996064370865907,
+      "loss": 0.2539,
+      "step": 3830
+    },
+    {
+      "epoch": 0.03325491966215571,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0019960615900590454,
+      "loss": 0.1699,
+      "step": 3831
+    },
+    {
+      "epoch": 0.03326360014235988,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001996058808272264,
+      "loss": 0.1328,
+      "step": 3832
+    },
+    {
+      "epoch": 0.03327228062256404,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001996056025505567,
+      "loss": 0.1406,
+      "step": 3833
+    },
+    {
+      "epoch": 0.03328096110276821,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001996053241758956,
+      "loss": 0.1855,
+      "step": 3834
+    },
+    {
+      "epoch": 0.03328964158297237,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019960504570324345,
+      "loss": 0.1719,
+      "step": 3835
+    },
+    {
+      "epoch": 0.03329832206317653,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0019960476713260056,
+      "loss": 0.1963,
+      "step": 3836
+    },
+    {
+      "epoch": 0.0333070025433807,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001996044884639673,
+      "loss": 0.2305,
+      "step": 3837
+    },
+    {
+      "epoch": 0.03331568302358486,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0019960420969734383,
+      "loss": 0.1543,
+      "step": 3838
+    },
+    {
+      "epoch": 0.03332436350378903,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001996039308327306,
+      "loss": 0.1816,
+      "step": 3839
+    },
+    {
+      "epoch": 0.03333304398399319,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0019960365187012786,
+      "loss": 0.1738,
+      "step": 3840
+    },
+    {
+      "epoch": 0.03334172446419736,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0019960337280953595,
+      "loss": 0.1885,
+      "step": 3841
+    },
+    {
+      "epoch": 0.03335040494440152,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0019960309365095507,
+      "loss": 0.166,
+      "step": 3842
+    },
+    {
+      "epoch": 0.03335908542460569,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019960281439438564,
+      "loss": 0.1641,
+      "step": 3843
+    },
+    {
+      "epoch": 0.03336776590480985,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019960253503982788,
+      "loss": 0.2324,
+      "step": 3844
+    },
+    {
+      "epoch": 0.03337644638501402,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001996022555872822,
+      "loss": 0.2344,
+      "step": 3845
+    },
+    {
+      "epoch": 0.03338512686521818,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019960197603674876,
+      "loss": 0.1846,
+      "step": 3846
+    },
+    {
+      "epoch": 0.03339380734542235,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019960169638822797,
+      "loss": 0.1816,
+      "step": 3847
+    },
+    {
+      "epoch": 0.03340248782562651,
+      "grad_norm": 0.056640625,
+      "learning_rate": 0.001996014166417201,
+      "loss": 0.127,
+      "step": 3848
+    },
+    {
+      "epoch": 0.03341116830583068,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001996011367972255,
+      "loss": 0.1553,
+      "step": 3849
+    },
+    {
+      "epoch": 0.03341984878603484,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0019960085685474444,
+      "loss": 0.1592,
+      "step": 3850
+    },
+    {
+      "epoch": 0.03342852926623901,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001996005768142772,
+      "loss": 0.1729,
+      "step": 3851
+    },
+    {
+      "epoch": 0.033437209746443174,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0019960029667582414,
+      "loss": 0.1572,
+      "step": 3852
+    },
+    {
+      "epoch": 0.03344589022664734,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019960001643938552,
+      "loss": 0.3203,
+      "step": 3853
+    },
+    {
+      "epoch": 0.033454570706851504,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019959973610496166,
+      "loss": 0.1572,
+      "step": 3854
+    },
+    {
+      "epoch": 0.03346325118705567,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0019959945567255295,
+      "loss": 0.25,
+      "step": 3855
+    },
+    {
+      "epoch": 0.033471931667259834,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019959917514215954,
+      "loss": 0.1523,
+      "step": 3856
+    },
+    {
+      "epoch": 0.033480612147464,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0019959889451378185,
+      "loss": 0.1377,
+      "step": 3857
+    },
+    {
+      "epoch": 0.033489292627668164,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0019959861378742016,
+      "loss": 0.1836,
+      "step": 3858
+    },
+    {
+      "epoch": 0.03349797310787233,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001995983329630748,
+      "loss": 0.1914,
+      "step": 3859
+    },
+    {
+      "epoch": 0.033506653588076495,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.00199598052040746,
+      "loss": 0.1699,
+      "step": 3860
+    },
+    {
+      "epoch": 0.03351533406828066,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001995977710204341,
+      "loss": 0.1904,
+      "step": 3861
+    },
+    {
+      "epoch": 0.033524014548484825,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0019959748990213944,
+      "loss": 0.1963,
+      "step": 3862
+    },
+    {
+      "epoch": 0.03353269502868899,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0019959720868586235,
+      "loss": 0.1562,
+      "step": 3863
+    },
+    {
+      "epoch": 0.033541375508893155,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0019959692737160305,
+      "loss": 0.165,
+      "step": 3864
+    },
+    {
+      "epoch": 0.03355005598909732,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0019959664595936193,
+      "loss": 0.1875,
+      "step": 3865
+    },
+    {
+      "epoch": 0.03355873646930148,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001995963644491393,
+      "loss": 0.2402,
+      "step": 3866
+    },
+    {
+      "epoch": 0.033567416949505643,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019959608284093534,
+      "loss": 0.168,
+      "step": 3867
+    },
+    {
+      "epoch": 0.03357609742970981,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001995958011347505,
+      "loss": 0.1797,
+      "step": 3868
+    },
+    {
+      "epoch": 0.033584777909913974,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0019959551933058503,
+      "loss": 0.2168,
+      "step": 3869
+    },
+    {
+      "epoch": 0.03359345839011814,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019959523742843926,
+      "loss": 0.1318,
+      "step": 3870
+    },
+    {
+      "epoch": 0.033602138870322304,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.001995949554283135,
+      "loss": 0.1504,
+      "step": 3871
+    },
+    {
+      "epoch": 0.03361081935052647,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00199594673330208,
+      "loss": 0.1719,
+      "step": 3872
+    },
+    {
+      "epoch": 0.033619499830730634,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019959439113412312,
+      "loss": 0.21,
+      "step": 3873
+    },
+    {
+      "epoch": 0.0336281803109348,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019959410884005917,
+      "loss": 0.1973,
+      "step": 3874
+    },
+    {
+      "epoch": 0.033636860791138964,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0019959382644801644,
+      "loss": 0.1562,
+      "step": 3875
+    },
+    {
+      "epoch": 0.03364554127134313,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0019959354395799526,
+      "loss": 0.168,
+      "step": 3876
+    },
+    {
+      "epoch": 0.033654221751547295,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001995932613699959,
+      "loss": 0.1855,
+      "step": 3877
+    },
+    {
+      "epoch": 0.03366290223175146,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019959297868401876,
+      "loss": 0.1875,
+      "step": 3878
+    },
+    {
+      "epoch": 0.033671582711955625,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00199592695900064,
+      "loss": 0.2031,
+      "step": 3879
+    },
+    {
+      "epoch": 0.03368026319215979,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001995924130181321,
+      "loss": 0.1836,
+      "step": 3880
+    },
+    {
+      "epoch": 0.033688943672363955,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001995921300382232,
+      "loss": 0.167,
+      "step": 3881
+    },
+    {
+      "epoch": 0.03369762415256812,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0019959184696033776,
+      "loss": 0.2539,
+      "step": 3882
+    },
+    {
+      "epoch": 0.033706304632772285,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019959156378447597,
+      "loss": 0.1719,
+      "step": 3883
+    },
+    {
+      "epoch": 0.03371498511297645,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019959128051063825,
+      "loss": 0.1807,
+      "step": 3884
+    },
+    {
+      "epoch": 0.033723665593180616,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001995909971388248,
+      "loss": 0.1719,
+      "step": 3885
+    },
+    {
+      "epoch": 0.03373234607338478,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.00199590713669036,
+      "loss": 0.1367,
+      "step": 3886
+    },
+    {
+      "epoch": 0.033741026553588946,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0019959043010127214,
+      "loss": 0.1836,
+      "step": 3887
+    },
+    {
+      "epoch": 0.03374970703379311,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019959014643553354,
+      "loss": 0.1602,
+      "step": 3888
+    },
+    {
+      "epoch": 0.033758387513997276,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001995898626718205,
+      "loss": 0.1641,
+      "step": 3889
+    },
+    {
+      "epoch": 0.03376706799420144,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001995895788101333,
+      "loss": 0.1855,
+      "step": 3890
+    },
+    {
+      "epoch": 0.033775748474405606,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001995892948504723,
+      "loss": 0.1426,
+      "step": 3891
+    },
+    {
+      "epoch": 0.03378442895460977,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001995890107928378,
+      "loss": 0.1934,
+      "step": 3892
+    },
+    {
+      "epoch": 0.03379310943481394,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019958872663723014,
+      "loss": 0.2031,
+      "step": 3893
+    },
+    {
+      "epoch": 0.0338017899150181,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0019958844238364957,
+      "loss": 0.1992,
+      "step": 3894
+    },
+    {
+      "epoch": 0.03381047039522227,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001995881580320964,
+      "loss": 0.1738,
+      "step": 3895
+    },
+    {
+      "epoch": 0.03381915087542643,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.00199587873582571,
+      "loss": 0.1904,
+      "step": 3896
+    },
+    {
+      "epoch": 0.03382783135563059,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001995875890350736,
+      "loss": 0.1777,
+      "step": 3897
+    },
+    {
+      "epoch": 0.033836511835834755,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001995873043896046,
+      "loss": 0.1611,
+      "step": 3898
+    },
+    {
+      "epoch": 0.03384519231603892,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001995870196461643,
+      "loss": 0.168,
+      "step": 3899
+    },
+    {
+      "epoch": 0.033853872796243085,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0019958673480475293,
+      "loss": 0.1846,
+      "step": 3900
+    },
+    {
+      "epoch": 0.03386255327644725,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0019958644986537086,
+      "loss": 0.1611,
+      "step": 3901
+    },
+    {
+      "epoch": 0.033871233756651416,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019958616482801837,
+      "loss": 0.2266,
+      "step": 3902
+    },
+    {
+      "epoch": 0.03387991423685558,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019958587969269585,
+      "loss": 0.1387,
+      "step": 3903
+    },
+    {
+      "epoch": 0.033888594717059746,
+      "grad_norm": 2.484375,
+      "learning_rate": 0.001995855944594035,
+      "loss": 0.457,
+      "step": 3904
+    },
+    {
+      "epoch": 0.03389727519726391,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019958530912814174,
+      "loss": 0.1553,
+      "step": 3905
+    },
+    {
+      "epoch": 0.033905955677468076,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001995850236989108,
+      "loss": 0.1504,
+      "step": 3906
+    },
+    {
+      "epoch": 0.03391463615767224,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0019958473817171104,
+      "loss": 0.1797,
+      "step": 3907
+    },
+    {
+      "epoch": 0.033923316637876406,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0019958445254654278,
+      "loss": 0.5625,
+      "step": 3908
+    },
+    {
+      "epoch": 0.03393199711808057,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019958416682340626,
+      "loss": 0.1885,
+      "step": 3909
+    },
+    {
+      "epoch": 0.03394067759828474,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0019958388100230185,
+      "loss": 0.1641,
+      "step": 3910
+    },
+    {
+      "epoch": 0.0339493580784889,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001995835950832299,
+      "loss": 0.1738,
+      "step": 3911
+    },
+    {
+      "epoch": 0.03395803855869307,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001995833090661906,
+      "loss": 0.1318,
+      "step": 3912
+    },
+    {
+      "epoch": 0.03396671903889723,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001995830229511844,
+      "loss": 0.2266,
+      "step": 3913
+    },
+    {
+      "epoch": 0.0339753995191014,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019958273673821155,
+      "loss": 0.2119,
+      "step": 3914
+    },
+    {
+      "epoch": 0.03398407999930556,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0019958245042727236,
+      "loss": 0.2578,
+      "step": 3915
+    },
+    {
+      "epoch": 0.03399276047950973,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019958216401836713,
+      "loss": 0.3477,
+      "step": 3916
+    },
+    {
+      "epoch": 0.03400144095971389,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001995818775114962,
+      "loss": 0.166,
+      "step": 3917
+    },
+    {
+      "epoch": 0.03401012143991806,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019958159090665986,
+      "loss": 0.2031,
+      "step": 3918
+    },
+    {
+      "epoch": 0.03401880192012222,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019958130420385848,
+      "loss": 0.2461,
+      "step": 3919
+    },
+    {
+      "epoch": 0.03402748240032639,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001995810174030923,
+      "loss": 0.1689,
+      "step": 3920
+    },
+    {
+      "epoch": 0.03403616288053055,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0019958073050436167,
+      "loss": 0.2402,
+      "step": 3921
+    },
+    {
+      "epoch": 0.03404484336073472,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001995804435076669,
+      "loss": 0.165,
+      "step": 3922
+    },
+    {
+      "epoch": 0.03405352384093888,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001995801564130083,
+      "loss": 0.1797,
+      "step": 3923
+    },
+    {
+      "epoch": 0.03406220432114305,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001995798692203862,
+      "loss": 0.2305,
+      "step": 3924
+    },
+    {
+      "epoch": 0.034070884801347213,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019957958192980086,
+      "loss": 0.1992,
+      "step": 3925
+    },
+    {
+      "epoch": 0.03407956528155138,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001995792945412527,
+      "loss": 0.2129,
+      "step": 3926
+    },
+    {
+      "epoch": 0.034088245761755544,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019957900705474194,
+      "loss": 0.21,
+      "step": 3927
+    },
+    {
+      "epoch": 0.0340969262419597,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001995787194702689,
+      "loss": 0.1553,
+      "step": 3928
+    },
+    {
+      "epoch": 0.03410560672216387,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0019957843178783395,
+      "loss": 0.4922,
+      "step": 3929
+    },
+    {
+      "epoch": 0.03411428720236803,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0019957814400743735,
+      "loss": 0.1816,
+      "step": 3930
+    },
+    {
+      "epoch": 0.0341229676825722,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001995778561290795,
+      "loss": 0.2246,
+      "step": 3931
+    },
+    {
+      "epoch": 0.03413164816277636,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019957756815276056,
+      "loss": 0.1738,
+      "step": 3932
+    },
+    {
+      "epoch": 0.03414032864298053,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0019957728007848094,
+      "loss": 0.2012,
+      "step": 3933
+    },
+    {
+      "epoch": 0.03414900912318469,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.00199576991906241,
+      "loss": 0.1631,
+      "step": 3934
+    },
+    {
+      "epoch": 0.03415768960338886,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.00199576703636041,
+      "loss": 0.1572,
+      "step": 3935
+    },
+    {
+      "epoch": 0.03416637008359302,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019957641526788124,
+      "loss": 0.2207,
+      "step": 3936
+    },
+    {
+      "epoch": 0.03417505056379719,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001995761268017621,
+      "loss": 0.1953,
+      "step": 3937
+    },
+    {
+      "epoch": 0.03418373104400135,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0019957583823768383,
+      "loss": 0.25,
+      "step": 3938
+    },
+    {
+      "epoch": 0.03419241152420552,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0019957554957564677,
+      "loss": 0.1641,
+      "step": 3939
+    },
+    {
+      "epoch": 0.03420109200440968,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0019957526081565125,
+      "loss": 0.2168,
+      "step": 3940
+    },
+    {
+      "epoch": 0.03420977248461385,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019957497195769755,
+      "loss": 0.1777,
+      "step": 3941
+    },
+    {
+      "epoch": 0.034218452964818014,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0019957468300178605,
+      "loss": 0.2383,
+      "step": 3942
+    },
+    {
+      "epoch": 0.03422713344502218,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019957439394791693,
+      "loss": 0.1719,
+      "step": 3943
+    },
+    {
+      "epoch": 0.034235813925226344,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001995741047960907,
+      "loss": 0.1484,
+      "step": 3944
+    },
+    {
+      "epoch": 0.03424449440543051,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001995738155463075,
+      "loss": 0.1729,
+      "step": 3945
+    },
+    {
+      "epoch": 0.034253174885634674,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0019957352619856777,
+      "loss": 0.2422,
+      "step": 3946
+    },
+    {
+      "epoch": 0.03426185536583884,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019957323675287176,
+      "loss": 0.1504,
+      "step": 3947
+    },
+    {
+      "epoch": 0.034270535846043004,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019957294720921977,
+      "loss": 0.1621,
+      "step": 3948
+    },
+    {
+      "epoch": 0.03427921632624717,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001995726575676122,
+      "loss": 0.167,
+      "step": 3949
+    },
+    {
+      "epoch": 0.034287896806451335,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001995723678280493,
+      "loss": 0.1992,
+      "step": 3950
+    },
+    {
+      "epoch": 0.0342965772866555,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001995720779905314,
+      "loss": 0.1963,
+      "step": 3951
+    },
+    {
+      "epoch": 0.034305257766859665,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.0019957178805505883,
+      "loss": 0.2852,
+      "step": 3952
+    },
+    {
+      "epoch": 0.03431393824706383,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019957149802163187,
+      "loss": 0.2402,
+      "step": 3953
+    },
+    {
+      "epoch": 0.034322618727267995,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001995712078902509,
+      "loss": 0.1709,
+      "step": 3954
+    },
+    {
+      "epoch": 0.03433129920747216,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019957091766091618,
+      "loss": 0.1445,
+      "step": 3955
+    },
+    {
+      "epoch": 0.034339979687676325,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019957062733362806,
+      "loss": 0.1748,
+      "step": 3956
+    },
+    {
+      "epoch": 0.03434866016788049,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019957033690838686,
+      "loss": 0.209,
+      "step": 3957
+    },
+    {
+      "epoch": 0.034357340648084655,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001995700463851929,
+      "loss": 0.1914,
+      "step": 3958
+    },
+    {
+      "epoch": 0.034366021128288814,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019956975576404645,
+      "loss": 0.1816,
+      "step": 3959
+    },
+    {
+      "epoch": 0.03437470160849298,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019956946504494787,
+      "loss": 0.1992,
+      "step": 3960
+    },
+    {
+      "epoch": 0.034383382088697144,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.001995691742278975,
+      "loss": 0.1963,
+      "step": 3961
+    },
+    {
+      "epoch": 0.03439206256890131,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019956888331289554,
+      "loss": 0.1621,
+      "step": 3962
+    },
+    {
+      "epoch": 0.034400743049105474,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001995685922999425,
+      "loss": 0.165,
+      "step": 3963
+    },
+    {
+      "epoch": 0.03440942352930964,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001995683011890385,
+      "loss": 0.1699,
+      "step": 3964
+    },
+    {
+      "epoch": 0.034418104009513804,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00199568009980184,
+      "loss": 0.207,
+      "step": 3965
+    },
+    {
+      "epoch": 0.03442678448971797,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019956771867337924,
+      "loss": 0.2012,
+      "step": 3966
+    },
+    {
+      "epoch": 0.034435464969922135,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001995674272686246,
+      "loss": 0.2246,
+      "step": 3967
+    },
+    {
+      "epoch": 0.0344441454501263,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001995671357659204,
+      "loss": 0.1455,
+      "step": 3968
+    },
+    {
+      "epoch": 0.034452825930330465,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001995668441652669,
+      "loss": 0.1797,
+      "step": 3969
+    },
+    {
+      "epoch": 0.03446150641053463,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001995665524666644,
+      "loss": 0.1738,
+      "step": 3970
+    },
+    {
+      "epoch": 0.034470186890738795,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001995662606701133,
+      "loss": 0.1982,
+      "step": 3971
+    },
+    {
+      "epoch": 0.03447886737094296,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.001995659687756139,
+      "loss": 0.248,
+      "step": 3972
+    },
+    {
+      "epoch": 0.034487547851147125,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019956567678316647,
+      "loss": 0.1992,
+      "step": 3973
+    },
+    {
+      "epoch": 0.03449622833135129,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019956538469277137,
+      "loss": 0.1475,
+      "step": 3974
+    },
+    {
+      "epoch": 0.034504908811555456,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0019956509250442893,
+      "loss": 0.1543,
+      "step": 3975
+    },
+    {
+      "epoch": 0.03451358929175962,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001995648002181394,
+      "loss": 0.1533,
+      "step": 3976
+    },
+    {
+      "epoch": 0.034522269771963786,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019956450783390318,
+      "loss": 0.1963,
+      "step": 3977
+    },
+    {
+      "epoch": 0.03453095025216795,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0019956421535172056,
+      "loss": 0.1797,
+      "step": 3978
+    },
+    {
+      "epoch": 0.034539630732372116,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0019956392277159186,
+      "loss": 0.207,
+      "step": 3979
+    },
+    {
+      "epoch": 0.03454831121257628,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001995636300935174,
+      "loss": 0.2109,
+      "step": 3980
+    },
+    {
+      "epoch": 0.034556991692780446,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0019956333731749754,
+      "loss": 0.1621,
+      "step": 3981
+    },
+    {
+      "epoch": 0.03456567217298461,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001995630444435325,
+      "loss": 0.1484,
+      "step": 3982
+    },
+    {
+      "epoch": 0.03457435265318878,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001995627514716227,
+      "loss": 0.2227,
+      "step": 3983
+    },
+    {
+      "epoch": 0.03458303313339294,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001995624584017684,
+      "loss": 0.2197,
+      "step": 3984
+    },
+    {
+      "epoch": 0.03459171361359711,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019956216523396996,
+      "loss": 0.2012,
+      "step": 3985
+    },
+    {
+      "epoch": 0.03460039409380127,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0019956187196822767,
+      "loss": 0.1562,
+      "step": 3986
+    },
+    {
+      "epoch": 0.03460907457400544,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019956157860454185,
+      "loss": 0.1729,
+      "step": 3987
+    },
+    {
+      "epoch": 0.0346177550542096,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019956128514291285,
+      "loss": 0.1543,
+      "step": 3988
+    },
+    {
+      "epoch": 0.03462643553441376,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0019956099158334097,
+      "loss": 0.1387,
+      "step": 3989
+    },
+    {
+      "epoch": 0.034635116014617925,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019956069792582652,
+      "loss": 0.1621,
+      "step": 3990
+    },
+    {
+      "epoch": 0.03464379649482209,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019956040417036984,
+      "loss": 0.1455,
+      "step": 3991
+    },
+    {
+      "epoch": 0.034652476975026256,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001995601103169713,
+      "loss": 0.2129,
+      "step": 3992
+    },
+    {
+      "epoch": 0.03466115745523042,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001995598163656311,
+      "loss": 0.1885,
+      "step": 3993
+    },
+    {
+      "epoch": 0.034669837935434586,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.0019955952231634965,
+      "loss": 0.3242,
+      "step": 3994
+    },
+    {
+      "epoch": 0.03467851841563875,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019955922816912728,
+      "loss": 0.2441,
+      "step": 3995
+    },
+    {
+      "epoch": 0.034687198895842916,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019955893392396423,
+      "loss": 0.2012,
+      "step": 3996
+    },
+    {
+      "epoch": 0.03469587937604708,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019955863958086096,
+      "loss": 0.1641,
+      "step": 3997
+    },
+    {
+      "epoch": 0.034704559856251246,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001995583451398176,
+      "loss": 0.1719,
+      "step": 3998
+    },
+    {
+      "epoch": 0.03471324033645541,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019955805060083466,
+      "loss": 0.1748,
+      "step": 3999
+    },
+    {
+      "epoch": 0.03472192081665958,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019955775596391234,
+      "loss": 0.1484,
+      "step": 4000
+    },
+    {
+      "epoch": 0.03473060129686374,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0019955746122905104,
+      "loss": 0.4453,
+      "step": 4001
+    },
+    {
+      "epoch": 0.03473928177706791,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00199557166396251,
+      "loss": 0.2109,
+      "step": 4002
+    },
+    {
+      "epoch": 0.03474796225727207,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019955687146551264,
+      "loss": 0.2441,
+      "step": 4003
+    },
+    {
+      "epoch": 0.03475664273747624,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001995565764368362,
+      "loss": 0.2324,
+      "step": 4004
+    },
+    {
+      "epoch": 0.0347653232176804,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0019955628131022203,
+      "loss": 0.1318,
+      "step": 4005
+    },
+    {
+      "epoch": 0.03477400369788457,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0019955598608567045,
+      "loss": 0.1758,
+      "step": 4006
+    },
+    {
+      "epoch": 0.03478268417808873,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001995556907631818,
+      "loss": 0.1572,
+      "step": 4007
+    },
+    {
+      "epoch": 0.0347913646582929,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001995553953427564,
+      "loss": 0.2021,
+      "step": 4008
+    },
+    {
+      "epoch": 0.03480004513849706,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001995550998243946,
+      "loss": 0.2031,
+      "step": 4009
+    },
+    {
+      "epoch": 0.03480872561870123,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0019955480420809665,
+      "loss": 0.1729,
+      "step": 4010
+    },
+    {
+      "epoch": 0.03481740609890539,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001995545084938629,
+      "loss": 0.1357,
+      "step": 4011
+    },
+    {
+      "epoch": 0.03482608657910956,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001995542126816937,
+      "loss": 0.1582,
+      "step": 4012
+    },
+    {
+      "epoch": 0.03483476705931372,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0019955391677158934,
+      "loss": 0.1553,
+      "step": 4013
+    },
+    {
+      "epoch": 0.03484344753951789,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019955362076355017,
+      "loss": 0.1523,
+      "step": 4014
+    },
+    {
+      "epoch": 0.03485212801972205,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0019955332465757653,
+      "loss": 0.1396,
+      "step": 4015
+    },
+    {
+      "epoch": 0.03486080849992622,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001995530284536687,
+      "loss": 0.1855,
+      "step": 4016
+    },
+    {
+      "epoch": 0.034869488980130384,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019955273215182703,
+      "loss": 0.1543,
+      "step": 4017
+    },
+    {
+      "epoch": 0.03487816946033455,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019955243575205182,
+      "loss": 0.1436,
+      "step": 4018
+    },
+    {
+      "epoch": 0.034886849940538714,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019955213925434345,
+      "loss": 0.1553,
+      "step": 4019
+    },
+    {
+      "epoch": 0.03489553042074287,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019955184265870217,
+      "loss": 0.1895,
+      "step": 4020
+    },
+    {
+      "epoch": 0.03490421090094704,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019955154596512834,
+      "loss": 0.165,
+      "step": 4021
+    },
+    {
+      "epoch": 0.0349128913811512,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001995512491736223,
+      "loss": 0.1533,
+      "step": 4022
+    },
+    {
+      "epoch": 0.03492157186135537,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019955095228418435,
+      "loss": 0.2246,
+      "step": 4023
+    },
+    {
+      "epoch": 0.03493025234155953,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001995506552968148,
+      "loss": 0.1299,
+      "step": 4024
+    },
+    {
+      "epoch": 0.0349389328217637,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0019955035821151404,
+      "loss": 0.1357,
+      "step": 4025
+    },
+    {
+      "epoch": 0.03494761330196786,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0019955006102828237,
+      "loss": 0.1699,
+      "step": 4026
+    },
+    {
+      "epoch": 0.03495629378217203,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001995497637471201,
+      "loss": 0.1846,
+      "step": 4027
+    },
+    {
+      "epoch": 0.03496497426237619,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019954946636802752,
+      "loss": 0.1494,
+      "step": 4028
+    },
+    {
+      "epoch": 0.03497365474258036,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0019954916889100495,
+      "loss": 0.2129,
+      "step": 4029
+    },
+    {
+      "epoch": 0.03498233522278452,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019954887131605286,
+      "loss": 0.1719,
+      "step": 4030
+    },
+    {
+      "epoch": 0.03499101570298869,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019954857364317138,
+      "loss": 0.1289,
+      "step": 4031
+    },
+    {
+      "epoch": 0.034999696183192854,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019954827587236094,
+      "loss": 0.1953,
+      "step": 4032
+    },
+    {
+      "epoch": 0.03500837666339702,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001995479780036219,
+      "loss": 0.1641,
+      "step": 4033
+    },
+    {
+      "epoch": 0.035017057143601184,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001995476800369545,
+      "loss": 0.1719,
+      "step": 4034
+    },
+    {
+      "epoch": 0.03502573762380535,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001995473819723591,
+      "loss": 0.1797,
+      "step": 4035
+    },
+    {
+      "epoch": 0.035034418104009514,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00199547083809836,
+      "loss": 0.1787,
+      "step": 4036
+    },
+    {
+      "epoch": 0.03504309858421368,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001995467855493856,
+      "loss": 0.2148,
+      "step": 4037
+    },
+    {
+      "epoch": 0.035051779064417844,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019954648719100816,
+      "loss": 0.1582,
+      "step": 4038
+    },
+    {
+      "epoch": 0.03506045954462201,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0019954618873470405,
+      "loss": 0.1689,
+      "step": 4039
+    },
+    {
+      "epoch": 0.035069140024826174,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019954589018047354,
+      "loss": 0.1602,
+      "step": 4040
+    },
+    {
+      "epoch": 0.03507782050503034,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019954559152831706,
+      "loss": 0.1895,
+      "step": 4041
+    },
+    {
+      "epoch": 0.035086500985234505,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019954529277823475,
+      "loss": 0.2148,
+      "step": 4042
+    },
+    {
+      "epoch": 0.03509518146543867,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001995449939302271,
+      "loss": 0.2021,
+      "step": 4043
+    },
+    {
+      "epoch": 0.035103861945642835,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019954469498429444,
+      "loss": 0.1719,
+      "step": 4044
+    },
+    {
+      "epoch": 0.035112542425847,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00199544395940437,
+      "loss": 0.1924,
+      "step": 4045
+    },
+    {
+      "epoch": 0.035121222906051165,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001995440967986552,
+      "loss": 0.1895,
+      "step": 4046
+    },
+    {
+      "epoch": 0.03512990338625533,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001995437975589493,
+      "loss": 0.1494,
+      "step": 4047
+    },
+    {
+      "epoch": 0.035138583866459495,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001995434982213196,
+      "loss": 0.1689,
+      "step": 4048
+    },
+    {
+      "epoch": 0.03514726434666366,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019954319878576652,
+      "loss": 0.1875,
+      "step": 4049
+    },
+    {
+      "epoch": 0.035155944826867826,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0019954289925229033,
+      "loss": 0.1719,
+      "step": 4050
+    },
+    {
+      "epoch": 0.035164625307071984,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.001995425996208914,
+      "loss": 0.123,
+      "step": 4051
+    },
+    {
+      "epoch": 0.03517330578727615,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019954229989157003,
+      "loss": 0.1348,
+      "step": 4052
+    },
+    {
+      "epoch": 0.035181986267480314,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001995420000643265,
+      "loss": 0.2061,
+      "step": 4053
+    },
+    {
+      "epoch": 0.03519066674768448,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001995417001391612,
+      "loss": 0.1699,
+      "step": 4054
+    },
+    {
+      "epoch": 0.035199347227888644,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019954140011607447,
+      "loss": 0.1816,
+      "step": 4055
+    },
+    {
+      "epoch": 0.03520802770809281,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019954109999506657,
+      "loss": 0.1934,
+      "step": 4056
+    },
+    {
+      "epoch": 0.035216708188296975,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001995407997761379,
+      "loss": 0.1777,
+      "step": 4057
+    },
+    {
+      "epoch": 0.03522538866850114,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019954049945928878,
+      "loss": 0.2109,
+      "step": 4058
+    },
+    {
+      "epoch": 0.035234069148705305,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0019954019904451946,
+      "loss": 0.2207,
+      "step": 4059
+    },
+    {
+      "epoch": 0.03524274962890947,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019953989853183033,
+      "loss": 0.1689,
+      "step": 4060
+    },
+    {
+      "epoch": 0.035251430109113635,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001995395979212217,
+      "loss": 0.208,
+      "step": 4061
+    },
+    {
+      "epoch": 0.0352601105893178,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019953929721269396,
+      "loss": 0.1641,
+      "step": 4062
+    },
+    {
+      "epoch": 0.035268791069521965,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019953899640624737,
+      "loss": 0.1602,
+      "step": 4063
+    },
+    {
+      "epoch": 0.03527747154972613,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019953869550188227,
+      "loss": 0.1836,
+      "step": 4064
+    },
+    {
+      "epoch": 0.035286152029930296,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00199538394499599,
+      "loss": 0.1562,
+      "step": 4065
+    },
+    {
+      "epoch": 0.03529483251013446,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001995380933993979,
+      "loss": 0.6406,
+      "step": 4066
+    },
+    {
+      "epoch": 0.035303512990338626,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019953779220127925,
+      "loss": 0.2227,
+      "step": 4067
+    },
+    {
+      "epoch": 0.03531219347054279,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0019953749090524347,
+      "loss": 0.1816,
+      "step": 4068
+    },
+    {
+      "epoch": 0.035320873950746956,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001995371895112908,
+      "loss": 0.1719,
+      "step": 4069
+    },
+    {
+      "epoch": 0.03532955443095112,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001995368880194216,
+      "loss": 0.2061,
+      "step": 4070
+    },
+    {
+      "epoch": 0.035338234911155286,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001995365864296362,
+      "loss": 0.1855,
+      "step": 4071
+    },
+    {
+      "epoch": 0.03534691539135945,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.00199536284741935,
+      "loss": 0.166,
+      "step": 4072
+    },
+    {
+      "epoch": 0.035355595871563616,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001995359829563182,
+      "loss": 0.1719,
+      "step": 4073
+    },
+    {
+      "epoch": 0.03536427635176778,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0019953568107278622,
+      "loss": 0.1504,
+      "step": 4074
+    },
+    {
+      "epoch": 0.03537295683197195,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019953537909133934,
+      "loss": 0.1426,
+      "step": 4075
+    },
+    {
+      "epoch": 0.03538163731217611,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019953507701197795,
+      "loss": 0.2988,
+      "step": 4076
+    },
+    {
+      "epoch": 0.03539031779238028,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001995347748347023,
+      "loss": 0.1846,
+      "step": 4077
+    },
+    {
+      "epoch": 0.03539899827258444,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001995344725595128,
+      "loss": 0.1758,
+      "step": 4078
+    },
+    {
+      "epoch": 0.03540767875278861,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019953417018640975,
+      "loss": 0.1846,
+      "step": 4079
+    },
+    {
+      "epoch": 0.03541635923299277,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001995338677153934,
+      "loss": 0.2314,
+      "step": 4080
+    },
+    {
+      "epoch": 0.03542503971319694,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019953356514646426,
+      "loss": 0.1514,
+      "step": 4081
+    },
+    {
+      "epoch": 0.035433720193401096,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019953326247962254,
+      "loss": 0.1465,
+      "step": 4082
+    },
+    {
+      "epoch": 0.03544240067360526,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019953295971486856,
+      "loss": 0.168,
+      "step": 4083
+    },
+    {
+      "epoch": 0.035451081153809426,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019953265685220267,
+      "loss": 0.1553,
+      "step": 4084
+    },
+    {
+      "epoch": 0.03545976163401359,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001995323538916252,
+      "loss": 0.168,
+      "step": 4085
+    },
+    {
+      "epoch": 0.035468442114217756,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0019953205083313654,
+      "loss": 0.1826,
+      "step": 4086
+    },
+    {
+      "epoch": 0.03547712259442192,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0019953174767673694,
+      "loss": 0.1777,
+      "step": 4087
+    },
+    {
+      "epoch": 0.035485803074626086,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019953144442242682,
+      "loss": 0.1855,
+      "step": 4088
+    },
+    {
+      "epoch": 0.03549448355483025,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0019953114107020644,
+      "loss": 0.1797,
+      "step": 4089
+    },
+    {
+      "epoch": 0.03550316403503442,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001995308376200761,
+      "loss": 0.1787,
+      "step": 4090
+    },
+    {
+      "epoch": 0.03551184451523858,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0019953053407203622,
+      "loss": 0.1836,
+      "step": 4091
+    },
+    {
+      "epoch": 0.03552052499544275,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001995302304260871,
+      "loss": 0.2676,
+      "step": 4092
+    },
+    {
+      "epoch": 0.03552920547564691,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019952992668222902,
+      "loss": 0.1797,
+      "step": 4093
+    },
+    {
+      "epoch": 0.03553788595585108,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.001995296228404624,
+      "loss": 0.1504,
+      "step": 4094
+    },
+    {
+      "epoch": 0.03554656643605524,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019952931890078753,
+      "loss": 0.1582,
+      "step": 4095
+    },
+    {
+      "epoch": 0.03555524691625941,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.001995290148632047,
+      "loss": 0.2324,
+      "step": 4096
+    },
+    {
+      "epoch": 0.03556392739646357,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0019952871072771434,
+      "loss": 0.1807,
+      "step": 4097
+    },
+    {
+      "epoch": 0.03557260787666774,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001995284064943167,
+      "loss": 0.2344,
+      "step": 4098
+    },
+    {
+      "epoch": 0.0355812883568719,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0019952810216301215,
+      "loss": 0.25,
+      "step": 4099
+    },
+    {
+      "epoch": 0.03558996883707607,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00199527797733801,
+      "loss": 0.249,
+      "step": 4100
+    },
+    {
+      "epoch": 0.03559864931728023,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001995274932066836,
+      "loss": 0.1865,
+      "step": 4101
+    },
+    {
+      "epoch": 0.0356073297974844,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019952718858166026,
+      "loss": 0.2012,
+      "step": 4102
+    },
+    {
+      "epoch": 0.03561601027768856,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001995268838587314,
+      "loss": 0.1523,
+      "step": 4103
+    },
+    {
+      "epoch": 0.03562469075789273,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001995265790378972,
+      "loss": 0.1484,
+      "step": 4104
+    },
+    {
+      "epoch": 0.03563337123809689,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001995262741191581,
+      "loss": 0.1514,
+      "step": 4105
+    },
+    {
+      "epoch": 0.03564205171830106,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019952596910251443,
+      "loss": 0.1641,
+      "step": 4106
+    },
+    {
+      "epoch": 0.035650732198505224,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.001995256639879665,
+      "loss": 0.457,
+      "step": 4107
+    },
+    {
+      "epoch": 0.03565941267870939,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019952535877551464,
+      "loss": 0.1543,
+      "step": 4108
+    },
+    {
+      "epoch": 0.035668093158913554,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0019952505346515917,
+      "loss": 0.1953,
+      "step": 4109
+    },
+    {
+      "epoch": 0.03567677363911772,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001995247480569005,
+      "loss": 0.2002,
+      "step": 4110
+    },
+    {
+      "epoch": 0.035685454119321884,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001995244425507389,
+      "loss": 0.1826,
+      "step": 4111
+    },
+    {
+      "epoch": 0.03569413459952605,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001995241369466747,
+      "loss": 0.2188,
+      "step": 4112
+    },
+    {
+      "epoch": 0.03570281507973021,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0019952383124470824,
+      "loss": 0.1992,
+      "step": 4113
+    },
+    {
+      "epoch": 0.03571149555993437,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019952352544483987,
+      "loss": 0.1807,
+      "step": 4114
+    },
+    {
+      "epoch": 0.03572017604013854,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019952321954706995,
+      "loss": 0.1875,
+      "step": 4115
+    },
+    {
+      "epoch": 0.0357288565203427,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019952291355139875,
+      "loss": 0.1836,
+      "step": 4116
+    },
+    {
+      "epoch": 0.03573753700054687,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0019952260745782664,
+      "loss": 0.2168,
+      "step": 4117
+    },
+    {
+      "epoch": 0.03574621748075103,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0019952230126635394,
+      "loss": 0.1602,
+      "step": 4118
+    },
+    {
+      "epoch": 0.0357548979609552,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0019952199497698104,
+      "loss": 0.2314,
+      "step": 4119
+    },
+    {
+      "epoch": 0.03576357844115936,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001995216885897082,
+      "loss": 0.1738,
+      "step": 4120
+    },
+    {
+      "epoch": 0.03577225892136353,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0019952138210453576,
+      "loss": 0.2227,
+      "step": 4121
+    },
+    {
+      "epoch": 0.035780939401567693,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001995210755214641,
+      "loss": 0.1719,
+      "step": 4122
+    },
+    {
+      "epoch": 0.03578961988177186,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0019952076884049356,
+      "loss": 0.2148,
+      "step": 4123
+    },
+    {
+      "epoch": 0.035798300361976024,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019952046206162446,
+      "loss": 0.1953,
+      "step": 4124
+    },
+    {
+      "epoch": 0.03580698084218019,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001995201551848571,
+      "loss": 0.1699,
+      "step": 4125
+    },
+    {
+      "epoch": 0.035815661322384354,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019951984821019184,
+      "loss": 0.1758,
+      "step": 4126
+    },
+    {
+      "epoch": 0.03582434180258852,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019951954113762906,
+      "loss": 0.1445,
+      "step": 4127
+    },
+    {
+      "epoch": 0.035833022282792684,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.00199519233967169,
+      "loss": 0.1699,
+      "step": 4128
+    },
+    {
+      "epoch": 0.03584170276299685,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019951892669881205,
+      "loss": 0.1777,
+      "step": 4129
+    },
+    {
+      "epoch": 0.035850383243201014,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0019951861933255855,
+      "loss": 0.1709,
+      "step": 4130
+    },
+    {
+      "epoch": 0.03585906372340518,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001995183118684089,
+      "loss": 0.2188,
+      "step": 4131
+    },
+    {
+      "epoch": 0.035867744203609345,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001995180043063633,
+      "loss": 0.1582,
+      "step": 4132
+    },
+    {
+      "epoch": 0.03587642468381351,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001995176966464222,
+      "loss": 0.2578,
+      "step": 4133
+    },
+    {
+      "epoch": 0.035885105164017675,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019951738888858584,
+      "loss": 0.1953,
+      "step": 4134
+    },
+    {
+      "epoch": 0.03589378564422184,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019951708103285466,
+      "loss": 0.1973,
+      "step": 4135
+    },
+    {
+      "epoch": 0.035902466124426005,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.001995167730792289,
+      "loss": 0.1631,
+      "step": 4136
+    },
+    {
+      "epoch": 0.03591114660463017,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019951646502770898,
+      "loss": 0.1953,
+      "step": 4137
+    },
+    {
+      "epoch": 0.035919827084834335,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001995161568782952,
+      "loss": 0.1748,
+      "step": 4138
+    },
+    {
+      "epoch": 0.0359285075650385,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0019951584863098786,
+      "loss": 0.1826,
+      "step": 4139
+    },
+    {
+      "epoch": 0.035937188045242666,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001995155402857874,
+      "loss": 0.1338,
+      "step": 4140
+    },
+    {
+      "epoch": 0.03594586852544683,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00199515231842694,
+      "loss": 0.1504,
+      "step": 4141
+    },
+    {
+      "epoch": 0.035954549005650996,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0019951492330170816,
+      "loss": 0.1641,
+      "step": 4142
+    },
+    {
+      "epoch": 0.035963229485855154,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001995146146628301,
+      "loss": 0.1699,
+      "step": 4143
+    },
+    {
+      "epoch": 0.03597190996605932,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001995143059260602,
+      "loss": 0.1855,
+      "step": 4144
+    },
+    {
+      "epoch": 0.035980590446263484,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001995139970913988,
+      "loss": 0.1592,
+      "step": 4145
+    },
+    {
+      "epoch": 0.03598927092646765,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019951368815884626,
+      "loss": 0.1328,
+      "step": 4146
+    },
+    {
+      "epoch": 0.035997951406671815,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019951337912840292,
+      "loss": 0.1699,
+      "step": 4147
+    },
+    {
+      "epoch": 0.03600663188687598,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0019951307000006906,
+      "loss": 0.1367,
+      "step": 4148
+    },
+    {
+      "epoch": 0.036015312367080145,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0019951276077384505,
+      "loss": 0.1699,
+      "step": 4149
+    },
+    {
+      "epoch": 0.03602399284728431,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019951245144973124,
+      "loss": 0.1621,
+      "step": 4150
+    },
+    {
+      "epoch": 0.036032673327488475,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019951214202772794,
+      "loss": 0.1689,
+      "step": 4151
+    },
+    {
+      "epoch": 0.03604135380769264,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019951183250783554,
+      "loss": 0.1807,
+      "step": 4152
+    },
+    {
+      "epoch": 0.036050034287896805,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001995115228900543,
+      "loss": 0.1904,
+      "step": 4153
+    },
+    {
+      "epoch": 0.03605871476810097,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001995112131743846,
+      "loss": 0.1064,
+      "step": 4154
+    },
+    {
+      "epoch": 0.036067395248305135,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019951090336082686,
+      "loss": 0.2031,
+      "step": 4155
+    },
+    {
+      "epoch": 0.0360760757285093,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001995105934493813,
+      "loss": 0.2012,
+      "step": 4156
+    },
+    {
+      "epoch": 0.036084756208713466,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019951028344004826,
+      "loss": 0.2188,
+      "step": 4157
+    },
+    {
+      "epoch": 0.03609343668891763,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019950997333282815,
+      "loss": 0.2031,
+      "step": 4158
+    },
+    {
+      "epoch": 0.036102117169121796,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001995096631277213,
+      "loss": 0.1582,
+      "step": 4159
+    },
+    {
+      "epoch": 0.03611079764932596,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.00199509352824728,
+      "loss": 0.2031,
+      "step": 4160
+    },
+    {
+      "epoch": 0.036119478129530126,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001995090424238486,
+      "loss": 0.5156,
+      "step": 4161
+    },
+    {
+      "epoch": 0.03612815860973429,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001995087319250835,
+      "loss": 0.1797,
+      "step": 4162
+    },
+    {
+      "epoch": 0.036136839089938456,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019950842132843297,
+      "loss": 0.1797,
+      "step": 4163
+    },
+    {
+      "epoch": 0.03614551957014262,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0019950811063389738,
+      "loss": 0.1582,
+      "step": 4164
+    },
+    {
+      "epoch": 0.03615420005034679,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0019950779984147706,
+      "loss": 0.1709,
+      "step": 4165
+    },
+    {
+      "epoch": 0.03616288053055095,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019950748895117237,
+      "loss": 0.1797,
+      "step": 4166
+    },
+    {
+      "epoch": 0.03617156101075512,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.001995071779629836,
+      "loss": 0.1973,
+      "step": 4167
+    },
+    {
+      "epoch": 0.03618024149095928,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019950686687691117,
+      "loss": 0.2012,
+      "step": 4168
+    },
+    {
+      "epoch": 0.03618892197116345,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0019950655569295535,
+      "loss": 0.1377,
+      "step": 4169
+    },
+    {
+      "epoch": 0.03619760245136761,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001995062444111165,
+      "loss": 0.1914,
+      "step": 4170
+    },
+    {
+      "epoch": 0.03620628293157178,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00199505933031395,
+      "loss": 0.1436,
+      "step": 4171
+    },
+    {
+      "epoch": 0.03621496341177594,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001995056215537911,
+      "loss": 0.1836,
+      "step": 4172
+    },
+    {
+      "epoch": 0.03622364389198011,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0019950530997830526,
+      "loss": 0.126,
+      "step": 4173
+    },
+    {
+      "epoch": 0.036232324372184266,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001995049983049377,
+      "loss": 0.1582,
+      "step": 4174
+    },
+    {
+      "epoch": 0.03624100485238843,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001995046865336889,
+      "loss": 0.1328,
+      "step": 4175
+    },
+    {
+      "epoch": 0.036249685332592596,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0019950437466455906,
+      "loss": 0.1445,
+      "step": 4176
+    },
+    {
+      "epoch": 0.03625836581279676,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001995040626975486,
+      "loss": 0.21,
+      "step": 4177
+    },
+    {
+      "epoch": 0.036267046293000926,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019950375063265777,
+      "loss": 0.2031,
+      "step": 4178
+    },
+    {
+      "epoch": 0.03627572677320509,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0019950343846988705,
+      "loss": 0.1787,
+      "step": 4179
+    },
+    {
+      "epoch": 0.03628440725340926,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019950312620923673,
+      "loss": 0.1514,
+      "step": 4180
+    },
+    {
+      "epoch": 0.03629308773361342,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019950281385070706,
+      "loss": 0.1484,
+      "step": 4181
+    },
+    {
+      "epoch": 0.03630176821381759,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019950250139429853,
+      "loss": 0.209,
+      "step": 4182
+    },
+    {
+      "epoch": 0.03631044869402175,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001995021888400114,
+      "loss": 0.2188,
+      "step": 4183
+    },
+    {
+      "epoch": 0.03631912917422592,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.00199501876187846,
+      "loss": 0.1748,
+      "step": 4184
+    },
+    {
+      "epoch": 0.03632780965443008,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001995015634378027,
+      "loss": 0.1934,
+      "step": 4185
+    },
+    {
+      "epoch": 0.03633649013463425,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019950125058988185,
+      "loss": 0.1787,
+      "step": 4186
+    },
+    {
+      "epoch": 0.03634517061483841,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001995009376440838,
+      "loss": 0.2461,
+      "step": 4187
+    },
+    {
+      "epoch": 0.03635385109504258,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001995006246004088,
+      "loss": 0.1709,
+      "step": 4188
+    },
+    {
+      "epoch": 0.03636253157524674,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019950031145885734,
+      "loss": 0.1895,
+      "step": 4189
+    },
+    {
+      "epoch": 0.03637121205545091,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001994999982194296,
+      "loss": 0.1865,
+      "step": 4190
+    },
+    {
+      "epoch": 0.03637989253565507,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019949968488212604,
+      "loss": 0.1758,
+      "step": 4191
+    },
+    {
+      "epoch": 0.03638857301585924,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019949937144694697,
+      "loss": 0.1777,
+      "step": 4192
+    },
+    {
+      "epoch": 0.0363972534960634,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019949905791389277,
+      "loss": 0.1758,
+      "step": 4193
+    },
+    {
+      "epoch": 0.03640593397626757,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001994987442829637,
+      "loss": 0.1338,
+      "step": 4194
+    },
+    {
+      "epoch": 0.03641461445647173,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019949843055416016,
+      "loss": 0.2188,
+      "step": 4195
+    },
+    {
+      "epoch": 0.0364232949366759,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001994981167274825,
+      "loss": 0.123,
+      "step": 4196
+    },
+    {
+      "epoch": 0.036431975416880064,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.00199497802802931,
+      "loss": 0.1826,
+      "step": 4197
+    },
+    {
+      "epoch": 0.03644065589708423,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001994974887805061,
+      "loss": 0.1572,
+      "step": 4198
+    },
+    {
+      "epoch": 0.036449336377288394,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019949717466020804,
+      "loss": 0.1973,
+      "step": 4199
+    },
+    {
+      "epoch": 0.03645801685749256,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019949686044203724,
+      "loss": 0.166,
+      "step": 4200
+    },
+    {
+      "epoch": 0.036466697337696724,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.00199496546125994,
+      "loss": 0.1934,
+      "step": 4201
+    },
+    {
+      "epoch": 0.03647537781790089,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001994962317120787,
+      "loss": 0.1641,
+      "step": 4202
+    },
+    {
+      "epoch": 0.036484058298105054,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001994959172002917,
+      "loss": 0.2314,
+      "step": 4203
+    },
+    {
+      "epoch": 0.03649273877830922,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0019949560259063326,
+      "loss": 0.1895,
+      "step": 4204
+    },
+    {
+      "epoch": 0.03650141925851338,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001994952878831038,
+      "loss": 0.1455,
+      "step": 4205
+    },
+    {
+      "epoch": 0.03651009973871754,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0019949497307770363,
+      "loss": 0.2031,
+      "step": 4206
+    },
+    {
+      "epoch": 0.03651878021892171,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0019949465817443307,
+      "loss": 0.1484,
+      "step": 4207
+    },
+    {
+      "epoch": 0.03652746069912587,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0019949434317329253,
+      "loss": 0.1318,
+      "step": 4208
+    },
+    {
+      "epoch": 0.03653614117933004,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001994940280742823,
+      "loss": 0.1445,
+      "step": 4209
+    },
+    {
+      "epoch": 0.0365448216595342,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001994937128774028,
+      "loss": 0.1494,
+      "step": 4210
+    },
+    {
+      "epoch": 0.03655350213973837,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001994933975826543,
+      "loss": 0.1875,
+      "step": 4211
+    },
+    {
+      "epoch": 0.03656218261994253,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001994930821900371,
+      "loss": 0.1572,
+      "step": 4212
+    },
+    {
+      "epoch": 0.0365708631001467,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0019949276669955166,
+      "loss": 0.2061,
+      "step": 4213
+    },
+    {
+      "epoch": 0.036579543580350864,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019949245111119825,
+      "loss": 0.1436,
+      "step": 4214
+    },
+    {
+      "epoch": 0.03658822406055503,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001994921354249773,
+      "loss": 0.1611,
+      "step": 4215
+    },
+    {
+      "epoch": 0.036596904540759194,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.00199491819640889,
+      "loss": 0.1777,
+      "step": 4216
+    },
+    {
+      "epoch": 0.03660558502096336,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019949150375893386,
+      "loss": 0.1943,
+      "step": 4217
+    },
+    {
+      "epoch": 0.036614265501167524,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0019949118777911217,
+      "loss": 0.1484,
+      "step": 4218
+    },
+    {
+      "epoch": 0.03662294598137169,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019949087170142423,
+      "loss": 0.1709,
+      "step": 4219
+    },
+    {
+      "epoch": 0.036631626461575854,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0019949055552587044,
+      "loss": 0.1943,
+      "step": 4220
+    },
+    {
+      "epoch": 0.03664030694178002,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001994902392524511,
+      "loss": 0.1318,
+      "step": 4221
+    },
+    {
+      "epoch": 0.036648987421984185,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001994899228811666,
+      "loss": 0.125,
+      "step": 4222
+    },
+    {
+      "epoch": 0.03665766790218835,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001994896064120172,
+      "loss": 0.1953,
+      "step": 4223
+    },
+    {
+      "epoch": 0.036666348382392515,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001994892898450034,
+      "loss": 0.1855,
+      "step": 4224
+    },
+    {
+      "epoch": 0.03667502886259668,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019948897318012543,
+      "loss": 0.1816,
+      "step": 4225
+    },
+    {
+      "epoch": 0.036683709342800845,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001994886564173836,
+      "loss": 0.248,
+      "step": 4226
+    },
+    {
+      "epoch": 0.03669238982300501,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001994883395567784,
+      "loss": 0.1543,
+      "step": 4227
+    },
+    {
+      "epoch": 0.036701070303209175,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001994880225983101,
+      "loss": 0.1953,
+      "step": 4228
+    },
+    {
+      "epoch": 0.03670975078341334,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00199487705541979,
+      "loss": 0.2285,
+      "step": 4229
+    },
+    {
+      "epoch": 0.036718431263617506,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019948738838778545,
+      "loss": 0.1206,
+      "step": 4230
+    },
+    {
+      "epoch": 0.03672711174382167,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001994870711357299,
+      "loss": 0.1934,
+      "step": 4231
+    },
+    {
+      "epoch": 0.036735792224025836,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001994867537858126,
+      "loss": 0.1777,
+      "step": 4232
+    },
+    {
+      "epoch": 0.03674447270423,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.00199486436338034,
+      "loss": 0.168,
+      "step": 4233
+    },
+    {
+      "epoch": 0.036753153184434166,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001994861187923943,
+      "loss": 0.1699,
+      "step": 4234
+    },
+    {
+      "epoch": 0.03676183366463833,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019948580114889395,
+      "loss": 0.1309,
+      "step": 4235
+    },
+    {
+      "epoch": 0.03677051414484249,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0019948548340753326,
+      "loss": 0.2109,
+      "step": 4236
+    },
+    {
+      "epoch": 0.036779194625046654,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001994851655683126,
+      "loss": 0.1484,
+      "step": 4237
+    },
+    {
+      "epoch": 0.03678787510525082,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0019948484763123234,
+      "loss": 0.1934,
+      "step": 4238
+    },
+    {
+      "epoch": 0.036796555585454985,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019948452959629276,
+      "loss": 0.1787,
+      "step": 4239
+    },
+    {
+      "epoch": 0.03680523606565915,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019948421146349423,
+      "loss": 0.2041,
+      "step": 4240
+    },
+    {
+      "epoch": 0.036813916545863315,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019948389323283717,
+      "loss": 0.2334,
+      "step": 4241
+    },
+    {
+      "epoch": 0.03682259702606748,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019948357490432184,
+      "loss": 0.252,
+      "step": 4242
+    },
+    {
+      "epoch": 0.036831277506271645,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001994832564779486,
+      "loss": 0.1885,
+      "step": 4243
+    },
+    {
+      "epoch": 0.03683995798647581,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001994829379537178,
+      "loss": 0.1426,
+      "step": 4244
+    },
+    {
+      "epoch": 0.036848638466679975,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001994826193316298,
+      "loss": 0.1328,
+      "step": 4245
+    },
+    {
+      "epoch": 0.03685731894688414,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019948230061168503,
+      "loss": 0.1738,
+      "step": 4246
+    },
+    {
+      "epoch": 0.036865999427088306,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001994819817938837,
+      "loss": 0.1504,
+      "step": 4247
+    },
+    {
+      "epoch": 0.03687467990729247,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019948166287822623,
+      "loss": 0.1465,
+      "step": 4248
+    },
+    {
+      "epoch": 0.036883360387496636,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019948134386471297,
+      "loss": 0.1338,
+      "step": 4249
+    },
+    {
+      "epoch": 0.0368920408677008,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001994810247533442,
+      "loss": 0.2188,
+      "step": 4250
+    },
+    {
+      "epoch": 0.036900721347904966,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001994807055441204,
+      "loss": 0.1738,
+      "step": 4251
+    },
+    {
+      "epoch": 0.03690940182810913,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001994803862370418,
+      "loss": 0.167,
+      "step": 4252
+    },
+    {
+      "epoch": 0.036918082308313296,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.001994800668321088,
+      "loss": 0.1758,
+      "step": 4253
+    },
+    {
+      "epoch": 0.03692676278851746,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019947974732932175,
+      "loss": 0.1494,
+      "step": 4254
+    },
+    {
+      "epoch": 0.03693544326872163,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.00199479427728681,
+      "loss": 0.2266,
+      "step": 4255
+    },
+    {
+      "epoch": 0.03694412374892579,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001994791080301869,
+      "loss": 0.2119,
+      "step": 4256
+    },
+    {
+      "epoch": 0.03695280422912996,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019947878823383977,
+      "loss": 0.1826,
+      "step": 4257
+    },
+    {
+      "epoch": 0.03696148470933412,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0019947846833963997,
+      "loss": 0.1602,
+      "step": 4258
+    },
+    {
+      "epoch": 0.03697016518953829,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001994781483475879,
+      "loss": 0.1133,
+      "step": 4259
+    },
+    {
+      "epoch": 0.03697884566974245,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0019947782825768387,
+      "loss": 0.1426,
+      "step": 4260
+    },
+    {
+      "epoch": 0.03698752614994662,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0019947750806992817,
+      "loss": 0.1641,
+      "step": 4261
+    },
+    {
+      "epoch": 0.03699620663015078,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001994771877843213,
+      "loss": 0.166,
+      "step": 4262
+    },
+    {
+      "epoch": 0.03700488711035495,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019947686740086346,
+      "loss": 0.126,
+      "step": 4263
+    },
+    {
+      "epoch": 0.03701356759055911,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001994765469195551,
+      "loss": 0.1611,
+      "step": 4264
+    },
+    {
+      "epoch": 0.03702224807076328,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019947622634039646,
+      "loss": 0.1934,
+      "step": 4265
+    },
+    {
+      "epoch": 0.03703092855096744,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019947590566338803,
+      "loss": 0.1621,
+      "step": 4266
+    },
+    {
+      "epoch": 0.0370396090311716,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001994755848885301,
+      "loss": 0.1914,
+      "step": 4267
+    },
+    {
+      "epoch": 0.037048289511375766,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.00199475264015823,
+      "loss": 0.1289,
+      "step": 4268
+    },
+    {
+      "epoch": 0.03705696999157993,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019947494304526706,
+      "loss": 0.1992,
+      "step": 4269
+    },
+    {
+      "epoch": 0.037065650471784096,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001994746219768627,
+      "loss": 0.1914,
+      "step": 4270
+    },
+    {
+      "epoch": 0.03707433095198826,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001994743008106103,
+      "loss": 0.2012,
+      "step": 4271
+    },
+    {
+      "epoch": 0.03708301143219243,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0019947397954651,
+      "loss": 0.1758,
+      "step": 4272
+    },
+    {
+      "epoch": 0.03709169191239659,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001994736581845624,
+      "loss": 0.1758,
+      "step": 4273
+    },
+    {
+      "epoch": 0.03710037239260076,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019947333672476774,
+      "loss": 0.1494,
+      "step": 4274
+    },
+    {
+      "epoch": 0.03710905287280492,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019947301516712638,
+      "loss": 0.1426,
+      "step": 4275
+    },
+    {
+      "epoch": 0.03711773335300909,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0019947269351163867,
+      "loss": 0.1436,
+      "step": 4276
+    },
+    {
+      "epoch": 0.03712641383321325,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019947237175830497,
+      "loss": 0.1504,
+      "step": 4277
+    },
+    {
+      "epoch": 0.03713509431341742,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019947204990712563,
+      "loss": 0.208,
+      "step": 4278
+    },
+    {
+      "epoch": 0.03714377479362158,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0019947172795810103,
+      "loss": 0.2412,
+      "step": 4279
+    },
+    {
+      "epoch": 0.03715245527382575,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0019947140591123147,
+      "loss": 0.1582,
+      "step": 4280
+    },
+    {
+      "epoch": 0.03716113575402991,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019947108376651727,
+      "loss": 0.1865,
+      "step": 4281
+    },
+    {
+      "epoch": 0.03716981623423408,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001994707615239589,
+      "loss": 0.1582,
+      "step": 4282
+    },
+    {
+      "epoch": 0.03717849671443824,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.001994704391835566,
+      "loss": 0.1318,
+      "step": 4283
+    },
+    {
+      "epoch": 0.03718717719464241,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019947011674531084,
+      "loss": 0.2539,
+      "step": 4284
+    },
+    {
+      "epoch": 0.03719585767484657,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019946979420922186,
+      "loss": 0.207,
+      "step": 4285
+    },
+    {
+      "epoch": 0.03720453815505074,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0019946947157529005,
+      "loss": 0.1157,
+      "step": 4286
+    },
+    {
+      "epoch": 0.037213218635254904,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001994691488435158,
+      "loss": 0.1904,
+      "step": 4287
+    },
+    {
+      "epoch": 0.03722189911545907,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001994688260138994,
+      "loss": 0.1934,
+      "step": 4288
+    },
+    {
+      "epoch": 0.037230579595663234,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001994685030864413,
+      "loss": 0.1855,
+      "step": 4289
+    },
+    {
+      "epoch": 0.0372392600758674,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019946818006114173,
+      "loss": 0.1572,
+      "step": 4290
+    },
+    {
+      "epoch": 0.037247940556071564,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001994678569380011,
+      "loss": 0.1631,
+      "step": 4291
+    },
+    {
+      "epoch": 0.03725662103627573,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001994675337170198,
+      "loss": 0.2012,
+      "step": 4292
+    },
+    {
+      "epoch": 0.037265301516479894,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0019946721039819813,
+      "loss": 0.1836,
+      "step": 4293
+    },
+    {
+      "epoch": 0.03727398199668406,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0019946688698153645,
+      "loss": 0.1523,
+      "step": 4294
+    },
+    {
+      "epoch": 0.037282662476888224,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019946656346703515,
+      "loss": 0.166,
+      "step": 4295
+    },
+    {
+      "epoch": 0.03729134295709239,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001994662398546946,
+      "loss": 0.2188,
+      "step": 4296
+    },
+    {
+      "epoch": 0.03730002343729655,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019946591614451504,
+      "loss": 0.1621,
+      "step": 4297
+    },
+    {
+      "epoch": 0.03730870391750071,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0019946559233649692,
+      "loss": 0.2129,
+      "step": 4298
+    },
+    {
+      "epoch": 0.03731738439770488,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0019946526843064058,
+      "loss": 0.2148,
+      "step": 4299
+    },
+    {
+      "epoch": 0.03732606487790904,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019946494442694635,
+      "loss": 0.1885,
+      "step": 4300
+    },
+    {
+      "epoch": 0.03733474535811321,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019946462032541462,
+      "loss": 0.1992,
+      "step": 4301
+    },
+    {
+      "epoch": 0.03734342583831737,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001994642961260457,
+      "loss": 0.1572,
+      "step": 4302
+    },
+    {
+      "epoch": 0.03735210631852154,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0019946397182884,
+      "loss": 0.2578,
+      "step": 4303
+    },
+    {
+      "epoch": 0.037360786798725704,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019946364743379788,
+      "loss": 0.1387,
+      "step": 4304
+    },
+    {
+      "epoch": 0.03736946727892987,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019946332294091956,
+      "loss": 0.1455,
+      "step": 4305
+    },
+    {
+      "epoch": 0.037378147759134034,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019946299835020558,
+      "loss": 0.1445,
+      "step": 4306
+    },
+    {
+      "epoch": 0.0373868282393382,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001994626736616562,
+      "loss": 0.1348,
+      "step": 4307
+    },
+    {
+      "epoch": 0.037395508719542364,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0019946234887527176,
+      "loss": 0.1553,
+      "step": 4308
+    },
+    {
+      "epoch": 0.03740418919974653,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0019946202399105262,
+      "loss": 0.1602,
+      "step": 4309
+    },
+    {
+      "epoch": 0.037412869679950694,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0019946169900899916,
+      "loss": 0.1494,
+      "step": 4310
+    },
+    {
+      "epoch": 0.03742155016015486,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0019946137392911176,
+      "loss": 0.1338,
+      "step": 4311
+    },
+    {
+      "epoch": 0.037430230640359025,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019946104875139072,
+      "loss": 0.1602,
+      "step": 4312
+    },
+    {
+      "epoch": 0.03743891112056319,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019946072347583645,
+      "loss": 0.1719,
+      "step": 4313
+    },
+    {
+      "epoch": 0.037447591600767355,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019946039810244927,
+      "loss": 0.1475,
+      "step": 4314
+    },
+    {
+      "epoch": 0.03745627208097152,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019946007263122955,
+      "loss": 0.1533,
+      "step": 4315
+    },
+    {
+      "epoch": 0.037464952561175685,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.001994597470621776,
+      "loss": 0.2031,
+      "step": 4316
+    },
+    {
+      "epoch": 0.03747363304137985,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019945942139529384,
+      "loss": 0.2305,
+      "step": 4317
+    },
+    {
+      "epoch": 0.037482313521584015,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001994590956305786,
+      "loss": 0.1562,
+      "step": 4318
+    },
+    {
+      "epoch": 0.03749099400178818,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019945876976803223,
+      "loss": 0.1455,
+      "step": 4319
+    },
+    {
+      "epoch": 0.037499674481992346,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001994584438076551,
+      "loss": 0.2207,
+      "step": 4320
+    },
+    {
+      "epoch": 0.03750835496219651,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0019945811774944753,
+      "loss": 0.1533,
+      "step": 4321
+    },
+    {
+      "epoch": 0.037517035442400676,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019945779159340994,
+      "loss": 0.1602,
+      "step": 4322
+    },
+    {
+      "epoch": 0.03752571592260484,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019945746533954266,
+      "loss": 0.1533,
+      "step": 4323
+    },
+    {
+      "epoch": 0.037534396402809006,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019945713898784603,
+      "loss": 0.1245,
+      "step": 4324
+    },
+    {
+      "epoch": 0.03754307688301317,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001994568125383204,
+      "loss": 0.1758,
+      "step": 4325
+    },
+    {
+      "epoch": 0.037551757363217336,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001994564859909662,
+      "loss": 0.166,
+      "step": 4326
+    },
+    {
+      "epoch": 0.0375604378434215,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001994561593457837,
+      "loss": 0.2109,
+      "step": 4327
+    },
+    {
+      "epoch": 0.03756911832362566,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0019945583260277326,
+      "loss": 0.1621,
+      "step": 4328
+    },
+    {
+      "epoch": 0.037577798803829825,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0019945550576193527,
+      "loss": 0.123,
+      "step": 4329
+    },
+    {
+      "epoch": 0.03758647928403399,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001994551788232701,
+      "loss": 0.1826,
+      "step": 4330
+    },
+    {
+      "epoch": 0.037595159764238155,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001994548517867781,
+      "loss": 0.1797,
+      "step": 4331
+    },
+    {
+      "epoch": 0.03760384024444232,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001994545246524596,
+      "loss": 0.1904,
+      "step": 4332
+    },
+    {
+      "epoch": 0.037612520724646485,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.00199454197420315,
+      "loss": 0.168,
+      "step": 4333
+    },
+    {
+      "epoch": 0.03762120120485065,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001994538700903446,
+      "loss": 0.1836,
+      "step": 4334
+    },
+    {
+      "epoch": 0.037629881685054815,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001994535426625488,
+      "loss": 0.1689,
+      "step": 4335
+    },
+    {
+      "epoch": 0.03763856216525898,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019945321513692798,
+      "loss": 0.1465,
+      "step": 4336
+    },
+    {
+      "epoch": 0.037647242645463146,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001994528875134824,
+      "loss": 0.1758,
+      "step": 4337
+    },
+    {
+      "epoch": 0.03765592312566731,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0019945255979221255,
+      "loss": 0.3945,
+      "step": 4338
+    },
+    {
+      "epoch": 0.037664603605871476,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019945223197311867,
+      "loss": 0.1953,
+      "step": 4339
+    },
+    {
+      "epoch": 0.03767328408607564,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0019945190405620122,
+      "loss": 0.168,
+      "step": 4340
+    },
+    {
+      "epoch": 0.037681964566279806,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001994515760414605,
+      "loss": 0.1914,
+      "step": 4341
+    },
+    {
+      "epoch": 0.03769064504648397,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001994512479288969,
+      "loss": 0.1865,
+      "step": 4342
+    },
+    {
+      "epoch": 0.037699325526688136,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0019945091971851074,
+      "loss": 0.1602,
+      "step": 4343
+    },
+    {
+      "epoch": 0.0377080060068923,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001994505914103024,
+      "loss": 0.1475,
+      "step": 4344
+    },
+    {
+      "epoch": 0.03771668648709647,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0019945026300427224,
+      "loss": 0.124,
+      "step": 4345
+    },
+    {
+      "epoch": 0.03772536696730063,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001994499345004206,
+      "loss": 0.2217,
+      "step": 4346
+    },
+    {
+      "epoch": 0.0377340474475048,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019944960589874786,
+      "loss": 0.1865,
+      "step": 4347
+    },
+    {
+      "epoch": 0.03774272792770896,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0019944927719925437,
+      "loss": 0.1699,
+      "step": 4348
+    },
+    {
+      "epoch": 0.03775140840791313,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001994489484019405,
+      "loss": 0.1582,
+      "step": 4349
+    },
+    {
+      "epoch": 0.03776008888811729,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0019944861950680665,
+      "loss": 0.1523,
+      "step": 4350
+    },
+    {
+      "epoch": 0.03776876936832146,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019944829051385306,
+      "loss": 0.1758,
+      "step": 4351
+    },
+    {
+      "epoch": 0.03777744984852562,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.001994479614230802,
+      "loss": 0.2305,
+      "step": 4352
+    },
+    {
+      "epoch": 0.03778613032872979,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001994476322344884,
+      "loss": 0.2266,
+      "step": 4353
+    },
+    {
+      "epoch": 0.03779481080893395,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.00199447302948078,
+      "loss": 0.1201,
+      "step": 4354
+    },
+    {
+      "epoch": 0.03780349128913812,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001994469735638494,
+      "loss": 0.2129,
+      "step": 4355
+    },
+    {
+      "epoch": 0.03781217176934228,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001994466440818029,
+      "loss": 0.1807,
+      "step": 4356
+    },
+    {
+      "epoch": 0.03782085224954645,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001994463145019389,
+      "loss": 0.1582,
+      "step": 4357
+    },
+    {
+      "epoch": 0.03782953272975061,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001994459848242578,
+      "loss": 0.1924,
+      "step": 4358
+    },
+    {
+      "epoch": 0.03783821320995477,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019944565504875986,
+      "loss": 0.2041,
+      "step": 4359
+    },
+    {
+      "epoch": 0.037846893690158936,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0019944532517544547,
+      "loss": 0.1719,
+      "step": 4360
+    },
+    {
+      "epoch": 0.0378555741703631,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019944499520431506,
+      "loss": 0.1514,
+      "step": 4361
+    },
+    {
+      "epoch": 0.03786425465056727,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019944466513536897,
+      "loss": 0.1943,
+      "step": 4362
+    },
+    {
+      "epoch": 0.03787293513077143,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001994443349686075,
+      "loss": 0.1611,
+      "step": 4363
+    },
+    {
+      "epoch": 0.0378816156109756,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0019944400470403106,
+      "loss": 0.166,
+      "step": 4364
+    },
+    {
+      "epoch": 0.03789029609117976,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0019944367434164,
+      "loss": 0.1445,
+      "step": 4365
+    },
+    {
+      "epoch": 0.03789897657138393,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0019944334388143467,
+      "loss": 0.1387,
+      "step": 4366
+    },
+    {
+      "epoch": 0.03790765705158809,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0019944301332341548,
+      "loss": 0.1768,
+      "step": 4367
+    },
+    {
+      "epoch": 0.03791633753179226,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019944268266758273,
+      "loss": 0.1855,
+      "step": 4368
+    },
+    {
+      "epoch": 0.03792501801199642,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001994423519139368,
+      "loss": 0.1904,
+      "step": 4369
+    },
+    {
+      "epoch": 0.03793369849220059,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019944202106247804,
+      "loss": 0.165,
+      "step": 4370
+    },
+    {
+      "epoch": 0.03794237897240475,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001994416901132069,
+      "loss": 0.1582,
+      "step": 4371
+    },
+    {
+      "epoch": 0.03795105945260892,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001994413590661236,
+      "loss": 0.1943,
+      "step": 4372
+    },
+    {
+      "epoch": 0.03795973993281308,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.001994410279212286,
+      "loss": 0.1719,
+      "step": 4373
+    },
+    {
+      "epoch": 0.03796842041301725,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0019944069667852224,
+      "loss": 0.1602,
+      "step": 4374
+    },
+    {
+      "epoch": 0.03797710089322141,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.001994403653380049,
+      "loss": 0.1953,
+      "step": 4375
+    },
+    {
+      "epoch": 0.03798578137342558,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001994400338996769,
+      "loss": 0.1621,
+      "step": 4376
+    },
+    {
+      "epoch": 0.037994461853629743,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019943970236353855,
+      "loss": 0.2031,
+      "step": 4377
+    },
+    {
+      "epoch": 0.03800314233383391,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.001994393707295904,
+      "loss": 0.1514,
+      "step": 4378
+    },
+    {
+      "epoch": 0.038011822814038074,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001994390389978326,
+      "loss": 0.1631,
+      "step": 4379
+    },
+    {
+      "epoch": 0.03802050329424224,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.001994387071682657,
+      "loss": 0.1641,
+      "step": 4380
+    },
+    {
+      "epoch": 0.038029183774446404,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019943837524088993,
+      "loss": 0.1035,
+      "step": 4381
+    },
+    {
+      "epoch": 0.03803786425465057,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0019943804321570567,
+      "loss": 0.1709,
+      "step": 4382
+    },
+    {
+      "epoch": 0.038046544734854734,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019943771109271337,
+      "loss": 0.1934,
+      "step": 4383
+    },
+    {
+      "epoch": 0.0380552252150589,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0019943737887191328,
+      "loss": 0.1631,
+      "step": 4384
+    },
+    {
+      "epoch": 0.038063905695263064,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0019943704655330584,
+      "loss": 0.1709,
+      "step": 4385
+    },
+    {
+      "epoch": 0.03807258617546723,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019943671413689143,
+      "loss": 0.1738,
+      "step": 4386
+    },
+    {
+      "epoch": 0.038081266655671395,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001994363816226703,
+      "loss": 0.1836,
+      "step": 4387
+    },
+    {
+      "epoch": 0.03808994713587556,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019943604901064295,
+      "loss": 0.2129,
+      "step": 4388
+    },
+    {
+      "epoch": 0.038098627616079725,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001994357163008096,
+      "loss": 0.1738,
+      "step": 4389
+    },
+    {
+      "epoch": 0.03810730809628388,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019943538349317075,
+      "loss": 0.1895,
+      "step": 4390
+    },
+    {
+      "epoch": 0.03811598857648805,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001994350505877267,
+      "loss": 0.1445,
+      "step": 4391
+    },
+    {
+      "epoch": 0.03812466905669221,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019943471758447785,
+      "loss": 0.1758,
+      "step": 4392
+    },
+    {
+      "epoch": 0.03813334953689638,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001994343844834245,
+      "loss": 0.166,
+      "step": 4393
+    },
+    {
+      "epoch": 0.038142030017100544,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019943405128456707,
+      "loss": 0.2129,
+      "step": 4394
+    },
+    {
+      "epoch": 0.03815071049730471,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019943371798790592,
+      "loss": 0.2031,
+      "step": 4395
+    },
+    {
+      "epoch": 0.038159390977508874,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019943338459344133,
+      "loss": 0.1465,
+      "step": 4396
+    },
+    {
+      "epoch": 0.03816807145771304,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001994330511011738,
+      "loss": 0.1631,
+      "step": 4397
+    },
+    {
+      "epoch": 0.038176751937917204,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001994327175111036,
+      "loss": 0.1504,
+      "step": 4398
+    },
+    {
+      "epoch": 0.03818543241812137,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001994323838232312,
+      "loss": 0.1318,
+      "step": 4399
+    },
+    {
+      "epoch": 0.038194112898325534,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019943205003755677,
+      "loss": 0.2207,
+      "step": 4400
+    },
+    {
+      "epoch": 0.0382027933785297,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019943171615408083,
+      "loss": 0.1748,
+      "step": 4401
+    },
+    {
+      "epoch": 0.038211473858733865,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019943138217280378,
+      "loss": 0.1279,
+      "step": 4402
+    },
+    {
+      "epoch": 0.03822015433893803,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019943104809372583,
+      "loss": 0.2139,
+      "step": 4403
+    },
+    {
+      "epoch": 0.038228834819142195,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0019943071391684748,
+      "loss": 0.1533,
+      "step": 4404
+    },
+    {
+      "epoch": 0.03823751529934636,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.00199430379642169,
+      "loss": 0.2236,
+      "step": 4405
+    },
+    {
+      "epoch": 0.038246195779550525,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019943004526969082,
+      "loss": 0.1709,
+      "step": 4406
+    },
+    {
+      "epoch": 0.03825487625975469,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0019942971079941326,
+      "loss": 0.2227,
+      "step": 4407
+    },
+    {
+      "epoch": 0.038263556739958855,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001994293762313368,
+      "loss": 0.1738,
+      "step": 4408
+    },
+    {
+      "epoch": 0.03827223722016302,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019942904156546163,
+      "loss": 0.1621,
+      "step": 4409
+    },
+    {
+      "epoch": 0.038280917700367185,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0019942870680178825,
+      "loss": 0.2031,
+      "step": 4410
+    },
+    {
+      "epoch": 0.03828959818057135,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019942837194031698,
+      "loss": 0.1504,
+      "step": 4411
+    },
+    {
+      "epoch": 0.038298278660775516,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019942803698104815,
+      "loss": 0.1562,
+      "step": 4412
+    },
+    {
+      "epoch": 0.03830695914097968,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001994277019239822,
+      "loss": 0.1826,
+      "step": 4413
+    },
+    {
+      "epoch": 0.038315639621183846,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001994273667691194,
+      "loss": 0.2129,
+      "step": 4414
+    },
+    {
+      "epoch": 0.03832432010138801,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019942703151646026,
+      "loss": 0.1641,
+      "step": 4415
+    },
+    {
+      "epoch": 0.038333000581592176,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.00199426696166005,
+      "loss": 0.1631,
+      "step": 4416
+    },
+    {
+      "epoch": 0.03834168106179634,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019942636071775405,
+      "loss": 0.1836,
+      "step": 4417
+    },
+    {
+      "epoch": 0.038350361542000506,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001994260251717078,
+      "loss": 0.1475,
+      "step": 4418
+    },
+    {
+      "epoch": 0.03835904202220467,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001994256895278666,
+      "loss": 0.1699,
+      "step": 4419
+    },
+    {
+      "epoch": 0.03836772250240884,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019942535378623077,
+      "loss": 0.2207,
+      "step": 4420
+    },
+    {
+      "epoch": 0.038376402982612995,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019942501794680077,
+      "loss": 0.1465,
+      "step": 4421
+    },
+    {
+      "epoch": 0.03838508346281716,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001994246820095769,
+      "loss": 0.1621,
+      "step": 4422
+    },
+    {
+      "epoch": 0.038393763943021325,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001994243459745595,
+      "loss": 0.1699,
+      "step": 4423
+    },
+    {
+      "epoch": 0.03840244442322549,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.00199424009841749,
+      "loss": 0.1719,
+      "step": 4424
+    },
+    {
+      "epoch": 0.038411124903429655,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0019942367361114577,
+      "loss": 0.1758,
+      "step": 4425
+    },
+    {
+      "epoch": 0.03841980538363382,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0019942333728275013,
+      "loss": 0.1562,
+      "step": 4426
+    },
+    {
+      "epoch": 0.038428485863837986,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001994230008565625,
+      "loss": 0.2012,
+      "step": 4427
+    },
+    {
+      "epoch": 0.03843716634404215,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001994226643325832,
+      "loss": 0.1406,
+      "step": 4428
+    },
+    {
+      "epoch": 0.038445846824246316,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019942232771081258,
+      "loss": 0.1738,
+      "step": 4429
+    },
+    {
+      "epoch": 0.03845452730445048,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.001994219909912511,
+      "loss": 0.1553,
+      "step": 4430
+    },
+    {
+      "epoch": 0.038463207784654646,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019942165417389905,
+      "loss": 0.2227,
+      "step": 4431
+    },
+    {
+      "epoch": 0.03847188826485881,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019942131725875683,
+      "loss": 0.1924,
+      "step": 4432
+    },
+    {
+      "epoch": 0.038480568745062976,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001994209802458248,
+      "loss": 0.1807,
+      "step": 4433
+    },
+    {
+      "epoch": 0.03848924922526714,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001994206431351033,
+      "loss": 0.1592,
+      "step": 4434
+    },
+    {
+      "epoch": 0.03849792970547131,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019942030592659276,
+      "loss": 0.1738,
+      "step": 4435
+    },
+    {
+      "epoch": 0.03850661018567547,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019941996862029355,
+      "loss": 0.165,
+      "step": 4436
+    },
+    {
+      "epoch": 0.03851529066587964,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.00199419631216206,
+      "loss": 0.1514,
+      "step": 4437
+    },
+    {
+      "epoch": 0.0385239711460838,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.001994192937143304,
+      "loss": 0.1533,
+      "step": 4438
+    },
+    {
+      "epoch": 0.03853265162628797,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001994189561146673,
+      "loss": 0.1367,
+      "step": 4439
+    },
+    {
+      "epoch": 0.03854133210649213,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001994186184172169,
+      "loss": 0.1426,
+      "step": 4440
+    },
+    {
+      "epoch": 0.0385500125866963,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001994182806219797,
+      "loss": 0.1455,
+      "step": 4441
+    },
+    {
+      "epoch": 0.03855869306690046,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019941794272895596,
+      "loss": 0.1484,
+      "step": 4442
+    },
+    {
+      "epoch": 0.03856737354710463,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019941760473814614,
+      "loss": 0.1562,
+      "step": 4443
+    },
+    {
+      "epoch": 0.03857605402730879,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0019941726664955057,
+      "loss": 0.1719,
+      "step": 4444
+    },
+    {
+      "epoch": 0.03858473450751296,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0019941692846316963,
+      "loss": 0.168,
+      "step": 4445
+    },
+    {
+      "epoch": 0.03859341498771712,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019941659017900363,
+      "loss": 0.1504,
+      "step": 4446
+    },
+    {
+      "epoch": 0.03860209546792129,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019941625179705305,
+      "loss": 0.1562,
+      "step": 4447
+    },
+    {
+      "epoch": 0.03861077594812545,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019941591331731814,
+      "loss": 0.1973,
+      "step": 4448
+    },
+    {
+      "epoch": 0.03861945642832962,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.001994155747397994,
+      "loss": 0.1621,
+      "step": 4449
+    },
+    {
+      "epoch": 0.03862813690853378,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019941523606449704,
+      "loss": 0.166,
+      "step": 4450
+    },
+    {
+      "epoch": 0.03863681738873794,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001994148972914116,
+      "loss": 0.1348,
+      "step": 4451
+    },
+    {
+      "epoch": 0.03864549786894211,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019941455842054333,
+      "loss": 0.2051,
+      "step": 4452
+    },
+    {
+      "epoch": 0.03865417834914627,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0019941421945189265,
+      "loss": 0.1504,
+      "step": 4453
+    },
+    {
+      "epoch": 0.03866285882935044,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0019941388038545995,
+      "loss": 0.1729,
+      "step": 4454
+    },
+    {
+      "epoch": 0.0386715393095546,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0019941354122124553,
+      "loss": 0.1465,
+      "step": 4455
+    },
+    {
+      "epoch": 0.03868021978975877,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0019941320195924982,
+      "loss": 0.1699,
+      "step": 4456
+    },
+    {
+      "epoch": 0.03868890026996293,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001994128625994732,
+      "loss": 0.1533,
+      "step": 4457
+    },
+    {
+      "epoch": 0.0386975807501671,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.00199412523141916,
+      "loss": 0.1357,
+      "step": 4458
+    },
+    {
+      "epoch": 0.03870626123037126,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.001994121835865786,
+      "loss": 0.1738,
+      "step": 4459
+    },
+    {
+      "epoch": 0.03871494171057543,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019941184393346143,
+      "loss": 0.1416,
+      "step": 4460
+    },
+    {
+      "epoch": 0.03872362219077959,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0019941150418256474,
+      "loss": 0.1543,
+      "step": 4461
+    },
+    {
+      "epoch": 0.03873230267098376,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00199411164333889,
+      "loss": 0.1797,
+      "step": 4462
+    },
+    {
+      "epoch": 0.03874098315118792,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0019941082438743457,
+      "loss": 0.1748,
+      "step": 4463
+    },
+    {
+      "epoch": 0.03874966363139209,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001994104843432018,
+      "loss": 0.1934,
+      "step": 4464
+    },
+    {
+      "epoch": 0.03875834411159625,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0019941014420119104,
+      "loss": 0.2021,
+      "step": 4465
+    },
+    {
+      "epoch": 0.03876702459180042,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019940980396140275,
+      "loss": 0.1367,
+      "step": 4466
+    },
+    {
+      "epoch": 0.03877570507200458,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001994094636238372,
+      "loss": 0.208,
+      "step": 4467
+    },
+    {
+      "epoch": 0.03878438555220875,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001994091231884948,
+      "loss": 0.1055,
+      "step": 4468
+    },
+    {
+      "epoch": 0.038793066032412914,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019940878265537593,
+      "loss": 0.1719,
+      "step": 4469
+    },
+    {
+      "epoch": 0.03880174651261708,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019940844202448096,
+      "loss": 0.1934,
+      "step": 4470
+    },
+    {
+      "epoch": 0.038810426992821244,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019940810129581025,
+      "loss": 0.1797,
+      "step": 4471
+    },
+    {
+      "epoch": 0.03881910747302541,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001994077604693642,
+      "loss": 0.1377,
+      "step": 4472
+    },
+    {
+      "epoch": 0.038827787953229574,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0019940741954514317,
+      "loss": 0.1875,
+      "step": 4473
+    },
+    {
+      "epoch": 0.03883646843343374,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001994070785231475,
+      "loss": 0.1309,
+      "step": 4474
+    },
+    {
+      "epoch": 0.038845148913637904,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001994067374033776,
+      "loss": 0.1396,
+      "step": 4475
+    },
+    {
+      "epoch": 0.03885382939384207,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001994063961858339,
+      "loss": 0.1602,
+      "step": 4476
+    },
+    {
+      "epoch": 0.038862509874046235,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0019940605487051666,
+      "loss": 0.1904,
+      "step": 4477
+    },
+    {
+      "epoch": 0.0388711903542504,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001994057134574263,
+      "loss": 0.1406,
+      "step": 4478
+    },
+    {
+      "epoch": 0.038879870834454565,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019940537194656316,
+      "loss": 0.1514,
+      "step": 4479
+    },
+    {
+      "epoch": 0.03888855131465873,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0019940503033792772,
+      "loss": 0.2051,
+      "step": 4480
+    },
+    {
+      "epoch": 0.038897231794862895,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001994046886315202,
+      "loss": 0.2266,
+      "step": 4481
+    },
+    {
+      "epoch": 0.03890591227506705,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019940434682734114,
+      "loss": 0.1465,
+      "step": 4482
+    },
+    {
+      "epoch": 0.03891459275527122,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001994040049253908,
+      "loss": 0.1572,
+      "step": 4483
+    },
+    {
+      "epoch": 0.038923273235475384,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0019940366292566956,
+      "loss": 0.2012,
+      "step": 4484
+    },
+    {
+      "epoch": 0.03893195371567955,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001994033208281778,
+      "loss": 0.1953,
+      "step": 4485
+    },
+    {
+      "epoch": 0.038940634195883714,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.00199402978632916,
+      "loss": 0.165,
+      "step": 4486
+    },
+    {
+      "epoch": 0.03894931467608788,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019940263633988434,
+      "loss": 0.1934,
+      "step": 4487
+    },
+    {
+      "epoch": 0.038957995156292044,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019940229394908335,
+      "loss": 0.1787,
+      "step": 4488
+    },
+    {
+      "epoch": 0.03896667563649621,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0019940195146051338,
+      "loss": 0.1553,
+      "step": 4489
+    },
+    {
+      "epoch": 0.038975356116700374,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019940160887417474,
+      "loss": 0.1455,
+      "step": 4490
+    },
+    {
+      "epoch": 0.03898403659690454,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0019940126619006787,
+      "loss": 0.1094,
+      "step": 4491
+    },
+    {
+      "epoch": 0.038992717077108704,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001994009234081931,
+      "loss": 0.1172,
+      "step": 4492
+    },
+    {
+      "epoch": 0.03900139755731287,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001994005805285508,
+      "loss": 0.1768,
+      "step": 4493
+    },
+    {
+      "epoch": 0.039010078037517035,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019940023755114144,
+      "loss": 0.1436,
+      "step": 4494
+    },
+    {
+      "epoch": 0.0390187585177212,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019939989447596528,
+      "loss": 0.1553,
+      "step": 4495
+    },
+    {
+      "epoch": 0.039027438997925365,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019939955130302274,
+      "loss": 0.1484,
+      "step": 4496
+    },
+    {
+      "epoch": 0.03903611947812953,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.001993992080323142,
+      "loss": 0.1807,
+      "step": 4497
+    },
+    {
+      "epoch": 0.039044799958333695,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0019939886466384,
+      "loss": 0.1221,
+      "step": 4498
+    },
+    {
+      "epoch": 0.03905348043853786,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001993985211976006,
+      "loss": 0.1543,
+      "step": 4499
+    },
+    {
+      "epoch": 0.039062160918742025,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019939817763359622,
+      "loss": 0.1533,
+      "step": 4500
+    },
+    {
+      "epoch": 0.03907084139894619,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0019939783397182743,
+      "loss": 0.1328,
+      "step": 4501
+    },
+    {
+      "epoch": 0.039079521879150356,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001993974902122945,
+      "loss": 0.1855,
+      "step": 4502
+    },
+    {
+      "epoch": 0.03908820235935452,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001993971463549978,
+      "loss": 0.1377,
+      "step": 4503
+    },
+    {
+      "epoch": 0.039096882839558686,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001993968023999377,
+      "loss": 0.1514,
+      "step": 4504
+    },
+    {
+      "epoch": 0.03910556331976285,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001993964583471147,
+      "loss": 0.1523,
+      "step": 4505
+    },
+    {
+      "epoch": 0.039114243799967016,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0019939611419652896,
+      "loss": 0.1602,
+      "step": 4506
+    },
+    {
+      "epoch": 0.03912292428017118,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.00199395769948181,
+      "loss": 0.1494,
+      "step": 4507
+    },
+    {
+      "epoch": 0.039131604760375346,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.001993954256020712,
+      "loss": 0.1406,
+      "step": 4508
+    },
+    {
+      "epoch": 0.03914028524057951,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001993950811581999,
+      "loss": 0.1562,
+      "step": 4509
+    },
+    {
+      "epoch": 0.03914896572078368,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0019939473661656744,
+      "loss": 0.2012,
+      "step": 4510
+    },
+    {
+      "epoch": 0.03915764620098784,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001993943919771743,
+      "loss": 0.165,
+      "step": 4511
+    },
+    {
+      "epoch": 0.03916632668119201,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019939404724002075,
+      "loss": 0.1836,
+      "step": 4512
+    },
+    {
+      "epoch": 0.039175007161396165,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0019939370240510726,
+      "loss": 0.168,
+      "step": 4513
+    },
+    {
+      "epoch": 0.03918368764160033,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0019939335747243413,
+      "loss": 0.1895,
+      "step": 4514
+    },
+    {
+      "epoch": 0.039192368121804495,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001993930124420018,
+      "loss": 0.1758,
+      "step": 4515
+    },
+    {
+      "epoch": 0.03920104860200866,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001993926673138105,
+      "loss": 0.1826,
+      "step": 4516
+    },
+    {
+      "epoch": 0.039209729082212826,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019939232208786087,
+      "loss": 0.1172,
+      "step": 4517
+    },
+    {
+      "epoch": 0.03921840956241699,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019939197676415304,
+      "loss": 0.1562,
+      "step": 4518
+    },
+    {
+      "epoch": 0.039227090042621156,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019939163134268753,
+      "loss": 0.1592,
+      "step": 4519
+    },
+    {
+      "epoch": 0.03923577052282532,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.001993912858234647,
+      "loss": 0.1758,
+      "step": 4520
+    },
+    {
+      "epoch": 0.039244451003029486,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019939094020648487,
+      "loss": 0.1748,
+      "step": 4521
+    },
+    {
+      "epoch": 0.03925313148323365,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019939059449174843,
+      "loss": 0.1377,
+      "step": 4522
+    },
+    {
+      "epoch": 0.039261811963437816,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001993902486792558,
+      "loss": 0.2363,
+      "step": 4523
+    },
+    {
+      "epoch": 0.03927049244364198,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019938990276900737,
+      "loss": 0.1797,
+      "step": 4524
+    },
+    {
+      "epoch": 0.039279172923846147,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019938955676100344,
+      "loss": 0.1406,
+      "step": 4525
+    },
+    {
+      "epoch": 0.03928785340405031,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0019938921065524445,
+      "loss": 0.2266,
+      "step": 4526
+    },
+    {
+      "epoch": 0.03929653388425448,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019938886445173077,
+      "loss": 0.1543,
+      "step": 4527
+    },
+    {
+      "epoch": 0.03930521436445864,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019938851815046277,
+      "loss": 0.1582,
+      "step": 4528
+    },
+    {
+      "epoch": 0.03931389484466281,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001993881717514408,
+      "loss": 0.1514,
+      "step": 4529
+    },
+    {
+      "epoch": 0.03932257532486697,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0019938782525466526,
+      "loss": 0.1904,
+      "step": 4530
+    },
+    {
+      "epoch": 0.03933125580507114,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001993874786601366,
+      "loss": 0.1855,
+      "step": 4531
+    },
+    {
+      "epoch": 0.0393399362852753,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0019938713196785514,
+      "loss": 0.1475,
+      "step": 4532
+    },
+    {
+      "epoch": 0.03934861676547947,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019938678517782116,
+      "loss": 0.2539,
+      "step": 4533
+    },
+    {
+      "epoch": 0.03935729724568363,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001993864382900352,
+      "loss": 0.1318,
+      "step": 4534
+    },
+    {
+      "epoch": 0.0393659777258878,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019938609130449756,
+      "loss": 0.1992,
+      "step": 4535
+    },
+    {
+      "epoch": 0.03937465820609196,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0019938574422120867,
+      "loss": 0.2148,
+      "step": 4536
+    },
+    {
+      "epoch": 0.03938333868629613,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001993853970401688,
+      "loss": 0.105,
+      "step": 4537
+    },
+    {
+      "epoch": 0.03939201916650029,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001993850497613785,
+      "loss": 0.2227,
+      "step": 4538
+    },
+    {
+      "epoch": 0.03940069964670446,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.00199384702384838,
+      "loss": 0.1533,
+      "step": 4539
+    },
+    {
+      "epoch": 0.03940938012690862,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.001993843549105477,
+      "loss": 0.1387,
+      "step": 4540
+    },
+    {
+      "epoch": 0.03941806060711279,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0019938400733850808,
+      "loss": 0.1348,
+      "step": 4541
+    },
+    {
+      "epoch": 0.039426741087316954,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019938365966871937,
+      "loss": 0.1738,
+      "step": 4542
+    },
+    {
+      "epoch": 0.03943542156752112,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001993833119011821,
+      "loss": 0.1426,
+      "step": 4543
+    },
+    {
+      "epoch": 0.03944410204772528,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019938296403589654,
+      "loss": 0.1865,
+      "step": 4544
+    },
+    {
+      "epoch": 0.03945278252792944,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019938261607286316,
+      "loss": 0.2148,
+      "step": 4545
+    },
+    {
+      "epoch": 0.03946146300813361,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0019938226801208226,
+      "loss": 0.1582,
+      "step": 4546
+    },
+    {
+      "epoch": 0.03947014348833777,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019938191985355426,
+      "loss": 0.1406,
+      "step": 4547
+    },
+    {
+      "epoch": 0.03947882396854194,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0019938157159727953,
+      "loss": 0.168,
+      "step": 4548
+    },
+    {
+      "epoch": 0.0394875044487461,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001993812232432585,
+      "loss": 0.1484,
+      "step": 4549
+    },
+    {
+      "epoch": 0.03949618492895027,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0019938087479149146,
+      "loss": 0.1436,
+      "step": 4550
+    },
+    {
+      "epoch": 0.03950486540915443,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0019938052624197886,
+      "loss": 0.1221,
+      "step": 4551
+    },
+    {
+      "epoch": 0.0395135458893586,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0019938017759472105,
+      "loss": 0.1826,
+      "step": 4552
+    },
+    {
+      "epoch": 0.03952222636956276,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019937982884971842,
+      "loss": 0.1621,
+      "step": 4553
+    },
+    {
+      "epoch": 0.03953090684976693,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019937948000697133,
+      "loss": 0.1992,
+      "step": 4554
+    },
+    {
+      "epoch": 0.03953958732997109,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0019937913106648024,
+      "loss": 0.1338,
+      "step": 4555
+    },
+    {
+      "epoch": 0.03954826781017526,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019937878202824543,
+      "loss": 0.1699,
+      "step": 4556
+    },
+    {
+      "epoch": 0.03955694829037942,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019937843289226736,
+      "loss": 0.2188,
+      "step": 4557
+    },
+    {
+      "epoch": 0.03956562877058359,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0019937808365854633,
+      "loss": 0.167,
+      "step": 4558
+    },
+    {
+      "epoch": 0.039574309250787754,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.001993777343270828,
+      "loss": 0.1426,
+      "step": 4559
+    },
+    {
+      "epoch": 0.03958298973099192,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019937738489787713,
+      "loss": 0.168,
+      "step": 4560
+    },
+    {
+      "epoch": 0.039591670211196084,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001993770353709297,
+      "loss": 0.1758,
+      "step": 4561
+    },
+    {
+      "epoch": 0.03960035069140025,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019937668574624085,
+      "loss": 0.1895,
+      "step": 4562
+    },
+    {
+      "epoch": 0.039609031171604414,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0019937633602381106,
+      "loss": 0.3926,
+      "step": 4563
+    },
+    {
+      "epoch": 0.03961771165180858,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.001993759862036406,
+      "loss": 0.1543,
+      "step": 4564
+    },
+    {
+      "epoch": 0.039626392132012744,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001993756362857299,
+      "loss": 0.1494,
+      "step": 4565
+    },
+    {
+      "epoch": 0.03963507261221691,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019937528627007937,
+      "loss": 0.1943,
+      "step": 4566
+    },
+    {
+      "epoch": 0.039643753092421075,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001993749361566894,
+      "loss": 0.1777,
+      "step": 4567
+    },
+    {
+      "epoch": 0.03965243357262524,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001993745859455603,
+      "loss": 0.1523,
+      "step": 4568
+    },
+    {
+      "epoch": 0.039661114052829405,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001993742356366925,
+      "loss": 0.1533,
+      "step": 4569
+    },
+    {
+      "epoch": 0.03966979453303357,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019937388523008637,
+      "loss": 0.1895,
+      "step": 4570
+    },
+    {
+      "epoch": 0.039678475013237735,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019937353472574233,
+      "loss": 0.1328,
+      "step": 4571
+    },
+    {
+      "epoch": 0.0396871554934419,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001993731841236607,
+      "loss": 0.1631,
+      "step": 4572
+    },
+    {
+      "epoch": 0.039695835973646065,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0019937283342384192,
+      "loss": 0.1348,
+      "step": 4573
+    },
+    {
+      "epoch": 0.03970451645385023,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0019937248262628634,
+      "loss": 0.1484,
+      "step": 4574
+    },
+    {
+      "epoch": 0.03971319693405439,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001993721317309944,
+      "loss": 0.1621,
+      "step": 4575
+    },
+    {
+      "epoch": 0.039721877414258554,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001993717807379664,
+      "loss": 0.1973,
+      "step": 4576
+    },
+    {
+      "epoch": 0.03973055789446272,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0019937142964720276,
+      "loss": 0.1357,
+      "step": 4577
+    },
+    {
+      "epoch": 0.039739238374666884,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019937107845870387,
+      "loss": 0.1309,
+      "step": 4578
+    },
+    {
+      "epoch": 0.03974791885487105,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019937072717247013,
+      "loss": 0.1777,
+      "step": 4579
+    },
+    {
+      "epoch": 0.039756599335075214,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.001993703757885019,
+      "loss": 0.2012,
+      "step": 4580
+    },
+    {
+      "epoch": 0.03976527981527938,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019937002430679956,
+      "loss": 0.1953,
+      "step": 4581
+    },
+    {
+      "epoch": 0.039773960295483544,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001993696727273635,
+      "loss": 0.1533,
+      "step": 4582
+    },
+    {
+      "epoch": 0.03978264077568771,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0019936932105019413,
+      "loss": 0.1758,
+      "step": 4583
+    },
+    {
+      "epoch": 0.039791321255891875,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001993689692752918,
+      "loss": 0.1924,
+      "step": 4584
+    },
+    {
+      "epoch": 0.03980000173609604,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001993686174026569,
+      "loss": 0.1426,
+      "step": 4585
+    },
+    {
+      "epoch": 0.039808682216300205,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0019936826543228985,
+      "loss": 0.1963,
+      "step": 4586
+    },
+    {
+      "epoch": 0.03981736269650437,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.00199367913364191,
+      "loss": 0.1475,
+      "step": 4587
+    },
+    {
+      "epoch": 0.039826043176708535,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001993675611983607,
+      "loss": 0.1602,
+      "step": 4588
+    },
+    {
+      "epoch": 0.0398347236569127,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001993672089347994,
+      "loss": 0.1738,
+      "step": 4589
+    },
+    {
+      "epoch": 0.039843404137116865,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0019936685657350748,
+      "loss": 0.209,
+      "step": 4590
+    },
+    {
+      "epoch": 0.03985208461732103,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001993665041144853,
+      "loss": 0.1621,
+      "step": 4591
+    },
+    {
+      "epoch": 0.039860765097525196,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019936615155773324,
+      "loss": 0.1582,
+      "step": 4592
+    },
+    {
+      "epoch": 0.03986944557772936,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001993657989032517,
+      "loss": 0.1641,
+      "step": 4593
+    },
+    {
+      "epoch": 0.039878126057933526,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001993654461510411,
+      "loss": 0.1416,
+      "step": 4594
+    },
+    {
+      "epoch": 0.03988680653813769,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001993650933011018,
+      "loss": 0.1562,
+      "step": 4595
+    },
+    {
+      "epoch": 0.039895487018341856,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019936474035343412,
+      "loss": 0.1514,
+      "step": 4596
+    },
+    {
+      "epoch": 0.03990416749854602,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019936438730803856,
+      "loss": 0.1826,
+      "step": 4597
+    },
+    {
+      "epoch": 0.039912847978750186,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001993640341649154,
+      "loss": 0.1719,
+      "step": 4598
+    },
+    {
+      "epoch": 0.03992152845895435,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019936368092406506,
+      "loss": 0.1895,
+      "step": 4599
+    },
+    {
+      "epoch": 0.03993020893915852,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.00199363327585488,
+      "loss": 0.1699,
+      "step": 4600
+    },
+    {
+      "epoch": 0.03993888941936268,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001993629741491845,
+      "loss": 0.1641,
+      "step": 4601
+    },
+    {
+      "epoch": 0.03994756989956685,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019936262061515503,
+      "loss": 0.1543,
+      "step": 4602
+    },
+    {
+      "epoch": 0.03995625037977101,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.001993622669833999,
+      "loss": 0.1709,
+      "step": 4603
+    },
+    {
+      "epoch": 0.03996493085997518,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001993619132539196,
+      "loss": 0.1602,
+      "step": 4604
+    },
+    {
+      "epoch": 0.039973611340179335,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001993615594267144,
+      "loss": 0.1592,
+      "step": 4605
+    },
+    {
+      "epoch": 0.0399822918203835,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019936120550178476,
+      "loss": 0.1426,
+      "step": 4606
+    },
+    {
+      "epoch": 0.039990972300587665,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0019936085147913107,
+      "loss": 0.1895,
+      "step": 4607
+    },
+    {
+      "epoch": 0.03999965278079183,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.001993604973587537,
+      "loss": 0.1387,
+      "step": 4608
+    },
+    {
+      "epoch": 0.040008333260995996,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019936014314065297,
+      "loss": 0.2656,
+      "step": 4609
+    },
+    {
+      "epoch": 0.04001701374120016,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019935978882482937,
+      "loss": 0.1562,
+      "step": 4610
+    },
+    {
+      "epoch": 0.040025694221404326,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0019935943441128324,
+      "loss": 0.1748,
+      "step": 4611
+    },
+    {
+      "epoch": 0.04003437470160849,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0019935907990001494,
+      "loss": 0.1387,
+      "step": 4612
+    },
+    {
+      "epoch": 0.040043055181812656,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019935872529102494,
+      "loss": 0.166,
+      "step": 4613
+    },
+    {
+      "epoch": 0.04005173566201682,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001993583705843136,
+      "loss": 0.1406,
+      "step": 4614
+    },
+    {
+      "epoch": 0.040060416142220986,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019935801577988126,
+      "loss": 0.125,
+      "step": 4615
+    },
+    {
+      "epoch": 0.04006909662242515,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019935766087772833,
+      "loss": 0.1904,
+      "step": 4616
+    },
+    {
+      "epoch": 0.04007777710262932,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001993573058778552,
+      "loss": 0.1797,
+      "step": 4617
+    },
+    {
+      "epoch": 0.04008645758283348,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001993569507802623,
+      "loss": 0.1641,
+      "step": 4618
+    },
+    {
+      "epoch": 0.04009513806303765,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0019935659558494995,
+      "loss": 0.1602,
+      "step": 4619
+    },
+    {
+      "epoch": 0.04010381854324181,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001993562402919186,
+      "loss": 0.1875,
+      "step": 4620
+    },
+    {
+      "epoch": 0.04011249902344598,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0019935588490116855,
+      "loss": 0.168,
+      "step": 4621
+    },
+    {
+      "epoch": 0.04012117950365014,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001993555294127003,
+      "loss": 0.1826,
+      "step": 4622
+    },
+    {
+      "epoch": 0.04012985998385431,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019935517382651414,
+      "loss": 0.1729,
+      "step": 4623
+    },
+    {
+      "epoch": 0.04013854046405847,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019935481814261054,
+      "loss": 0.1543,
+      "step": 4624
+    },
+    {
+      "epoch": 0.04014722094426264,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019935446236098984,
+      "loss": 0.1934,
+      "step": 4625
+    },
+    {
+      "epoch": 0.0401559014244668,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0019935410648165247,
+      "loss": 0.1523,
+      "step": 4626
+    },
+    {
+      "epoch": 0.04016458190467097,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019935375050459873,
+      "loss": 0.1572,
+      "step": 4627
+    },
+    {
+      "epoch": 0.04017326238487513,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001993533944298291,
+      "loss": 0.1699,
+      "step": 4628
+    },
+    {
+      "epoch": 0.0401819428650793,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00199353038257344,
+      "loss": 0.1738,
+      "step": 4629
+    },
+    {
+      "epoch": 0.04019062334528346,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0019935268198714366,
+      "loss": 0.2031,
+      "step": 4630
+    },
+    {
+      "epoch": 0.04019930382548763,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019935232561922867,
+      "loss": 0.123,
+      "step": 4631
+    },
+    {
+      "epoch": 0.040207984305691793,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001993519691535992,
+      "loss": 0.1406,
+      "step": 4632
+    },
+    {
+      "epoch": 0.04021666478589596,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019935161259025586,
+      "loss": 0.1797,
+      "step": 4633
+    },
+    {
+      "epoch": 0.040225345266100124,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019935125592919893,
+      "loss": 0.1367,
+      "step": 4634
+    },
+    {
+      "epoch": 0.04023402574630429,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001993508991704288,
+      "loss": 0.165,
+      "step": 4635
+    },
+    {
+      "epoch": 0.04024270622650845,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019935054231394584,
+      "loss": 0.1797,
+      "step": 4636
+    },
+    {
+      "epoch": 0.04025138670671261,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0019935018535975047,
+      "loss": 0.1475,
+      "step": 4637
+    },
+    {
+      "epoch": 0.04026006718691678,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001993498283078431,
+      "loss": 0.1934,
+      "step": 4638
+    },
+    {
+      "epoch": 0.04026874766712094,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0019934947115822408,
+      "loss": 0.1621,
+      "step": 4639
+    },
+    {
+      "epoch": 0.04027742814732511,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0019934911391089384,
+      "loss": 0.168,
+      "step": 4640
+    },
+    {
+      "epoch": 0.04028610862752927,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0019934875656585273,
+      "loss": 0.2168,
+      "step": 4641
+    },
+    {
+      "epoch": 0.04029478910773344,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001993483991231012,
+      "loss": 0.2188,
+      "step": 4642
+    },
+    {
+      "epoch": 0.0403034695879376,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019934804158263956,
+      "loss": 0.1855,
+      "step": 4643
+    },
+    {
+      "epoch": 0.04031215006814177,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019934768394446827,
+      "loss": 0.2334,
+      "step": 4644
+    },
+    {
+      "epoch": 0.04032083054834593,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019934732620858773,
+      "loss": 0.1328,
+      "step": 4645
+    },
+    {
+      "epoch": 0.0403295110285501,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019934696837499823,
+      "loss": 0.1699,
+      "step": 4646
+    },
+    {
+      "epoch": 0.04033819150875426,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019934661044370026,
+      "loss": 0.1982,
+      "step": 4647
+    },
+    {
+      "epoch": 0.04034687198895843,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0019934625241469417,
+      "loss": 0.1602,
+      "step": 4648
+    },
+    {
+      "epoch": 0.040355552469162594,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0019934589428798038,
+      "loss": 0.1328,
+      "step": 4649
+    },
+    {
+      "epoch": 0.04036423294936676,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0019934553606355924,
+      "loss": 0.1523,
+      "step": 4650
+    },
+    {
+      "epoch": 0.040372913429570924,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0019934517774143116,
+      "loss": 0.1475,
+      "step": 4651
+    },
+    {
+      "epoch": 0.04038159390977509,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0019934481932159655,
+      "loss": 0.1416,
+      "step": 4652
+    },
+    {
+      "epoch": 0.040390274389979254,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019934446080405576,
+      "loss": 0.1914,
+      "step": 4653
+    },
+    {
+      "epoch": 0.04039895487018342,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019934410218880928,
+      "loss": 0.1602,
+      "step": 4654
+    },
+    {
+      "epoch": 0.040407635350387584,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019934374347585736,
+      "loss": 0.1699,
+      "step": 4655
+    },
+    {
+      "epoch": 0.04041631583059175,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001993433846652005,
+      "loss": 0.1592,
+      "step": 4656
+    },
+    {
+      "epoch": 0.040424996310795915,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019934302575683903,
+      "loss": 0.1816,
+      "step": 4657
+    },
+    {
+      "epoch": 0.04043367679100008,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001993426667507734,
+      "loss": 0.1504,
+      "step": 4658
+    },
+    {
+      "epoch": 0.040442357271204245,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019934230764700397,
+      "loss": 0.1182,
+      "step": 4659
+    },
+    {
+      "epoch": 0.04045103775140841,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001993419484455311,
+      "loss": 0.1445,
+      "step": 4660
+    },
+    {
+      "epoch": 0.040459718231612575,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001993415891463553,
+      "loss": 0.1484,
+      "step": 4661
+    },
+    {
+      "epoch": 0.04046839871181674,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0019934122974947684,
+      "loss": 0.1396,
+      "step": 4662
+    },
+    {
+      "epoch": 0.040477079192020905,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001993408702548961,
+      "loss": 0.1875,
+      "step": 4663
+    },
+    {
+      "epoch": 0.04048575967222507,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019934051066261356,
+      "loss": 0.2188,
+      "step": 4664
+    },
+    {
+      "epoch": 0.040494440152429236,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001993401509726296,
+      "loss": 0.1592,
+      "step": 4665
+    },
+    {
+      "epoch": 0.0405031206326334,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019933979118494454,
+      "loss": 0.1426,
+      "step": 4666
+    },
+    {
+      "epoch": 0.04051180111283756,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001993394312995589,
+      "loss": 0.1367,
+      "step": 4667
+    },
+    {
+      "epoch": 0.040520481593041724,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019933907131647294,
+      "loss": 0.1885,
+      "step": 4668
+    },
+    {
+      "epoch": 0.04052916207324589,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019933871123568713,
+      "loss": 0.1455,
+      "step": 4669
+    },
+    {
+      "epoch": 0.040537842553450054,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019933835105720187,
+      "loss": 0.1611,
+      "step": 4670
+    },
+    {
+      "epoch": 0.04054652303365422,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001993379907810175,
+      "loss": 0.2148,
+      "step": 4671
+    },
+    {
+      "epoch": 0.040555203513858384,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019933763040713448,
+      "loss": 0.1768,
+      "step": 4672
+    },
+    {
+      "epoch": 0.04056388399406255,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0019933726993555316,
+      "loss": 0.1069,
+      "step": 4673
+    },
+    {
+      "epoch": 0.040572564474266715,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0019933690936627395,
+      "loss": 0.2051,
+      "step": 4674
+    },
+    {
+      "epoch": 0.04058124495447088,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.001993365486992972,
+      "loss": 0.1787,
+      "step": 4675
+    },
+    {
+      "epoch": 0.040589925434675045,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019933618793462338,
+      "loss": 0.1758,
+      "step": 4676
+    },
+    {
+      "epoch": 0.04059860591487921,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0019933582707225284,
+      "loss": 0.1436,
+      "step": 4677
+    },
+    {
+      "epoch": 0.040607286395083375,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0019933546611218596,
+      "loss": 0.1426,
+      "step": 4678
+    },
+    {
+      "epoch": 0.04061596687528754,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019933510505442315,
+      "loss": 0.2168,
+      "step": 4679
+    },
+    {
+      "epoch": 0.040624647355491705,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001993347438989649,
+      "loss": 0.165,
+      "step": 4680
+    },
+    {
+      "epoch": 0.04063332783569587,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.001993343826458114,
+      "loss": 0.1104,
+      "step": 4681
+    },
+    {
+      "epoch": 0.040642008315900036,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0019933402129496324,
+      "loss": 0.1963,
+      "step": 4682
+    },
+    {
+      "epoch": 0.0406506887961042,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001993336598464207,
+      "loss": 0.1777,
+      "step": 4683
+    },
+    {
+      "epoch": 0.040659369276308366,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019933329830018423,
+      "loss": 0.1992,
+      "step": 4684
+    },
+    {
+      "epoch": 0.04066804975651253,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001993329366562542,
+      "loss": 0.1553,
+      "step": 4685
+    },
+    {
+      "epoch": 0.040676730236716696,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.00199332574914631,
+      "loss": 0.165,
+      "step": 4686
+    },
+    {
+      "epoch": 0.04068541071692086,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.00199332213075315,
+      "loss": 0.2129,
+      "step": 4687
+    },
+    {
+      "epoch": 0.040694091197125026,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019933185113830674,
+      "loss": 0.1543,
+      "step": 4688
+    },
+    {
+      "epoch": 0.04070277167732919,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019933148910360643,
+      "loss": 0.1465,
+      "step": 4689
+    },
+    {
+      "epoch": 0.04071145215753336,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0019933112697121456,
+      "loss": 0.1445,
+      "step": 4690
+    },
+    {
+      "epoch": 0.04072013263773752,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0019933076474113152,
+      "loss": 0.1973,
+      "step": 4691
+    },
+    {
+      "epoch": 0.04072881311794169,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001993304024133577,
+      "loss": 0.1992,
+      "step": 4692
+    },
+    {
+      "epoch": 0.04073749359814585,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.001993300399878935,
+      "loss": 0.1553,
+      "step": 4693
+    },
+    {
+      "epoch": 0.04074617407835002,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001993296774647393,
+      "loss": 0.1699,
+      "step": 4694
+    },
+    {
+      "epoch": 0.04075485455855418,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001993293148438955,
+      "loss": 0.1602,
+      "step": 4695
+    },
+    {
+      "epoch": 0.04076353503875835,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001993289521253625,
+      "loss": 0.124,
+      "step": 4696
+    },
+    {
+      "epoch": 0.04077221551896251,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019932858930914073,
+      "loss": 0.1895,
+      "step": 4697
+    },
+    {
+      "epoch": 0.04078089599916667,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001993282263952305,
+      "loss": 0.1572,
+      "step": 4698
+    },
+    {
+      "epoch": 0.040789576479370836,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0019932786338363235,
+      "loss": 0.1357,
+      "step": 4699
+    },
+    {
+      "epoch": 0.040798256959575,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0019932750027434653,
+      "loss": 0.1758,
+      "step": 4700
+    },
+    {
+      "epoch": 0.040806937439779166,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001993271370673735,
+      "loss": 0.1445,
+      "step": 4701
+    },
+    {
+      "epoch": 0.04081561791998333,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001993267737627137,
+      "loss": 0.1426,
+      "step": 4702
+    },
+    {
+      "epoch": 0.040824298400187496,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0019932641036036745,
+      "loss": 0.1494,
+      "step": 4703
+    },
+    {
+      "epoch": 0.04083297888039166,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019932604686033516,
+      "loss": 0.1777,
+      "step": 4704
+    },
+    {
+      "epoch": 0.040841659360595826,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019932568326261725,
+      "loss": 0.1582,
+      "step": 4705
+    },
+    {
+      "epoch": 0.04085033984079999,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019932531956721416,
+      "loss": 0.1768,
+      "step": 4706
+    },
+    {
+      "epoch": 0.04085902032100416,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001993249557741262,
+      "loss": 0.2207,
+      "step": 4707
+    },
+    {
+      "epoch": 0.04086770080120832,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001993245918833538,
+      "loss": 0.1387,
+      "step": 4708
+    },
+    {
+      "epoch": 0.04087638128141249,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0019932422789489743,
+      "loss": 0.1738,
+      "step": 4709
+    },
+    {
+      "epoch": 0.04088506176161665,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0019932386380875737,
+      "loss": 0.1562,
+      "step": 4710
+    },
+    {
+      "epoch": 0.04089374224182082,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001993234996249341,
+      "loss": 0.166,
+      "step": 4711
+    },
+    {
+      "epoch": 0.04090242272202498,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00199323135343428,
+      "loss": 0.1484,
+      "step": 4712
+    },
+    {
+      "epoch": 0.04091110320222915,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0019932277096423945,
+      "loss": 0.1641,
+      "step": 4713
+    },
+    {
+      "epoch": 0.04091978368243331,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019932240648736885,
+      "loss": 0.1641,
+      "step": 4714
+    },
+    {
+      "epoch": 0.04092846416263748,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001993220419128166,
+      "loss": 0.1543,
+      "step": 4715
+    },
+    {
+      "epoch": 0.04093714464284164,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001993216772405831,
+      "loss": 0.1689,
+      "step": 4716
+    },
+    {
+      "epoch": 0.04094582512304581,
+      "grad_norm": 1.703125,
+      "learning_rate": 0.001993213124706688,
+      "loss": 0.1895,
+      "step": 4717
+    },
+    {
+      "epoch": 0.04095450560324997,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019932094760307406,
+      "loss": 0.1748,
+      "step": 4718
+    },
+    {
+      "epoch": 0.04096318608345414,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0019932058263779926,
+      "loss": 0.1367,
+      "step": 4719
+    },
+    {
+      "epoch": 0.0409718665636583,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001993202175748448,
+      "loss": 0.167,
+      "step": 4720
+    },
+    {
+      "epoch": 0.04098054704386247,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019931985241421106,
+      "loss": 0.1562,
+      "step": 4721
+    },
+    {
+      "epoch": 0.04098922752406663,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001993194871558985,
+      "loss": 0.1855,
+      "step": 4722
+    },
+    {
+      "epoch": 0.0409979080042708,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.001993191217999075,
+      "loss": 0.1562,
+      "step": 4723
+    },
+    {
+      "epoch": 0.041006588484474964,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0019931875634623844,
+      "loss": 0.1475,
+      "step": 4724
+    },
+    {
+      "epoch": 0.04101526896467913,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0019931839079489174,
+      "loss": 0.1504,
+      "step": 4725
+    },
+    {
+      "epoch": 0.041023949444883294,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0019931802514586774,
+      "loss": 0.1196,
+      "step": 4726
+    },
+    {
+      "epoch": 0.04103262992508746,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019931765939916694,
+      "loss": 0.1475,
+      "step": 4727
+    },
+    {
+      "epoch": 0.041041310405291624,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001993172935547897,
+      "loss": 0.1777,
+      "step": 4728
+    },
+    {
+      "epoch": 0.04104999088549578,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001993169276127364,
+      "loss": 0.1387,
+      "step": 4729
+    },
+    {
+      "epoch": 0.04105867136569995,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019931656157300744,
+      "loss": 0.1455,
+      "step": 4730
+    },
+    {
+      "epoch": 0.04106735184590411,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019931619543560324,
+      "loss": 0.1475,
+      "step": 4731
+    },
+    {
+      "epoch": 0.04107603232610828,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0019931582920052417,
+      "loss": 0.1436,
+      "step": 4732
+    },
+    {
+      "epoch": 0.04108471280631244,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0019931546286777067,
+      "loss": 0.1475,
+      "step": 4733
+    },
+    {
+      "epoch": 0.04109339328651661,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019931509643734313,
+      "loss": 0.1641,
+      "step": 4734
+    },
+    {
+      "epoch": 0.04110207376672077,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001993147299092419,
+      "loss": 0.1367,
+      "step": 4735
+    },
+    {
+      "epoch": 0.04111075424692494,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001993143632834675,
+      "loss": 0.1729,
+      "step": 4736
+    },
+    {
+      "epoch": 0.0411194347271291,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001993139965600202,
+      "loss": 0.1875,
+      "step": 4737
+    },
+    {
+      "epoch": 0.04112811520733327,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019931362973890044,
+      "loss": 0.1758,
+      "step": 4738
+    },
+    {
+      "epoch": 0.041136795687537434,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019931326282010865,
+      "loss": 0.2266,
+      "step": 4739
+    },
+    {
+      "epoch": 0.0411454761677416,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0019931289580364525,
+      "loss": 0.1875,
+      "step": 4740
+    },
+    {
+      "epoch": 0.041154156647945764,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001993125286895106,
+      "loss": 0.1699,
+      "step": 4741
+    },
+    {
+      "epoch": 0.04116283712814993,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019931216147770505,
+      "loss": 0.1475,
+      "step": 4742
+    },
+    {
+      "epoch": 0.041171517608354094,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019931179416822916,
+      "loss": 0.1631,
+      "step": 4743
+    },
+    {
+      "epoch": 0.04118019808855826,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019931142676108318,
+      "loss": 0.1689,
+      "step": 4744
+    },
+    {
+      "epoch": 0.041188878568762424,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019931105925626753,
+      "loss": 0.1543,
+      "step": 4745
+    },
+    {
+      "epoch": 0.04119755904896659,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0019931069165378267,
+      "loss": 0.1309,
+      "step": 4746
+    },
+    {
+      "epoch": 0.041206239529170754,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.00199310323953629,
+      "loss": 0.3652,
+      "step": 4747
+    },
+    {
+      "epoch": 0.04121492000937492,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001993099561558069,
+      "loss": 0.1533,
+      "step": 4748
+    },
+    {
+      "epoch": 0.041223600489579085,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001993095882603168,
+      "loss": 0.1562,
+      "step": 4749
+    },
+    {
+      "epoch": 0.04123228096978325,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.00199309220267159,
+      "loss": 0.1572,
+      "step": 4750
+    },
+    {
+      "epoch": 0.041240961449987415,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019930885217633405,
+      "loss": 0.1826,
+      "step": 4751
+    },
+    {
+      "epoch": 0.04124964193019158,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001993084839878423,
+      "loss": 0.1426,
+      "step": 4752
+    },
+    {
+      "epoch": 0.041258322410395745,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0019930811570168403,
+      "loss": 0.1582,
+      "step": 4753
+    },
+    {
+      "epoch": 0.04126700289059991,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019930774731785985,
+      "loss": 0.1729,
+      "step": 4754
+    },
+    {
+      "epoch": 0.041275683370804075,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0019930737883637,
+      "loss": 0.1387,
+      "step": 4755
+    },
+    {
+      "epoch": 0.04128436385100824,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019930701025721496,
+      "loss": 0.1572,
+      "step": 4756
+    },
+    {
+      "epoch": 0.041293044331212406,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001993066415803951,
+      "loss": 0.1533,
+      "step": 4757
+    },
+    {
+      "epoch": 0.04130172481141657,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0019930627280591085,
+      "loss": 0.1934,
+      "step": 4758
+    },
+    {
+      "epoch": 0.04131040529162073,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001993059039337626,
+      "loss": 0.1826,
+      "step": 4759
+    },
+    {
+      "epoch": 0.041319085771824894,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001993055349639508,
+      "loss": 0.1602,
+      "step": 4760
+    },
+    {
+      "epoch": 0.04132776625202906,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019930516589647574,
+      "loss": 0.1719,
+      "step": 4761
+    },
+    {
+      "epoch": 0.041336446732233224,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001993047967313379,
+      "loss": 0.127,
+      "step": 4762
+    },
+    {
+      "epoch": 0.04134512721243739,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001993044274685377,
+      "loss": 0.1396,
+      "step": 4763
+    },
+    {
+      "epoch": 0.041353807692641555,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019930405810807553,
+      "loss": 0.1504,
+      "step": 4764
+    },
+    {
+      "epoch": 0.04136248817284572,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001993036886499518,
+      "loss": 0.1348,
+      "step": 4765
+    },
+    {
+      "epoch": 0.041371168653049885,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019930331909416682,
+      "loss": 0.1611,
+      "step": 4766
+    },
+    {
+      "epoch": 0.04137984913325405,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0019930294944072117,
+      "loss": 0.1572,
+      "step": 4767
+    },
+    {
+      "epoch": 0.041388529613458215,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001993025796896151,
+      "loss": 0.1611,
+      "step": 4768
+    },
+    {
+      "epoch": 0.04139721009366238,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019930220984084902,
+      "loss": 0.1465,
+      "step": 4769
+    },
+    {
+      "epoch": 0.041405890573866545,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.001993018398944235,
+      "loss": 0.1602,
+      "step": 4770
+    },
+    {
+      "epoch": 0.04141457105407071,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0019930146985033875,
+      "loss": 0.1484,
+      "step": 4771
+    },
+    {
+      "epoch": 0.041423251534274876,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001993010997085953,
+      "loss": 0.1436,
+      "step": 4772
+    },
+    {
+      "epoch": 0.04143193201447904,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019930072946919347,
+      "loss": 0.1377,
+      "step": 4773
+    },
+    {
+      "epoch": 0.041440612494683206,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001993003591321337,
+      "loss": 0.1553,
+      "step": 4774
+    },
+    {
+      "epoch": 0.04144929297488737,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0019929998869741643,
+      "loss": 0.1572,
+      "step": 4775
+    },
+    {
+      "epoch": 0.041457973455091536,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019929961816504203,
+      "loss": 0.1719,
+      "step": 4776
+    },
+    {
+      "epoch": 0.0414666539352957,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001992992475350109,
+      "loss": 0.1367,
+      "step": 4777
+    },
+    {
+      "epoch": 0.041475334415499866,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019929887680732346,
+      "loss": 0.1426,
+      "step": 4778
+    },
+    {
+      "epoch": 0.04148401489570403,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001992985059819801,
+      "loss": 0.1338,
+      "step": 4779
+    },
+    {
+      "epoch": 0.041492695375908197,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0019929813505898124,
+      "loss": 0.1475,
+      "step": 4780
+    },
+    {
+      "epoch": 0.04150137585611236,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001992977640383273,
+      "loss": 0.1465,
+      "step": 4781
+    },
+    {
+      "epoch": 0.04151005633631653,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0019929739292001863,
+      "loss": 0.1777,
+      "step": 4782
+    },
+    {
+      "epoch": 0.04151873681652069,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0019929702170405567,
+      "loss": 0.1318,
+      "step": 4783
+    },
+    {
+      "epoch": 0.04152741729672486,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001992966503904389,
+      "loss": 0.1504,
+      "step": 4784
+    },
+    {
+      "epoch": 0.04153609777692902,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0019929627897916856,
+      "loss": 0.1855,
+      "step": 4785
+    },
+    {
+      "epoch": 0.04154477825713319,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001992959074702452,
+      "loss": 0.1211,
+      "step": 4786
+    },
+    {
+      "epoch": 0.04155345873733735,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0019929553586366918,
+      "loss": 0.1807,
+      "step": 4787
+    },
+    {
+      "epoch": 0.04156213921754152,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019929516415944093,
+      "loss": 0.209,
+      "step": 4788
+    },
+    {
+      "epoch": 0.04157081969774568,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001992947923575608,
+      "loss": 0.1523,
+      "step": 4789
+    },
+    {
+      "epoch": 0.04157950017794984,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019929442045802923,
+      "loss": 0.1504,
+      "step": 4790
+    },
+    {
+      "epoch": 0.041588180658154006,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001992940484608466,
+      "loss": 0.1182,
+      "step": 4791
+    },
+    {
+      "epoch": 0.04159686113835817,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0019929367636601332,
+      "loss": 0.2051,
+      "step": 4792
+    },
+    {
+      "epoch": 0.041605541618562336,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001992933041735299,
+      "loss": 0.1167,
+      "step": 4793
+    },
+    {
+      "epoch": 0.0416142220987665,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019929293188339662,
+      "loss": 0.1338,
+      "step": 4794
+    },
+    {
+      "epoch": 0.041622902578970666,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001992925594956139,
+      "loss": 0.1445,
+      "step": 4795
+    },
+    {
+      "epoch": 0.04163158305917483,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001992921870101822,
+      "loss": 0.1221,
+      "step": 4796
+    },
+    {
+      "epoch": 0.041640263539379,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019929181442710194,
+      "loss": 0.1758,
+      "step": 4797
+    },
+    {
+      "epoch": 0.04164894401958316,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0019929144174637347,
+      "loss": 0.1758,
+      "step": 4798
+    },
+    {
+      "epoch": 0.04165762449978733,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001992910689679972,
+      "loss": 0.1436,
+      "step": 4799
+    },
+    {
+      "epoch": 0.04166630497999149,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019929069609197357,
+      "loss": 0.1855,
+      "step": 4800
+    },
+    {
+      "epoch": 0.04167498546019566,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.00199290323118303,
+      "loss": 0.1196,
+      "step": 4801
+    },
+    {
+      "epoch": 0.04168366594039982,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0019928995004698585,
+      "loss": 0.1826,
+      "step": 4802
+    },
+    {
+      "epoch": 0.04169234642060399,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001992895768780226,
+      "loss": 0.1387,
+      "step": 4803
+    },
+    {
+      "epoch": 0.04170102690080815,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0019928920361141356,
+      "loss": 0.1514,
+      "step": 4804
+    },
+    {
+      "epoch": 0.04170970738101232,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001992888302471592,
+      "loss": 0.1055,
+      "step": 4805
+    },
+    {
+      "epoch": 0.04171838786121648,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001992884567852599,
+      "loss": 0.1543,
+      "step": 4806
+    },
+    {
+      "epoch": 0.04172706834142065,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001992880832257161,
+      "loss": 0.1826,
+      "step": 4807
+    },
+    {
+      "epoch": 0.04173574882162481,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001992877095685282,
+      "loss": 0.1699,
+      "step": 4808
+    },
+    {
+      "epoch": 0.04174442930182898,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001992873358136966,
+      "loss": 0.126,
+      "step": 4809
+    },
+    {
+      "epoch": 0.04175310978203314,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.001992869619612217,
+      "loss": 0.1582,
+      "step": 4810
+    },
+    {
+      "epoch": 0.04176179026223731,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0019928658801110395,
+      "loss": 0.1855,
+      "step": 4811
+    },
+    {
+      "epoch": 0.04177047074244147,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001992862139633437,
+      "loss": 0.1465,
+      "step": 4812
+    },
+    {
+      "epoch": 0.04177915122264564,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001992858398179414,
+      "loss": 0.1719,
+      "step": 4813
+    },
+    {
+      "epoch": 0.041787831702849804,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0019928546557489743,
+      "loss": 0.1484,
+      "step": 4814
+    },
+    {
+      "epoch": 0.04179651218305397,
+      "grad_norm": 2.34375,
+      "learning_rate": 0.0019928509123421224,
+      "loss": 0.3672,
+      "step": 4815
+    },
+    {
+      "epoch": 0.041805192663258134,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001992847167958862,
+      "loss": 0.1436,
+      "step": 4816
+    },
+    {
+      "epoch": 0.0418138731434623,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0019928434225991976,
+      "loss": 0.166,
+      "step": 4817
+    },
+    {
+      "epoch": 0.041822553623666464,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001992839676263133,
+      "loss": 0.2002,
+      "step": 4818
+    },
+    {
+      "epoch": 0.04183123410387063,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0019928359289506717,
+      "loss": 0.126,
+      "step": 4819
+    },
+    {
+      "epoch": 0.041839914584074794,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001992832180661819,
+      "loss": 0.1523,
+      "step": 4820
+    },
+    {
+      "epoch": 0.04184859506427895,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019928284313965785,
+      "loss": 0.1895,
+      "step": 4821
+    },
+    {
+      "epoch": 0.04185727554448312,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0019928246811549543,
+      "loss": 0.1084,
+      "step": 4822
+    },
+    {
+      "epoch": 0.04186595602468728,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.00199282092993695,
+      "loss": 0.1992,
+      "step": 4823
+    },
+    {
+      "epoch": 0.04187463650489145,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019928171777425703,
+      "loss": 0.1973,
+      "step": 4824
+    },
+    {
+      "epoch": 0.04188331698509561,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0019928134245718195,
+      "loss": 0.1582,
+      "step": 4825
+    },
+    {
+      "epoch": 0.04189199746529978,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001992809670424701,
+      "loss": 0.1377,
+      "step": 4826
+    },
+    {
+      "epoch": 0.04190067794550394,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001992805915301219,
+      "loss": 0.1367,
+      "step": 4827
+    },
+    {
+      "epoch": 0.04190935842570811,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019928021592013783,
+      "loss": 0.2891,
+      "step": 4828
+    },
+    {
+      "epoch": 0.041918038905912273,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001992798402125183,
+      "loss": 0.1641,
+      "step": 4829
+    },
+    {
+      "epoch": 0.04192671938611644,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001992794644072636,
+      "loss": 0.1738,
+      "step": 4830
+    },
+    {
+      "epoch": 0.041935399866320604,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0019927908850437426,
+      "loss": 0.21,
+      "step": 4831
+    },
+    {
+      "epoch": 0.04194408034652477,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019927871250385062,
+      "loss": 0.1602,
+      "step": 4832
+    },
+    {
+      "epoch": 0.041952760826728934,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001992783364056932,
+      "loss": 0.2168,
+      "step": 4833
+    },
+    {
+      "epoch": 0.0419614413069331,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001992779602099022,
+      "loss": 0.168,
+      "step": 4834
+    },
+    {
+      "epoch": 0.041970121787137264,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001992775839164783,
+      "loss": 0.1172,
+      "step": 4835
+    },
+    {
+      "epoch": 0.04197880226734143,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019927720752542168,
+      "loss": 0.1289,
+      "step": 4836
+    },
+    {
+      "epoch": 0.041987482747545594,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.001992768310367329,
+      "loss": 0.168,
+      "step": 4837
+    },
+    {
+      "epoch": 0.04199616322774976,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0019927645445041225,
+      "loss": 0.2285,
+      "step": 4838
+    },
+    {
+      "epoch": 0.042004843707953925,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001992760777664603,
+      "loss": 0.2041,
+      "step": 4839
+    },
+    {
+      "epoch": 0.04201352418815809,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019927570098487732,
+      "loss": 0.1572,
+      "step": 4840
+    },
+    {
+      "epoch": 0.042022204668362255,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001992753241056638,
+      "loss": 0.1309,
+      "step": 4841
+    },
+    {
+      "epoch": 0.04203088514856642,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001992749471288201,
+      "loss": 0.1543,
+      "step": 4842
+    },
+    {
+      "epoch": 0.042039565628770585,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0019927457005434667,
+      "loss": 0.1445,
+      "step": 4843
+    },
+    {
+      "epoch": 0.04204824610897475,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019927419288224392,
+      "loss": 0.1855,
+      "step": 4844
+    },
+    {
+      "epoch": 0.042056926589178915,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019927381561251224,
+      "loss": 0.2227,
+      "step": 4845
+    },
+    {
+      "epoch": 0.04206560706938308,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001992734382451521,
+      "loss": 0.1406,
+      "step": 4846
+    },
+    {
+      "epoch": 0.042074287549587246,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019927306078016383,
+      "loss": 0.2188,
+      "step": 4847
+    },
+    {
+      "epoch": 0.04208296802979141,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019927268321754785,
+      "loss": 0.1523,
+      "step": 4848
+    },
+    {
+      "epoch": 0.042091648509995576,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019927230555730467,
+      "loss": 0.1338,
+      "step": 4849
+    },
+    {
+      "epoch": 0.04210032899019974,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.001992719277994346,
+      "loss": 0.1455,
+      "step": 4850
+    },
+    {
+      "epoch": 0.042109009470403906,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001992715499439381,
+      "loss": 0.1826,
+      "step": 4851
+    },
+    {
+      "epoch": 0.042117689950608064,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0019927117199081555,
+      "loss": 0.2012,
+      "step": 4852
+    },
+    {
+      "epoch": 0.04212637043081223,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019927079394006742,
+      "loss": 0.1602,
+      "step": 4853
+    },
+    {
+      "epoch": 0.042135050911016395,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001992704157916941,
+      "loss": 0.1943,
+      "step": 4854
+    },
+    {
+      "epoch": 0.04214373139122056,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.00199270037545696,
+      "loss": 0.1875,
+      "step": 4855
+    },
+    {
+      "epoch": 0.042152411871424725,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001992696592020735,
+      "loss": 0.1592,
+      "step": 4856
+    },
+    {
+      "epoch": 0.04216109235162889,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0019926928076082705,
+      "loss": 0.1855,
+      "step": 4857
+    },
+    {
+      "epoch": 0.042169772831833055,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0019926890222195705,
+      "loss": 0.1709,
+      "step": 4858
+    },
+    {
+      "epoch": 0.04217845331203722,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.001992685235854639,
+      "loss": 0.168,
+      "step": 4859
+    },
+    {
+      "epoch": 0.042187133792241385,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019926814485134807,
+      "loss": 0.1396,
+      "step": 4860
+    },
+    {
+      "epoch": 0.04219581427244555,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001992677660196099,
+      "loss": 0.1719,
+      "step": 4861
+    },
+    {
+      "epoch": 0.042204494752649716,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001992673870902499,
+      "loss": 0.1592,
+      "step": 4862
+    },
+    {
+      "epoch": 0.04221317523285388,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019926700806326835,
+      "loss": 0.168,
+      "step": 4863
+    },
+    {
+      "epoch": 0.042221855713058046,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0019926662893866584,
+      "loss": 0.1328,
+      "step": 4864
+    },
+    {
+      "epoch": 0.04223053619326221,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001992662497164426,
+      "loss": 0.1406,
+      "step": 4865
+    },
+    {
+      "epoch": 0.042239216673466376,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001992658703965992,
+      "loss": 0.1914,
+      "step": 4866
+    },
+    {
+      "epoch": 0.04224789715367054,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001992654909791359,
+      "loss": 0.1543,
+      "step": 4867
+    },
+    {
+      "epoch": 0.042256577633874706,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019926511146405325,
+      "loss": 0.1709,
+      "step": 4868
+    },
+    {
+      "epoch": 0.04226525811407887,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001992647318513516,
+      "loss": 0.1621,
+      "step": 4869
+    },
+    {
+      "epoch": 0.042273938594283036,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0019926435214103143,
+      "loss": 0.127,
+      "step": 4870
+    },
+    {
+      "epoch": 0.0422826190744872,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001992639723330931,
+      "loss": 0.1299,
+      "step": 4871
+    },
+    {
+      "epoch": 0.04229129955469137,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019926359242753698,
+      "loss": 0.1934,
+      "step": 4872
+    },
+    {
+      "epoch": 0.04229998003489553,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001992632124243635,
+      "loss": 0.168,
+      "step": 4873
+    },
+    {
+      "epoch": 0.0423086605150997,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019926283232357318,
+      "loss": 0.1807,
+      "step": 4874
+    },
+    {
+      "epoch": 0.04231734099530386,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001992624521251664,
+      "loss": 0.1504,
+      "step": 4875
+    },
+    {
+      "epoch": 0.04232602147550803,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001992620718291435,
+      "loss": 0.1875,
+      "step": 4876
+    },
+    {
+      "epoch": 0.04233470195571219,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001992616914355049,
+      "loss": 0.1245,
+      "step": 4877
+    },
+    {
+      "epoch": 0.04234338243591636,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001992613109442511,
+      "loss": 0.168,
+      "step": 4878
+    },
+    {
+      "epoch": 0.04235206291612052,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019926093035538247,
+      "loss": 0.2168,
+      "step": 4879
+    },
+    {
+      "epoch": 0.04236074339632469,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019926054966889943,
+      "loss": 0.166,
+      "step": 4880
+    },
+    {
+      "epoch": 0.04236942387652885,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001992601688848024,
+      "loss": 0.1934,
+      "step": 4881
+    },
+    {
+      "epoch": 0.04237810435673302,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019925978800309175,
+      "loss": 0.1582,
+      "step": 4882
+    },
+    {
+      "epoch": 0.042386784836937176,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019925940702376797,
+      "loss": 0.168,
+      "step": 4883
+    },
+    {
+      "epoch": 0.04239546531714134,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0019925902594683147,
+      "loss": 0.166,
+      "step": 4884
+    },
+    {
+      "epoch": 0.042404145797345506,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001992586447722826,
+      "loss": 0.1338,
+      "step": 4885
+    },
+    {
+      "epoch": 0.04241282627754967,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.001992582635001218,
+      "loss": 0.2695,
+      "step": 4886
+    },
+    {
+      "epoch": 0.04242150675775384,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019925788213034953,
+      "loss": 0.1543,
+      "step": 4887
+    },
+    {
+      "epoch": 0.042430187237958,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001992575006629662,
+      "loss": 0.1348,
+      "step": 4888
+    },
+    {
+      "epoch": 0.04243886771816217,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001992571190979722,
+      "loss": 0.1689,
+      "step": 4889
+    },
+    {
+      "epoch": 0.04244754819836633,
+      "grad_norm": 2.359375,
+      "learning_rate": 0.0019925673743536793,
+      "loss": 0.3301,
+      "step": 4890
+    },
+    {
+      "epoch": 0.0424562286785705,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019925635567515387,
+      "loss": 0.165,
+      "step": 4891
+    },
+    {
+      "epoch": 0.04246490915877466,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001992559738173304,
+      "loss": 0.1348,
+      "step": 4892
+    },
+    {
+      "epoch": 0.04247358963897883,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001992555918618979,
+      "loss": 0.124,
+      "step": 4893
+    },
+    {
+      "epoch": 0.04248227011918299,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019925520980885684,
+      "loss": 0.1787,
+      "step": 4894
+    },
+    {
+      "epoch": 0.04249095059938716,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001992548276582076,
+      "loss": 0.1387,
+      "step": 4895
+    },
+    {
+      "epoch": 0.04249963107959132,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001992544454099507,
+      "loss": 0.1631,
+      "step": 4896
+    },
+    {
+      "epoch": 0.04250831155979549,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001992540630640864,
+      "loss": 0.1807,
+      "step": 4897
+    },
+    {
+      "epoch": 0.04251699203999965,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0019925368062061522,
+      "loss": 0.1318,
+      "step": 4898
+    },
+    {
+      "epoch": 0.04252567252020382,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019925329807953755,
+      "loss": 0.1523,
+      "step": 4899
+    },
+    {
+      "epoch": 0.04253435300040798,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0019925291544085383,
+      "loss": 0.1748,
+      "step": 4900
+    },
+    {
+      "epoch": 0.04254303348061215,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019925253270456447,
+      "loss": 0.1465,
+      "step": 4901
+    },
+    {
+      "epoch": 0.04255171396081631,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019925214987066985,
+      "loss": 0.2324,
+      "step": 4902
+    },
+    {
+      "epoch": 0.04256039444102048,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019925176693917045,
+      "loss": 0.2012,
+      "step": 4903
+    },
+    {
+      "epoch": 0.042569074921224644,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019925138391006666,
+      "loss": 0.1914,
+      "step": 4904
+    },
+    {
+      "epoch": 0.04257775540142881,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019925100078335887,
+      "loss": 0.1641,
+      "step": 4905
+    },
+    {
+      "epoch": 0.042586435881632974,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019925061755904755,
+      "loss": 0.1309,
+      "step": 4906
+    },
+    {
+      "epoch": 0.04259511636183714,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0019925023423713307,
+      "loss": 0.1328,
+      "step": 4907
+    },
+    {
+      "epoch": 0.042603796842041304,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0019924985081761592,
+      "loss": 0.165,
+      "step": 4908
+    },
+    {
+      "epoch": 0.04261247732224547,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019924946730049643,
+      "loss": 0.1221,
+      "step": 4909
+    },
+    {
+      "epoch": 0.042621157802449634,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019924908368577506,
+      "loss": 0.1768,
+      "step": 4910
+    },
+    {
+      "epoch": 0.0426298382826538,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.001992486999734523,
+      "loss": 0.1895,
+      "step": 4911
+    },
+    {
+      "epoch": 0.042638518762857965,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019924831616352843,
+      "loss": 0.1436,
+      "step": 4912
+    },
+    {
+      "epoch": 0.04264719924306212,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019924793225600396,
+      "loss": 0.1475,
+      "step": 4913
+    },
+    {
+      "epoch": 0.04265587972326629,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001992475482508793,
+      "loss": 0.1641,
+      "step": 4914
+    },
+    {
+      "epoch": 0.04266456020347045,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001992471641481549,
+      "loss": 0.1699,
+      "step": 4915
+    },
+    {
+      "epoch": 0.04267324068367462,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019924677994783107,
+      "loss": 0.1523,
+      "step": 4916
+    },
+    {
+      "epoch": 0.04268192116387878,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0019924639564990834,
+      "loss": 0.1533,
+      "step": 4917
+    },
+    {
+      "epoch": 0.04269060164408295,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019924601125438706,
+      "loss": 0.1709,
+      "step": 4918
+    },
+    {
+      "epoch": 0.04269928212428711,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0019924562676126773,
+      "loss": 0.1348,
+      "step": 4919
+    },
+    {
+      "epoch": 0.04270796260449128,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019924524217055073,
+      "loss": 0.1719,
+      "step": 4920
+    },
+    {
+      "epoch": 0.042716643084695444,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019924485748223646,
+      "loss": 0.1504,
+      "step": 4921
+    },
+    {
+      "epoch": 0.04272532356489961,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019924447269632534,
+      "loss": 0.1089,
+      "step": 4922
+    },
+    {
+      "epoch": 0.042734004045103774,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.001992440878128178,
+      "loss": 0.1157,
+      "step": 4923
+    },
+    {
+      "epoch": 0.04274268452530794,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0019924370283171426,
+      "loss": 0.1309,
+      "step": 4924
+    },
+    {
+      "epoch": 0.042751365005512104,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019924331775301517,
+      "loss": 0.1191,
+      "step": 4925
+    },
+    {
+      "epoch": 0.04276004548571627,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019924293257672096,
+      "loss": 0.2002,
+      "step": 4926
+    },
+    {
+      "epoch": 0.042768725965920434,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019924254730283196,
+      "loss": 0.1689,
+      "step": 4927
+    },
+    {
+      "epoch": 0.0427774064461246,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019924216193134866,
+      "loss": 0.2051,
+      "step": 4928
+    },
+    {
+      "epoch": 0.042786086926328765,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001992417764622715,
+      "loss": 0.1465,
+      "step": 4929
+    },
+    {
+      "epoch": 0.04279476740653293,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019924139089560087,
+      "loss": 0.1201,
+      "step": 4930
+    },
+    {
+      "epoch": 0.042803447886737095,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001992410052313372,
+      "loss": 0.1797,
+      "step": 4931
+    },
+    {
+      "epoch": 0.04281212836694126,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001992406194694809,
+      "loss": 0.1045,
+      "step": 4932
+    },
+    {
+      "epoch": 0.042820808847145425,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019924023361003237,
+      "loss": 0.1206,
+      "step": 4933
+    },
+    {
+      "epoch": 0.04282948932734959,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.001992398476529921,
+      "loss": 0.1357,
+      "step": 4934
+    },
+    {
+      "epoch": 0.042838169807553755,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0019923946159836046,
+      "loss": 0.1416,
+      "step": 4935
+    },
+    {
+      "epoch": 0.04284685028775792,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019923907544613785,
+      "loss": 0.168,
+      "step": 4936
+    },
+    {
+      "epoch": 0.042855530767962086,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019923868919632477,
+      "loss": 0.1885,
+      "step": 4937
+    },
+    {
+      "epoch": 0.04286421124816625,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001992383028489216,
+      "loss": 0.1709,
+      "step": 4938
+    },
+    {
+      "epoch": 0.042872891728370416,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001992379164039287,
+      "loss": 0.1348,
+      "step": 4939
+    },
+    {
+      "epoch": 0.04288157220857458,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019923752986134666,
+      "loss": 0.1621,
+      "step": 4940
+    },
+    {
+      "epoch": 0.042890252688778746,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001992371432211758,
+      "loss": 0.1787,
+      "step": 4941
+    },
+    {
+      "epoch": 0.04289893316898291,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019923675648341643,
+      "loss": 0.1396,
+      "step": 4942
+    },
+    {
+      "epoch": 0.042907613649187076,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019923636964806913,
+      "loss": 0.1416,
+      "step": 4943
+    },
+    {
+      "epoch": 0.042916294129391235,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001992359827151343,
+      "loss": 0.1797,
+      "step": 4944
+    },
+    {
+      "epoch": 0.0429249746095954,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0019923559568461235,
+      "loss": 0.1621,
+      "step": 4945
+    },
+    {
+      "epoch": 0.042933655089799565,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019923520855650366,
+      "loss": 0.1484,
+      "step": 4946
+    },
+    {
+      "epoch": 0.04294233557000373,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001992348213308087,
+      "loss": 0.1816,
+      "step": 4947
+    },
+    {
+      "epoch": 0.042951016050207895,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001992344340075279,
+      "loss": 0.1328,
+      "step": 4948
+    },
+    {
+      "epoch": 0.04295969653041206,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001992340465866616,
+      "loss": 0.1357,
+      "step": 4949
+    },
+    {
+      "epoch": 0.042968377010616225,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001992336590682104,
+      "loss": 0.1641,
+      "step": 4950
+    },
+    {
+      "epoch": 0.04297705749082039,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001992332714521745,
+      "loss": 0.1348,
+      "step": 4951
+    },
+    {
+      "epoch": 0.042985737971024555,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001992328837385545,
+      "loss": 0.1719,
+      "step": 4952
+    },
+    {
+      "epoch": 0.04299441845122872,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001992324959273507,
+      "loss": 0.1914,
+      "step": 4953
+    },
+    {
+      "epoch": 0.043003098931432886,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019923210801856364,
+      "loss": 0.1553,
+      "step": 4954
+    },
+    {
+      "epoch": 0.04301177941163705,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0019923172001219368,
+      "loss": 0.1768,
+      "step": 4955
+    },
+    {
+      "epoch": 0.043020459891841216,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019923133190824123,
+      "loss": 0.1953,
+      "step": 4956
+    },
+    {
+      "epoch": 0.04302914037204538,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0019923094370670673,
+      "loss": 0.1074,
+      "step": 4957
+    },
+    {
+      "epoch": 0.043037820852249546,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019923055540759065,
+      "loss": 0.1689,
+      "step": 4958
+    },
+    {
+      "epoch": 0.04304650133245371,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019923016701089338,
+      "loss": 0.1758,
+      "step": 4959
+    },
+    {
+      "epoch": 0.043055181812657876,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019922977851661527,
+      "loss": 0.1514,
+      "step": 4960
+    },
+    {
+      "epoch": 0.04306386229286204,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001992293899247569,
+      "loss": 0.1445,
+      "step": 4961
+    },
+    {
+      "epoch": 0.04307254277306621,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001992290012353185,
+      "loss": 0.2559,
+      "step": 4962
+    },
+    {
+      "epoch": 0.04308122325327037,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001992286124483007,
+      "loss": 0.166,
+      "step": 4963
+    },
+    {
+      "epoch": 0.04308990373347454,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001992282235637038,
+      "loss": 0.1562,
+      "step": 4964
+    },
+    {
+      "epoch": 0.0430985842136787,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019922783458152828,
+      "loss": 0.1641,
+      "step": 4965
+    },
+    {
+      "epoch": 0.04310726469388287,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001992274455017745,
+      "loss": 0.2217,
+      "step": 4966
+    },
+    {
+      "epoch": 0.04311594517408703,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019922705632444294,
+      "loss": 0.1177,
+      "step": 4967
+    },
+    {
+      "epoch": 0.0431246256542912,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.00199226667049534,
+      "loss": 0.168,
+      "step": 4968
+    },
+    {
+      "epoch": 0.04313330613449536,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0019922627767704816,
+      "loss": 0.1543,
+      "step": 4969
+    },
+    {
+      "epoch": 0.04314198661469953,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001992258882069858,
+      "loss": 0.124,
+      "step": 4970
+    },
+    {
+      "epoch": 0.04315066709490369,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0019922549863934727,
+      "loss": 0.1348,
+      "step": 4971
+    },
+    {
+      "epoch": 0.04315934757510786,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019922510897413316,
+      "loss": 0.166,
+      "step": 4972
+    },
+    {
+      "epoch": 0.04316802805531202,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019922471921134375,
+      "loss": 0.1953,
+      "step": 4973
+    },
+    {
+      "epoch": 0.04317670853551619,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019922432935097958,
+      "loss": 0.1348,
+      "step": 4974
+    },
+    {
+      "epoch": 0.043185389015720346,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00199223939393041,
+      "loss": 0.1611,
+      "step": 4975
+    },
+    {
+      "epoch": 0.04319406949592451,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001992235493375285,
+      "loss": 0.1572,
+      "step": 4976
+    },
+    {
+      "epoch": 0.043202749976128677,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001992231591844424,
+      "loss": 0.1445,
+      "step": 4977
+    },
+    {
+      "epoch": 0.04321143045633284,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0019922276893378325,
+      "loss": 0.1523,
+      "step": 4978
+    },
+    {
+      "epoch": 0.04322011093653701,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0019922237858555135,
+      "loss": 0.1377,
+      "step": 4979
+    },
+    {
+      "epoch": 0.04322879141674117,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019922198813974723,
+      "loss": 0.1611,
+      "step": 4980
+    },
+    {
+      "epoch": 0.04323747189694534,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019922159759637134,
+      "loss": 0.1953,
+      "step": 4981
+    },
+    {
+      "epoch": 0.0432461523771495,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019922120695542397,
+      "loss": 0.1465,
+      "step": 4982
+    },
+    {
+      "epoch": 0.04325483285735367,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001992208162169057,
+      "loss": 0.1602,
+      "step": 4983
+    },
+    {
+      "epoch": 0.04326351333755783,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019922042538081685,
+      "loss": 0.1914,
+      "step": 4984
+    },
+    {
+      "epoch": 0.043272193817762,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001992200344471579,
+      "loss": 0.1816,
+      "step": 4985
+    },
+    {
+      "epoch": 0.04328087429796616,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001992196434159292,
+      "loss": 0.168,
+      "step": 4986
+    },
+    {
+      "epoch": 0.04328955477817033,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019921925228713134,
+      "loss": 0.168,
+      "step": 4987
+    },
+    {
+      "epoch": 0.04329823525837449,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001992188610607646,
+      "loss": 0.1523,
+      "step": 4988
+    },
+    {
+      "epoch": 0.04330691573857866,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001992184697368294,
+      "loss": 0.1299,
+      "step": 4989
+    },
+    {
+      "epoch": 0.04331559621878282,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001992180783153263,
+      "loss": 0.1895,
+      "step": 4990
+    },
+    {
+      "epoch": 0.04332427669898699,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001992176867962556,
+      "loss": 0.1514,
+      "step": 4991
+    },
+    {
+      "epoch": 0.04333295717919115,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001992172951796178,
+      "loss": 0.1416,
+      "step": 4992
+    },
+    {
+      "epoch": 0.04334163765939532,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.001992169034654133,
+      "loss": 0.1582,
+      "step": 4993
+    },
+    {
+      "epoch": 0.043350318139599484,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001992165116536425,
+      "loss": 0.2031,
+      "step": 4994
+    },
+    {
+      "epoch": 0.04335899861980365,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019921611974430594,
+      "loss": 0.127,
+      "step": 4995
+    },
+    {
+      "epoch": 0.043367679100007814,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001992157277374039,
+      "loss": 0.1475,
+      "step": 4996
+    },
+    {
+      "epoch": 0.04337635958021198,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0019921533563293696,
+      "loss": 0.1484,
+      "step": 4997
+    },
+    {
+      "epoch": 0.043385040060416144,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001992149434309054,
+      "loss": 0.1797,
+      "step": 4998
+    },
+    {
+      "epoch": 0.04339372054062031,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001992145511313097,
+      "loss": 0.1514,
+      "step": 4999
+    },
+    {
+      "epoch": 0.043402401020824474,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019921415873415038,
+      "loss": 0.1387,
+      "step": 5000
+    },
+    {
+      "epoch": 0.04341108150102864,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0019921376623942776,
+      "loss": 0.1396,
+      "step": 5001
+    },
+    {
+      "epoch": 0.043419761981232805,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001992133736471423,
+      "loss": 0.2402,
+      "step": 5002
+    },
+    {
+      "epoch": 0.04342844246143697,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019921298095729447,
+      "loss": 0.1162,
+      "step": 5003
+    },
+    {
+      "epoch": 0.043437122941641135,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0019921258816988463,
+      "loss": 0.1338,
+      "step": 5004
+    },
+    {
+      "epoch": 0.0434458034218453,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001992121952849133,
+      "loss": 0.1289,
+      "step": 5005
+    },
+    {
+      "epoch": 0.04345448390204946,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019921180230238074,
+      "loss": 0.1465,
+      "step": 5006
+    },
+    {
+      "epoch": 0.04346316438225362,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019921140922228757,
+      "loss": 0.1611,
+      "step": 5007
+    },
+    {
+      "epoch": 0.04347184486245779,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019921101604463416,
+      "loss": 0.1162,
+      "step": 5008
+    },
+    {
+      "epoch": 0.04348052534266195,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001992106227694209,
+      "loss": 0.1445,
+      "step": 5009
+    },
+    {
+      "epoch": 0.04348920582286612,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001992102293966482,
+      "loss": 0.1289,
+      "step": 5010
+    },
+    {
+      "epoch": 0.043497886303070284,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001992098359263166,
+      "loss": 0.1377,
+      "step": 5011
+    },
+    {
+      "epoch": 0.04350656678327445,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0019920944235842643,
+      "loss": 0.1445,
+      "step": 5012
+    },
+    {
+      "epoch": 0.043515247263478614,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001992090486929782,
+      "loss": 0.1699,
+      "step": 5013
+    },
+    {
+      "epoch": 0.04352392774368278,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001992086549299722,
+      "loss": 0.1719,
+      "step": 5014
+    },
+    {
+      "epoch": 0.043532608223886944,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0019920826106940904,
+      "loss": 0.1816,
+      "step": 5015
+    },
+    {
+      "epoch": 0.04354128870409111,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0019920786711128905,
+      "loss": 0.166,
+      "step": 5016
+    },
+    {
+      "epoch": 0.043549969184295274,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019920747305561264,
+      "loss": 0.2051,
+      "step": 5017
+    },
+    {
+      "epoch": 0.04355864966449944,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001992070789023803,
+      "loss": 0.2227,
+      "step": 5018
+    },
+    {
+      "epoch": 0.043567330144703605,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0019920668465159244,
+      "loss": 0.1592,
+      "step": 5019
+    },
+    {
+      "epoch": 0.04357601062490777,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0019920629030324953,
+      "loss": 0.1406,
+      "step": 5020
+    },
+    {
+      "epoch": 0.043584691105111935,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0019920589585735193,
+      "loss": 0.1904,
+      "step": 5021
+    },
+    {
+      "epoch": 0.0435933715853161,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0019920550131390007,
+      "loss": 0.1309,
+      "step": 5022
+    },
+    {
+      "epoch": 0.043602052065520265,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019920510667289443,
+      "loss": 0.1738,
+      "step": 5023
+    },
+    {
+      "epoch": 0.04361073254572443,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019920471193433545,
+      "loss": 0.1318,
+      "step": 5024
+    },
+    {
+      "epoch": 0.043619413025928595,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019920431709822355,
+      "loss": 0.1162,
+      "step": 5025
+    },
+    {
+      "epoch": 0.04362809350613276,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001992039221645591,
+      "loss": 0.1738,
+      "step": 5026
+    },
+    {
+      "epoch": 0.043636773986336926,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001992035271333426,
+      "loss": 0.1953,
+      "step": 5027
+    },
+    {
+      "epoch": 0.04364545446654109,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0019920313200457447,
+      "loss": 0.1973,
+      "step": 5028
+    },
+    {
+      "epoch": 0.043654134946745256,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019920273677825513,
+      "loss": 0.1641,
+      "step": 5029
+    },
+    {
+      "epoch": 0.04366281542694942,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00199202341454385,
+      "loss": 0.1113,
+      "step": 5030
+    },
+    {
+      "epoch": 0.043671495907153586,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019920194603296458,
+      "loss": 0.1748,
+      "step": 5031
+    },
+    {
+      "epoch": 0.04368017638735775,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0019920155051399422,
+      "loss": 0.1523,
+      "step": 5032
+    },
+    {
+      "epoch": 0.043688856867561916,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001992011548974744,
+      "loss": 0.1289,
+      "step": 5033
+    },
+    {
+      "epoch": 0.04369753734776608,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001992007591834055,
+      "loss": 0.2656,
+      "step": 5034
+    },
+    {
+      "epoch": 0.043706217827970247,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00199200363371788,
+      "loss": 0.1631,
+      "step": 5035
+    },
+    {
+      "epoch": 0.04371489830817441,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0019919996746262233,
+      "loss": 0.1289,
+      "step": 5036
+    },
+    {
+      "epoch": 0.04372357878837857,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0019919957145590893,
+      "loss": 0.1357,
+      "step": 5037
+    },
+    {
+      "epoch": 0.043732259268582735,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001991991753516482,
+      "loss": 0.1973,
+      "step": 5038
+    },
+    {
+      "epoch": 0.0437409397487869,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001991987791498406,
+      "loss": 0.1748,
+      "step": 5039
+    },
+    {
+      "epoch": 0.043749620228991065,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019919838285048655,
+      "loss": 0.166,
+      "step": 5040
+    },
+    {
+      "epoch": 0.04375830070919523,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019919798645358653,
+      "loss": 0.1699,
+      "step": 5041
+    },
+    {
+      "epoch": 0.043766981189399395,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0019919758995914087,
+      "loss": 0.1099,
+      "step": 5042
+    },
+    {
+      "epoch": 0.04377566166960356,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001991971933671501,
+      "loss": 0.1445,
+      "step": 5043
+    },
+    {
+      "epoch": 0.043784342149807726,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001991967966776146,
+      "loss": 0.1387,
+      "step": 5044
+    },
+    {
+      "epoch": 0.04379302263001189,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019919639989053484,
+      "loss": 0.1562,
+      "step": 5045
+    },
+    {
+      "epoch": 0.043801703110216056,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019919600300591124,
+      "loss": 0.1309,
+      "step": 5046
+    },
+    {
+      "epoch": 0.04381038359042022,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019919560602374422,
+      "loss": 0.1484,
+      "step": 5047
+    },
+    {
+      "epoch": 0.043819064070624386,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0019919520894403422,
+      "loss": 0.1436,
+      "step": 5048
+    },
+    {
+      "epoch": 0.04382774455082855,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0019919481176678172,
+      "loss": 0.1328,
+      "step": 5049
+    },
+    {
+      "epoch": 0.043836425031032716,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019919441449198706,
+      "loss": 0.1611,
+      "step": 5050
+    },
+    {
+      "epoch": 0.04384510551123688,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019919401711965077,
+      "loss": 0.1445,
+      "step": 5051
+    },
+    {
+      "epoch": 0.04385378599144105,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001991936196497732,
+      "loss": 0.1816,
+      "step": 5052
+    },
+    {
+      "epoch": 0.04386246647164521,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019919322208235488,
+      "loss": 0.1641,
+      "step": 5053
+    },
+    {
+      "epoch": 0.04387114695184938,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019919282441739614,
+      "loss": 0.2432,
+      "step": 5054
+    },
+    {
+      "epoch": 0.04387982743205354,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001991924266548975,
+      "loss": 0.1836,
+      "step": 5055
+    },
+    {
+      "epoch": 0.04388850791225771,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0019919202879485937,
+      "loss": 0.2949,
+      "step": 5056
+    },
+    {
+      "epoch": 0.04389718839246187,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019919163083728215,
+      "loss": 0.1641,
+      "step": 5057
+    },
+    {
+      "epoch": 0.04390586887266604,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019919123278216632,
+      "loss": 0.1621,
+      "step": 5058
+    },
+    {
+      "epoch": 0.0439145493528702,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001991908346295123,
+      "loss": 0.21,
+      "step": 5059
+    },
+    {
+      "epoch": 0.04392322983307437,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001991904363793205,
+      "loss": 0.1465,
+      "step": 5060
+    },
+    {
+      "epoch": 0.04393191031327853,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001991900380315914,
+      "loss": 0.208,
+      "step": 5061
+    },
+    {
+      "epoch": 0.0439405907934827,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001991896395863254,
+      "loss": 0.1484,
+      "step": 5062
+    },
+    {
+      "epoch": 0.04394927127368686,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019918924104352295,
+      "loss": 0.1768,
+      "step": 5063
+    },
+    {
+      "epoch": 0.04395795175389103,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0019918884240318455,
+      "loss": 0.1514,
+      "step": 5064
+    },
+    {
+      "epoch": 0.04396663223409519,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001991884436653105,
+      "loss": 0.1992,
+      "step": 5065
+    },
+    {
+      "epoch": 0.04397531271429936,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001991880448299013,
+      "loss": 0.168,
+      "step": 5066
+    },
+    {
+      "epoch": 0.043983993194503516,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.001991876458969574,
+      "loss": 0.1621,
+      "step": 5067
+    },
+    {
+      "epoch": 0.04399267367470768,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0019918724686647927,
+      "loss": 0.1865,
+      "step": 5068
+    },
+    {
+      "epoch": 0.04400135415491185,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.001991868477384673,
+      "loss": 0.1494,
+      "step": 5069
+    },
+    {
+      "epoch": 0.04401003463511601,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019918644851292192,
+      "loss": 0.1465,
+      "step": 5070
+    },
+    {
+      "epoch": 0.04401871511532018,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001991860491898436,
+      "loss": 0.167,
+      "step": 5071
+    },
+    {
+      "epoch": 0.04402739559552434,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001991856497692327,
+      "loss": 0.1416,
+      "step": 5072
+    },
+    {
+      "epoch": 0.04403607607572851,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0019918525025108976,
+      "loss": 0.1426,
+      "step": 5073
+    },
+    {
+      "epoch": 0.04404475655593267,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001991848506354152,
+      "loss": 0.1758,
+      "step": 5074
+    },
+    {
+      "epoch": 0.04405343703613684,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0019918445092220933,
+      "loss": 0.1689,
+      "step": 5075
+    },
+    {
+      "epoch": 0.044062117516341,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019918405111147277,
+      "loss": 0.1582,
+      "step": 5076
+    },
+    {
+      "epoch": 0.04407079799654517,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019918365120320585,
+      "loss": 0.1895,
+      "step": 5077
+    },
+    {
+      "epoch": 0.04407947847674933,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.00199183251197409,
+      "loss": 0.1338,
+      "step": 5078
+    },
+    {
+      "epoch": 0.0440881589569535,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0019918285109408276,
+      "loss": 0.1436,
+      "step": 5079
+    },
+    {
+      "epoch": 0.04409683943715766,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001991824508932274,
+      "loss": 0.1777,
+      "step": 5080
+    },
+    {
+      "epoch": 0.04410551991736183,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001991820505948435,
+      "loss": 0.1738,
+      "step": 5081
+    },
+    {
+      "epoch": 0.04411420039756599,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019918165019893143,
+      "loss": 0.1504,
+      "step": 5082
+    },
+    {
+      "epoch": 0.04412288087777016,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0019918124970549167,
+      "loss": 0.1621,
+      "step": 5083
+    },
+    {
+      "epoch": 0.044131561357974324,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0019918084911452463,
+      "loss": 0.1426,
+      "step": 5084
+    },
+    {
+      "epoch": 0.04414024183817849,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001991804484260308,
+      "loss": 0.1328,
+      "step": 5085
+    },
+    {
+      "epoch": 0.044148922318382654,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001991800476400105,
+      "loss": 0.1387,
+      "step": 5086
+    },
+    {
+      "epoch": 0.04415760279858682,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019917964675646425,
+      "loss": 0.126,
+      "step": 5087
+    },
+    {
+      "epoch": 0.044166283278790984,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001991792457753925,
+      "loss": 0.207,
+      "step": 5088
+    },
+    {
+      "epoch": 0.04417496375899515,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0019917884469679565,
+      "loss": 0.1797,
+      "step": 5089
+    },
+    {
+      "epoch": 0.044183644239199314,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019917844352067417,
+      "loss": 0.1572,
+      "step": 5090
+    },
+    {
+      "epoch": 0.04419232471940348,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019917804224702847,
+      "loss": 0.2012,
+      "step": 5091
+    },
+    {
+      "epoch": 0.044201005199607644,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00199177640875859,
+      "loss": 0.1299,
+      "step": 5092
+    },
+    {
+      "epoch": 0.04420968567981181,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001991772394071662,
+      "loss": 0.1738,
+      "step": 5093
+    },
+    {
+      "epoch": 0.044218366160015975,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019917683784095052,
+      "loss": 0.1445,
+      "step": 5094
+    },
+    {
+      "epoch": 0.04422704664022014,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001991764361772124,
+      "loss": 0.1836,
+      "step": 5095
+    },
+    {
+      "epoch": 0.044235727120424305,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0019917603441595225,
+      "loss": 0.1416,
+      "step": 5096
+    },
+    {
+      "epoch": 0.04424440760062847,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019917563255717054,
+      "loss": 0.1396,
+      "step": 5097
+    },
+    {
+      "epoch": 0.04425308808083263,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001991752306008677,
+      "loss": 0.1523,
+      "step": 5098
+    },
+    {
+      "epoch": 0.04426176856103679,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001991748285470442,
+      "loss": 0.1816,
+      "step": 5099
+    },
+    {
+      "epoch": 0.04427044904124096,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019917442639570044,
+      "loss": 0.126,
+      "step": 5100
+    },
+    {
+      "epoch": 0.044279129521445124,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001991740241468368,
+      "loss": 0.1631,
+      "step": 5101
+    },
+    {
+      "epoch": 0.04428781000164929,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001991736218004538,
+      "loss": 0.1338,
+      "step": 5102
+    },
+    {
+      "epoch": 0.044296490481853454,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019917321935655195,
+      "loss": 0.1514,
+      "step": 5103
+    },
+    {
+      "epoch": 0.04430517096205762,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019917281681513153,
+      "loss": 0.1338,
+      "step": 5104
+    },
+    {
+      "epoch": 0.044313851442261784,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019917241417619306,
+      "loss": 0.1562,
+      "step": 5105
+    },
+    {
+      "epoch": 0.04432253192246595,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0019917201143973703,
+      "loss": 0.1562,
+      "step": 5106
+    },
+    {
+      "epoch": 0.044331212402670114,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019917160860576377,
+      "loss": 0.1436,
+      "step": 5107
+    },
+    {
+      "epoch": 0.04433989288287428,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001991712056742738,
+      "loss": 0.1729,
+      "step": 5108
+    },
+    {
+      "epoch": 0.044348573363078445,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0019917080264526756,
+      "loss": 0.1621,
+      "step": 5109
+    },
+    {
+      "epoch": 0.04435725384328261,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019917039951874546,
+      "loss": 0.1309,
+      "step": 5110
+    },
+    {
+      "epoch": 0.044365934323486775,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0019916999629470793,
+      "loss": 0.1465,
+      "step": 5111
+    },
+    {
+      "epoch": 0.04437461480369094,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019916959297315543,
+      "loss": 0.1777,
+      "step": 5112
+    },
+    {
+      "epoch": 0.044383295283895105,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019916918955408844,
+      "loss": 0.1924,
+      "step": 5113
+    },
+    {
+      "epoch": 0.04439197576409927,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001991687860375073,
+      "loss": 0.2148,
+      "step": 5114
+    },
+    {
+      "epoch": 0.044400656244303435,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001991683824234126,
+      "loss": 0.1738,
+      "step": 5115
+    },
+    {
+      "epoch": 0.0444093367245076,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001991679787118046,
+      "loss": 0.123,
+      "step": 5116
+    },
+    {
+      "epoch": 0.044418017204711766,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019916757490268393,
+      "loss": 0.1504,
+      "step": 5117
+    },
+    {
+      "epoch": 0.04442669768491593,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001991671709960509,
+      "loss": 0.1396,
+      "step": 5118
+    },
+    {
+      "epoch": 0.044435378165120096,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019916676699190602,
+      "loss": 0.2012,
+      "step": 5119
+    },
+    {
+      "epoch": 0.04444405864532426,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0019916636289024968,
+      "loss": 0.1543,
+      "step": 5120
+    },
+    {
+      "epoch": 0.044452739125528426,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019916595869108236,
+      "loss": 0.1357,
+      "step": 5121
+    },
+    {
+      "epoch": 0.04446141960573259,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001991655543944044,
+      "loss": 0.1514,
+      "step": 5122
+    },
+    {
+      "epoch": 0.044470100085936756,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019916515000021643,
+      "loss": 0.1094,
+      "step": 5123
+    },
+    {
+      "epoch": 0.04447878056614092,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0019916474550851875,
+      "loss": 0.1494,
+      "step": 5124
+    },
+    {
+      "epoch": 0.044487461046345086,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019916434091931186,
+      "loss": 0.1543,
+      "step": 5125
+    },
+    {
+      "epoch": 0.04449614152654925,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001991639362325962,
+      "loss": 0.1953,
+      "step": 5126
+    },
+    {
+      "epoch": 0.04450482200675342,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001991635314483722,
+      "loss": 0.1191,
+      "step": 5127
+    },
+    {
+      "epoch": 0.04451350248695758,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0019916312656664023,
+      "loss": 0.1367,
+      "step": 5128
+    },
+    {
+      "epoch": 0.04452218296716174,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001991627215874009,
+      "loss": 0.1562,
+      "step": 5129
+    },
+    {
+      "epoch": 0.044530863447365905,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001991623165106545,
+      "loss": 0.1768,
+      "step": 5130
+    },
+    {
+      "epoch": 0.04453954392757007,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0019916191133640153,
+      "loss": 0.1504,
+      "step": 5131
+    },
+    {
+      "epoch": 0.044548224407774235,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0019916150606464245,
+      "loss": 0.1289,
+      "step": 5132
+    },
+    {
+      "epoch": 0.0445569048879784,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001991611006953777,
+      "loss": 0.1465,
+      "step": 5133
+    },
+    {
+      "epoch": 0.044565585368182566,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019916069522860767,
+      "loss": 0.1475,
+      "step": 5134
+    },
+    {
+      "epoch": 0.04457426584838673,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001991602896643329,
+      "loss": 0.1748,
+      "step": 5135
+    },
+    {
+      "epoch": 0.044582946328590896,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0019915988400255372,
+      "loss": 0.1602,
+      "step": 5136
+    },
+    {
+      "epoch": 0.04459162680879506,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001991594782432707,
+      "loss": 0.1631,
+      "step": 5137
+    },
+    {
+      "epoch": 0.044600307288999226,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0019915907238648414,
+      "loss": 0.1162,
+      "step": 5138
+    },
+    {
+      "epoch": 0.04460898776920339,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0019915866643219456,
+      "loss": 0.1348,
+      "step": 5139
+    },
+    {
+      "epoch": 0.044617668249407556,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001991582603804024,
+      "loss": 0.1846,
+      "step": 5140
+    },
+    {
+      "epoch": 0.04462634872961172,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001991578542311082,
+      "loss": 0.1426,
+      "step": 5141
+    },
+    {
+      "epoch": 0.04463502920981589,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001991574479843122,
+      "loss": 0.1807,
+      "step": 5142
+    },
+    {
+      "epoch": 0.04464370969002005,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019915704164001503,
+      "loss": 0.1289,
+      "step": 5143
+    },
+    {
+      "epoch": 0.04465239017022422,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.00199156635198217,
+      "loss": 0.1328,
+      "step": 5144
+    },
+    {
+      "epoch": 0.04466107065042838,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019915622865891865,
+      "loss": 0.1777,
+      "step": 5145
+    },
+    {
+      "epoch": 0.04466975113063255,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019915582202212037,
+      "loss": 0.1138,
+      "step": 5146
+    },
+    {
+      "epoch": 0.04467843161083671,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0019915541528782266,
+      "loss": 0.1582,
+      "step": 5147
+    },
+    {
+      "epoch": 0.04468711209104088,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001991550084560259,
+      "loss": 0.1797,
+      "step": 5148
+    },
+    {
+      "epoch": 0.04469579257124504,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.001991546015267306,
+      "loss": 0.1543,
+      "step": 5149
+    },
+    {
+      "epoch": 0.04470447305144921,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001991541944999371,
+      "loss": 0.168,
+      "step": 5150
+    },
+    {
+      "epoch": 0.04471315353165337,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019915378737564594,
+      "loss": 0.1396,
+      "step": 5151
+    },
+    {
+      "epoch": 0.04472183401185754,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019915338015385753,
+      "loss": 0.1514,
+      "step": 5152
+    },
+    {
+      "epoch": 0.0447305144920617,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019915297283457233,
+      "loss": 0.2363,
+      "step": 5153
+    },
+    {
+      "epoch": 0.04473919497226587,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.001991525654177908,
+      "loss": 0.2012,
+      "step": 5154
+    },
+    {
+      "epoch": 0.04474787545247003,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001991521579035133,
+      "loss": 0.126,
+      "step": 5155
+    },
+    {
+      "epoch": 0.0447565559326742,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019915175029174043,
+      "loss": 0.1348,
+      "step": 5156
+    },
+    {
+      "epoch": 0.04476523641287836,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019915134258247248,
+      "loss": 0.1191,
+      "step": 5157
+    },
+    {
+      "epoch": 0.04477391689308253,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019915093477571,
+      "loss": 0.1611,
+      "step": 5158
+    },
+    {
+      "epoch": 0.044782597373286694,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001991505268714534,
+      "loss": 0.1367,
+      "step": 5159
+    },
+    {
+      "epoch": 0.04479127785349085,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019915011886970308,
+      "loss": 0.1758,
+      "step": 5160
+    },
+    {
+      "epoch": 0.04479995833369502,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019914971077045953,
+      "loss": 0.1357,
+      "step": 5161
+    },
+    {
+      "epoch": 0.04480863881389918,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019914930257372322,
+      "loss": 0.1621,
+      "step": 5162
+    },
+    {
+      "epoch": 0.04481731929410335,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019914889427949454,
+      "loss": 0.1523,
+      "step": 5163
+    },
+    {
+      "epoch": 0.04482599977430751,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00199148485887774,
+      "loss": 0.1387,
+      "step": 5164
+    },
+    {
+      "epoch": 0.04483468025451168,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0019914807739856203,
+      "loss": 0.1826,
+      "step": 5165
+    },
+    {
+      "epoch": 0.04484336073471584,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0019914766881185906,
+      "loss": 0.1797,
+      "step": 5166
+    },
+    {
+      "epoch": 0.04485204121492001,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001991472601276655,
+      "loss": 0.1689,
+      "step": 5167
+    },
+    {
+      "epoch": 0.04486072169512417,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0019914685134598186,
+      "loss": 0.123,
+      "step": 5168
+    },
+    {
+      "epoch": 0.04486940217532834,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001991464424668086,
+      "loss": 0.166,
+      "step": 5169
+    },
+    {
+      "epoch": 0.0448780826555325,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0019914603349014607,
+      "loss": 0.1104,
+      "step": 5170
+    },
+    {
+      "epoch": 0.04488676313573667,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001991456244159948,
+      "loss": 0.1855,
+      "step": 5171
+    },
+    {
+      "epoch": 0.04489544361594083,
+      "grad_norm": 1.9375,
+      "learning_rate": 0.0019914521524435525,
+      "loss": 0.1357,
+      "step": 5172
+    },
+    {
+      "epoch": 0.044904124096145,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0019914480597522776,
+      "loss": 0.1602,
+      "step": 5173
+    },
+    {
+      "epoch": 0.04491280457634916,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001991443966086129,
+      "loss": 0.1455,
+      "step": 5174
+    },
+    {
+      "epoch": 0.04492148505655333,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019914398714451103,
+      "loss": 0.1738,
+      "step": 5175
+    },
+    {
+      "epoch": 0.044930165536757494,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019914357758292266,
+      "loss": 0.1533,
+      "step": 5176
+    },
+    {
+      "epoch": 0.04493884601696166,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001991431679238482,
+      "loss": 0.166,
+      "step": 5177
+    },
+    {
+      "epoch": 0.044947526497165824,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019914275816728813,
+      "loss": 0.126,
+      "step": 5178
+    },
+    {
+      "epoch": 0.04495620697736999,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001991423483132429,
+      "loss": 0.1709,
+      "step": 5179
+    },
+    {
+      "epoch": 0.044964887457574154,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.001991419383617129,
+      "loss": 0.1504,
+      "step": 5180
+    },
+    {
+      "epoch": 0.04497356793777832,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001991415283126986,
+      "loss": 0.1328,
+      "step": 5181
+    },
+    {
+      "epoch": 0.044982248417982484,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019914111816620047,
+      "loss": 0.1021,
+      "step": 5182
+    },
+    {
+      "epoch": 0.04499092889818665,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00199140707922219,
+      "loss": 0.1514,
+      "step": 5183
+    },
+    {
+      "epoch": 0.044999609378390815,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019914029758075456,
+      "loss": 0.1387,
+      "step": 5184
+    },
+    {
+      "epoch": 0.04500828985859498,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0019913988714180763,
+      "loss": 0.1211,
+      "step": 5185
+    },
+    {
+      "epoch": 0.045016970338799145,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0019913947660537864,
+      "loss": 0.1514,
+      "step": 5186
+    },
+    {
+      "epoch": 0.04502565081900331,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001991390659714681,
+      "loss": 0.1602,
+      "step": 5187
+    },
+    {
+      "epoch": 0.045034331299207475,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001991386552400764,
+      "loss": 0.1855,
+      "step": 5188
+    },
+    {
+      "epoch": 0.04504301177941164,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.00199138244411204,
+      "loss": 0.1406,
+      "step": 5189
+    },
+    {
+      "epoch": 0.045051692259615805,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0019913783348485137,
+      "loss": 0.1406,
+      "step": 5190
+    },
+    {
+      "epoch": 0.045060372739819964,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001991374224610189,
+      "loss": 0.2051,
+      "step": 5191
+    },
+    {
+      "epoch": 0.04506905322002413,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019913701133970714,
+      "loss": 0.1416,
+      "step": 5192
+    },
+    {
+      "epoch": 0.045077733700228294,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019913660012091643,
+      "loss": 0.1445,
+      "step": 5193
+    },
+    {
+      "epoch": 0.04508641418043246,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019913618880464732,
+      "loss": 0.1973,
+      "step": 5194
+    },
+    {
+      "epoch": 0.045095094660636624,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001991357773909002,
+      "loss": 0.1836,
+      "step": 5195
+    },
+    {
+      "epoch": 0.04510377514084079,
+      "grad_norm": 2.40625,
+      "learning_rate": 0.0019913536587967554,
+      "loss": 0.1738,
+      "step": 5196
+    },
+    {
+      "epoch": 0.045112455621044954,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019913495427097373,
+      "loss": 0.1895,
+      "step": 5197
+    },
+    {
+      "epoch": 0.04512113610124912,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0019913454256479533,
+      "loss": 0.1396,
+      "step": 5198
+    },
+    {
+      "epoch": 0.045129816581453285,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0019913413076114075,
+      "loss": 0.1426,
+      "step": 5199
+    },
+    {
+      "epoch": 0.04513849706165745,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019913371886001036,
+      "loss": 0.1377,
+      "step": 5200
+    },
+    {
+      "epoch": 0.045147177541861615,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001991333068614047,
+      "loss": 0.1602,
+      "step": 5201
+    },
+    {
+      "epoch": 0.04515585802206578,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001991328947653242,
+      "loss": 0.1748,
+      "step": 5202
+    },
+    {
+      "epoch": 0.045164538502269945,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001991324825717693,
+      "loss": 0.1494,
+      "step": 5203
+    },
+    {
+      "epoch": 0.04517321898247411,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019913207028074047,
+      "loss": 0.1504,
+      "step": 5204
+    },
+    {
+      "epoch": 0.045181899462678275,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019913165789223813,
+      "loss": 0.1543,
+      "step": 5205
+    },
+    {
+      "epoch": 0.04519057994288244,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019913124540626276,
+      "loss": 0.1914,
+      "step": 5206
+    },
+    {
+      "epoch": 0.045199260423086605,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001991308328228148,
+      "loss": 0.1572,
+      "step": 5207
+    },
+    {
+      "epoch": 0.04520794090329077,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001991304201418947,
+      "loss": 0.2324,
+      "step": 5208
+    },
+    {
+      "epoch": 0.045216621383494936,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001991300073635029,
+      "loss": 0.1562,
+      "step": 5209
+    },
+    {
+      "epoch": 0.0452253018636991,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001991295944876399,
+      "loss": 0.1973,
+      "step": 5210
+    },
+    {
+      "epoch": 0.045233982343903266,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0019912918151430608,
+      "loss": 0.1738,
+      "step": 5211
+    },
+    {
+      "epoch": 0.04524266282410743,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001991287684435019,
+      "loss": 0.1846,
+      "step": 5212
+    },
+    {
+      "epoch": 0.045251343304311596,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001991283552752279,
+      "loss": 0.1455,
+      "step": 5213
+    },
+    {
+      "epoch": 0.04526002378451576,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001991279420094844,
+      "loss": 0.1025,
+      "step": 5214
+    },
+    {
+      "epoch": 0.045268704264719926,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00199127528646272,
+      "loss": 0.1592,
+      "step": 5215
+    },
+    {
+      "epoch": 0.04527738474492409,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019912711518559104,
+      "loss": 0.1641,
+      "step": 5216
+    },
+    {
+      "epoch": 0.04528606522512826,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.00199126701627442,
+      "loss": 0.1406,
+      "step": 5217
+    },
+    {
+      "epoch": 0.04529474570533242,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019912628797182537,
+      "loss": 0.165,
+      "step": 5218
+    },
+    {
+      "epoch": 0.04530342618553659,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001991258742187415,
+      "loss": 0.1113,
+      "step": 5219
+    },
+    {
+      "epoch": 0.04531210666574075,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0019912546036819096,
+      "loss": 0.1807,
+      "step": 5220
+    },
+    {
+      "epoch": 0.04532078714594492,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0019912504642017412,
+      "loss": 0.1602,
+      "step": 5221
+    },
+    {
+      "epoch": 0.045329467626149075,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001991246323746915,
+      "loss": 0.1455,
+      "step": 5222
+    },
+    {
+      "epoch": 0.04533814810635324,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019912421823174353,
+      "loss": 0.207,
+      "step": 5223
+    },
+    {
+      "epoch": 0.045346828586557406,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019912380399133064,
+      "loss": 0.1426,
+      "step": 5224
+    },
+    {
+      "epoch": 0.04535550906676157,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.001991233896534533,
+      "loss": 0.1396,
+      "step": 5225
+    },
+    {
+      "epoch": 0.045364189546965736,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019912297521811196,
+      "loss": 0.1348,
+      "step": 5226
+    },
+    {
+      "epoch": 0.0453728700271699,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.001991225606853071,
+      "loss": 0.1475,
+      "step": 5227
+    },
+    {
+      "epoch": 0.045381550507374066,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0019912214605503913,
+      "loss": 0.1562,
+      "step": 5228
+    },
+    {
+      "epoch": 0.04539023098757823,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.001991217313273085,
+      "loss": 0.1602,
+      "step": 5229
+    },
+    {
+      "epoch": 0.045398911467782396,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001991213165021157,
+      "loss": 0.1094,
+      "step": 5230
+    },
+    {
+      "epoch": 0.04540759194798656,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0019912090157946116,
+      "loss": 0.207,
+      "step": 5231
+    },
+    {
+      "epoch": 0.045416272428190727,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0019912048655934536,
+      "loss": 0.1582,
+      "step": 5232
+    },
+    {
+      "epoch": 0.04542495290839489,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019912007144176867,
+      "loss": 0.1543,
+      "step": 5233
+    },
+    {
+      "epoch": 0.04543363338859906,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019911965622673167,
+      "loss": 0.3008,
+      "step": 5234
+    },
+    {
+      "epoch": 0.04544231386880322,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0019911924091423473,
+      "loss": 0.248,
+      "step": 5235
+    },
+    {
+      "epoch": 0.04545099434900739,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019911882550427834,
+      "loss": 0.168,
+      "step": 5236
+    },
+    {
+      "epoch": 0.04545967482921155,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0019911840999686293,
+      "loss": 0.165,
+      "step": 5237
+    },
+    {
+      "epoch": 0.04546835530941572,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0019911799439198898,
+      "loss": 0.2012,
+      "step": 5238
+    },
+    {
+      "epoch": 0.04547703578961988,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001991175786896569,
+      "loss": 0.1689,
+      "step": 5239
+    },
+    {
+      "epoch": 0.04548571626982405,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001991171628898672,
+      "loss": 0.1504,
+      "step": 5240
+    },
+    {
+      "epoch": 0.04549439675002821,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0019911674699262033,
+      "loss": 0.1367,
+      "step": 5241
+    },
+    {
+      "epoch": 0.04550307723023238,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001991163309979167,
+      "loss": 0.1187,
+      "step": 5242
+    },
+    {
+      "epoch": 0.04551175771043654,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001991159149057568,
+      "loss": 0.1309,
+      "step": 5243
+    },
+    {
+      "epoch": 0.04552043819064071,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001991154987161411,
+      "loss": 0.1807,
+      "step": 5244
+    },
+    {
+      "epoch": 0.04552911867084487,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019911508242906996,
+      "loss": 0.1719,
+      "step": 5245
+    },
+    {
+      "epoch": 0.04553779915104904,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0019911466604454397,
+      "loss": 0.1641,
+      "step": 5246
+    },
+    {
+      "epoch": 0.0455464796312532,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019911424956256347,
+      "loss": 0.1465,
+      "step": 5247
+    },
+    {
+      "epoch": 0.04555516011145737,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.00199113832983129,
+      "loss": 0.1445,
+      "step": 5248
+    },
+    {
+      "epoch": 0.045563840591661534,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019911341630624094,
+      "loss": 0.1777,
+      "step": 5249
+    },
+    {
+      "epoch": 0.0455725210718657,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001991129995318998,
+      "loss": 0.1992,
+      "step": 5250
+    },
+    {
+      "epoch": 0.045581201552069864,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019911258266010604,
+      "loss": 0.1826,
+      "step": 5251
+    },
+    {
+      "epoch": 0.04558988203227402,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001991121656908601,
+      "loss": 0.1553,
+      "step": 5252
+    },
+    {
+      "epoch": 0.04559856251247819,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019911174862416244,
+      "loss": 0.1553,
+      "step": 5253
+    },
+    {
+      "epoch": 0.04560724299268235,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001991113314600135,
+      "loss": 0.1484,
+      "step": 5254
+    },
+    {
+      "epoch": 0.04561592347288652,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019911091419841376,
+      "loss": 0.1445,
+      "step": 5255
+    },
+    {
+      "epoch": 0.04562460395309068,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019911049683936366,
+      "loss": 0.1387,
+      "step": 5256
+    },
+    {
+      "epoch": 0.04563328443329485,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019911007938286365,
+      "loss": 0.1299,
+      "step": 5257
+    },
+    {
+      "epoch": 0.04564196491349901,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001991096618289142,
+      "loss": 0.1367,
+      "step": 5258
+    },
+    {
+      "epoch": 0.04565064539370318,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019910924417751576,
+      "loss": 0.1445,
+      "step": 5259
+    },
+    {
+      "epoch": 0.04565932587390734,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001991088264286688,
+      "loss": 0.3281,
+      "step": 5260
+    },
+    {
+      "epoch": 0.04566800635411151,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001991084085823737,
+      "loss": 0.1543,
+      "step": 5261
+    },
+    {
+      "epoch": 0.04567668683431567,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001991079906386311,
+      "loss": 0.126,
+      "step": 5262
+    },
+    {
+      "epoch": 0.04568536731451984,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019910757259744127,
+      "loss": 0.1641,
+      "step": 5263
+    },
+    {
+      "epoch": 0.045694047794724,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0019910715445880475,
+      "loss": 0.125,
+      "step": 5264
+    },
+    {
+      "epoch": 0.04570272827492817,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019910673622272196,
+      "loss": 0.1543,
+      "step": 5265
+    },
+    {
+      "epoch": 0.045711408755132334,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.001991063178891934,
+      "loss": 0.1377,
+      "step": 5266
+    },
+    {
+      "epoch": 0.0457200892353365,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0019910589945821952,
+      "loss": 0.1367,
+      "step": 5267
+    },
+    {
+      "epoch": 0.045728769715540664,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019910548092980074,
+      "loss": 0.2217,
+      "step": 5268
+    },
+    {
+      "epoch": 0.04573745019574483,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.001991050623039376,
+      "loss": 0.1289,
+      "step": 5269
+    },
+    {
+      "epoch": 0.045746130675948994,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019910464358063045,
+      "loss": 0.2227,
+      "step": 5270
+    },
+    {
+      "epoch": 0.04575481115615316,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001991042247598798,
+      "loss": 0.1797,
+      "step": 5271
+    },
+    {
+      "epoch": 0.045763491636357324,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019910380584168613,
+      "loss": 0.1348,
+      "step": 5272
+    },
+    {
+      "epoch": 0.04577217211656149,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0019910338682604988,
+      "loss": 0.1406,
+      "step": 5273
+    },
+    {
+      "epoch": 0.045780852596765655,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0019910296771297148,
+      "loss": 0.1289,
+      "step": 5274
+    },
+    {
+      "epoch": 0.04578953307696982,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019910254850245145,
+      "loss": 0.126,
+      "step": 5275
+    },
+    {
+      "epoch": 0.045798213557173985,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001991021291944902,
+      "loss": 0.1245,
+      "step": 5276
+    },
+    {
+      "epoch": 0.04580689403737815,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001991017097890882,
+      "loss": 0.207,
+      "step": 5277
+    },
+    {
+      "epoch": 0.045815574517582315,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0019910129028624587,
+      "loss": 0.2041,
+      "step": 5278
+    },
+    {
+      "epoch": 0.04582425499778648,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019910087068596375,
+      "loss": 0.1797,
+      "step": 5279
+    },
+    {
+      "epoch": 0.045832935477990645,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0019910045098824225,
+      "loss": 0.1562,
+      "step": 5280
+    },
+    {
+      "epoch": 0.04584161595819481,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001991000311930818,
+      "loss": 0.2148,
+      "step": 5281
+    },
+    {
+      "epoch": 0.045850296438398976,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019909961130048295,
+      "loss": 0.166,
+      "step": 5282
+    },
+    {
+      "epoch": 0.045858976918603134,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019909919131044606,
+      "loss": 0.1426,
+      "step": 5283
+    },
+    {
+      "epoch": 0.0458676573988073,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019909877122297167,
+      "loss": 0.1523,
+      "step": 5284
+    },
+    {
+      "epoch": 0.045876337879011464,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001990983510380602,
+      "loss": 0.1504,
+      "step": 5285
+    },
+    {
+      "epoch": 0.04588501835921563,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019909793075571208,
+      "loss": 0.1768,
+      "step": 5286
+    },
+    {
+      "epoch": 0.045893698839419794,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0019909751037592784,
+      "loss": 0.1084,
+      "step": 5287
+    },
+    {
+      "epoch": 0.04590237931962396,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019909708989870787,
+      "loss": 0.1885,
+      "step": 5288
+    },
+    {
+      "epoch": 0.045911059799828124,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019909666932405264,
+      "loss": 0.1582,
+      "step": 5289
+    },
+    {
+      "epoch": 0.04591974028003229,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001990962486519627,
+      "loss": 0.1875,
+      "step": 5290
+    },
+    {
+      "epoch": 0.045928420760236455,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001990958278824384,
+      "loss": 0.1309,
+      "step": 5291
+    },
+    {
+      "epoch": 0.04593710124044062,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001990954070154802,
+      "loss": 0.1562,
+      "step": 5292
+    },
+    {
+      "epoch": 0.045945781720644785,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.001990949860510887,
+      "loss": 0.1895,
+      "step": 5293
+    },
+    {
+      "epoch": 0.04595446220084895,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001990945649892642,
+      "loss": 0.1797,
+      "step": 5294
+    },
+    {
+      "epoch": 0.045963142681053115,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019909414383000724,
+      "loss": 0.1396,
+      "step": 5295
+    },
+    {
+      "epoch": 0.04597182316125728,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019909372257331825,
+      "loss": 0.1729,
+      "step": 5296
+    },
+    {
+      "epoch": 0.045980503641461445,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.001990933012191977,
+      "loss": 0.1777,
+      "step": 5297
+    },
+    {
+      "epoch": 0.04598918412166561,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.001990928797676461,
+      "loss": 0.1543,
+      "step": 5298
+    },
+    {
+      "epoch": 0.045997864601869776,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019909245821866377,
+      "loss": 0.1582,
+      "step": 5299
+    },
+    {
+      "epoch": 0.04600654508207394,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019909203657225137,
+      "loss": 0.1338,
+      "step": 5300
+    },
+    {
+      "epoch": 0.046015225562278106,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0019909161482840917,
+      "loss": 0.1387,
+      "step": 5301
+    },
+    {
+      "epoch": 0.04602390604248227,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001990911929871378,
+      "loss": 0.2197,
+      "step": 5302
+    },
+    {
+      "epoch": 0.046032586522686436,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001990907710484376,
+      "loss": 0.1572,
+      "step": 5303
+    },
+    {
+      "epoch": 0.0460412670028906,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0019909034901230906,
+      "loss": 0.1445,
+      "step": 5304
+    },
+    {
+      "epoch": 0.046049947483094766,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0019908992687875265,
+      "loss": 0.1738,
+      "step": 5305
+    },
+    {
+      "epoch": 0.04605862796329893,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0019908950464776884,
+      "loss": 0.1641,
+      "step": 5306
+    },
+    {
+      "epoch": 0.0460673084435031,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001990890823193581,
+      "loss": 0.1445,
+      "step": 5307
+    },
+    {
+      "epoch": 0.04607598892370726,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0019908865989352086,
+      "loss": 0.2021,
+      "step": 5308
+    },
+    {
+      "epoch": 0.04608466940391143,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019908823737025765,
+      "loss": 0.2285,
+      "step": 5309
+    },
+    {
+      "epoch": 0.04609334988411559,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001990878147495688,
+      "loss": 0.1289,
+      "step": 5310
+    },
+    {
+      "epoch": 0.04610203036431976,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001990873920314549,
+      "loss": 0.1572,
+      "step": 5311
+    },
+    {
+      "epoch": 0.04611071084452392,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001990869692159164,
+      "loss": 0.2012,
+      "step": 5312
+    },
+    {
+      "epoch": 0.04611939132472809,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0019908654630295367,
+      "loss": 0.2109,
+      "step": 5313
+    },
+    {
+      "epoch": 0.046128071804932246,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019908612329256727,
+      "loss": 0.1973,
+      "step": 5314
+    },
+    {
+      "epoch": 0.04613675228513641,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019908570018475763,
+      "loss": 0.127,
+      "step": 5315
+    },
+    {
+      "epoch": 0.046145432765340576,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001990852769795252,
+      "loss": 0.165,
+      "step": 5316
+    },
+    {
+      "epoch": 0.04615411324554474,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001990848536768704,
+      "loss": 0.1226,
+      "step": 5317
+    },
+    {
+      "epoch": 0.046162793725748906,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019908443027679378,
+      "loss": 0.1807,
+      "step": 5318
+    },
+    {
+      "epoch": 0.04617147420595307,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019908400677929577,
+      "loss": 0.1504,
+      "step": 5319
+    },
+    {
+      "epoch": 0.046180154686157236,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001990835831843768,
+      "loss": 0.1426,
+      "step": 5320
+    },
+    {
+      "epoch": 0.0461888351663614,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001990831594920374,
+      "loss": 0.1318,
+      "step": 5321
+    },
+    {
+      "epoch": 0.046197515646565566,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019908273570227795,
+      "loss": 0.1406,
+      "step": 5322
+    },
+    {
+      "epoch": 0.04620619612676973,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0019908231181509904,
+      "loss": 0.1953,
+      "step": 5323
+    },
+    {
+      "epoch": 0.0462148766069739,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.00199081887830501,
+      "loss": 0.1787,
+      "step": 5324
+    },
+    {
+      "epoch": 0.04622355708717806,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001990814637484843,
+      "loss": 0.1187,
+      "step": 5325
+    },
+    {
+      "epoch": 0.04623223756738223,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001990810395690495,
+      "loss": 0.168,
+      "step": 5326
+    },
+    {
+      "epoch": 0.04624091804758639,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0019908061529219706,
+      "loss": 0.1729,
+      "step": 5327
+    },
+    {
+      "epoch": 0.04624959852779056,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019908019091792735,
+      "loss": 0.2041,
+      "step": 5328
+    },
+    {
+      "epoch": 0.04625827900799472,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019907976644624086,
+      "loss": 0.1187,
+      "step": 5329
+    },
+    {
+      "epoch": 0.04626695948819889,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019907934187713807,
+      "loss": 0.1787,
+      "step": 5330
+    },
+    {
+      "epoch": 0.04627563996840305,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001990789172106195,
+      "loss": 0.1426,
+      "step": 5331
+    },
+    {
+      "epoch": 0.04628432044860722,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019907849244668553,
+      "loss": 0.1816,
+      "step": 5332
+    },
+    {
+      "epoch": 0.04629300092881138,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001990780675853367,
+      "loss": 0.1523,
+      "step": 5333
+    },
+    {
+      "epoch": 0.04630168140901555,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001990776426265734,
+      "loss": 0.1914,
+      "step": 5334
+    },
+    {
+      "epoch": 0.04631036188921971,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0019907721757039614,
+      "loss": 0.1582,
+      "step": 5335
+    },
+    {
+      "epoch": 0.04631904236942388,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0019907679241680538,
+      "loss": 0.1543,
+      "step": 5336
+    },
+    {
+      "epoch": 0.04632772284962804,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0019907636716580152,
+      "loss": 0.1143,
+      "step": 5337
+    },
+    {
+      "epoch": 0.04633640332983221,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019907594181738514,
+      "loss": 0.1348,
+      "step": 5338
+    },
+    {
+      "epoch": 0.046345083810036374,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001990755163715566,
+      "loss": 0.126,
+      "step": 5339
+    },
+    {
+      "epoch": 0.04635376429024054,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001990750908283165,
+      "loss": 0.1719,
+      "step": 5340
+    },
+    {
+      "epoch": 0.046362444770444704,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019907466518766515,
+      "loss": 0.1543,
+      "step": 5341
+    },
+    {
+      "epoch": 0.04637112525064887,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001990742394496031,
+      "loss": 0.105,
+      "step": 5342
+    },
+    {
+      "epoch": 0.046379805730853034,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001990738136141308,
+      "loss": 0.1338,
+      "step": 5343
+    },
+    {
+      "epoch": 0.0463884862110572,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019907338768124874,
+      "loss": 0.1533,
+      "step": 5344
+    },
+    {
+      "epoch": 0.04639716669126136,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001990729616509573,
+      "loss": 0.1455,
+      "step": 5345
+    },
+    {
+      "epoch": 0.04640584717146552,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019907253552325704,
+      "loss": 0.1182,
+      "step": 5346
+    },
+    {
+      "epoch": 0.04641452765166969,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.001990721092981484,
+      "loss": 0.1758,
+      "step": 5347
+    },
+    {
+      "epoch": 0.04642320813187385,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019907168297563184,
+      "loss": 0.1699,
+      "step": 5348
+    },
+    {
+      "epoch": 0.04643188861207802,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019907125655570785,
+      "loss": 0.1699,
+      "step": 5349
+    },
+    {
+      "epoch": 0.04644056909228218,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0019907083003837685,
+      "loss": 0.1396,
+      "step": 5350
+    },
+    {
+      "epoch": 0.04644924957248635,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001990704034236393,
+      "loss": 0.1406,
+      "step": 5351
+    },
+    {
+      "epoch": 0.04645793005269051,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0019906997671149573,
+      "loss": 0.1602,
+      "step": 5352
+    },
+    {
+      "epoch": 0.04646661053289468,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019906954990194657,
+      "loss": 0.1494,
+      "step": 5353
+    },
+    {
+      "epoch": 0.04647529101309884,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001990691229949923,
+      "loss": 0.1816,
+      "step": 5354
+    },
+    {
+      "epoch": 0.04648397149330301,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0019906869599063334,
+      "loss": 0.1357,
+      "step": 5355
+    },
+    {
+      "epoch": 0.046492651973507174,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0019906826888887023,
+      "loss": 0.127,
+      "step": 5356
+    },
+    {
+      "epoch": 0.04650133245371134,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019906784168970335,
+      "loss": 0.1699,
+      "step": 5357
+    },
+    {
+      "epoch": 0.046510012933915504,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0019906741439313326,
+      "loss": 0.1641,
+      "step": 5358
+    },
+    {
+      "epoch": 0.04651869341411967,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0019906698699916035,
+      "loss": 0.1553,
+      "step": 5359
+    },
+    {
+      "epoch": 0.046527373894323834,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019906655950778515,
+      "loss": 0.1514,
+      "step": 5360
+    },
+    {
+      "epoch": 0.046536054374528,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001990661319190081,
+      "loss": 0.166,
+      "step": 5361
+    },
+    {
+      "epoch": 0.046544734854732164,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019906570423282965,
+      "loss": 0.1328,
+      "step": 5362
+    },
+    {
+      "epoch": 0.04655341533493633,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019906527644925026,
+      "loss": 0.0996,
+      "step": 5363
+    },
+    {
+      "epoch": 0.046562095815140495,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.001990648485682705,
+      "loss": 0.1592,
+      "step": 5364
+    },
+    {
+      "epoch": 0.04657077629534466,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001990644205898907,
+      "loss": 0.1533,
+      "step": 5365
+    },
+    {
+      "epoch": 0.046579456775548825,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001990639925141114,
+      "loss": 0.1387,
+      "step": 5366
+    },
+    {
+      "epoch": 0.04658813725575299,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0019906356434093303,
+      "loss": 0.1118,
+      "step": 5367
+    },
+    {
+      "epoch": 0.046596817735957155,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0019906313607035613,
+      "loss": 0.1875,
+      "step": 5368
+    },
+    {
+      "epoch": 0.04660549821616132,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.001990627077023811,
+      "loss": 0.127,
+      "step": 5369
+    },
+    {
+      "epoch": 0.046614178696365485,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0019906227923700845,
+      "loss": 0.1504,
+      "step": 5370
+    },
+    {
+      "epoch": 0.04662285917656965,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001990618506742386,
+      "loss": 0.1387,
+      "step": 5371
+    },
+    {
+      "epoch": 0.046631539656773816,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019906142201407207,
+      "loss": 0.1719,
+      "step": 5372
+    },
+    {
+      "epoch": 0.04664022013697798,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001990609932565093,
+      "loss": 0.1719,
+      "step": 5373
+    },
+    {
+      "epoch": 0.046648900617182146,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019906056440155075,
+      "loss": 0.1152,
+      "step": 5374
+    },
+    {
+      "epoch": 0.04665758109738631,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001990601354491969,
+      "loss": 0.1201,
+      "step": 5375
+    },
+    {
+      "epoch": 0.04666626157759047,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019905970639944827,
+      "loss": 0.1602,
+      "step": 5376
+    },
+    {
+      "epoch": 0.046674942057794634,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0019905927725230523,
+      "loss": 0.207,
+      "step": 5377
+    },
+    {
+      "epoch": 0.0466836225379988,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001990588480077683,
+      "loss": 0.127,
+      "step": 5378
+    },
+    {
+      "epoch": 0.046692303018202964,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019905841866583803,
+      "loss": 0.1582,
+      "step": 5379
+    },
+    {
+      "epoch": 0.04670098349840713,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019905798922651473,
+      "loss": 0.1807,
+      "step": 5380
+    },
+    {
+      "epoch": 0.046709663978611295,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019905755968979903,
+      "loss": 0.1699,
+      "step": 5381
+    },
+    {
+      "epoch": 0.04671834445881546,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019905713005569123,
+      "loss": 0.1426,
+      "step": 5382
+    },
+    {
+      "epoch": 0.046727024939019625,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019905670032419193,
+      "loss": 0.2109,
+      "step": 5383
+    },
+    {
+      "epoch": 0.04673570541922379,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0019905627049530156,
+      "loss": 0.1191,
+      "step": 5384
+    },
+    {
+      "epoch": 0.046744385899427955,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001990558405690206,
+      "loss": 0.1621,
+      "step": 5385
+    },
+    {
+      "epoch": 0.04675306637963212,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001990554105453495,
+      "loss": 0.1709,
+      "step": 5386
+    },
+    {
+      "epoch": 0.046761746859836285,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0019905498042428874,
+      "loss": 0.1309,
+      "step": 5387
+    },
+    {
+      "epoch": 0.04677042734004045,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001990545502058388,
+      "loss": 0.126,
+      "step": 5388
+    },
+    {
+      "epoch": 0.046779107820244616,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019905411989000017,
+      "loss": 0.1104,
+      "step": 5389
+    },
+    {
+      "epoch": 0.04678778830044878,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0019905368947677328,
+      "loss": 0.1953,
+      "step": 5390
+    },
+    {
+      "epoch": 0.046796468780652946,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001990532589661586,
+      "loss": 0.1611,
+      "step": 5391
+    },
+    {
+      "epoch": 0.04680514926085711,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001990528283581566,
+      "loss": 0.1533,
+      "step": 5392
+    },
+    {
+      "epoch": 0.046813829741061276,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001990523976527678,
+      "loss": 0.1514,
+      "step": 5393
+    },
+    {
+      "epoch": 0.04682251022126544,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019905196684999256,
+      "loss": 0.1094,
+      "step": 5394
+    },
+    {
+      "epoch": 0.046831190701469606,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001990515359498315,
+      "loss": 0.1602,
+      "step": 5395
+    },
+    {
+      "epoch": 0.04683987118167377,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019905110495228497,
+      "loss": 0.1777,
+      "step": 5396
+    },
+    {
+      "epoch": 0.04684855166187794,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001990506738573535,
+      "loss": 0.1992,
+      "step": 5397
+    },
+    {
+      "epoch": 0.0468572321420821,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001990502426650376,
+      "loss": 0.1484,
+      "step": 5398
+    },
+    {
+      "epoch": 0.04686591262228627,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019904981137533763,
+      "loss": 0.1543,
+      "step": 5399
+    },
+    {
+      "epoch": 0.04687459310249043,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001990493799882542,
+      "loss": 0.1621,
+      "step": 5400
+    },
+    {
+      "epoch": 0.0468832735826946,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0019904894850378764,
+      "loss": 0.2236,
+      "step": 5401
+    },
+    {
+      "epoch": 0.04689195406289876,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001990485169219385,
+      "loss": 0.165,
+      "step": 5402
+    },
+    {
+      "epoch": 0.04690063454310293,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019904808524270727,
+      "loss": 0.1621,
+      "step": 5403
+    },
+    {
+      "epoch": 0.04690931502330709,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0019904765346609433,
+      "loss": 0.1348,
+      "step": 5404
+    },
+    {
+      "epoch": 0.04691799550351126,
+      "grad_norm": 2.546875,
+      "learning_rate": 0.001990472215921003,
+      "loss": 0.5156,
+      "step": 5405
+    },
+    {
+      "epoch": 0.046926675983715416,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001990467896207255,
+      "loss": 0.1162,
+      "step": 5406
+    },
+    {
+      "epoch": 0.04693535646391958,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001990463575519705,
+      "loss": 0.1777,
+      "step": 5407
+    },
+    {
+      "epoch": 0.046944036944123746,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.001990459253858357,
+      "loss": 0.1641,
+      "step": 5408
+    },
+    {
+      "epoch": 0.04695271742432791,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0019904549312232166,
+      "loss": 0.1709,
+      "step": 5409
+    },
+    {
+      "epoch": 0.046961397904532076,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001990450607614288,
+      "loss": 0.1523,
+      "step": 5410
+    },
+    {
+      "epoch": 0.04697007838473624,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001990446283031576,
+      "loss": 0.166,
+      "step": 5411
+    },
+    {
+      "epoch": 0.046978758864940406,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001990441957475085,
+      "loss": 0.1133,
+      "step": 5412
+    },
+    {
+      "epoch": 0.04698743934514457,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00199043763094482,
+      "loss": 0.1533,
+      "step": 5413
+    },
+    {
+      "epoch": 0.04699611982534874,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019904333034407866,
+      "loss": 0.126,
+      "step": 5414
+    },
+    {
+      "epoch": 0.0470048003055529,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001990428974962988,
+      "loss": 0.1338,
+      "step": 5415
+    },
+    {
+      "epoch": 0.04701348078575707,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.00199042464551143,
+      "loss": 0.1611,
+      "step": 5416
+    },
+    {
+      "epoch": 0.04702216126596123,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019904203150861166,
+      "loss": 0.1357,
+      "step": 5417
+    },
+    {
+      "epoch": 0.0470308417461654,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019904159836870534,
+      "loss": 0.1387,
+      "step": 5418
+    },
+    {
+      "epoch": 0.04703952222636956,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001990411651314244,
+      "loss": 0.1289,
+      "step": 5419
+    },
+    {
+      "epoch": 0.04704820270657373,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019904073179676943,
+      "loss": 0.1289,
+      "step": 5420
+    },
+    {
+      "epoch": 0.04705688318677789,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001990402983647409,
+      "loss": 0.1006,
+      "step": 5421
+    },
+    {
+      "epoch": 0.04706556366698206,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019903986483533914,
+      "loss": 0.1309,
+      "step": 5422
+    },
+    {
+      "epoch": 0.04707424414718622,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019903943120856476,
+      "loss": 0.127,
+      "step": 5423
+    },
+    {
+      "epoch": 0.04708292462739039,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001990389974844182,
+      "loss": 0.1406,
+      "step": 5424
+    },
+    {
+      "epoch": 0.04709160510759455,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0019903856366289994,
+      "loss": 0.1089,
+      "step": 5425
+    },
+    {
+      "epoch": 0.04710028558779872,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001990381297440104,
+      "loss": 0.1875,
+      "step": 5426
+    },
+    {
+      "epoch": 0.04710896606800288,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0019903769572775015,
+      "loss": 0.1602,
+      "step": 5427
+    },
+    {
+      "epoch": 0.04711764654820705,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0019903726161411956,
+      "loss": 0.166,
+      "step": 5428
+    },
+    {
+      "epoch": 0.04712632702841121,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001990368274031192,
+      "loss": 0.1455,
+      "step": 5429
+    },
+    {
+      "epoch": 0.04713500750861538,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001990363930947495,
+      "loss": 0.1621,
+      "step": 5430
+    },
+    {
+      "epoch": 0.047143687988819544,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019903595868901096,
+      "loss": 0.1699,
+      "step": 5431
+    },
+    {
+      "epoch": 0.04715236846902371,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019903552418590402,
+      "loss": 0.1328,
+      "step": 5432
+    },
+    {
+      "epoch": 0.047161048949227874,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0019903508958542915,
+      "loss": 0.1299,
+      "step": 5433
+    },
+    {
+      "epoch": 0.04716972942943204,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0019903465488758684,
+      "loss": 0.1699,
+      "step": 5434
+    },
+    {
+      "epoch": 0.047178409909636204,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001990342200923776,
+      "loss": 0.1758,
+      "step": 5435
+    },
+    {
+      "epoch": 0.04718709038984037,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019903378519980186,
+      "loss": 0.1729,
+      "step": 5436
+    },
+    {
+      "epoch": 0.04719577087004453,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001990333502098601,
+      "loss": 0.1738,
+      "step": 5437
+    },
+    {
+      "epoch": 0.04720445135024869,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001990329151225528,
+      "loss": 0.2246,
+      "step": 5438
+    },
+    {
+      "epoch": 0.04721313183045286,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019903247993788044,
+      "loss": 0.1895,
+      "step": 5439
+    },
+    {
+      "epoch": 0.04722181231065702,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001990320446558435,
+      "loss": 0.1426,
+      "step": 5440
+    },
+    {
+      "epoch": 0.04723049279086119,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001990316092764425,
+      "loss": 0.2334,
+      "step": 5441
+    },
+    {
+      "epoch": 0.04723917327106535,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001990311737996778,
+      "loss": 0.1533,
+      "step": 5442
+    },
+    {
+      "epoch": 0.04724785375126952,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019903073822555004,
+      "loss": 0.1216,
+      "step": 5443
+    },
+    {
+      "epoch": 0.04725653423147368,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001990303025540595,
+      "loss": 0.1152,
+      "step": 5444
+    },
+    {
+      "epoch": 0.04726521471167785,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001990298667852068,
+      "loss": 0.1562,
+      "step": 5445
+    },
+    {
+      "epoch": 0.047273895191882014,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019902943091899238,
+      "loss": 0.1631,
+      "step": 5446
+    },
+    {
+      "epoch": 0.04728257567208618,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.001990289949554167,
+      "loss": 0.1406,
+      "step": 5447
+    },
+    {
+      "epoch": 0.047291256152290344,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0019902855889448027,
+      "loss": 0.1367,
+      "step": 5448
+    },
+    {
+      "epoch": 0.04729993663249451,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0019902812273618355,
+      "loss": 0.1973,
+      "step": 5449
+    },
+    {
+      "epoch": 0.047308617112698674,
+      "grad_norm": 1.9375,
+      "learning_rate": 0.0019902768648052695,
+      "loss": 0.2598,
+      "step": 5450
+    },
+    {
+      "epoch": 0.04731729759290284,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001990272501275111,
+      "loss": 0.1592,
+      "step": 5451
+    },
+    {
+      "epoch": 0.047325978073107004,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019902681367713632,
+      "loss": 0.2188,
+      "step": 5452
+    },
+    {
+      "epoch": 0.04733465855331117,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001990263771294032,
+      "loss": 0.1416,
+      "step": 5453
+    },
+    {
+      "epoch": 0.047343339033515335,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001990259404843121,
+      "loss": 0.1582,
+      "step": 5454
+    },
+    {
+      "epoch": 0.0473520195137195,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001990255037418636,
+      "loss": 0.1885,
+      "step": 5455
+    },
+    {
+      "epoch": 0.047360699993923665,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001990250669020582,
+      "loss": 0.1045,
+      "step": 5456
+    },
+    {
+      "epoch": 0.04736938047412783,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.001990246299648963,
+      "loss": 0.166,
+      "step": 5457
+    },
+    {
+      "epoch": 0.047378060954331995,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001990241929303784,
+      "loss": 0.1338,
+      "step": 5458
+    },
+    {
+      "epoch": 0.04738674143453616,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0019902375579850494,
+      "loss": 0.1436,
+      "step": 5459
+    },
+    {
+      "epoch": 0.047395421914740325,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019902331856927647,
+      "loss": 0.1562,
+      "step": 5460
+    },
+    {
+      "epoch": 0.04740410239494449,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019902288124269345,
+      "loss": 0.2129,
+      "step": 5461
+    },
+    {
+      "epoch": 0.047412782875148655,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019902244381875636,
+      "loss": 0.1426,
+      "step": 5462
+    },
+    {
+      "epoch": 0.04742146335535282,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001990220062974656,
+      "loss": 0.1543,
+      "step": 5463
+    },
+    {
+      "epoch": 0.047430143835556986,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019902156867882175,
+      "loss": 0.1426,
+      "step": 5464
+    },
+    {
+      "epoch": 0.04743882431576115,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0019902113096282527,
+      "loss": 0.1426,
+      "step": 5465
+    },
+    {
+      "epoch": 0.047447504795965316,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0019902069314947654,
+      "loss": 0.1582,
+      "step": 5466
+    },
+    {
+      "epoch": 0.04745618527616948,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019902025523877618,
+      "loss": 0.1406,
+      "step": 5467
+    },
+    {
+      "epoch": 0.04746486575637364,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001990198172307246,
+      "loss": 0.1729,
+      "step": 5468
+    },
+    {
+      "epoch": 0.047473546236577804,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001990193791253223,
+      "loss": 0.1279,
+      "step": 5469
+    },
+    {
+      "epoch": 0.04748222671678197,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001990189409225697,
+      "loss": 0.1611,
+      "step": 5470
+    },
+    {
+      "epoch": 0.047490907196986135,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019901850262246737,
+      "loss": 0.124,
+      "step": 5471
+    },
+    {
+      "epoch": 0.0474995876771903,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0019901806422501574,
+      "loss": 0.1348,
+      "step": 5472
+    },
+    {
+      "epoch": 0.047508268157394465,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001990176257302153,
+      "loss": 0.1318,
+      "step": 5473
+    },
+    {
+      "epoch": 0.04751694863759863,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001990171871380665,
+      "loss": 0.1816,
+      "step": 5474
+    },
+    {
+      "epoch": 0.047525629117802795,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001990167484485698,
+      "loss": 0.1562,
+      "step": 5475
+    },
+    {
+      "epoch": 0.04753430959800696,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019901630966172577,
+      "loss": 0.1416,
+      "step": 5476
+    },
+    {
+      "epoch": 0.047542990078211125,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019901587077753484,
+      "loss": 0.165,
+      "step": 5477
+    },
+    {
+      "epoch": 0.04755167055841529,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019901543179599744,
+      "loss": 0.1641,
+      "step": 5478
+    },
+    {
+      "epoch": 0.047560351038619456,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0019901499271711416,
+      "loss": 0.1738,
+      "step": 5479
+    },
+    {
+      "epoch": 0.04756903151882362,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019901455354088544,
+      "loss": 0.1572,
+      "step": 5480
+    },
+    {
+      "epoch": 0.047577711999027786,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0019901411426731168,
+      "loss": 0.1631,
+      "step": 5481
+    },
+    {
+      "epoch": 0.04758639247923195,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0019901367489639347,
+      "loss": 0.166,
+      "step": 5482
+    },
+    {
+      "epoch": 0.047595072959436116,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001990132354281312,
+      "loss": 0.1855,
+      "step": 5483
+    },
+    {
+      "epoch": 0.04760375343964028,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0019901279586252542,
+      "loss": 0.1738,
+      "step": 5484
+    },
+    {
+      "epoch": 0.047612433919844446,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001990123561995766,
+      "loss": 0.1543,
+      "step": 5485
+    },
+    {
+      "epoch": 0.04762111440004861,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019901191643928516,
+      "loss": 0.1904,
+      "step": 5486
+    },
+    {
+      "epoch": 0.047629794880252777,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.001990114765816517,
+      "loss": 0.1562,
+      "step": 5487
+    },
+    {
+      "epoch": 0.04763847536045694,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019901103662667654,
+      "loss": 0.1289,
+      "step": 5488
+    },
+    {
+      "epoch": 0.04764715584066111,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0019901059657436025,
+      "loss": 0.1904,
+      "step": 5489
+    },
+    {
+      "epoch": 0.04765583632086527,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001990101564247034,
+      "loss": 0.1787,
+      "step": 5490
+    },
+    {
+      "epoch": 0.04766451680106944,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001990097161777063,
+      "loss": 0.1758,
+      "step": 5491
+    },
+    {
+      "epoch": 0.0476731972812736,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0019900927583336955,
+      "loss": 0.1396,
+      "step": 5492
+    },
+    {
+      "epoch": 0.04768187776147777,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001990088353916936,
+      "loss": 0.165,
+      "step": 5493
+    },
+    {
+      "epoch": 0.04769055824168193,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0019900839485267885,
+      "loss": 0.127,
+      "step": 5494
+    },
+    {
+      "epoch": 0.0476992387218861,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019900795421632592,
+      "loss": 0.1777,
+      "step": 5495
+    },
+    {
+      "epoch": 0.04770791920209026,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0019900751348263523,
+      "loss": 0.1406,
+      "step": 5496
+    },
+    {
+      "epoch": 0.04771659968229443,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019900707265160726,
+      "loss": 0.1152,
+      "step": 5497
+    },
+    {
+      "epoch": 0.04772528016249859,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019900663172324248,
+      "loss": 0.1357,
+      "step": 5498
+    },
+    {
+      "epoch": 0.04773396064270275,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0019900619069754136,
+      "loss": 0.1748,
+      "step": 5499
+    },
+    {
+      "epoch": 0.047742641122906916,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019900574957450443,
+      "loss": 0.1226,
+      "step": 5500
+    },
+    {
+      "epoch": 0.04775132160311108,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019900530835413217,
+      "loss": 0.1416,
+      "step": 5501
+    },
+    {
+      "epoch": 0.047760002083315246,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.00199004867036425,
+      "loss": 0.1309,
+      "step": 5502
+    },
+    {
+      "epoch": 0.04776868256351941,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001990044256213835,
+      "loss": 0.1367,
+      "step": 5503
+    },
+    {
+      "epoch": 0.04777736304372358,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0019900398410900807,
+      "loss": 0.166,
+      "step": 5504
+    },
+    {
+      "epoch": 0.04778604352392774,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001990035424992992,
+      "loss": 0.1641,
+      "step": 5505
+    },
+    {
+      "epoch": 0.04779472400413191,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019900310079225742,
+      "loss": 0.1797,
+      "step": 5506
+    },
+    {
+      "epoch": 0.04780340448433607,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019900265898788318,
+      "loss": 0.127,
+      "step": 5507
+    },
+    {
+      "epoch": 0.04781208496454024,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0019900221708617698,
+      "loss": 0.1206,
+      "step": 5508
+    },
+    {
+      "epoch": 0.0478207654447444,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019900177508713926,
+      "loss": 0.1504,
+      "step": 5509
+    },
+    {
+      "epoch": 0.04782944592494857,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.001990013329907706,
+      "loss": 0.1973,
+      "step": 5510
+    },
+    {
+      "epoch": 0.04783812640515273,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019900089079707135,
+      "loss": 0.1465,
+      "step": 5511
+    },
+    {
+      "epoch": 0.0478468068853569,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019900044850604207,
+      "loss": 0.1582,
+      "step": 5512
+    },
+    {
+      "epoch": 0.04785548736556106,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0019900000611768327,
+      "loss": 0.1367,
+      "step": 5513
+    },
+    {
+      "epoch": 0.04786416784576523,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001989995636319954,
+      "loss": 0.1099,
+      "step": 5514
+    },
+    {
+      "epoch": 0.04787284832596939,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001989991210489789,
+      "loss": 0.1426,
+      "step": 5515
+    },
+    {
+      "epoch": 0.04788152880617356,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019899867836863433,
+      "loss": 0.1475,
+      "step": 5516
+    },
+    {
+      "epoch": 0.04789020928637772,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001989982355909621,
+      "loss": 0.1543,
+      "step": 5517
+    },
+    {
+      "epoch": 0.04789888976658189,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001989977927159628,
+      "loss": 0.1572,
+      "step": 5518
+    },
+    {
+      "epoch": 0.04790757024678605,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019899734974363685,
+      "loss": 0.125,
+      "step": 5519
+    },
+    {
+      "epoch": 0.04791625072699022,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001989969066739847,
+      "loss": 0.1367,
+      "step": 5520
+    },
+    {
+      "epoch": 0.047924931207194384,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001989964635070069,
+      "loss": 0.1582,
+      "step": 5521
+    },
+    {
+      "epoch": 0.04793361168739855,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001989960202427039,
+      "loss": 0.1533,
+      "step": 5522
+    },
+    {
+      "epoch": 0.047942292167602714,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001989955768810762,
+      "loss": 0.106,
+      "step": 5523
+    },
+    {
+      "epoch": 0.04795097264780688,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.001989951334221242,
+      "loss": 0.1211,
+      "step": 5524
+    },
+    {
+      "epoch": 0.047959653128011044,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0019899468986584855,
+      "loss": 0.1582,
+      "step": 5525
+    },
+    {
+      "epoch": 0.04796833360821521,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001989942462122496,
+      "loss": 0.1387,
+      "step": 5526
+    },
+    {
+      "epoch": 0.047977014088419374,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001989938024613279,
+      "loss": 0.1465,
+      "step": 5527
+    },
+    {
+      "epoch": 0.04798569456862354,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001989933586130839,
+      "loss": 0.1748,
+      "step": 5528
+    },
+    {
+      "epoch": 0.047994375048827705,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0019899291466751803,
+      "loss": 0.1318,
+      "step": 5529
+    },
+    {
+      "epoch": 0.04800305552903186,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019899247062463094,
+      "loss": 0.1455,
+      "step": 5530
+    },
+    {
+      "epoch": 0.04801173600923603,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00198992026484423,
+      "loss": 0.1328,
+      "step": 5531
+    },
+    {
+      "epoch": 0.04802041648944019,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001989915822468947,
+      "loss": 0.1514,
+      "step": 5532
+    },
+    {
+      "epoch": 0.04802909696964436,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019899113791204655,
+      "loss": 0.1348,
+      "step": 5533
+    },
+    {
+      "epoch": 0.04803777744984852,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019899069347987905,
+      "loss": 0.1533,
+      "step": 5534
+    },
+    {
+      "epoch": 0.04804645793005269,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019899024895039265,
+      "loss": 0.1279,
+      "step": 5535
+    },
+    {
+      "epoch": 0.048055138410256854,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0019898980432358788,
+      "loss": 0.1602,
+      "step": 5536
+    },
+    {
+      "epoch": 0.04806381889046102,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.001989893595994651,
+      "loss": 0.165,
+      "step": 5537
+    },
+    {
+      "epoch": 0.048072499370665184,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.00198988914778025,
+      "loss": 0.1943,
+      "step": 5538
+    },
+    {
+      "epoch": 0.04808117985086935,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0019898846985926797,
+      "loss": 0.127,
+      "step": 5539
+    },
+    {
+      "epoch": 0.048089860331073514,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019898802484319437,
+      "loss": 0.1426,
+      "step": 5540
+    },
+    {
+      "epoch": 0.04809854081127768,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001989875797298049,
+      "loss": 0.1211,
+      "step": 5541
+    },
+    {
+      "epoch": 0.048107221291481844,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019898713451909993,
+      "loss": 0.1562,
+      "step": 5542
+    },
+    {
+      "epoch": 0.04811590177168601,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019898668921107996,
+      "loss": 0.1719,
+      "step": 5543
+    },
+    {
+      "epoch": 0.048124582251890174,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019898624380574547,
+      "loss": 0.1396,
+      "step": 5544
+    },
+    {
+      "epoch": 0.04813326273209434,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.00198985798303097,
+      "loss": 0.1904,
+      "step": 5545
+    },
+    {
+      "epoch": 0.048141943212298505,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019898535270313493,
+      "loss": 0.1484,
+      "step": 5546
+    },
+    {
+      "epoch": 0.04815062369250267,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001989849070058599,
+      "loss": 0.1787,
+      "step": 5547
+    },
+    {
+      "epoch": 0.048159304172706835,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001989844612112722,
+      "loss": 0.1709,
+      "step": 5548
+    },
+    {
+      "epoch": 0.048167984652911,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001989840153193725,
+      "loss": 0.1602,
+      "step": 5549
+    },
+    {
+      "epoch": 0.048176665133115165,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0019898356933016124,
+      "loss": 0.1533,
+      "step": 5550
+    },
+    {
+      "epoch": 0.04818534561331933,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0019898312324363888,
+      "loss": 0.1582,
+      "step": 5551
+    },
+    {
+      "epoch": 0.048194026093523495,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001989826770598059,
+      "loss": 0.1631,
+      "step": 5552
+    },
+    {
+      "epoch": 0.04820270657372766,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001989822307786628,
+      "loss": 0.127,
+      "step": 5553
+    },
+    {
+      "epoch": 0.048211387053931826,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019898178440021005,
+      "loss": 0.1514,
+      "step": 5554
+    },
+    {
+      "epoch": 0.04822006753413599,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019898133792444817,
+      "loss": 0.1582,
+      "step": 5555
+    },
+    {
+      "epoch": 0.048228748014340156,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019898089135137764,
+      "loss": 0.1426,
+      "step": 5556
+    },
+    {
+      "epoch": 0.04823742849454432,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0019898044468099896,
+      "loss": 0.1465,
+      "step": 5557
+    },
+    {
+      "epoch": 0.048246108974748486,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019897999791331257,
+      "loss": 0.2275,
+      "step": 5558
+    },
+    {
+      "epoch": 0.04825478945495265,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.00198979551048319,
+      "loss": 0.1187,
+      "step": 5559
+    },
+    {
+      "epoch": 0.04826346993515681,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019897910408601875,
+      "loss": 0.1602,
+      "step": 5560
+    },
+    {
+      "epoch": 0.048272150415360975,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0019897865702641227,
+      "loss": 0.1108,
+      "step": 5561
+    },
+    {
+      "epoch": 0.04828083089556514,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001989782098695001,
+      "loss": 0.1201,
+      "step": 5562
+    },
+    {
+      "epoch": 0.048289511375769305,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0019897776261528265,
+      "loss": 0.1143,
+      "step": 5563
+    },
+    {
+      "epoch": 0.04829819185597347,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001989773152637605,
+      "loss": 0.1982,
+      "step": 5564
+    },
+    {
+      "epoch": 0.048306872336177635,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019897686781493403,
+      "loss": 0.1895,
+      "step": 5565
+    },
+    {
+      "epoch": 0.0483155528163818,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019897642026880384,
+      "loss": 0.1904,
+      "step": 5566
+    },
+    {
+      "epoch": 0.048324233296585965,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019897597262537037,
+      "loss": 0.1309,
+      "step": 5567
+    },
+    {
+      "epoch": 0.04833291377679013,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019897552488463413,
+      "loss": 0.1543,
+      "step": 5568
+    },
+    {
+      "epoch": 0.048341594256994296,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019897507704659556,
+      "loss": 0.1494,
+      "step": 5569
+    },
+    {
+      "epoch": 0.04835027473719846,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019897462911125517,
+      "loss": 0.1914,
+      "step": 5570
+    },
+    {
+      "epoch": 0.048358955217402626,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.001989741810786135,
+      "loss": 0.1328,
+      "step": 5571
+    },
+    {
+      "epoch": 0.04836763569760679,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019897373294867097,
+      "loss": 0.123,
+      "step": 5572
+    },
+    {
+      "epoch": 0.048376316177810956,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019897328472142815,
+      "loss": 0.1621,
+      "step": 5573
+    },
+    {
+      "epoch": 0.04838499665801512,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019897283639688547,
+      "loss": 0.1777,
+      "step": 5574
+    },
+    {
+      "epoch": 0.048393677138219286,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001989723879750434,
+      "loss": 0.1426,
+      "step": 5575
+    },
+    {
+      "epoch": 0.04840235761842345,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001989719394559025,
+      "loss": 0.2031,
+      "step": 5576
+    },
+    {
+      "epoch": 0.048411038098627616,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001989714908394632,
+      "loss": 0.1143,
+      "step": 5577
+    },
+    {
+      "epoch": 0.04841971857883178,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00198971042125726,
+      "loss": 0.1328,
+      "step": 5578
+    },
+    {
+      "epoch": 0.04842839905903595,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019897059331469144,
+      "loss": 0.1396,
+      "step": 5579
+    },
+    {
+      "epoch": 0.04843707953924011,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019897014440635997,
+      "loss": 0.1426,
+      "step": 5580
+    },
+    {
+      "epoch": 0.04844576001944428,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.001989696954007321,
+      "loss": 0.126,
+      "step": 5581
+    },
+    {
+      "epoch": 0.04845444049964844,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019896924629780825,
+      "loss": 0.1211,
+      "step": 5582
+    },
+    {
+      "epoch": 0.04846312097985261,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0019896879709758904,
+      "loss": 0.1602,
+      "step": 5583
+    },
+    {
+      "epoch": 0.04847180146005677,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019896834780007483,
+      "loss": 0.1621,
+      "step": 5584
+    },
+    {
+      "epoch": 0.04848048194026094,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019896789840526618,
+      "loss": 0.1426,
+      "step": 5585
+    },
+    {
+      "epoch": 0.0484891624204651,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019896744891316357,
+      "loss": 0.1445,
+      "step": 5586
+    },
+    {
+      "epoch": 0.04849784290066927,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019896699932376756,
+      "loss": 0.1641,
+      "step": 5587
+    },
+    {
+      "epoch": 0.04850652338087343,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001989665496370785,
+      "loss": 0.1602,
+      "step": 5588
+    },
+    {
+      "epoch": 0.0485152038610776,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00198966099853097,
+      "loss": 0.1152,
+      "step": 5589
+    },
+    {
+      "epoch": 0.04852388434128176,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.001989656499718235,
+      "loss": 0.167,
+      "step": 5590
+    },
+    {
+      "epoch": 0.04853256482148592,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001989651999932585,
+      "loss": 0.1611,
+      "step": 5591
+    },
+    {
+      "epoch": 0.048541245301690086,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001989647499174025,
+      "loss": 0.1543,
+      "step": 5592
+    },
+    {
+      "epoch": 0.04854992578189425,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019896429974425598,
+      "loss": 0.1318,
+      "step": 5593
+    },
+    {
+      "epoch": 0.04855860626209842,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0019896384947381946,
+      "loss": 0.1289,
+      "step": 5594
+    },
+    {
+      "epoch": 0.04856728674230258,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0019896339910609336,
+      "loss": 0.1426,
+      "step": 5595
+    },
+    {
+      "epoch": 0.04857596722250675,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001989629486410783,
+      "loss": 0.1621,
+      "step": 5596
+    },
+    {
+      "epoch": 0.04858464770271091,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019896249807877463,
+      "loss": 0.1396,
+      "step": 5597
+    },
+    {
+      "epoch": 0.04859332818291508,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001989620474191829,
+      "loss": 0.1436,
+      "step": 5598
+    },
+    {
+      "epoch": 0.04860200866311924,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019896159666230365,
+      "loss": 0.1631,
+      "step": 5599
+    },
+    {
+      "epoch": 0.04861068914332341,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019896114580813733,
+      "loss": 0.1436,
+      "step": 5600
+    },
+    {
+      "epoch": 0.04861936962352757,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019896069485668445,
+      "loss": 0.1602,
+      "step": 5601
+    },
+    {
+      "epoch": 0.04862805010373174,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019896024380794547,
+      "loss": 0.1426,
+      "step": 5602
+    },
+    {
+      "epoch": 0.0486367305839359,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001989597926619209,
+      "loss": 0.1289,
+      "step": 5603
+    },
+    {
+      "epoch": 0.04864541106414007,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019895934141861125,
+      "loss": 0.1152,
+      "step": 5604
+    },
+    {
+      "epoch": 0.04865409154434423,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019895889007801697,
+      "loss": 0.1143,
+      "step": 5605
+    },
+    {
+      "epoch": 0.0486627720245484,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001989584386401386,
+      "loss": 0.1484,
+      "step": 5606
+    },
+    {
+      "epoch": 0.04867145250475256,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019895798710497666,
+      "loss": 0.1885,
+      "step": 5607
+    },
+    {
+      "epoch": 0.04868013298495673,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001989575354725316,
+      "loss": 0.1299,
+      "step": 5608
+    },
+    {
+      "epoch": 0.04868881346516089,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0019895708374280388,
+      "loss": 0.1543,
+      "step": 5609
+    },
+    {
+      "epoch": 0.04869749394536506,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00198956631915794,
+      "loss": 0.2031,
+      "step": 5610
+    },
+    {
+      "epoch": 0.048706174425569224,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019895617999150256,
+      "loss": 0.1128,
+      "step": 5611
+    },
+    {
+      "epoch": 0.04871485490577339,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001989557279699299,
+      "loss": 0.1328,
+      "step": 5612
+    },
+    {
+      "epoch": 0.048723535385977554,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019895527585107662,
+      "loss": 0.1406,
+      "step": 5613
+    },
+    {
+      "epoch": 0.04873221586618172,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001989548236349432,
+      "loss": 0.1592,
+      "step": 5614
+    },
+    {
+      "epoch": 0.048740896346385884,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019895437132153014,
+      "loss": 0.1387,
+      "step": 5615
+    },
+    {
+      "epoch": 0.04874957682659005,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.001989539189108379,
+      "loss": 0.1455,
+      "step": 5616
+    },
+    {
+      "epoch": 0.048758257306794214,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019895346640286696,
+      "loss": 0.1641,
+      "step": 5617
+    },
+    {
+      "epoch": 0.04876693778699838,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001989530137976179,
+      "loss": 0.1328,
+      "step": 5618
+    },
+    {
+      "epoch": 0.048775618267202545,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.001989525610950911,
+      "loss": 0.1318,
+      "step": 5619
+    },
+    {
+      "epoch": 0.04878429874740671,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0019895210829528717,
+      "loss": 0.1689,
+      "step": 5620
+    },
+    {
+      "epoch": 0.048792979227610875,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019895165539820653,
+      "loss": 0.1455,
+      "step": 5621
+    },
+    {
+      "epoch": 0.04880165970781503,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001989512024038497,
+      "loss": 0.1445,
+      "step": 5622
+    },
+    {
+      "epoch": 0.0488103401880192,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019895074931221713,
+      "loss": 0.1484,
+      "step": 5623
+    },
+    {
+      "epoch": 0.04881902066822336,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001989502961233094,
+      "loss": 0.1533,
+      "step": 5624
+    },
+    {
+      "epoch": 0.04882770114842753,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019894984283712696,
+      "loss": 0.1777,
+      "step": 5625
+    },
+    {
+      "epoch": 0.04883638162863169,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001989493894536703,
+      "loss": 0.1445,
+      "step": 5626
+    },
+    {
+      "epoch": 0.04884506210883586,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019894893597293996,
+      "loss": 0.1318,
+      "step": 5627
+    },
+    {
+      "epoch": 0.048853742589040024,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019894848239493633,
+      "loss": 0.1855,
+      "step": 5628
+    },
+    {
+      "epoch": 0.04886242306924419,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019894802871966003,
+      "loss": 0.1611,
+      "step": 5629
+    },
+    {
+      "epoch": 0.048871103549448354,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.001989475749471115,
+      "loss": 0.1602,
+      "step": 5630
+    },
+    {
+      "epoch": 0.04887978402965252,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001989471210772912,
+      "loss": 0.1777,
+      "step": 5631
+    },
+    {
+      "epoch": 0.048888464509856684,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001989466671101997,
+      "loss": 0.166,
+      "step": 5632
+    },
+    {
+      "epoch": 0.04889714499006085,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019894621304583746,
+      "loss": 0.1738,
+      "step": 5633
+    },
+    {
+      "epoch": 0.048905825470265014,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00198945758884205,
+      "loss": 0.1318,
+      "step": 5634
+    },
+    {
+      "epoch": 0.04891450595046918,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001989453046253027,
+      "loss": 0.1113,
+      "step": 5635
+    },
+    {
+      "epoch": 0.048923186430673345,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.001989448502691312,
+      "loss": 0.3262,
+      "step": 5636
+    },
+    {
+      "epoch": 0.04893186691087751,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019894439581569097,
+      "loss": 0.1855,
+      "step": 5637
+    },
+    {
+      "epoch": 0.048940547391081675,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019894394126498253,
+      "loss": 0.1011,
+      "step": 5638
+    },
+    {
+      "epoch": 0.04894922787128584,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019894348661700623,
+      "loss": 0.125,
+      "step": 5639
+    },
+    {
+      "epoch": 0.048957908351490005,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019894303187176273,
+      "loss": 0.2129,
+      "step": 5640
+    },
+    {
+      "epoch": 0.04896658883169417,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019894257702925246,
+      "loss": 0.1338,
+      "step": 5641
+    },
+    {
+      "epoch": 0.048975269311898335,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001989421220894759,
+      "loss": 0.1426,
+      "step": 5642
+    },
+    {
+      "epoch": 0.0489839497921025,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019894166705243365,
+      "loss": 0.1621,
+      "step": 5643
+    },
+    {
+      "epoch": 0.048992630272306666,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.00198941211918126,
+      "loss": 0.1816,
+      "step": 5644
+    },
+    {
+      "epoch": 0.04900131075251083,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001989407566865537,
+      "loss": 0.1621,
+      "step": 5645
+    },
+    {
+      "epoch": 0.049009991232714996,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0019894030135771705,
+      "loss": 0.2031,
+      "step": 5646
+    },
+    {
+      "epoch": 0.04901867171291916,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019893984593161666,
+      "loss": 0.1201,
+      "step": 5647
+    },
+    {
+      "epoch": 0.049027352193123326,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019893939040825297,
+      "loss": 0.1582,
+      "step": 5648
+    },
+    {
+      "epoch": 0.04903603267332749,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001989389347876265,
+      "loss": 0.2324,
+      "step": 5649
+    },
+    {
+      "epoch": 0.049044713153531656,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0019893847906973777,
+      "loss": 0.1504,
+      "step": 5650
+    },
+    {
+      "epoch": 0.04905339363373582,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001989380232545872,
+      "loss": 0.1406,
+      "step": 5651
+    },
+    {
+      "epoch": 0.04906207411393999,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001989375673421754,
+      "loss": 0.1475,
+      "step": 5652
+    },
+    {
+      "epoch": 0.049070754594144145,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001989371113325028,
+      "loss": 0.1582,
+      "step": 5653
+    },
+    {
+      "epoch": 0.04907943507434831,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019893665522556993,
+      "loss": 0.1123,
+      "step": 5654
+    },
+    {
+      "epoch": 0.049088115554552475,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0019893619902137723,
+      "loss": 0.124,
+      "step": 5655
+    },
+    {
+      "epoch": 0.04909679603475664,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019893574271992527,
+      "loss": 0.1377,
+      "step": 5656
+    },
+    {
+      "epoch": 0.049105476514960805,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019893528632121448,
+      "loss": 0.1143,
+      "step": 5657
+    },
+    {
+      "epoch": 0.04911415699516497,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001989348298252454,
+      "loss": 0.1416,
+      "step": 5658
+    },
+    {
+      "epoch": 0.049122837475369135,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019893437323201856,
+      "loss": 0.1436,
+      "step": 5659
+    },
+    {
+      "epoch": 0.0491315179555733,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019893391654153443,
+      "loss": 0.1533,
+      "step": 5660
+    },
+    {
+      "epoch": 0.049140198435777466,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001989334597537935,
+      "loss": 0.1465,
+      "step": 5661
+    },
+    {
+      "epoch": 0.04914887891598163,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0019893300286879627,
+      "loss": 0.1973,
+      "step": 5662
+    },
+    {
+      "epoch": 0.049157559396185796,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019893254588654323,
+      "loss": 0.1758,
+      "step": 5663
+    },
+    {
+      "epoch": 0.04916623987638996,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0019893208880703487,
+      "loss": 0.1973,
+      "step": 5664
+    },
+    {
+      "epoch": 0.049174920356594126,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019893163163027176,
+      "loss": 0.1553,
+      "step": 5665
+    },
+    {
+      "epoch": 0.04918360083679829,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0019893117435625433,
+      "loss": 0.1396,
+      "step": 5666
+    },
+    {
+      "epoch": 0.049192281317002456,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001989307169849831,
+      "loss": 0.1494,
+      "step": 5667
+    },
+    {
+      "epoch": 0.04920096179720662,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.001989302595164586,
+      "loss": 0.21,
+      "step": 5668
+    },
+    {
+      "epoch": 0.04920964227741079,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001989298019506813,
+      "loss": 0.1709,
+      "step": 5669
+    },
+    {
+      "epoch": 0.04921832275761495,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0019892934428765168,
+      "loss": 0.1348,
+      "step": 5670
+    },
+    {
+      "epoch": 0.04922700323781912,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001989288865273703,
+      "loss": 0.1279,
+      "step": 5671
+    },
+    {
+      "epoch": 0.04923568371802328,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001989284286698376,
+      "loss": 0.1533,
+      "step": 5672
+    },
+    {
+      "epoch": 0.04924436419822745,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.001989279707150541,
+      "loss": 0.1475,
+      "step": 5673
+    },
+    {
+      "epoch": 0.04925304467843161,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019892751266302033,
+      "loss": 0.1475,
+      "step": 5674
+    },
+    {
+      "epoch": 0.04926172515863578,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0019892705451373676,
+      "loss": 0.1709,
+      "step": 5675
+    },
+    {
+      "epoch": 0.04927040563883994,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001989265962672039,
+      "loss": 0.1338,
+      "step": 5676
+    },
+    {
+      "epoch": 0.04927908611904411,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019892613792342226,
+      "loss": 0.1895,
+      "step": 5677
+    },
+    {
+      "epoch": 0.04928776659924827,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019892567948239233,
+      "loss": 0.1455,
+      "step": 5678
+    },
+    {
+      "epoch": 0.04929644707945244,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001989252209441146,
+      "loss": 0.1758,
+      "step": 5679
+    },
+    {
+      "epoch": 0.0493051275596566,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001989247623085896,
+      "loss": 0.1914,
+      "step": 5680
+    },
+    {
+      "epoch": 0.04931380803986077,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019892430357581776,
+      "loss": 0.2012,
+      "step": 5681
+    },
+    {
+      "epoch": 0.04932248852006493,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001989238447457997,
+      "loss": 0.1953,
+      "step": 5682
+    },
+    {
+      "epoch": 0.0493311690002691,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019892338581853583,
+      "loss": 0.208,
+      "step": 5683
+    },
+    {
+      "epoch": 0.049339849480473257,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0019892292679402667,
+      "loss": 0.2305,
+      "step": 5684
+    },
+    {
+      "epoch": 0.04934852996067742,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0019892246767227277,
+      "loss": 0.1699,
+      "step": 5685
+    },
+    {
+      "epoch": 0.04935721044088159,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0019892200845327457,
+      "loss": 0.166,
+      "step": 5686
+    },
+    {
+      "epoch": 0.04936589092108575,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019892154913703263,
+      "loss": 0.1172,
+      "step": 5687
+    },
+    {
+      "epoch": 0.04937457140128992,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001989210897235474,
+      "loss": 0.1416,
+      "step": 5688
+    },
+    {
+      "epoch": 0.04938325188149408,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019892063021281934,
+      "loss": 0.1406,
+      "step": 5689
+    },
+    {
+      "epoch": 0.04939193236169825,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001989201706048491,
+      "loss": 0.1367,
+      "step": 5690
+    },
+    {
+      "epoch": 0.04940061284190241,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001989197108996371,
+      "loss": 0.1475,
+      "step": 5691
+    },
+    {
+      "epoch": 0.04940929332210658,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001989192510971838,
+      "loss": 0.1445,
+      "step": 5692
+    },
+    {
+      "epoch": 0.04941797380231074,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019891879119748974,
+      "loss": 0.123,
+      "step": 5693
+    },
+    {
+      "epoch": 0.04942665428251491,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019891833120055545,
+      "loss": 0.1406,
+      "step": 5694
+    },
+    {
+      "epoch": 0.04943533476271907,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001989178711063814,
+      "loss": 0.1611,
+      "step": 5695
+    },
+    {
+      "epoch": 0.04944401524292324,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019891741091496813,
+      "loss": 0.1611,
+      "step": 5696
+    },
+    {
+      "epoch": 0.0494526957231274,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0019891695062631606,
+      "loss": 0.127,
+      "step": 5697
+    },
+    {
+      "epoch": 0.04946137620333157,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001989164902404258,
+      "loss": 0.1875,
+      "step": 5698
+    },
+    {
+      "epoch": 0.04947005668353573,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019891602975729778,
+      "loss": 0.1406,
+      "step": 5699
+    },
+    {
+      "epoch": 0.0494787371637399,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0019891556917693252,
+      "loss": 0.1562,
+      "step": 5700
+    },
+    {
+      "epoch": 0.049487417643944064,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001989151084993305,
+      "loss": 0.1758,
+      "step": 5701
+    },
+    {
+      "epoch": 0.04949609812414823,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001989146477244923,
+      "loss": 0.1123,
+      "step": 5702
+    },
+    {
+      "epoch": 0.049504778604352394,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001989141868524184,
+      "loss": 0.1523,
+      "step": 5703
+    },
+    {
+      "epoch": 0.04951345908455656,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001989137258831092,
+      "loss": 0.1748,
+      "step": 5704
+    },
+    {
+      "epoch": 0.049522139564760724,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0019891326481656533,
+      "loss": 0.1816,
+      "step": 5705
+    },
+    {
+      "epoch": 0.04953082004496489,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.001989128036527873,
+      "loss": 0.1504,
+      "step": 5706
+    },
+    {
+      "epoch": 0.049539500525169054,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001989123423917755,
+      "loss": 0.1641,
+      "step": 5707
+    },
+    {
+      "epoch": 0.04954818100537322,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001989118810335305,
+      "loss": 0.1445,
+      "step": 5708
+    },
+    {
+      "epoch": 0.049556861485577385,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0019891141957805282,
+      "loss": 0.1426,
+      "step": 5709
+    },
+    {
+      "epoch": 0.04956554196578155,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019891095802534297,
+      "loss": 0.1289,
+      "step": 5710
+    },
+    {
+      "epoch": 0.049574222445985715,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001989104963754014,
+      "loss": 0.1367,
+      "step": 5711
+    },
+    {
+      "epoch": 0.04958290292618988,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0019891003462822864,
+      "loss": 0.1348,
+      "step": 5712
+    },
+    {
+      "epoch": 0.049591583406394045,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001989095727838252,
+      "loss": 0.1182,
+      "step": 5713
+    },
+    {
+      "epoch": 0.0496002638865982,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0019890911084219157,
+      "loss": 0.1504,
+      "step": 5714
+    },
+    {
+      "epoch": 0.04960894436680237,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019890864880332836,
+      "loss": 0.1504,
+      "step": 5715
+    },
+    {
+      "epoch": 0.04961762484700653,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001989081866672359,
+      "loss": 0.1191,
+      "step": 5716
+    },
+    {
+      "epoch": 0.0496263053272107,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001989077244339148,
+      "loss": 0.1357,
+      "step": 5717
+    },
+    {
+      "epoch": 0.049634985807414864,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019890726210336556,
+      "loss": 0.1523,
+      "step": 5718
+    },
+    {
+      "epoch": 0.04964366628761903,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0019890679967558868,
+      "loss": 0.166,
+      "step": 5719
+    },
+    {
+      "epoch": 0.049652346767823194,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001989063371505847,
+      "loss": 0.1406,
+      "step": 5720
+    },
+    {
+      "epoch": 0.04966102724802736,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019890587452835403,
+      "loss": 0.1123,
+      "step": 5721
+    },
+    {
+      "epoch": 0.049669707728231524,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0019890541180889727,
+      "loss": 0.1445,
+      "step": 5722
+    },
+    {
+      "epoch": 0.04967838820843569,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.001989049489922148,
+      "loss": 0.1108,
+      "step": 5723
+    },
+    {
+      "epoch": 0.049687068688639854,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001989044860783073,
+      "loss": 0.1523,
+      "step": 5724
+    },
+    {
+      "epoch": 0.04969574916884402,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0019890402306717513,
+      "loss": 0.1621,
+      "step": 5725
+    },
+    {
+      "epoch": 0.049704429649048185,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001989035599588189,
+      "loss": 0.1318,
+      "step": 5726
+    },
+    {
+      "epoch": 0.04971311012925235,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0019890309675323907,
+      "loss": 0.1582,
+      "step": 5727
+    },
+    {
+      "epoch": 0.049721790609456515,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0019890263345043614,
+      "loss": 0.1602,
+      "step": 5728
+    },
+    {
+      "epoch": 0.04973047108966068,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0019890217005041066,
+      "loss": 0.1816,
+      "step": 5729
+    },
+    {
+      "epoch": 0.049739151569864845,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019890170655316305,
+      "loss": 0.1475,
+      "step": 5730
+    },
+    {
+      "epoch": 0.04974783205006901,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0019890124295869386,
+      "loss": 0.1465,
+      "step": 5731
+    },
+    {
+      "epoch": 0.049756512530273175,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001989007792670037,
+      "loss": 0.168,
+      "step": 5732
+    },
+    {
+      "epoch": 0.04976519301047734,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001989003154780929,
+      "loss": 0.1494,
+      "step": 5733
+    },
+    {
+      "epoch": 0.049773873490681506,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019889985159196205,
+      "loss": 0.1445,
+      "step": 5734
+    },
+    {
+      "epoch": 0.04978255397088567,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001988993876086117,
+      "loss": 0.1709,
+      "step": 5735
+    },
+    {
+      "epoch": 0.049791234451089836,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001988989235280423,
+      "loss": 0.1934,
+      "step": 5736
+    },
+    {
+      "epoch": 0.049799914931294,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019889845935025436,
+      "loss": 0.1836,
+      "step": 5737
+    },
+    {
+      "epoch": 0.049808595411498166,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0019889799507524838,
+      "loss": 0.1494,
+      "step": 5738
+    },
+    {
+      "epoch": 0.04981727589170233,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019889753070302493,
+      "loss": 0.1797,
+      "step": 5739
+    },
+    {
+      "epoch": 0.049825956371906496,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0019889706623358447,
+      "loss": 0.1504,
+      "step": 5740
+    },
+    {
+      "epoch": 0.04983463685211066,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001988966016669275,
+      "loss": 0.1299,
+      "step": 5741
+    },
+    {
+      "epoch": 0.049843317332314827,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019889613700305457,
+      "loss": 0.1475,
+      "step": 5742
+    },
+    {
+      "epoch": 0.04985199781251899,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001988956722419661,
+      "loss": 0.1855,
+      "step": 5743
+    },
+    {
+      "epoch": 0.04986067829272316,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019889520738366273,
+      "loss": 0.1182,
+      "step": 5744
+    },
+    {
+      "epoch": 0.049869358772927315,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001988947424281448,
+      "loss": 0.1729,
+      "step": 5745
+    },
+    {
+      "epoch": 0.04987803925313148,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019889427737541303,
+      "loss": 0.1494,
+      "step": 5746
+    },
+    {
+      "epoch": 0.049886719733335645,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019889381222546773,
+      "loss": 0.1553,
+      "step": 5747
+    },
+    {
+      "epoch": 0.04989540021353981,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019889334697830953,
+      "loss": 0.1494,
+      "step": 5748
+    },
+    {
+      "epoch": 0.049904080693743975,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001988928816339389,
+      "loss": 0.1396,
+      "step": 5749
+    },
+    {
+      "epoch": 0.04991276117394814,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0019889241619235635,
+      "loss": 0.1348,
+      "step": 5750
+    },
+    {
+      "epoch": 0.049921441654152306,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019889195065356242,
+      "loss": 0.1699,
+      "step": 5751
+    },
+    {
+      "epoch": 0.04993012213435647,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019889148501755754,
+      "loss": 0.1533,
+      "step": 5752
+    },
+    {
+      "epoch": 0.049938802614560636,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0019889101928434226,
+      "loss": 0.1387,
+      "step": 5753
+    },
+    {
+      "epoch": 0.0499474830947648,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001988905534539171,
+      "loss": 0.1357,
+      "step": 5754
+    },
+    {
+      "epoch": 0.049956163574968966,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0019889008752628262,
+      "loss": 0.1484,
+      "step": 5755
+    },
+    {
+      "epoch": 0.04996484405517313,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.001988896215014392,
+      "loss": 0.1357,
+      "step": 5756
+    },
+    {
+      "epoch": 0.049973524535377296,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0019888915537938754,
+      "loss": 0.1631,
+      "step": 5757
+    },
+    {
+      "epoch": 0.04998220501558146,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019888868916012794,
+      "loss": 0.1309,
+      "step": 5758
+    },
+    {
+      "epoch": 0.04999088549578563,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019888822284366103,
+      "loss": 0.1816,
+      "step": 5759
+    },
+    {
+      "epoch": 0.04999956597598979,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0019888775642998725,
+      "loss": 0.1475,
+      "step": 5760
+    },
+    {
+      "epoch": 0.05000824645619396,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019888728991910723,
+      "loss": 0.1387,
+      "step": 5761
+    },
+    {
+      "epoch": 0.05001692693639812,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.001988868233110214,
+      "loss": 0.1494,
+      "step": 5762
+    },
+    {
+      "epoch": 0.05002560741660229,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.001988863566057302,
+      "loss": 0.1631,
+      "step": 5763
+    },
+    {
+      "epoch": 0.05003428789680645,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.001988858898032343,
+      "loss": 0.1738,
+      "step": 5764
+    },
+    {
+      "epoch": 0.05004296837701062,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019888542290353406,
+      "loss": 0.1738,
+      "step": 5765
+    },
+    {
+      "epoch": 0.05005164885721478,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001988849559066301,
+      "loss": 0.1143,
+      "step": 5766
+    },
+    {
+      "epoch": 0.05006032933741895,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019888448881252288,
+      "loss": 0.1406,
+      "step": 5767
+    },
+    {
+      "epoch": 0.05006900981762311,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001988840216212129,
+      "loss": 0.1582,
+      "step": 5768
+    },
+    {
+      "epoch": 0.05007769029782728,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001988835543327007,
+      "loss": 0.1138,
+      "step": 5769
+    },
+    {
+      "epoch": 0.05008637077803144,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0019888308694698677,
+      "loss": 0.1553,
+      "step": 5770
+    },
+    {
+      "epoch": 0.05009505125823561,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019888261946407167,
+      "loss": 0.1416,
+      "step": 5771
+    },
+    {
+      "epoch": 0.05010373173843977,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0019888215188395585,
+      "loss": 0.1641,
+      "step": 5772
+    },
+    {
+      "epoch": 0.05011241221864394,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019888168420663984,
+      "loss": 0.1875,
+      "step": 5773
+    },
+    {
+      "epoch": 0.0501210926988481,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019888121643212415,
+      "loss": 0.166,
+      "step": 5774
+    },
+    {
+      "epoch": 0.05012977317905227,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001988807485604093,
+      "loss": 0.1875,
+      "step": 5775
+    },
+    {
+      "epoch": 0.05013845365925643,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001988802805914958,
+      "loss": 0.1221,
+      "step": 5776
+    },
+    {
+      "epoch": 0.05014713413946059,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019887981252538415,
+      "loss": 0.1816,
+      "step": 5777
+    },
+    {
+      "epoch": 0.05015581461966476,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001988793443620749,
+      "loss": 0.1328,
+      "step": 5778
+    },
+    {
+      "epoch": 0.05016449509986892,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0019887887610156854,
+      "loss": 0.1631,
+      "step": 5779
+    },
+    {
+      "epoch": 0.05017317558007309,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019887840774386553,
+      "loss": 0.1367,
+      "step": 5780
+    },
+    {
+      "epoch": 0.05018185606027725,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001988779392889665,
+      "loss": 0.167,
+      "step": 5781
+    },
+    {
+      "epoch": 0.05019053654048142,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019887747073687183,
+      "loss": 0.1074,
+      "step": 5782
+    },
+    {
+      "epoch": 0.05019921702068558,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001988770020875821,
+      "loss": 0.1875,
+      "step": 5783
+    },
+    {
+      "epoch": 0.05020789750088975,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001988765333410978,
+      "loss": 0.1895,
+      "step": 5784
+    },
+    {
+      "epoch": 0.05021657798109391,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001988760644974195,
+      "loss": 0.1641,
+      "step": 5785
+    },
+    {
+      "epoch": 0.05022525846129808,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019887559555654766,
+      "loss": 0.1689,
+      "step": 5786
+    },
+    {
+      "epoch": 0.05023393894150224,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001988751265184828,
+      "loss": 0.1465,
+      "step": 5787
+    },
+    {
+      "epoch": 0.05024261942170641,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0019887465738322543,
+      "loss": 0.1348,
+      "step": 5788
+    },
+    {
+      "epoch": 0.05025129990191057,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0019887418815077605,
+      "loss": 0.1123,
+      "step": 5789
+    },
+    {
+      "epoch": 0.05025998038211474,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0019887371882113527,
+      "loss": 0.1689,
+      "step": 5790
+    },
+    {
+      "epoch": 0.050268660862318904,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019887324939430345,
+      "loss": 0.1562,
+      "step": 5791
+    },
+    {
+      "epoch": 0.05027734134252307,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019887277987028123,
+      "loss": 0.1572,
+      "step": 5792
+    },
+    {
+      "epoch": 0.050286021822727234,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019887231024906904,
+      "loss": 0.1338,
+      "step": 5793
+    },
+    {
+      "epoch": 0.0502947023029314,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001988718405306674,
+      "loss": 0.1689,
+      "step": 5794
+    },
+    {
+      "epoch": 0.050303382783135564,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019887137071507694,
+      "loss": 0.1387,
+      "step": 5795
+    },
+    {
+      "epoch": 0.05031206326333973,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019887090080229802,
+      "loss": 0.1533,
+      "step": 5796
+    },
+    {
+      "epoch": 0.050320743743543894,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0019887043079233123,
+      "loss": 0.1699,
+      "step": 5797
+    },
+    {
+      "epoch": 0.05032942422374806,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019886996068517707,
+      "loss": 0.1221,
+      "step": 5798
+    },
+    {
+      "epoch": 0.050338104703952224,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019886949048083607,
+      "loss": 0.1621,
+      "step": 5799
+    },
+    {
+      "epoch": 0.05034678518415639,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001988690201793087,
+      "loss": 0.1338,
+      "step": 5800
+    },
+    {
+      "epoch": 0.050355465664360555,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019886854978059553,
+      "loss": 0.1875,
+      "step": 5801
+    },
+    {
+      "epoch": 0.05036414614456472,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0019886807928469704,
+      "loss": 0.1631,
+      "step": 5802
+    },
+    {
+      "epoch": 0.050372826624768885,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019886760869161375,
+      "loss": 0.1445,
+      "step": 5803
+    },
+    {
+      "epoch": 0.05038150710497305,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019886713800134617,
+      "loss": 0.1191,
+      "step": 5804
+    },
+    {
+      "epoch": 0.050390187585177215,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019886666721389483,
+      "loss": 0.1318,
+      "step": 5805
+    },
+    {
+      "epoch": 0.05039886806538138,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019886619632926024,
+      "loss": 0.1562,
+      "step": 5806
+    },
+    {
+      "epoch": 0.05040754854558554,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019886572534744294,
+      "loss": 0.1611,
+      "step": 5807
+    },
+    {
+      "epoch": 0.050416229025789704,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001988652542684434,
+      "loss": 0.1543,
+      "step": 5808
+    },
+    {
+      "epoch": 0.05042490950599387,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019886478309226216,
+      "loss": 0.1562,
+      "step": 5809
+    },
+    {
+      "epoch": 0.050433589986198034,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0019886431181889972,
+      "loss": 0.1289,
+      "step": 5810
+    },
+    {
+      "epoch": 0.0504422704664022,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0019886384044835656,
+      "loss": 0.1738,
+      "step": 5811
+    },
+    {
+      "epoch": 0.050450950946606364,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.001988633689806333,
+      "loss": 0.1289,
+      "step": 5812
+    },
+    {
+      "epoch": 0.05045963142681053,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001988628974157304,
+      "loss": 0.1494,
+      "step": 5813
+    },
+    {
+      "epoch": 0.050468311907014694,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.001988624257536483,
+      "loss": 0.1748,
+      "step": 5814
+    },
+    {
+      "epoch": 0.05047699238721886,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0019886195399438766,
+      "loss": 0.1123,
+      "step": 5815
+    },
+    {
+      "epoch": 0.050485672867423025,
+      "grad_norm": 1.7265625,
+      "learning_rate": 0.001988614821379489,
+      "loss": 0.3867,
+      "step": 5816
+    },
+    {
+      "epoch": 0.05049435334762719,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019886101018433255,
+      "loss": 0.0957,
+      "step": 5817
+    },
+    {
+      "epoch": 0.050503033827831355,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0019886053813353913,
+      "loss": 0.1953,
+      "step": 5818
+    },
+    {
+      "epoch": 0.05051171430803552,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001988600659855692,
+      "loss": 0.1426,
+      "step": 5819
+    },
+    {
+      "epoch": 0.050520394788239685,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019885959374042316,
+      "loss": 0.2285,
+      "step": 5820
+    },
+    {
+      "epoch": 0.05052907526844385,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019885912139810166,
+      "loss": 0.1562,
+      "step": 5821
+    },
+    {
+      "epoch": 0.050537755748648015,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019885864895860515,
+      "loss": 0.1523,
+      "step": 5822
+    },
+    {
+      "epoch": 0.05054643622885218,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019885817642193416,
+      "loss": 0.1387,
+      "step": 5823
+    },
+    {
+      "epoch": 0.050555116709056346,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001988577037880892,
+      "loss": 0.124,
+      "step": 5824
+    },
+    {
+      "epoch": 0.05056379718926051,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019885723105707077,
+      "loss": 0.1855,
+      "step": 5825
+    },
+    {
+      "epoch": 0.050572477669464676,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001988567582288794,
+      "loss": 0.1123,
+      "step": 5826
+    },
+    {
+      "epoch": 0.05058115814966884,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019885628530351565,
+      "loss": 0.1523,
+      "step": 5827
+    },
+    {
+      "epoch": 0.050589838629873006,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0019885581228098,
+      "loss": 0.1641,
+      "step": 5828
+    },
+    {
+      "epoch": 0.05059851911007717,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0019885533916127298,
+      "loss": 0.165,
+      "step": 5829
+    },
+    {
+      "epoch": 0.050607199590281336,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001988548659443951,
+      "loss": 0.1396,
+      "step": 5830
+    },
+    {
+      "epoch": 0.0506158800704855,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001988543926303468,
+      "loss": 0.1289,
+      "step": 5831
+    },
+    {
+      "epoch": 0.050624560550689666,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019885391921912873,
+      "loss": 0.1328,
+      "step": 5832
+    },
+    {
+      "epoch": 0.05063324103089383,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019885344571074137,
+      "loss": 0.1641,
+      "step": 5833
+    },
+    {
+      "epoch": 0.050641921511098,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0019885297210518516,
+      "loss": 0.1602,
+      "step": 5834
+    },
+    {
+      "epoch": 0.05065060199130216,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001988524984024607,
+      "loss": 0.1709,
+      "step": 5835
+    },
+    {
+      "epoch": 0.05065928247150633,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001988520246025685,
+      "loss": 0.1758,
+      "step": 5836
+    },
+    {
+      "epoch": 0.05066796295171049,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019885155070550904,
+      "loss": 0.1621,
+      "step": 5837
+    },
+    {
+      "epoch": 0.05067664343191465,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019885107671128287,
+      "loss": 0.1465,
+      "step": 5838
+    },
+    {
+      "epoch": 0.050685323912118815,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001988506026198905,
+      "loss": 0.127,
+      "step": 5839
+    },
+    {
+      "epoch": 0.05069400439232298,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019885012843133243,
+      "loss": 0.1299,
+      "step": 5840
+    },
+    {
+      "epoch": 0.050702684872527146,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.001988496541456092,
+      "loss": 0.1279,
+      "step": 5841
+    },
+    {
+      "epoch": 0.05071136535273131,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001988491797627213,
+      "loss": 0.127,
+      "step": 5842
+    },
+    {
+      "epoch": 0.050720045832935476,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019884870528266933,
+      "loss": 0.1328,
+      "step": 5843
+    },
+    {
+      "epoch": 0.05072872631313964,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.001988482307054537,
+      "loss": 0.3008,
+      "step": 5844
+    },
+    {
+      "epoch": 0.050737406793343806,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019884775603107503,
+      "loss": 0.1836,
+      "step": 5845
+    },
+    {
+      "epoch": 0.05074608727354797,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001988472812595337,
+      "loss": 0.1416,
+      "step": 5846
+    },
+    {
+      "epoch": 0.050754767753752136,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001988468063908304,
+      "loss": 0.1445,
+      "step": 5847
+    },
+    {
+      "epoch": 0.0507634482339563,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0019884633142496556,
+      "loss": 0.1406,
+      "step": 5848
+    },
+    {
+      "epoch": 0.05077212871416047,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019884585636193966,
+      "loss": 0.1484,
+      "step": 5849
+    },
+    {
+      "epoch": 0.05078080919436463,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001988453812017533,
+      "loss": 0.1357,
+      "step": 5850
+    },
+    {
+      "epoch": 0.0507894896745688,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019884490594440697,
+      "loss": 0.1406,
+      "step": 5851
+    },
+    {
+      "epoch": 0.05079817015477296,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019884443058990117,
+      "loss": 0.1465,
+      "step": 5852
+    },
+    {
+      "epoch": 0.05080685063497713,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019884395513823645,
+      "loss": 0.1631,
+      "step": 5853
+    },
+    {
+      "epoch": 0.05081553111518129,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001988434795894133,
+      "loss": 0.168,
+      "step": 5854
+    },
+    {
+      "epoch": 0.05082421159538546,
+      "grad_norm": 1.8828125,
+      "learning_rate": 0.001988430039434323,
+      "loss": 0.1758,
+      "step": 5855
+    },
+    {
+      "epoch": 0.05083289207558962,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0019884252820029386,
+      "loss": 0.1836,
+      "step": 5856
+    },
+    {
+      "epoch": 0.05084157255579379,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.001988420523599986,
+      "loss": 0.2012,
+      "step": 5857
+    },
+    {
+      "epoch": 0.05085025303599795,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.00198841576422547,
+      "loss": 0.1953,
+      "step": 5858
+    },
+    {
+      "epoch": 0.05085893351620212,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001988411003879396,
+      "loss": 0.1201,
+      "step": 5859
+    },
+    {
+      "epoch": 0.05086761399640628,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019884062425617687,
+      "loss": 0.165,
+      "step": 5860
+    },
+    {
+      "epoch": 0.05087629447661045,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001988401480272594,
+      "loss": 0.1113,
+      "step": 5861
+    },
+    {
+      "epoch": 0.05088497495681461,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001988396717011877,
+      "loss": 0.1348,
+      "step": 5862
+    },
+    {
+      "epoch": 0.05089365543701878,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0019883919527796225,
+      "loss": 0.1475,
+      "step": 5863
+    },
+    {
+      "epoch": 0.05090233591722294,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0019883871875758354,
+      "loss": 0.2188,
+      "step": 5864
+    },
+    {
+      "epoch": 0.05091101639742711,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.001988382421400522,
+      "loss": 0.1309,
+      "step": 5865
+    },
+    {
+      "epoch": 0.050919696877631274,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001988377654253687,
+      "loss": 0.1855,
+      "step": 5866
+    },
+    {
+      "epoch": 0.05092837735783544,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0019883728861353352,
+      "loss": 0.1211,
+      "step": 5867
+    },
+    {
+      "epoch": 0.0509370578380396,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001988368117045472,
+      "loss": 0.1855,
+      "step": 5868
+    },
+    {
+      "epoch": 0.05094573831824376,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0019883633469841037,
+      "loss": 0.1235,
+      "step": 5869
+    },
+    {
+      "epoch": 0.05095441879844793,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0019883585759512337,
+      "loss": 0.1357,
+      "step": 5870
+    },
+    {
+      "epoch": 0.05096309927865209,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019883538039468683,
+      "loss": 0.1572,
+      "step": 5871
+    },
+    {
+      "epoch": 0.05097177975885626,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019883490309710126,
+      "loss": 0.209,
+      "step": 5872
+    },
+    {
+      "epoch": 0.05098046023906042,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001988344257023672,
+      "loss": 0.1494,
+      "step": 5873
+    },
+    {
+      "epoch": 0.05098914071926459,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0019883394821048513,
+      "loss": 0.1846,
+      "step": 5874
+    },
+    {
+      "epoch": 0.05099782119946875,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019883347062145556,
+      "loss": 0.1025,
+      "step": 5875
+    },
+    {
+      "epoch": 0.05100650167967292,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019883299293527905,
+      "loss": 0.1602,
+      "step": 5876
+    },
+    {
+      "epoch": 0.05101518215987708,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001988325151519561,
+      "loss": 0.1592,
+      "step": 5877
+    },
+    {
+      "epoch": 0.05102386264008125,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019883203727148733,
+      "loss": 0.1387,
+      "step": 5878
+    },
+    {
+      "epoch": 0.05103254312028541,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019883155929387307,
+      "loss": 0.1465,
+      "step": 5879
+    },
+    {
+      "epoch": 0.05104122360048958,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0019883108121911404,
+      "loss": 0.1436,
+      "step": 5880
+    },
+    {
+      "epoch": 0.05104990408069374,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001988306030472106,
+      "loss": 0.1445,
+      "step": 5881
+    },
+    {
+      "epoch": 0.05105858456089791,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0019883012477816342,
+      "loss": 0.1621,
+      "step": 5882
+    },
+    {
+      "epoch": 0.051067265041102074,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001988296464119729,
+      "loss": 0.1406,
+      "step": 5883
+    },
+    {
+      "epoch": 0.05107594552130624,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001988291679486396,
+      "loss": 0.1338,
+      "step": 5884
+    },
+    {
+      "epoch": 0.051084626001510404,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001988286893881641,
+      "loss": 0.1738,
+      "step": 5885
+    },
+    {
+      "epoch": 0.05109330648171457,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001988282107305469,
+      "loss": 0.1514,
+      "step": 5886
+    },
+    {
+      "epoch": 0.051101986961918734,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0019882773197578843,
+      "loss": 0.1436,
+      "step": 5887
+    },
+    {
+      "epoch": 0.0511106674421229,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019882725312388933,
+      "loss": 0.1455,
+      "step": 5888
+    },
+    {
+      "epoch": 0.051119347922327064,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0019882677417485005,
+      "loss": 0.1348,
+      "step": 5889
+    },
+    {
+      "epoch": 0.05112802840253123,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001988262951286712,
+      "loss": 0.1758,
+      "step": 5890
+    },
+    {
+      "epoch": 0.051136708882735395,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019882581598535324,
+      "loss": 0.1553,
+      "step": 5891
+    },
+    {
+      "epoch": 0.05114538936293956,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019882533674489664,
+      "loss": 0.2832,
+      "step": 5892
+    },
+    {
+      "epoch": 0.051154069843143725,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0019882485740730206,
+      "loss": 0.1436,
+      "step": 5893
+    },
+    {
+      "epoch": 0.05116275032334789,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019882437797256987,
+      "loss": 0.1338,
+      "step": 5894
+    },
+    {
+      "epoch": 0.051171430803552055,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019882389844070075,
+      "loss": 0.1621,
+      "step": 5895
+    },
+    {
+      "epoch": 0.05118011128375622,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001988234188116951,
+      "loss": 0.1289,
+      "step": 5896
+    },
+    {
+      "epoch": 0.051188791763960385,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001988229390855535,
+      "loss": 0.1758,
+      "step": 5897
+    },
+    {
+      "epoch": 0.05119747224416455,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001988224592622765,
+      "loss": 0.1338,
+      "step": 5898
+    },
+    {
+      "epoch": 0.05120615272436871,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0019882197934186457,
+      "loss": 0.125,
+      "step": 5899
+    },
+    {
+      "epoch": 0.051214833204572874,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001988214993243183,
+      "loss": 0.167,
+      "step": 5900
+    },
+    {
+      "epoch": 0.05122351368477704,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001988210192096381,
+      "loss": 0.1465,
+      "step": 5901
+    },
+    {
+      "epoch": 0.051232194164981204,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001988205389978246,
+      "loss": 0.1289,
+      "step": 5902
+    },
+    {
+      "epoch": 0.05124087464518537,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001988200586888783,
+      "loss": 0.1445,
+      "step": 5903
+    },
+    {
+      "epoch": 0.051249555125389534,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001988195782827997,
+      "loss": 0.1289,
+      "step": 5904
+    },
+    {
+      "epoch": 0.0512582356055937,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019881909777958937,
+      "loss": 0.1064,
+      "step": 5905
+    },
+    {
+      "epoch": 0.051266916085797865,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.001988186171792478,
+      "loss": 0.1709,
+      "step": 5906
+    },
+    {
+      "epoch": 0.05127559656600203,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0019881813648177553,
+      "loss": 0.1309,
+      "step": 5907
+    },
+    {
+      "epoch": 0.051284277046206195,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.001988176556871731,
+      "loss": 0.1367,
+      "step": 5908
+    },
+    {
+      "epoch": 0.05129295752641036,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019881717479544097,
+      "loss": 0.1348,
+      "step": 5909
+    },
+    {
+      "epoch": 0.051301638006614525,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019881669380657973,
+      "loss": 0.1455,
+      "step": 5910
+    },
+    {
+      "epoch": 0.05131031848681869,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001988162127205899,
+      "loss": 0.1426,
+      "step": 5911
+    },
+    {
+      "epoch": 0.051318998967022855,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019881573153747198,
+      "loss": 0.125,
+      "step": 5912
+    },
+    {
+      "epoch": 0.05132767944722702,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001988152502572265,
+      "loss": 0.1436,
+      "step": 5913
+    },
+    {
+      "epoch": 0.051336359927431185,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019881476887985403,
+      "loss": 0.1465,
+      "step": 5914
+    },
+    {
+      "epoch": 0.05134504040763535,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019881428740535505,
+      "loss": 0.1216,
+      "step": 5915
+    },
+    {
+      "epoch": 0.051353720887839516,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0019881380583373007,
+      "loss": 0.1011,
+      "step": 5916
+    },
+    {
+      "epoch": 0.05136240136804368,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001988133241649797,
+      "loss": 0.1445,
+      "step": 5917
+    },
+    {
+      "epoch": 0.051371081848247846,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019881284239910437,
+      "loss": 0.2754,
+      "step": 5918
+    },
+    {
+      "epoch": 0.05137976232845201,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.001988123605361047,
+      "loss": 0.1436,
+      "step": 5919
+    },
+    {
+      "epoch": 0.051388442808656176,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001988118785759811,
+      "loss": 0.2734,
+      "step": 5920
+    },
+    {
+      "epoch": 0.05139712328886034,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0019881139651873416,
+      "loss": 0.1367,
+      "step": 5921
+    },
+    {
+      "epoch": 0.051405803769064506,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019881091436436448,
+      "loss": 0.1221,
+      "step": 5922
+    },
+    {
+      "epoch": 0.05141448424926867,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001988104321128725,
+      "loss": 0.1758,
+      "step": 5923
+    },
+    {
+      "epoch": 0.05142316472947284,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019880994976425874,
+      "loss": 0.2158,
+      "step": 5924
+    },
+    {
+      "epoch": 0.051431845209677,
+      "grad_norm": 2.3125,
+      "learning_rate": 0.0019880946731852377,
+      "loss": 0.2773,
+      "step": 5925
+    },
+    {
+      "epoch": 0.05144052568988117,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001988089847756681,
+      "loss": 0.2314,
+      "step": 5926
+    },
+    {
+      "epoch": 0.05144920617008533,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001988085021356922,
+      "loss": 0.1777,
+      "step": 5927
+    },
+    {
+      "epoch": 0.0514578866502895,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001988080193985967,
+      "loss": 0.1455,
+      "step": 5928
+    },
+    {
+      "epoch": 0.05146656713049366,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019880753656438214,
+      "loss": 0.1523,
+      "step": 5929
+    },
+    {
+      "epoch": 0.05147524761069782,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019880705363304893,
+      "loss": 0.1484,
+      "step": 5930
+    },
+    {
+      "epoch": 0.051483928090901986,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019880657060459767,
+      "loss": 0.1475,
+      "step": 5931
+    },
+    {
+      "epoch": 0.05149260857110615,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019880608747902886,
+      "loss": 0.1689,
+      "step": 5932
+    },
+    {
+      "epoch": 0.051501289051310316,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019880560425634313,
+      "loss": 0.1289,
+      "step": 5933
+    },
+    {
+      "epoch": 0.05150996953151448,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019880512093654085,
+      "loss": 0.1533,
+      "step": 5934
+    },
+    {
+      "epoch": 0.051518650011718646,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019880463751962263,
+      "loss": 0.1367,
+      "step": 5935
+    },
+    {
+      "epoch": 0.05152733049192281,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.00198804154005589,
+      "loss": 0.1504,
+      "step": 5936
+    },
+    {
+      "epoch": 0.051536010972126976,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001988036703944405,
+      "loss": 0.1602,
+      "step": 5937
+    },
+    {
+      "epoch": 0.05154469145233114,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019880318668617763,
+      "loss": 0.1069,
+      "step": 5938
+    },
+    {
+      "epoch": 0.05155337193253531,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001988027028808009,
+      "loss": 0.1533,
+      "step": 5939
+    },
+    {
+      "epoch": 0.05156205241273947,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019880221897831092,
+      "loss": 0.1699,
+      "step": 5940
+    },
+    {
+      "epoch": 0.05157073289294364,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001988017349787081,
+      "loss": 0.1602,
+      "step": 5941
+    },
+    {
+      "epoch": 0.0515794133731478,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0019880125088199314,
+      "loss": 0.1973,
+      "step": 5942
+    },
+    {
+      "epoch": 0.05158809385335197,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019880076668816636,
+      "loss": 0.1406,
+      "step": 5943
+    },
+    {
+      "epoch": 0.05159677433355613,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019880028239722845,
+      "loss": 0.1719,
+      "step": 5944
+    },
+    {
+      "epoch": 0.0516054548137603,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019879979800917985,
+      "loss": 0.1543,
+      "step": 5945
+    },
+    {
+      "epoch": 0.05161413529396446,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.001987993135240212,
+      "loss": 0.127,
+      "step": 5946
+    },
+    {
+      "epoch": 0.05162281577416863,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0019879882894175286,
+      "loss": 0.1201,
+      "step": 5947
+    },
+    {
+      "epoch": 0.05163149625437279,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019879834426237554,
+      "loss": 0.126,
+      "step": 5948
+    },
+    {
+      "epoch": 0.05164017673457696,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001987978594858896,
+      "loss": 0.166,
+      "step": 5949
+    },
+    {
+      "epoch": 0.05164885721478112,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0019879737461229573,
+      "loss": 0.1875,
+      "step": 5950
+    },
+    {
+      "epoch": 0.05165753769498529,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0019879688964159433,
+      "loss": 0.1758,
+      "step": 5951
+    },
+    {
+      "epoch": 0.05166621817518945,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.00198796404573786,
+      "loss": 0.1445,
+      "step": 5952
+    },
+    {
+      "epoch": 0.05167489865539362,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001987959194088713,
+      "loss": 0.1484,
+      "step": 5953
+    },
+    {
+      "epoch": 0.05168357913559778,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001987954341468507,
+      "loss": 0.1787,
+      "step": 5954
+    },
+    {
+      "epoch": 0.05169225961580195,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019879494878772474,
+      "loss": 0.168,
+      "step": 5955
+    },
+    {
+      "epoch": 0.051700940096006114,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019879446333149393,
+      "loss": 0.123,
+      "step": 5956
+    },
+    {
+      "epoch": 0.05170962057621028,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0019879397777815885,
+      "loss": 0.1182,
+      "step": 5957
+    },
+    {
+      "epoch": 0.051718301056414444,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0019879349212772,
+      "loss": 0.1484,
+      "step": 5958
+    },
+    {
+      "epoch": 0.05172698153661861,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019879300638017796,
+      "loss": 0.1611,
+      "step": 5959
+    },
+    {
+      "epoch": 0.051735662016822774,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0019879252053553315,
+      "loss": 0.1484,
+      "step": 5960
+    },
+    {
+      "epoch": 0.05174434249702693,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019879203459378623,
+      "loss": 0.1108,
+      "step": 5961
+    },
+    {
+      "epoch": 0.0517530229772311,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019879154855493766,
+      "loss": 0.1367,
+      "step": 5962
+    },
+    {
+      "epoch": 0.05176170345743526,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019879106241898798,
+      "loss": 0.168,
+      "step": 5963
+    },
+    {
+      "epoch": 0.05177038393763943,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019879057618593775,
+      "loss": 0.1201,
+      "step": 5964
+    },
+    {
+      "epoch": 0.05177906441784359,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019879008985578747,
+      "loss": 0.1416,
+      "step": 5965
+    },
+    {
+      "epoch": 0.05178774489804776,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.001987896034285377,
+      "loss": 0.1279,
+      "step": 5966
+    },
+    {
+      "epoch": 0.05179642537825192,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001987891169041889,
+      "loss": 0.2227,
+      "step": 5967
+    },
+    {
+      "epoch": 0.05180510585845609,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001987886302827417,
+      "loss": 0.1592,
+      "step": 5968
+    },
+    {
+      "epoch": 0.05181378633866025,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019878814356419656,
+      "loss": 0.1196,
+      "step": 5969
+    },
+    {
+      "epoch": 0.05182246681886442,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0019878765674855407,
+      "loss": 0.1138,
+      "step": 5970
+    },
+    {
+      "epoch": 0.05183114729906858,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.001987871698358147,
+      "loss": 0.1289,
+      "step": 5971
+    },
+    {
+      "epoch": 0.05183982777927275,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0019878668282597906,
+      "loss": 0.1387,
+      "step": 5972
+    },
+    {
+      "epoch": 0.051848508259476914,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001987861957190476,
+      "loss": 0.1514,
+      "step": 5973
+    },
+    {
+      "epoch": 0.05185718873968108,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001987857085150209,
+      "loss": 0.125,
+      "step": 5974
+    },
+    {
+      "epoch": 0.051865869219885244,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019878522121389948,
+      "loss": 0.1777,
+      "step": 5975
+    },
+    {
+      "epoch": 0.05187454970008941,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019878473381568385,
+      "loss": 0.1699,
+      "step": 5976
+    },
+    {
+      "epoch": 0.051883230180293574,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0019878424632037463,
+      "loss": 0.125,
+      "step": 5977
+    },
+    {
+      "epoch": 0.05189191066049774,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019878375872797224,
+      "loss": 0.1494,
+      "step": 5978
+    },
+    {
+      "epoch": 0.051900591140701904,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001987832710384773,
+      "loss": 0.1484,
+      "step": 5979
+    },
+    {
+      "epoch": 0.05190927162090607,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001987827832518903,
+      "loss": 0.1338,
+      "step": 5980
+    },
+    {
+      "epoch": 0.051917952101110235,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.001987822953682118,
+      "loss": 0.1055,
+      "step": 5981
+    },
+    {
+      "epoch": 0.0519266325813144,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019878180738744228,
+      "loss": 0.1357,
+      "step": 5982
+    },
+    {
+      "epoch": 0.051935313061518565,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.001987813193095823,
+      "loss": 0.1621,
+      "step": 5983
+    },
+    {
+      "epoch": 0.05194399354172273,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019878083113463244,
+      "loss": 0.165,
+      "step": 5984
+    },
+    {
+      "epoch": 0.051952674021926895,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.001987803428625932,
+      "loss": 0.1484,
+      "step": 5985
+    },
+    {
+      "epoch": 0.05196135450213106,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019877985449346506,
+      "loss": 0.1455,
+      "step": 5986
+    },
+    {
+      "epoch": 0.051970034982335225,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0019877936602724863,
+      "loss": 0.1084,
+      "step": 5987
+    },
+    {
+      "epoch": 0.05197871546253939,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0019877887746394445,
+      "loss": 0.1836,
+      "step": 5988
+    },
+    {
+      "epoch": 0.051987395942743556,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00198778388803553,
+      "loss": 0.1221,
+      "step": 5989
+    },
+    {
+      "epoch": 0.05199607642294772,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019877790004607485,
+      "loss": 0.1299,
+      "step": 5990
+    },
+    {
+      "epoch": 0.052004756903151886,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.001987774111915105,
+      "loss": 0.1426,
+      "step": 5991
+    },
+    {
+      "epoch": 0.052013437383356044,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001987769222398605,
+      "loss": 0.1621,
+      "step": 5992
+    },
+    {
+      "epoch": 0.05202211786356021,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019877643319112545,
+      "loss": 0.1582,
+      "step": 5993
+    },
+    {
+      "epoch": 0.052030798343764374,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001987759440453058,
+      "loss": 0.1187,
+      "step": 5994
+    },
+    {
+      "epoch": 0.05203947882396854,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0019877545480240207,
+      "loss": 0.1377,
+      "step": 5995
+    },
+    {
+      "epoch": 0.052048159304172704,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019877496546241488,
+      "loss": 0.1406,
+      "step": 5996
+    },
+    {
+      "epoch": 0.05205683978437687,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.001987744760253447,
+      "loss": 0.1484,
+      "step": 5997
+    },
+    {
+      "epoch": 0.052065520264581035,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001987739864911921,
+      "loss": 0.1602,
+      "step": 5998
+    },
+    {
+      "epoch": 0.0520742007447852,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0019877349685995757,
+      "loss": 0.1416,
+      "step": 5999
+    },
+    {
+      "epoch": 0.052082881224989365,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019877300713164172,
+      "loss": 0.1348,
+      "step": 6000
+    },
+    {
+      "epoch": 0.05209156170519353,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019877251730624503,
+      "loss": 0.1602,
+      "step": 6001
+    },
+    {
+      "epoch": 0.052100242185397695,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019877202738376805,
+      "loss": 0.1816,
+      "step": 6002
+    },
+    {
+      "epoch": 0.05210892266560186,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001987715373642113,
+      "loss": 0.1201,
+      "step": 6003
+    },
+    {
+      "epoch": 0.052117603145806025,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.001987710472475753,
+      "loss": 0.1514,
+      "step": 6004
+    },
+    {
+      "epoch": 0.05212628362601019,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001987705570338607,
+      "loss": 0.1641,
+      "step": 6005
+    },
+    {
+      "epoch": 0.052134964106214356,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001987700667230679,
+      "loss": 0.1543,
+      "step": 6006
+    },
+    {
+      "epoch": 0.05214364458641852,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001987695763151975,
+      "loss": 0.166,
+      "step": 6007
+    },
+    {
+      "epoch": 0.052152325066622686,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019876908581025004,
+      "loss": 0.1357,
+      "step": 6008
+    },
+    {
+      "epoch": 0.05216100554682685,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0019876859520822597,
+      "loss": 0.1895,
+      "step": 6009
+    },
+    {
+      "epoch": 0.052169686027031016,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.00198768104509126,
+      "loss": 0.1338,
+      "step": 6010
+    },
+    {
+      "epoch": 0.05217836650723518,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001987676137129505,
+      "loss": 0.2109,
+      "step": 6011
+    },
+    {
+      "epoch": 0.052187046987439346,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0019876712281970007,
+      "loss": 0.1572,
+      "step": 6012
+    },
+    {
+      "epoch": 0.05219572746764351,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.001987666318293753,
+      "loss": 0.1025,
+      "step": 6013
+    },
+    {
+      "epoch": 0.05220440794784768,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019876614074197663,
+      "loss": 0.1914,
+      "step": 6014
+    },
+    {
+      "epoch": 0.05221308842805184,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0019876564955750462,
+      "loss": 0.1533,
+      "step": 6015
+    },
+    {
+      "epoch": 0.05222176890825601,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019876515827595993,
+      "loss": 0.1641,
+      "step": 6016
+    },
+    {
+      "epoch": 0.05223044938846017,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001987646668973429,
+      "loss": 0.165,
+      "step": 6017
+    },
+    {
+      "epoch": 0.05223912986866434,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019876417542165417,
+      "loss": 0.1611,
+      "step": 6018
+    },
+    {
+      "epoch": 0.0522478103488685,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019876368384889433,
+      "loss": 0.1768,
+      "step": 6019
+    },
+    {
+      "epoch": 0.05225649082907267,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019876319217906383,
+      "loss": 0.1377,
+      "step": 6020
+    },
+    {
+      "epoch": 0.05226517130927683,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019876270041216323,
+      "loss": 0.1338,
+      "step": 6021
+    },
+    {
+      "epoch": 0.05227385178948099,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001987622085481931,
+      "loss": 0.1338,
+      "step": 6022
+    },
+    {
+      "epoch": 0.052282532269685156,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001987617165871539,
+      "loss": 0.1514,
+      "step": 6023
+    },
+    {
+      "epoch": 0.05229121274988932,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0019876122452904626,
+      "loss": 0.1436,
+      "step": 6024
+    },
+    {
+      "epoch": 0.052299893230093486,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019876073237387066,
+      "loss": 0.1245,
+      "step": 6025
+    },
+    {
+      "epoch": 0.05230857371029765,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0019876024012162766,
+      "loss": 0.1924,
+      "step": 6026
+    },
+    {
+      "epoch": 0.052317254190501816,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001987597477723178,
+      "loss": 0.1543,
+      "step": 6027
+    },
+    {
+      "epoch": 0.05232593467070598,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.001987592553259416,
+      "loss": 0.1279,
+      "step": 6028
+    },
+    {
+      "epoch": 0.052334615150910146,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0019875876278249963,
+      "loss": 0.1543,
+      "step": 6029
+    },
+    {
+      "epoch": 0.05234329563111431,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019875827014199238,
+      "loss": 0.1387,
+      "step": 6030
+    },
+    {
+      "epoch": 0.05235197611131848,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0019875777740442045,
+      "loss": 0.1953,
+      "step": 6031
+    },
+    {
+      "epoch": 0.05236065659152264,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019875728456978437,
+      "loss": 0.1406,
+      "step": 6032
+    },
+    {
+      "epoch": 0.05236933707172681,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001987567916380846,
+      "loss": 0.1196,
+      "step": 6033
+    },
+    {
+      "epoch": 0.05237801755193097,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.001987562986093218,
+      "loss": 0.1465,
+      "step": 6034
+    },
+    {
+      "epoch": 0.05238669803213514,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001987558054834964,
+      "loss": 0.0977,
+      "step": 6035
+    },
+    {
+      "epoch": 0.0523953785123393,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00198755312260609,
+      "loss": 0.1426,
+      "step": 6036
+    },
+    {
+      "epoch": 0.05240405899254347,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001987548189406601,
+      "loss": 0.1475,
+      "step": 6037
+    },
+    {
+      "epoch": 0.05241273947274763,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001987543255236503,
+      "loss": 0.1396,
+      "step": 6038
+    },
+    {
+      "epoch": 0.0524214199529518,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001987538320095801,
+      "loss": 0.1299,
+      "step": 6039
+    },
+    {
+      "epoch": 0.05243010043315596,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019875333839845,
+      "loss": 0.1846,
+      "step": 6040
+    },
+    {
+      "epoch": 0.05243878091336013,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0019875284469026066,
+      "loss": 0.1357,
+      "step": 6041
+    },
+    {
+      "epoch": 0.05244746139356429,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019875235088501245,
+      "loss": 0.1592,
+      "step": 6042
+    },
+    {
+      "epoch": 0.05245614187376846,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019875185698270607,
+      "loss": 0.1641,
+      "step": 6043
+    },
+    {
+      "epoch": 0.05246482235397262,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0019875136298334196,
+      "loss": 0.1387,
+      "step": 6044
+    },
+    {
+      "epoch": 0.05247350283417679,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001987508688869207,
+      "loss": 0.106,
+      "step": 6045
+    },
+    {
+      "epoch": 0.052482183314380954,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001987503746934428,
+      "loss": 0.1426,
+      "step": 6046
+    },
+    {
+      "epoch": 0.05249086379458512,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001987498804029089,
+      "loss": 0.1914,
+      "step": 6047
+    },
+    {
+      "epoch": 0.052499544274789284,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.001987493860153194,
+      "loss": 0.125,
+      "step": 6048
+    },
+    {
+      "epoch": 0.05250822475499345,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.001987488915306749,
+      "loss": 0.1328,
+      "step": 6049
+    },
+    {
+      "epoch": 0.052516905235197614,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019874839694897597,
+      "loss": 0.1367,
+      "step": 6050
+    },
+    {
+      "epoch": 0.05252558571540178,
+      "grad_norm": 0.875,
+      "learning_rate": 0.001987479022702231,
+      "loss": 0.166,
+      "step": 6051
+    },
+    {
+      "epoch": 0.052534266195605944,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019874740749441693,
+      "loss": 0.1328,
+      "step": 6052
+    },
+    {
+      "epoch": 0.0525429466758101,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0019874691262155787,
+      "loss": 0.167,
+      "step": 6053
+    },
+    {
+      "epoch": 0.05255162715601427,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019874641765164655,
+      "loss": 0.127,
+      "step": 6054
+    },
+    {
+      "epoch": 0.05256030763621843,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0019874592258468343,
+      "loss": 0.0942,
+      "step": 6055
+    },
+    {
+      "epoch": 0.0525689881164226,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019874542742066913,
+      "loss": 0.1309,
+      "step": 6056
+    },
+    {
+      "epoch": 0.05257766859662676,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0019874493215960415,
+      "loss": 0.2227,
+      "step": 6057
+    },
+    {
+      "epoch": 0.05258634907683093,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001987444368014891,
+      "loss": 0.1396,
+      "step": 6058
+    },
+    {
+      "epoch": 0.05259502955703509,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019874394134632442,
+      "loss": 0.127,
+      "step": 6059
+    },
+    {
+      "epoch": 0.05260371003723926,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019874344579411066,
+      "loss": 0.2148,
+      "step": 6060
+    },
+    {
+      "epoch": 0.05261239051744342,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0019874295014484845,
+      "loss": 0.124,
+      "step": 6061
+    },
+    {
+      "epoch": 0.05262107099764759,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001987424543985383,
+      "loss": 0.1416,
+      "step": 6062
+    },
+    {
+      "epoch": 0.052629751477851754,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001987419585551807,
+      "loss": 0.168,
+      "step": 6063
+    },
+    {
+      "epoch": 0.05263843195805592,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001987414626147763,
+      "loss": 0.1221,
+      "step": 6064
+    },
+    {
+      "epoch": 0.052647112438260084,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001987409665773255,
+      "loss": 0.168,
+      "step": 6065
+    },
+    {
+      "epoch": 0.05265579291846425,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001987404704428289,
+      "loss": 0.1338,
+      "step": 6066
+    },
+    {
+      "epoch": 0.052664473398668414,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001987399742112871,
+      "loss": 0.126,
+      "step": 6067
+    },
+    {
+      "epoch": 0.05267315387887258,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0019873947788270055,
+      "loss": 0.1104,
+      "step": 6068
+    },
+    {
+      "epoch": 0.052681834359076744,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019873898145706986,
+      "loss": 0.1387,
+      "step": 6069
+    },
+    {
+      "epoch": 0.05269051483928091,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001987384849343956,
+      "loss": 0.126,
+      "step": 6070
+    },
+    {
+      "epoch": 0.052699195319485075,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019873798831467823,
+      "loss": 0.125,
+      "step": 6071
+    },
+    {
+      "epoch": 0.05270787579968924,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0019873749159791828,
+      "loss": 0.1475,
+      "step": 6072
+    },
+    {
+      "epoch": 0.052716556279893405,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001987369947841164,
+      "loss": 0.168,
+      "step": 6073
+    },
+    {
+      "epoch": 0.05272523676009757,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001987364978732731,
+      "loss": 0.1445,
+      "step": 6074
+    },
+    {
+      "epoch": 0.052733917240301735,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019873600086538885,
+      "loss": 0.1387,
+      "step": 6075
+    },
+    {
+      "epoch": 0.0527425977205059,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019873550376046426,
+      "loss": 0.2383,
+      "step": 6076
+    },
+    {
+      "epoch": 0.052751278200710065,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0019873500655849985,
+      "loss": 0.1592,
+      "step": 6077
+    },
+    {
+      "epoch": 0.05275995868091423,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001987345092594962,
+      "loss": 0.2207,
+      "step": 6078
+    },
+    {
+      "epoch": 0.052768639161118396,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019873401186345377,
+      "loss": 0.1348,
+      "step": 6079
+    },
+    {
+      "epoch": 0.05277731964132256,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001987335143703732,
+      "loss": 0.1416,
+      "step": 6080
+    },
+    {
+      "epoch": 0.052786000121526726,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.00198733016780255,
+      "loss": 0.1719,
+      "step": 6081
+    },
+    {
+      "epoch": 0.05279468060173089,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019873251909309966,
+      "loss": 0.1641,
+      "step": 6082
+    },
+    {
+      "epoch": 0.052803361081935056,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0019873202130890775,
+      "loss": 0.1055,
+      "step": 6083
+    },
+    {
+      "epoch": 0.052812041562139214,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019873152342767993,
+      "loss": 0.1348,
+      "step": 6084
+    },
+    {
+      "epoch": 0.05282072204234338,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019873102544941657,
+      "loss": 0.1953,
+      "step": 6085
+    },
+    {
+      "epoch": 0.052829402522547544,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0019873052737411833,
+      "loss": 0.1172,
+      "step": 6086
+    },
+    {
+      "epoch": 0.05283808300275171,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001987300292017857,
+      "loss": 0.1738,
+      "step": 6087
+    },
+    {
+      "epoch": 0.052846763482955875,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019872953093241926,
+      "loss": 0.1768,
+      "step": 6088
+    },
+    {
+      "epoch": 0.05285544396316004,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001987290325660195,
+      "loss": 0.1396,
+      "step": 6089
+    },
+    {
+      "epoch": 0.052864124443364205,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001987285341025871,
+      "loss": 0.1143,
+      "step": 6090
+    },
+    {
+      "epoch": 0.05287280492356837,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001987280355421224,
+      "loss": 0.1826,
+      "step": 6091
+    },
+    {
+      "epoch": 0.052881485403772535,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001987275368846261,
+      "loss": 0.1562,
+      "step": 6092
+    },
+    {
+      "epoch": 0.0528901658839767,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001987270381300987,
+      "loss": 0.1641,
+      "step": 6093
+    },
+    {
+      "epoch": 0.052898846364180865,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019872653927854073,
+      "loss": 0.1699,
+      "step": 6094
+    },
+    {
+      "epoch": 0.05290752684438503,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019872604032995273,
+      "loss": 0.1377,
+      "step": 6095
+    },
+    {
+      "epoch": 0.052916207324589196,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001987255412843353,
+      "loss": 0.127,
+      "step": 6096
+    },
+    {
+      "epoch": 0.05292488780479336,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0019872504214168895,
+      "loss": 0.1172,
+      "step": 6097
+    },
+    {
+      "epoch": 0.052933568284997526,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0019872454290201422,
+      "loss": 0.1621,
+      "step": 6098
+    },
+    {
+      "epoch": 0.05294224876520169,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001987240435653117,
+      "loss": 0.1152,
+      "step": 6099
+    },
+    {
+      "epoch": 0.052950929245405856,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019872354413158185,
+      "loss": 0.2197,
+      "step": 6100
+    },
+    {
+      "epoch": 0.05295960972561002,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019872304460082525,
+      "loss": 0.1533,
+      "step": 6101
+    },
+    {
+      "epoch": 0.052968290205814186,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019872254497304252,
+      "loss": 0.1465,
+      "step": 6102
+    },
+    {
+      "epoch": 0.05297697068601835,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001987220452482341,
+      "loss": 0.2383,
+      "step": 6103
+    },
+    {
+      "epoch": 0.05298565116622252,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019872154542640063,
+      "loss": 0.1318,
+      "step": 6104
+    },
+    {
+      "epoch": 0.05299433164642668,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001987210455075426,
+      "loss": 0.1523,
+      "step": 6105
+    },
+    {
+      "epoch": 0.05300301212663085,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0019872054549166054,
+      "loss": 0.1465,
+      "step": 6106
+    },
+    {
+      "epoch": 0.05301169260683501,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019872004537875502,
+      "loss": 0.1533,
+      "step": 6107
+    },
+    {
+      "epoch": 0.05302037308703918,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001987195451688266,
+      "loss": 0.1357,
+      "step": 6108
+    },
+    {
+      "epoch": 0.05302905356724334,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0019871904486187583,
+      "loss": 0.1709,
+      "step": 6109
+    },
+    {
+      "epoch": 0.05303773404744751,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019871854445790327,
+      "loss": 0.166,
+      "step": 6110
+    },
+    {
+      "epoch": 0.05304641452765167,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001987180439569094,
+      "loss": 0.1099,
+      "step": 6111
+    },
+    {
+      "epoch": 0.05305509500785584,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001987175433588948,
+      "loss": 0.1016,
+      "step": 6112
+    },
+    {
+      "epoch": 0.05306377548806,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019871704266386007,
+      "loss": 0.1582,
+      "step": 6113
+    },
+    {
+      "epoch": 0.05307245596826417,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001987165418718057,
+      "loss": 0.2266,
+      "step": 6114
+    },
+    {
+      "epoch": 0.053081136448468326,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019871604098273222,
+      "loss": 0.1416,
+      "step": 6115
+    },
+    {
+      "epoch": 0.05308981692867249,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001987155399966403,
+      "loss": 0.125,
+      "step": 6116
+    },
+    {
+      "epoch": 0.053098497408876656,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001987150389135303,
+      "loss": 0.1406,
+      "step": 6117
+    },
+    {
+      "epoch": 0.05310717788908082,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001987145377334029,
+      "loss": 0.1738,
+      "step": 6118
+    },
+    {
+      "epoch": 0.053115858369284986,
+      "grad_norm": 3.203125,
+      "learning_rate": 0.0019871403645625864,
+      "loss": 0.6641,
+      "step": 6119
+    },
+    {
+      "epoch": 0.05312453884948915,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00198713535082098,
+      "loss": 0.1406,
+      "step": 6120
+    },
+    {
+      "epoch": 0.05313321932969332,
+      "grad_norm": 3.703125,
+      "learning_rate": 0.0019871303361092157,
+      "loss": 0.1406,
+      "step": 6121
+    },
+    {
+      "epoch": 0.05314189980989748,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0019871253204272995,
+      "loss": 0.2637,
+      "step": 6122
+    },
+    {
+      "epoch": 0.05315058029010165,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001987120303775236,
+      "loss": 0.1279,
+      "step": 6123
+    },
+    {
+      "epoch": 0.05315926077030581,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001987115286153031,
+      "loss": 0.1455,
+      "step": 6124
+    },
+    {
+      "epoch": 0.05316794125050998,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019871102675606904,
+      "loss": 0.1582,
+      "step": 6125
+    },
+    {
+      "epoch": 0.05317662173071414,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001987105247998219,
+      "loss": 0.1289,
+      "step": 6126
+    },
+    {
+      "epoch": 0.05318530221091831,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.001987100227465623,
+      "loss": 0.1738,
+      "step": 6127
+    },
+    {
+      "epoch": 0.05319398269112247,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019870952059629072,
+      "loss": 0.126,
+      "step": 6128
+    },
+    {
+      "epoch": 0.05320266317132664,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019870901834900777,
+      "loss": 0.1318,
+      "step": 6129
+    },
+    {
+      "epoch": 0.0532113436515308,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001987085160047139,
+      "loss": 0.1191,
+      "step": 6130
+    },
+    {
+      "epoch": 0.05322002413173497,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001987080135634098,
+      "loss": 0.2021,
+      "step": 6131
+    },
+    {
+      "epoch": 0.05322870461193913,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.00198707511025096,
+      "loss": 0.1514,
+      "step": 6132
+    },
+    {
+      "epoch": 0.0532373850921433,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001987070083897729,
+      "loss": 0.1318,
+      "step": 6133
+    },
+    {
+      "epoch": 0.05324606557234746,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019870650565744115,
+      "loss": 0.1631,
+      "step": 6134
+    },
+    {
+      "epoch": 0.05325474605255163,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0019870600282810135,
+      "loss": 0.1113,
+      "step": 6135
+    },
+    {
+      "epoch": 0.053263426532755793,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019870549990175395,
+      "loss": 0.2275,
+      "step": 6136
+    },
+    {
+      "epoch": 0.05327210701295996,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001987049968783996,
+      "loss": 0.1006,
+      "step": 6137
+    },
+    {
+      "epoch": 0.053280787493164124,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0019870449375803878,
+      "loss": 0.1055,
+      "step": 6138
+    },
+    {
+      "epoch": 0.05328946797336829,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0019870399054067204,
+      "loss": 0.1475,
+      "step": 6139
+    },
+    {
+      "epoch": 0.053298148453572454,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.001987034872262999,
+      "loss": 0.1348,
+      "step": 6140
+    },
+    {
+      "epoch": 0.05330682893377662,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019870298381492305,
+      "loss": 0.125,
+      "step": 6141
+    },
+    {
+      "epoch": 0.053315509413980784,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0019870248030654193,
+      "loss": 0.166,
+      "step": 6142
+    },
+    {
+      "epoch": 0.05332418989418495,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001987019767011571,
+      "loss": 0.1533,
+      "step": 6143
+    },
+    {
+      "epoch": 0.053332870374389114,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0019870147299876914,
+      "loss": 0.1406,
+      "step": 6144
+    },
+    {
+      "epoch": 0.05334155085459328,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019870096919937853,
+      "loss": 0.1226,
+      "step": 6145
+    },
+    {
+      "epoch": 0.05335023133479744,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001987004653029859,
+      "loss": 0.1157,
+      "step": 6146
+    },
+    {
+      "epoch": 0.0533589118150016,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001986999613095918,
+      "loss": 0.1504,
+      "step": 6147
+    },
+    {
+      "epoch": 0.05336759229520577,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001986994572191967,
+      "loss": 0.1465,
+      "step": 6148
+    },
+    {
+      "epoch": 0.05337627277540993,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019869895303180124,
+      "loss": 0.1367,
+      "step": 6149
+    },
+    {
+      "epoch": 0.0533849532556141,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019869844874740596,
+      "loss": 0.1572,
+      "step": 6150
+    },
+    {
+      "epoch": 0.05339363373581826,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019869794436601137,
+      "loss": 0.1465,
+      "step": 6151
+    },
+    {
+      "epoch": 0.05340231421602243,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00198697439887618,
+      "loss": 0.1611,
+      "step": 6152
+    },
+    {
+      "epoch": 0.053410994696226594,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001986969353122265,
+      "loss": 0.1001,
+      "step": 6153
+    },
+    {
+      "epoch": 0.05341967517643076,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.001986964306398373,
+      "loss": 0.1465,
+      "step": 6154
+    },
+    {
+      "epoch": 0.053428355656634924,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019869592587045113,
+      "loss": 0.1543,
+      "step": 6155
+    },
+    {
+      "epoch": 0.05343703613683909,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0019869542100406836,
+      "loss": 0.1543,
+      "step": 6156
+    },
+    {
+      "epoch": 0.053445716617043254,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0019869491604068957,
+      "loss": 0.1406,
+      "step": 6157
+    },
+    {
+      "epoch": 0.05345439709724742,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0019869441098031536,
+      "loss": 0.127,
+      "step": 6158
+    },
+    {
+      "epoch": 0.053463077577451584,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019869390582294634,
+      "loss": 0.1143,
+      "step": 6159
+    },
+    {
+      "epoch": 0.05347175805765575,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0019869340056858295,
+      "loss": 0.1572,
+      "step": 6160
+    },
+    {
+      "epoch": 0.053480438537859915,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019869289521722584,
+      "loss": 0.123,
+      "step": 6161
+    },
+    {
+      "epoch": 0.05348911901806408,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0019869238976887543,
+      "loss": 0.1377,
+      "step": 6162
+    },
+    {
+      "epoch": 0.053497799498268245,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0019869188422353243,
+      "loss": 0.1318,
+      "step": 6163
+    },
+    {
+      "epoch": 0.05350647997847241,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019869137858119727,
+      "loss": 0.1182,
+      "step": 6164
+    },
+    {
+      "epoch": 0.053515160458676575,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0019869087284187055,
+      "loss": 0.1543,
+      "step": 6165
+    },
+    {
+      "epoch": 0.05352384093888074,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0019869036700555284,
+      "loss": 0.1582,
+      "step": 6166
+    },
+    {
+      "epoch": 0.053532521419084905,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001986898610722447,
+      "loss": 0.1426,
+      "step": 6167
+    },
+    {
+      "epoch": 0.05354120189928907,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001986893550419466,
+      "loss": 0.1064,
+      "step": 6168
+    },
+    {
+      "epoch": 0.053549882379493235,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001986888489146592,
+      "loss": 0.1064,
+      "step": 6169
+    },
+    {
+      "epoch": 0.0535585628596974,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0019868834269038305,
+      "loss": 0.1523,
+      "step": 6170
+    },
+    {
+      "epoch": 0.053567243339901566,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001986878363691186,
+      "loss": 0.1768,
+      "step": 6171
+    },
+    {
+      "epoch": 0.05357592382010573,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019868732995086645,
+      "loss": 0.1406,
+      "step": 6172
+    },
+    {
+      "epoch": 0.053584604300309896,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001986868234356272,
+      "loss": 0.1348,
+      "step": 6173
+    },
+    {
+      "epoch": 0.05359328478051406,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.001986863168234014,
+      "loss": 0.1416,
+      "step": 6174
+    },
+    {
+      "epoch": 0.053601965260718226,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001986858101141895,
+      "loss": 0.1235,
+      "step": 6175
+    },
+    {
+      "epoch": 0.053610645740922384,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.001986853033079922,
+      "loss": 0.1719,
+      "step": 6176
+    },
+    {
+      "epoch": 0.05361932622112655,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019868479640480995,
+      "loss": 0.1416,
+      "step": 6177
+    },
+    {
+      "epoch": 0.053628006701330715,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001986842894046433,
+      "loss": 0.125,
+      "step": 6178
+    },
+    {
+      "epoch": 0.05363668718153488,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0019868378230749294,
+      "loss": 0.1689,
+      "step": 6179
+    },
+    {
+      "epoch": 0.053645367661739045,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001986832751133593,
+      "loss": 0.1504,
+      "step": 6180
+    },
+    {
+      "epoch": 0.05365404814194321,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001986827678222429,
+      "loss": 0.1797,
+      "step": 6181
+    },
+    {
+      "epoch": 0.053662728622147375,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001986822604341444,
+      "loss": 0.1494,
+      "step": 6182
+    },
+    {
+      "epoch": 0.05367140910235154,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0019868175294906433,
+      "loss": 0.1172,
+      "step": 6183
+    },
+    {
+      "epoch": 0.053680089582555705,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0019868124536700322,
+      "loss": 0.2168,
+      "step": 6184
+    },
+    {
+      "epoch": 0.05368877006275987,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001986807376879616,
+      "loss": 0.1631,
+      "step": 6185
+    },
+    {
+      "epoch": 0.053697450542964036,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001986802299119401,
+      "loss": 0.1221,
+      "step": 6186
+    },
+    {
+      "epoch": 0.0537061310231682,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019867972203893923,
+      "loss": 0.1289,
+      "step": 6187
+    },
+    {
+      "epoch": 0.053714811503372366,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001986792140689595,
+      "loss": 0.1387,
+      "step": 6188
+    },
+    {
+      "epoch": 0.05372349198357653,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019867870600200157,
+      "loss": 0.1245,
+      "step": 6189
+    },
+    {
+      "epoch": 0.053732172463780696,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001986781978380659,
+      "loss": 0.1553,
+      "step": 6190
+    },
+    {
+      "epoch": 0.05374085294398486,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001986776895771531,
+      "loss": 0.123,
+      "step": 6191
+    },
+    {
+      "epoch": 0.053749533424189026,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.001986771812192637,
+      "loss": 0.1211,
+      "step": 6192
+    },
+    {
+      "epoch": 0.05375821390439319,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001986766727643983,
+      "loss": 0.166,
+      "step": 6193
+    },
+    {
+      "epoch": 0.05376689438459736,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019867616421255737,
+      "loss": 0.1855,
+      "step": 6194
+    },
+    {
+      "epoch": 0.05377557486480152,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019867565556374155,
+      "loss": 0.1396,
+      "step": 6195
+    },
+    {
+      "epoch": 0.05378425534500569,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.001986751468179514,
+      "loss": 0.1582,
+      "step": 6196
+    },
+    {
+      "epoch": 0.05379293582520985,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0019867463797518738,
+      "loss": 0.1143,
+      "step": 6197
+    },
+    {
+      "epoch": 0.05380161630541402,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0019867412903545015,
+      "loss": 0.1245,
+      "step": 6198
+    },
+    {
+      "epoch": 0.05381029678561818,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001986736199987402,
+      "loss": 0.1387,
+      "step": 6199
+    },
+    {
+      "epoch": 0.05381897726582235,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019867311086505814,
+      "loss": 0.1396,
+      "step": 6200
+    },
+    {
+      "epoch": 0.05382765774602651,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.001986726016344045,
+      "loss": 0.1021,
+      "step": 6201
+    },
+    {
+      "epoch": 0.05383633822623068,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001986720923067798,
+      "loss": 0.1221,
+      "step": 6202
+    },
+    {
+      "epoch": 0.05384501870643484,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019867158288218462,
+      "loss": 0.1777,
+      "step": 6203
+    },
+    {
+      "epoch": 0.05385369918663901,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001986710733606196,
+      "loss": 0.1436,
+      "step": 6204
+    },
+    {
+      "epoch": 0.05386237966684317,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019867056374208514,
+      "loss": 0.1445,
+      "step": 6205
+    },
+    {
+      "epoch": 0.05387106014704734,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001986700540265819,
+      "loss": 0.1348,
+      "step": 6206
+    },
+    {
+      "epoch": 0.053879740627251496,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001986695442141105,
+      "loss": 0.1602,
+      "step": 6207
+    },
+    {
+      "epoch": 0.05388842110745566,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001986690343046713,
+      "loss": 0.1553,
+      "step": 6208
+    },
+    {
+      "epoch": 0.053897101587659826,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001986685242982651,
+      "loss": 0.1924,
+      "step": 6209
+    },
+    {
+      "epoch": 0.05390578206786399,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019866801419489225,
+      "loss": 0.127,
+      "step": 6210
+    },
+    {
+      "epoch": 0.05391446254806816,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019866750399455345,
+      "loss": 0.1328,
+      "step": 6211
+    },
+    {
+      "epoch": 0.05392314302827232,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019866699369724913,
+      "loss": 0.1562,
+      "step": 6212
+    },
+    {
+      "epoch": 0.05393182350847649,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019866648330297996,
+      "loss": 0.1787,
+      "step": 6213
+    },
+    {
+      "epoch": 0.05394050398868065,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001986659728117465,
+      "loss": 0.1836,
+      "step": 6214
+    },
+    {
+      "epoch": 0.05394918446888482,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001986654622235492,
+      "loss": 0.1533,
+      "step": 6215
+    },
+    {
+      "epoch": 0.05395786494908898,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019866495153838867,
+      "loss": 0.127,
+      "step": 6216
+    },
+    {
+      "epoch": 0.05396654542929315,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001986644407562655,
+      "loss": 0.1533,
+      "step": 6217
+    },
+    {
+      "epoch": 0.05397522590949731,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019866392987718026,
+      "loss": 0.165,
+      "step": 6218
+    },
+    {
+      "epoch": 0.05398390638970148,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019866341890113345,
+      "loss": 0.1123,
+      "step": 6219
+    },
+    {
+      "epoch": 0.05399258686990564,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.001986629078281257,
+      "loss": 0.1641,
+      "step": 6220
+    },
+    {
+      "epoch": 0.05400126735010981,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019866239665815746,
+      "loss": 0.1719,
+      "step": 6221
+    },
+    {
+      "epoch": 0.05400994783031397,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001986618853912294,
+      "loss": 0.1367,
+      "step": 6222
+    },
+    {
+      "epoch": 0.05401862831051814,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.00198661374027342,
+      "loss": 0.1108,
+      "step": 6223
+    },
+    {
+      "epoch": 0.0540273087907223,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001986608625664959,
+      "loss": 0.1289,
+      "step": 6224
+    },
+    {
+      "epoch": 0.05403598927092647,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019866035100869163,
+      "loss": 0.1279,
+      "step": 6225
+    },
+    {
+      "epoch": 0.05404466975113063,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019865983935392966,
+      "loss": 0.208,
+      "step": 6226
+    },
+    {
+      "epoch": 0.0540533502313348,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0019865932760221067,
+      "loss": 0.1289,
+      "step": 6227
+    },
+    {
+      "epoch": 0.054062030711538964,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001986588157535351,
+      "loss": 0.1279,
+      "step": 6228
+    },
+    {
+      "epoch": 0.05407071119174313,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001986583038079037,
+      "loss": 0.127,
+      "step": 6229
+    },
+    {
+      "epoch": 0.054079391671947294,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.001986577917653168,
+      "loss": 0.1299,
+      "step": 6230
+    },
+    {
+      "epoch": 0.05408807215215146,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0019865727962577514,
+      "loss": 0.1543,
+      "step": 6231
+    },
+    {
+      "epoch": 0.054096752632355624,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001986567673892792,
+      "loss": 0.1484,
+      "step": 6232
+    },
+    {
+      "epoch": 0.05410543311255979,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0019865625505582954,
+      "loss": 0.1416,
+      "step": 6233
+    },
+    {
+      "epoch": 0.054114113592763954,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001986557426254267,
+      "loss": 0.1406,
+      "step": 6234
+    },
+    {
+      "epoch": 0.05412279407296812,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001986552300980713,
+      "loss": 0.1787,
+      "step": 6235
+    },
+    {
+      "epoch": 0.054131474553172285,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001986547174737639,
+      "loss": 0.1777,
+      "step": 6236
+    },
+    {
+      "epoch": 0.05414015503337645,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0019865420475250503,
+      "loss": 0.1768,
+      "step": 6237
+    },
+    {
+      "epoch": 0.05414883551358061,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019865369193429517,
+      "loss": 0.1602,
+      "step": 6238
+    },
+    {
+      "epoch": 0.05415751599378477,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019865317901913504,
+      "loss": 0.123,
+      "step": 6239
+    },
+    {
+      "epoch": 0.05416619647398894,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0019865266600702512,
+      "loss": 0.1602,
+      "step": 6240
+    },
+    {
+      "epoch": 0.0541748769541931,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0019865215289796598,
+      "loss": 0.1445,
+      "step": 6241
+    },
+    {
+      "epoch": 0.05418355743439727,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019865163969195816,
+      "loss": 0.127,
+      "step": 6242
+    },
+    {
+      "epoch": 0.054192237914601434,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019865112638900225,
+      "loss": 0.1445,
+      "step": 6243
+    },
+    {
+      "epoch": 0.0542009183948056,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001986506129890988,
+      "loss": 0.1279,
+      "step": 6244
+    },
+    {
+      "epoch": 0.054209598875009764,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0019865009949224834,
+      "loss": 0.1094,
+      "step": 6245
+    },
+    {
+      "epoch": 0.05421827935521393,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.001986495858984515,
+      "loss": 0.1455,
+      "step": 6246
+    },
+    {
+      "epoch": 0.054226959835418094,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001986490722077088,
+      "loss": 0.1465,
+      "step": 6247
+    },
+    {
+      "epoch": 0.05423564031562226,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019864855842002082,
+      "loss": 0.1387,
+      "step": 6248
+    },
+    {
+      "epoch": 0.054244320795826424,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0019864804453538806,
+      "loss": 0.1436,
+      "step": 6249
+    },
+    {
+      "epoch": 0.05425300127603059,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.001986475305538112,
+      "loss": 0.1631,
+      "step": 6250
+    },
+    {
+      "epoch": 0.054261681756234754,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001986470164752907,
+      "loss": 0.1279,
+      "step": 6251
+    },
+    {
+      "epoch": 0.05427036223643892,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001986465022998271,
+      "loss": 0.1484,
+      "step": 6252
+    },
+    {
+      "epoch": 0.054279042716643085,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001986459880274211,
+      "loss": 0.1865,
+      "step": 6253
+    },
+    {
+      "epoch": 0.05428772319684725,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001986454736580731,
+      "loss": 0.1543,
+      "step": 6254
+    },
+    {
+      "epoch": 0.054296403677051415,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001986449591917838,
+      "loss": 0.1611,
+      "step": 6255
+    },
+    {
+      "epoch": 0.05430508415725558,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019864444462855367,
+      "loss": 0.1387,
+      "step": 6256
+    },
+    {
+      "epoch": 0.054313764637459745,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019864392996838333,
+      "loss": 0.1494,
+      "step": 6257
+    },
+    {
+      "epoch": 0.05432244511766391,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0019864341521127335,
+      "loss": 0.1572,
+      "step": 6258
+    },
+    {
+      "epoch": 0.054331125597868075,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001986429003572242,
+      "loss": 0.1768,
+      "step": 6259
+    },
+    {
+      "epoch": 0.05433980607807224,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0019864238540623653,
+      "loss": 0.1797,
+      "step": 6260
+    },
+    {
+      "epoch": 0.054348486558276406,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001986418703583109,
+      "loss": 0.1172,
+      "step": 6261
+    },
+    {
+      "epoch": 0.05435716703848057,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019864135521344786,
+      "loss": 0.1074,
+      "step": 6262
+    },
+    {
+      "epoch": 0.054365847518684736,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0019864083997164794,
+      "loss": 0.209,
+      "step": 6263
+    },
+    {
+      "epoch": 0.0543745279988889,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001986403246329117,
+      "loss": 0.1455,
+      "step": 6264
+    },
+    {
+      "epoch": 0.054383208479093066,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001986398091972398,
+      "loss": 0.1924,
+      "step": 6265
+    },
+    {
+      "epoch": 0.05439188895929723,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001986392936646327,
+      "loss": 0.0977,
+      "step": 6266
+    },
+    {
+      "epoch": 0.054400569439501396,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.00198638778035091,
+      "loss": 0.1719,
+      "step": 6267
+    },
+    {
+      "epoch": 0.05440924991970556,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0019863826230861526,
+      "loss": 0.1562,
+      "step": 6268
+    },
+    {
+      "epoch": 0.05441793039990972,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.00198637746485206,
+      "loss": 0.123,
+      "step": 6269
+    },
+    {
+      "epoch": 0.054426610880113885,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019863723056486398,
+      "loss": 0.1436,
+      "step": 6270
+    },
+    {
+      "epoch": 0.05443529136031805,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019863671454758945,
+      "loss": 0.1582,
+      "step": 6271
+    },
+    {
+      "epoch": 0.054443971840522215,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019863619843338326,
+      "loss": 0.1108,
+      "step": 6272
+    },
+    {
+      "epoch": 0.05445265232072638,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001986356822222458,
+      "loss": 0.104,
+      "step": 6273
+    },
+    {
+      "epoch": 0.054461332800930545,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001986351659141777,
+      "loss": 0.1533,
+      "step": 6274
+    },
+    {
+      "epoch": 0.05447001328113471,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001986346495091795,
+      "loss": 0.0986,
+      "step": 6275
+    },
+    {
+      "epoch": 0.054478693761338876,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001986341330072518,
+      "loss": 0.1089,
+      "step": 6276
+    },
+    {
+      "epoch": 0.05448737424154304,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0019863361640839513,
+      "loss": 0.1426,
+      "step": 6277
+    },
+    {
+      "epoch": 0.054496054721747206,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001986330997126101,
+      "loss": 0.1025,
+      "step": 6278
+    },
+    {
+      "epoch": 0.05450473520195137,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001986325829198972,
+      "loss": 0.1445,
+      "step": 6279
+    },
+    {
+      "epoch": 0.054513415682155536,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0019863206603025706,
+      "loss": 0.1367,
+      "step": 6280
+    },
+    {
+      "epoch": 0.0545220961623597,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019863154904369022,
+      "loss": 0.1787,
+      "step": 6281
+    },
+    {
+      "epoch": 0.054530776642563866,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019863103196019723,
+      "loss": 0.1406,
+      "step": 6282
+    },
+    {
+      "epoch": 0.05453945712276803,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019863051477977873,
+      "loss": 0.126,
+      "step": 6283
+    },
+    {
+      "epoch": 0.054548137602972196,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0019862999750243516,
+      "loss": 0.124,
+      "step": 6284
+    },
+    {
+      "epoch": 0.05455681808317636,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001986294801281672,
+      "loss": 0.1484,
+      "step": 6285
+    },
+    {
+      "epoch": 0.05456549856338053,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001986289626569754,
+      "loss": 0.1279,
+      "step": 6286
+    },
+    {
+      "epoch": 0.05457417904358469,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019862844508886025,
+      "loss": 0.1592,
+      "step": 6287
+    },
+    {
+      "epoch": 0.05458285952378886,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001986279274238224,
+      "loss": 0.1143,
+      "step": 6288
+    },
+    {
+      "epoch": 0.05459154000399302,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0019862740966186234,
+      "loss": 0.124,
+      "step": 6289
+    },
+    {
+      "epoch": 0.05460022048419719,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001986268918029807,
+      "loss": 0.1338,
+      "step": 6290
+    },
+    {
+      "epoch": 0.05460890096440135,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00198626373847178,
+      "loss": 0.1221,
+      "step": 6291
+    },
+    {
+      "epoch": 0.05461758144460552,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019862585579445485,
+      "loss": 0.1592,
+      "step": 6292
+    },
+    {
+      "epoch": 0.05462626192480968,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001986253376448118,
+      "loss": 0.1118,
+      "step": 6293
+    },
+    {
+      "epoch": 0.05463494240501385,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001986248193982494,
+      "loss": 0.1523,
+      "step": 6294
+    },
+    {
+      "epoch": 0.05464362288521801,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001986243010547682,
+      "loss": 0.1748,
+      "step": 6295
+    },
+    {
+      "epoch": 0.05465230336542218,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019862378261436883,
+      "loss": 0.1318,
+      "step": 6296
+    },
+    {
+      "epoch": 0.05466098384562634,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019862326407705183,
+      "loss": 0.1289,
+      "step": 6297
+    },
+    {
+      "epoch": 0.05466966432583051,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0019862274544281775,
+      "loss": 0.1089,
+      "step": 6298
+    },
+    {
+      "epoch": 0.05467834480603467,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001986222267116672,
+      "loss": 0.1582,
+      "step": 6299
+    },
+    {
+      "epoch": 0.05468702528623883,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001986217078836007,
+      "loss": 0.1738,
+      "step": 6300
+    },
+    {
+      "epoch": 0.054695705766443,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001986211889586188,
+      "loss": 0.1465,
+      "step": 6301
+    },
+    {
+      "epoch": 0.05470438624664716,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001986206699367221,
+      "loss": 0.1357,
+      "step": 6302
+    },
+    {
+      "epoch": 0.05471306672685133,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001986201508179112,
+      "loss": 0.1016,
+      "step": 6303
+    },
+    {
+      "epoch": 0.05472174720705549,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019861963160218658,
+      "loss": 0.126,
+      "step": 6304
+    },
+    {
+      "epoch": 0.05473042768725966,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0019861911228954894,
+      "loss": 0.1221,
+      "step": 6305
+    },
+    {
+      "epoch": 0.05473910816746382,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001986185928799987,
+      "loss": 0.1602,
+      "step": 6306
+    },
+    {
+      "epoch": 0.05474778864766799,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019861807337353656,
+      "loss": 0.1396,
+      "step": 6307
+    },
+    {
+      "epoch": 0.05475646912787215,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00198617553770163,
+      "loss": 0.166,
+      "step": 6308
+    },
+    {
+      "epoch": 0.05476514960807632,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001986170340698786,
+      "loss": 0.1768,
+      "step": 6309
+    },
+    {
+      "epoch": 0.05477383008828048,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019861651427268393,
+      "loss": 0.1855,
+      "step": 6310
+    },
+    {
+      "epoch": 0.05478251056848465,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.001986159943785796,
+      "loss": 0.1162,
+      "step": 6311
+    },
+    {
+      "epoch": 0.05479119104868881,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019861547438756615,
+      "loss": 0.127,
+      "step": 6312
+    },
+    {
+      "epoch": 0.05479987152889298,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001986149542996441,
+      "loss": 0.1885,
+      "step": 6313
+    },
+    {
+      "epoch": 0.05480855200909714,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019861443411481414,
+      "loss": 0.1787,
+      "step": 6314
+    },
+    {
+      "epoch": 0.05481723248930131,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001986139138330767,
+      "loss": 0.1318,
+      "step": 6315
+    },
+    {
+      "epoch": 0.05482591296950547,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001986133934544325,
+      "loss": 0.166,
+      "step": 6316
+    },
+    {
+      "epoch": 0.05483459344970964,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.00198612872978882,
+      "loss": 0.1357,
+      "step": 6317
+    },
+    {
+      "epoch": 0.054843273929913804,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001986123524064257,
+      "loss": 0.1299,
+      "step": 6318
+    },
+    {
+      "epoch": 0.05485195441011797,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019861183173706435,
+      "loss": 0.1226,
+      "step": 6319
+    },
+    {
+      "epoch": 0.054860634890322134,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019861131097079843,
+      "loss": 0.1621,
+      "step": 6320
+    },
+    {
+      "epoch": 0.0548693153705263,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001986107901076285,
+      "loss": 0.1475,
+      "step": 6321
+    },
+    {
+      "epoch": 0.054877995850730464,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019861026914755513,
+      "loss": 0.1816,
+      "step": 6322
+    },
+    {
+      "epoch": 0.05488667633093463,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019860974809057888,
+      "loss": 0.1465,
+      "step": 6323
+    },
+    {
+      "epoch": 0.054895356811138794,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001986092269367004,
+      "loss": 0.1475,
+      "step": 6324
+    },
+    {
+      "epoch": 0.05490403729134296,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019860870568592015,
+      "loss": 0.1533,
+      "step": 6325
+    },
+    {
+      "epoch": 0.054912717771547125,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001986081843382388,
+      "loss": 0.1533,
+      "step": 6326
+    },
+    {
+      "epoch": 0.05492139825175129,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001986076628936568,
+      "loss": 0.1338,
+      "step": 6327
+    },
+    {
+      "epoch": 0.054930078731955455,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0019860714135217484,
+      "loss": 0.1396,
+      "step": 6328
+    },
+    {
+      "epoch": 0.05493875921215962,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019860661971379343,
+      "loss": 0.1147,
+      "step": 6329
+    },
+    {
+      "epoch": 0.05494743969236378,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019860609797851315,
+      "loss": 0.126,
+      "step": 6330
+    },
+    {
+      "epoch": 0.05495612017256794,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001986055761463346,
+      "loss": 0.1436,
+      "step": 6331
+    },
+    {
+      "epoch": 0.05496480065277211,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001986050542172583,
+      "loss": 0.1309,
+      "step": 6332
+    },
+    {
+      "epoch": 0.054973481132976273,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019860453219128485,
+      "loss": 0.1494,
+      "step": 6333
+    },
+    {
+      "epoch": 0.05498216161318044,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019860401006841478,
+      "loss": 0.1797,
+      "step": 6334
+    },
+    {
+      "epoch": 0.054990842093384604,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001986034878486488,
+      "loss": 0.1416,
+      "step": 6335
+    },
+    {
+      "epoch": 0.05499952257358877,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019860296553198724,
+      "loss": 0.1157,
+      "step": 6336
+    },
+    {
+      "epoch": 0.055008203053792934,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0019860244311843086,
+      "loss": 0.1289,
+      "step": 6337
+    },
+    {
+      "epoch": 0.0550168835339971,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001986019206079802,
+      "loss": 0.1406,
+      "step": 6338
+    },
+    {
+      "epoch": 0.055025564014201264,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001986013980006358,
+      "loss": 0.126,
+      "step": 6339
+    },
+    {
+      "epoch": 0.05503424449440543,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001986008752963983,
+      "loss": 0.124,
+      "step": 6340
+    },
+    {
+      "epoch": 0.055042924974609594,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0019860035249526812,
+      "loss": 0.1167,
+      "step": 6341
+    },
+    {
+      "epoch": 0.05505160545481376,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019859982959724598,
+      "loss": 0.1025,
+      "step": 6342
+    },
+    {
+      "epoch": 0.055060285935017925,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0019859930660233237,
+      "loss": 0.127,
+      "step": 6343
+    },
+    {
+      "epoch": 0.05506896641522209,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.001985987835105279,
+      "loss": 0.1172,
+      "step": 6344
+    },
+    {
+      "epoch": 0.055077646895426255,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001985982603218331,
+      "loss": 0.1289,
+      "step": 6345
+    },
+    {
+      "epoch": 0.05508632737563042,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019859773703624862,
+      "loss": 0.1377,
+      "step": 6346
+    },
+    {
+      "epoch": 0.055095007855834585,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0019859721365377498,
+      "loss": 0.1572,
+      "step": 6347
+    },
+    {
+      "epoch": 0.05510368833603875,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019859669017441273,
+      "loss": 0.126,
+      "step": 6348
+    },
+    {
+      "epoch": 0.055112368816242915,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001985961665981625,
+      "loss": 0.1758,
+      "step": 6349
+    },
+    {
+      "epoch": 0.05512104929644708,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019859564292502487,
+      "loss": 0.1514,
+      "step": 6350
+    },
+    {
+      "epoch": 0.055129729776651246,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001985951191550003,
+      "loss": 0.1367,
+      "step": 6351
+    },
+    {
+      "epoch": 0.05513841025685541,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019859459528808947,
+      "loss": 0.1426,
+      "step": 6352
+    },
+    {
+      "epoch": 0.055147090737059576,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019859407132429294,
+      "loss": 0.1396,
+      "step": 6353
+    },
+    {
+      "epoch": 0.05515577121726374,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019859354726361124,
+      "loss": 0.1885,
+      "step": 6354
+    },
+    {
+      "epoch": 0.055164451697467906,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019859302310604493,
+      "loss": 0.1348,
+      "step": 6355
+    },
+    {
+      "epoch": 0.05517313217767207,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001985924988515947,
+      "loss": 0.2031,
+      "step": 6356
+    },
+    {
+      "epoch": 0.055181812657876236,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00198591974500261,
+      "loss": 0.1016,
+      "step": 6357
+    },
+    {
+      "epoch": 0.0551904931380804,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0019859145005204446,
+      "loss": 0.1221,
+      "step": 6358
+    },
+    {
+      "epoch": 0.05519917361828457,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019859092550694566,
+      "loss": 0.1021,
+      "step": 6359
+    },
+    {
+      "epoch": 0.05520785409848873,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.001985904008649651,
+      "loss": 0.1328,
+      "step": 6360
+    },
+    {
+      "epoch": 0.05521653457869289,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019858987612610345,
+      "loss": 0.1738,
+      "step": 6361
+    },
+    {
+      "epoch": 0.055225215058897055,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0019858935129036122,
+      "loss": 0.1172,
+      "step": 6362
+    },
+    {
+      "epoch": 0.05523389553910122,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019858882635773903,
+      "loss": 0.1387,
+      "step": 6363
+    },
+    {
+      "epoch": 0.055242576019305385,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019858830132823743,
+      "loss": 0.2051,
+      "step": 6364
+    },
+    {
+      "epoch": 0.05525125649950955,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.00198587776201857,
+      "loss": 0.1152,
+      "step": 6365
+    },
+    {
+      "epoch": 0.055259936979713715,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001985872509785983,
+      "loss": 0.1602,
+      "step": 6366
+    },
+    {
+      "epoch": 0.05526861745991788,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001985867256584619,
+      "loss": 0.1621,
+      "step": 6367
+    },
+    {
+      "epoch": 0.055277297940122046,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019858620024144837,
+      "loss": 0.1328,
+      "step": 6368
+    },
+    {
+      "epoch": 0.05528597842032621,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0019858567472755832,
+      "loss": 0.123,
+      "step": 6369
+    },
+    {
+      "epoch": 0.055294658900530376,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019858514911679235,
+      "loss": 0.168,
+      "step": 6370
+    },
+    {
+      "epoch": 0.05530333938073454,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0019858462340915095,
+      "loss": 0.1416,
+      "step": 6371
+    },
+    {
+      "epoch": 0.055312019860938706,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0019858409760463475,
+      "loss": 0.1235,
+      "step": 6372
+    },
+    {
+      "epoch": 0.05532070034114287,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001985835717032443,
+      "loss": 0.1221,
+      "step": 6373
+    },
+    {
+      "epoch": 0.055329380821347036,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001985830457049802,
+      "loss": 0.124,
+      "step": 6374
+    },
+    {
+      "epoch": 0.0553380613015512,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019858251960984297,
+      "loss": 0.1484,
+      "step": 6375
+    },
+    {
+      "epoch": 0.05534674178175537,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019858199341783326,
+      "loss": 0.1816,
+      "step": 6376
+    },
+    {
+      "epoch": 0.05535542226195953,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0019858146712895164,
+      "loss": 0.126,
+      "step": 6377
+    },
+    {
+      "epoch": 0.0553641027421637,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019858094074319864,
+      "loss": 0.1602,
+      "step": 6378
+    },
+    {
+      "epoch": 0.05537278322236786,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019858041426057483,
+      "loss": 0.1387,
+      "step": 6379
+    },
+    {
+      "epoch": 0.05538146370257203,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001985798876810808,
+      "loss": 0.1201,
+      "step": 6380
+    },
+    {
+      "epoch": 0.05539014418277619,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001985793610047172,
+      "loss": 0.1367,
+      "step": 6381
+    },
+    {
+      "epoch": 0.05539882466298036,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.001985788342314845,
+      "loss": 0.1309,
+      "step": 6382
+    },
+    {
+      "epoch": 0.05540750514318452,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001985783073613833,
+      "loss": 0.1621,
+      "step": 6383
+    },
+    {
+      "epoch": 0.05541618562338869,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019857778039441424,
+      "loss": 0.1104,
+      "step": 6384
+    },
+    {
+      "epoch": 0.05542486610359285,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019857725333057782,
+      "loss": 0.1562,
+      "step": 6385
+    },
+    {
+      "epoch": 0.05543354658379702,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0019857672616987467,
+      "loss": 0.1689,
+      "step": 6386
+    },
+    {
+      "epoch": 0.05544222706400118,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019857619891230534,
+      "loss": 0.1729,
+      "step": 6387
+    },
+    {
+      "epoch": 0.05545090754420535,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.001985756715578704,
+      "loss": 0.1641,
+      "step": 6388
+    },
+    {
+      "epoch": 0.05545958802440951,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019857514410657044,
+      "loss": 0.1543,
+      "step": 6389
+    },
+    {
+      "epoch": 0.05546826850461368,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00198574616558406,
+      "loss": 0.1338,
+      "step": 6390
+    },
+    {
+      "epoch": 0.055476948984817843,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.001985740889133777,
+      "loss": 0.1611,
+      "step": 6391
+    },
+    {
+      "epoch": 0.055485629465022,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019857356117148612,
+      "loss": 0.1416,
+      "step": 6392
+    },
+    {
+      "epoch": 0.05549430994522617,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019857303333273187,
+      "loss": 0.1387,
+      "step": 6393
+    },
+    {
+      "epoch": 0.05550299042543033,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.001985725053971154,
+      "loss": 0.1484,
+      "step": 6394
+    },
+    {
+      "epoch": 0.0555116709056345,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0019857197736463743,
+      "loss": 0.1484,
+      "step": 6395
+    },
+    {
+      "epoch": 0.05552035138583866,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019857144923529843,
+      "loss": 0.1406,
+      "step": 6396
+    },
+    {
+      "epoch": 0.05552903186604283,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019857092100909906,
+      "loss": 0.1191,
+      "step": 6397
+    },
+    {
+      "epoch": 0.05553771234624699,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0019857039268603984,
+      "loss": 0.1504,
+      "step": 6398
+    },
+    {
+      "epoch": 0.05554639282645116,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001985698642661214,
+      "loss": 0.1738,
+      "step": 6399
+    },
+    {
+      "epoch": 0.05555507330665532,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0019856933574934426,
+      "loss": 0.1357,
+      "step": 6400
+    },
+    {
+      "epoch": 0.05556375378685949,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019856880713570906,
+      "loss": 0.1631,
+      "step": 6401
+    },
+    {
+      "epoch": 0.05557243426706365,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001985682784252163,
+      "loss": 0.1543,
+      "step": 6402
+    },
+    {
+      "epoch": 0.05558111474726782,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0019856774961786662,
+      "loss": 0.1582,
+      "step": 6403
+    },
+    {
+      "epoch": 0.05558979522747198,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019856722071366056,
+      "loss": 0.1504,
+      "step": 6404
+    },
+    {
+      "epoch": 0.05559847570767615,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001985666917125987,
+      "loss": 0.1309,
+      "step": 6405
+    },
+    {
+      "epoch": 0.05560715618788031,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001985661626146817,
+      "loss": 0.1113,
+      "step": 6406
+    },
+    {
+      "epoch": 0.05561583666808448,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019856563341991004,
+      "loss": 0.1289,
+      "step": 6407
+    },
+    {
+      "epoch": 0.055624517148288644,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019856510412828436,
+      "loss": 0.1797,
+      "step": 6408
+    },
+    {
+      "epoch": 0.05563319762849281,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0019856457473980517,
+      "loss": 0.1621,
+      "step": 6409
+    },
+    {
+      "epoch": 0.055641878108696974,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001985640452544731,
+      "loss": 0.1504,
+      "step": 6410
+    },
+    {
+      "epoch": 0.05565055858890114,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019856351567228876,
+      "loss": 0.1367,
+      "step": 6411
+    },
+    {
+      "epoch": 0.055659239069105304,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019856298599325267,
+      "loss": 0.1143,
+      "step": 6412
+    },
+    {
+      "epoch": 0.05566791954930947,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001985624562173654,
+      "loss": 0.127,
+      "step": 6413
+    },
+    {
+      "epoch": 0.055676600029513634,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001985619263446276,
+      "loss": 0.3711,
+      "step": 6414
+    },
+    {
+      "epoch": 0.0556852805097178,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.001985613963750398,
+      "loss": 0.1289,
+      "step": 6415
+    },
+    {
+      "epoch": 0.055693960989921965,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019856086630860256,
+      "loss": 0.126,
+      "step": 6416
+    },
+    {
+      "epoch": 0.05570264147012613,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0019856033614531654,
+      "loss": 0.1211,
+      "step": 6417
+    },
+    {
+      "epoch": 0.055711321950330295,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001985598058851822,
+      "loss": 0.1621,
+      "step": 6418
+    },
+    {
+      "epoch": 0.05572000243053446,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019855927552820027,
+      "loss": 0.1738,
+      "step": 6419
+    },
+    {
+      "epoch": 0.055728682910738625,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001985587450743712,
+      "loss": 0.1885,
+      "step": 6420
+    },
+    {
+      "epoch": 0.05573736339094279,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019855821452369558,
+      "loss": 0.1338,
+      "step": 6421
+    },
+    {
+      "epoch": 0.055746043871146955,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001985576838761741,
+      "loss": 0.1816,
+      "step": 6422
+    },
+    {
+      "epoch": 0.05575472435135111,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019855715313180718,
+      "loss": 0.1709,
+      "step": 6423
+    },
+    {
+      "epoch": 0.05576340483155528,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0019855662229059557,
+      "loss": 0.1289,
+      "step": 6424
+    },
+    {
+      "epoch": 0.055772085311759444,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001985560913525397,
+      "loss": 0.1387,
+      "step": 6425
+    },
+    {
+      "epoch": 0.05578076579196361,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019855556031764024,
+      "loss": 0.2002,
+      "step": 6426
+    },
+    {
+      "epoch": 0.055789446272167774,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001985550291858978,
+      "loss": 0.125,
+      "step": 6427
+    },
+    {
+      "epoch": 0.05579812675237194,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0019855449795731285,
+      "loss": 0.1719,
+      "step": 6428
+    },
+    {
+      "epoch": 0.055806807232576104,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001985539666318861,
+      "loss": 0.1777,
+      "step": 6429
+    },
+    {
+      "epoch": 0.05581548771278027,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019855343520961796,
+      "loss": 0.1367,
+      "step": 6430
+    },
+    {
+      "epoch": 0.055824168192984434,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001985529036905092,
+      "loss": 0.2051,
+      "step": 6431
+    },
+    {
+      "epoch": 0.0558328486731886,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.001985523720745603,
+      "loss": 0.1338,
+      "step": 6432
+    },
+    {
+      "epoch": 0.055841529153392765,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001985518403617718,
+      "loss": 0.1797,
+      "step": 6433
+    },
+    {
+      "epoch": 0.05585020963359693,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019855130855214445,
+      "loss": 0.1396,
+      "step": 6434
+    },
+    {
+      "epoch": 0.055858890113801095,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001985507766456786,
+      "loss": 0.1348,
+      "step": 6435
+    },
+    {
+      "epoch": 0.05586757059400526,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00198550244642375,
+      "loss": 0.1279,
+      "step": 6436
+    },
+    {
+      "epoch": 0.055876251074209425,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001985497125422342,
+      "loss": 0.1289,
+      "step": 6437
+    },
+    {
+      "epoch": 0.05588493155441359,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019854918034525673,
+      "loss": 0.1465,
+      "step": 6438
+    },
+    {
+      "epoch": 0.055893612034617755,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019854864805144323,
+      "loss": 0.1289,
+      "step": 6439
+    },
+    {
+      "epoch": 0.05590229251482192,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019854811566079426,
+      "loss": 0.1582,
+      "step": 6440
+    },
+    {
+      "epoch": 0.055910972995026086,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019854758317331043,
+      "loss": 0.1465,
+      "step": 6441
+    },
+    {
+      "epoch": 0.05591965347523025,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0019854705058899224,
+      "loss": 0.1475,
+      "step": 6442
+    },
+    {
+      "epoch": 0.055928333955434416,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0019854651790784036,
+      "loss": 0.1523,
+      "step": 6443
+    },
+    {
+      "epoch": 0.05593701443563858,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001985459851298553,
+      "loss": 0.1426,
+      "step": 6444
+    },
+    {
+      "epoch": 0.055945694915842746,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0019854545225503773,
+      "loss": 0.1279,
+      "step": 6445
+    },
+    {
+      "epoch": 0.05595437539604691,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019854491928338815,
+      "loss": 0.1162,
+      "step": 6446
+    },
+    {
+      "epoch": 0.055963055876251076,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001985443862149072,
+      "loss": 0.1709,
+      "step": 6447
+    },
+    {
+      "epoch": 0.05597173635645524,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001985438530495954,
+      "loss": 0.1279,
+      "step": 6448
+    },
+    {
+      "epoch": 0.05598041683665941,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001985433197874534,
+      "loss": 0.1787,
+      "step": 6449
+    },
+    {
+      "epoch": 0.05598909731686357,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019854278642848174,
+      "loss": 0.1245,
+      "step": 6450
+    },
+    {
+      "epoch": 0.05599777779706774,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019854225297268103,
+      "loss": 0.1182,
+      "step": 6451
+    },
+    {
+      "epoch": 0.0560064582772719,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019854171942005187,
+      "loss": 0.1816,
+      "step": 6452
+    },
+    {
+      "epoch": 0.05601513875747607,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001985411857705948,
+      "loss": 0.1621,
+      "step": 6453
+    },
+    {
+      "epoch": 0.056023819237680225,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001985406520243104,
+      "loss": 0.1787,
+      "step": 6454
+    },
+    {
+      "epoch": 0.05603249971788439,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001985401181811993,
+      "loss": 0.1348,
+      "step": 6455
+    },
+    {
+      "epoch": 0.056041180198088555,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019853958424126206,
+      "loss": 0.1201,
+      "step": 6456
+    },
+    {
+      "epoch": 0.05604986067829272,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.001985390502044992,
+      "loss": 0.1562,
+      "step": 6457
+    },
+    {
+      "epoch": 0.056058541158496886,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001985385160709114,
+      "loss": 0.1455,
+      "step": 6458
+    },
+    {
+      "epoch": 0.05606722163870105,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019853798184049924,
+      "loss": 0.1094,
+      "step": 6459
+    },
+    {
+      "epoch": 0.056075902118905216,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019853744751326323,
+      "loss": 0.1465,
+      "step": 6460
+    },
+    {
+      "epoch": 0.05608458259910938,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.00198536913089204,
+      "loss": 0.1191,
+      "step": 6461
+    },
+    {
+      "epoch": 0.056093263079313546,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001985363785683222,
+      "loss": 0.1553,
+      "step": 6462
+    },
+    {
+      "epoch": 0.05610194355951771,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019853584395061828,
+      "loss": 0.165,
+      "step": 6463
+    },
+    {
+      "epoch": 0.056110624039721876,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001985353092360929,
+      "loss": 0.1758,
+      "step": 6464
+    },
+    {
+      "epoch": 0.05611930451992604,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019853477442474663,
+      "loss": 0.1748,
+      "step": 6465
+    },
+    {
+      "epoch": 0.05612798500013021,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001985342395165801,
+      "loss": 0.1523,
+      "step": 6466
+    },
+    {
+      "epoch": 0.05613666548033437,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001985337045115938,
+      "loss": 0.1777,
+      "step": 6467
+    },
+    {
+      "epoch": 0.05614534596053854,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001985331694097884,
+      "loss": 0.1167,
+      "step": 6468
+    },
+    {
+      "epoch": 0.0561540264407427,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001985326342111645,
+      "loss": 0.1152,
+      "step": 6469
+    },
+    {
+      "epoch": 0.05616270692094687,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001985320989157226,
+      "loss": 0.1377,
+      "step": 6470
+    },
+    {
+      "epoch": 0.05617138740115103,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001985315635234633,
+      "loss": 0.106,
+      "step": 6471
+    },
+    {
+      "epoch": 0.0561800678813552,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019853102803438724,
+      "loss": 0.1582,
+      "step": 6472
+    },
+    {
+      "epoch": 0.05618874836155936,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019853049244849498,
+      "loss": 0.1357,
+      "step": 6473
+    },
+    {
+      "epoch": 0.05619742884176353,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001985299567657871,
+      "loss": 0.1445,
+      "step": 6474
+    },
+    {
+      "epoch": 0.05620610932196769,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001985294209862642,
+      "loss": 0.1006,
+      "step": 6475
+    },
+    {
+      "epoch": 0.05621478980217186,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019852888510992683,
+      "loss": 0.1475,
+      "step": 6476
+    },
+    {
+      "epoch": 0.05622347028237602,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.001985283491367756,
+      "loss": 0.166,
+      "step": 6477
+    },
+    {
+      "epoch": 0.05623215076258019,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019852781306681114,
+      "loss": 0.1055,
+      "step": 6478
+    },
+    {
+      "epoch": 0.05624083124278435,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019852727690003397,
+      "loss": 0.1641,
+      "step": 6479
+    },
+    {
+      "epoch": 0.05624951172298852,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0019852674063644466,
+      "loss": 0.1406,
+      "step": 6480
+    },
+    {
+      "epoch": 0.05625819220319268,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001985262042760439,
+      "loss": 0.127,
+      "step": 6481
+    },
+    {
+      "epoch": 0.05626687268339685,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001985256678188322,
+      "loss": 0.1167,
+      "step": 6482
+    },
+    {
+      "epoch": 0.056275553163601014,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019852513126481014,
+      "loss": 0.1367,
+      "step": 6483
+    },
+    {
+      "epoch": 0.05628423364380517,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0019852459461397833,
+      "loss": 0.1357,
+      "step": 6484
+    },
+    {
+      "epoch": 0.05629291412400934,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0019852405786633734,
+      "loss": 0.1797,
+      "step": 6485
+    },
+    {
+      "epoch": 0.0563015946042135,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001985235210218878,
+      "loss": 0.1377,
+      "step": 6486
+    },
+    {
+      "epoch": 0.05631027508441767,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019852298408063024,
+      "loss": 0.1338,
+      "step": 6487
+    },
+    {
+      "epoch": 0.05631895556462183,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001985224470425653,
+      "loss": 0.1582,
+      "step": 6488
+    },
+    {
+      "epoch": 0.056327636044826,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019852190990769353,
+      "loss": 0.1309,
+      "step": 6489
+    },
+    {
+      "epoch": 0.05633631652503016,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0019852137267601555,
+      "loss": 0.0898,
+      "step": 6490
+    },
+    {
+      "epoch": 0.05634499700523433,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.001985208353475319,
+      "loss": 0.1055,
+      "step": 6491
+    },
+    {
+      "epoch": 0.05635367748543849,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001985202979222432,
+      "loss": 0.1162,
+      "step": 6492
+    },
+    {
+      "epoch": 0.05636235796564266,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0019851976040015005,
+      "loss": 0.1777,
+      "step": 6493
+    },
+    {
+      "epoch": 0.05637103844584682,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.00198519222781253,
+      "loss": 0.1406,
+      "step": 6494
+    },
+    {
+      "epoch": 0.05637971892605099,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001985186850655527,
+      "loss": 0.1289,
+      "step": 6495
+    },
+    {
+      "epoch": 0.05638839940625515,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001985181472530496,
+      "loss": 0.1553,
+      "step": 6496
+    },
+    {
+      "epoch": 0.05639707988645932,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019851760934374444,
+      "loss": 0.1318,
+      "step": 6497
+    },
+    {
+      "epoch": 0.056405760366663484,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0019851707133763776,
+      "loss": 0.1904,
+      "step": 6498
+    },
+    {
+      "epoch": 0.05641444084686765,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019851653323473013,
+      "loss": 0.1523,
+      "step": 6499
+    },
+    {
+      "epoch": 0.056423121327071814,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019851599503502215,
+      "loss": 0.1465,
+      "step": 6500
+    },
+    {
+      "epoch": 0.05643180180727598,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001985154567385144,
+      "loss": 0.1152,
+      "step": 6501
+    },
+    {
+      "epoch": 0.056440482287480144,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001985149183452075,
+      "loss": 0.1377,
+      "step": 6502
+    },
+    {
+      "epoch": 0.05644916276768431,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00198514379855102,
+      "loss": 0.1484,
+      "step": 6503
+    },
+    {
+      "epoch": 0.056457843247888474,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001985138412681985,
+      "loss": 0.1445,
+      "step": 6504
+    },
+    {
+      "epoch": 0.05646652372809264,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019851330258449764,
+      "loss": 0.1377,
+      "step": 6505
+    },
+    {
+      "epoch": 0.056475204208296804,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001985127638039999,
+      "loss": 0.1562,
+      "step": 6506
+    },
+    {
+      "epoch": 0.05648388468850097,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019851222492670594,
+      "loss": 0.1406,
+      "step": 6507
+    },
+    {
+      "epoch": 0.056492565168705135,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019851168595261637,
+      "loss": 0.1289,
+      "step": 6508
+    },
+    {
+      "epoch": 0.0565012456489093,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019851114688173173,
+      "loss": 0.1143,
+      "step": 6509
+    },
+    {
+      "epoch": 0.056509926129113465,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001985106077140526,
+      "loss": 0.1143,
+      "step": 6510
+    },
+    {
+      "epoch": 0.05651860660931763,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001985100684495797,
+      "loss": 0.1016,
+      "step": 6511
+    },
+    {
+      "epoch": 0.056527287089521795,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019850952908831345,
+      "loss": 0.125,
+      "step": 6512
+    },
+    {
+      "epoch": 0.05653596756972596,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001985089896302545,
+      "loss": 0.1279,
+      "step": 6513
+    },
+    {
+      "epoch": 0.056544648049930125,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019850845007540344,
+      "loss": 0.1045,
+      "step": 6514
+    },
+    {
+      "epoch": 0.056553328530134284,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0019850791042376093,
+      "loss": 0.1221,
+      "step": 6515
+    },
+    {
+      "epoch": 0.05656200901033845,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019850737067532744,
+      "loss": 0.1465,
+      "step": 6516
+    },
+    {
+      "epoch": 0.056570689490542614,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.001985068308301036,
+      "loss": 0.1064,
+      "step": 6517
+    },
+    {
+      "epoch": 0.05657936997074678,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019850629088809008,
+      "loss": 0.1196,
+      "step": 6518
+    },
+    {
+      "epoch": 0.056588050450950944,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.001985057508492874,
+      "loss": 0.1338,
+      "step": 6519
+    },
+    {
+      "epoch": 0.05659673093115511,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0019850521071369614,
+      "loss": 0.1113,
+      "step": 6520
+    },
+    {
+      "epoch": 0.056605411411359274,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019850467048131695,
+      "loss": 0.1289,
+      "step": 6521
+    },
+    {
+      "epoch": 0.05661409189156344,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019850413015215034,
+      "loss": 0.1631,
+      "step": 6522
+    },
+    {
+      "epoch": 0.056622772371767605,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019850358972619696,
+      "loss": 0.1089,
+      "step": 6523
+    },
+    {
+      "epoch": 0.05663145285197177,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019850304920345737,
+      "loss": 0.1504,
+      "step": 6524
+    },
+    {
+      "epoch": 0.056640133332175935,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001985025085839322,
+      "loss": 0.1309,
+      "step": 6525
+    },
+    {
+      "epoch": 0.0566488138123801,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0019850196786762196,
+      "loss": 0.1719,
+      "step": 6526
+    },
+    {
+      "epoch": 0.056657494292584265,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0019850142705452735,
+      "loss": 0.165,
+      "step": 6527
+    },
+    {
+      "epoch": 0.05666617477278843,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001985008861446489,
+      "loss": 0.1436,
+      "step": 6528
+    },
+    {
+      "epoch": 0.056674855252992595,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0019850034513798722,
+      "loss": 0.1504,
+      "step": 6529
+    },
+    {
+      "epoch": 0.05668353573319676,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001984998040345429,
+      "loss": 0.1406,
+      "step": 6530
+    },
+    {
+      "epoch": 0.056692216213400926,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001984992628343165,
+      "loss": 0.1523,
+      "step": 6531
+    },
+    {
+      "epoch": 0.05670089669360509,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019849872153730862,
+      "loss": 0.0991,
+      "step": 6532
+    },
+    {
+      "epoch": 0.056709577173809256,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001984981801435199,
+      "loss": 0.1377,
+      "step": 6533
+    },
+    {
+      "epoch": 0.05671825765401342,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019849763865295088,
+      "loss": 0.1602,
+      "step": 6534
+    },
+    {
+      "epoch": 0.056726938134217586,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0019849709706560217,
+      "loss": 0.1455,
+      "step": 6535
+    },
+    {
+      "epoch": 0.05673561861442175,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0019849655538147436,
+      "loss": 0.1357,
+      "step": 6536
+    },
+    {
+      "epoch": 0.056744299094625916,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019849601360056807,
+      "loss": 0.1641,
+      "step": 6537
+    },
+    {
+      "epoch": 0.05675297957483008,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019849547172288385,
+      "loss": 0.123,
+      "step": 6538
+    },
+    {
+      "epoch": 0.056761660055034246,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.001984949297484223,
+      "loss": 0.1543,
+      "step": 6539
+    },
+    {
+      "epoch": 0.05677034053523841,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0019849438767718408,
+      "loss": 0.127,
+      "step": 6540
+    },
+    {
+      "epoch": 0.05677902101544258,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001984938455091697,
+      "loss": 0.1709,
+      "step": 6541
+    },
+    {
+      "epoch": 0.05678770149564674,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019849330324437976,
+      "loss": 0.1348,
+      "step": 6542
+    },
+    {
+      "epoch": 0.05679638197585091,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001984927608828149,
+      "loss": 0.1436,
+      "step": 6543
+    },
+    {
+      "epoch": 0.05680506245605507,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019849221842447567,
+      "loss": 0.1592,
+      "step": 6544
+    },
+    {
+      "epoch": 0.05681374293625924,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.001984916758693627,
+      "loss": 0.1328,
+      "step": 6545
+    },
+    {
+      "epoch": 0.056822423416463395,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0019849113321747654,
+      "loss": 0.0942,
+      "step": 6546
+    },
+    {
+      "epoch": 0.05683110389666756,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001984905904688178,
+      "loss": 0.1133,
+      "step": 6547
+    },
+    {
+      "epoch": 0.056839784376871726,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001984900476233871,
+      "loss": 0.1543,
+      "step": 6548
+    },
+    {
+      "epoch": 0.05684846485707589,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.00198489504681185,
+      "loss": 0.1279,
+      "step": 6549
+    },
+    {
+      "epoch": 0.056857145337280056,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001984889616422121,
+      "loss": 0.0879,
+      "step": 6550
+    },
+    {
+      "epoch": 0.05686582581748422,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0019848841850646903,
+      "loss": 0.1719,
+      "step": 6551
+    },
+    {
+      "epoch": 0.056874506297688386,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001984878752739563,
+      "loss": 0.1895,
+      "step": 6552
+    },
+    {
+      "epoch": 0.05688318677789255,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019848733194467462,
+      "loss": 0.1299,
+      "step": 6553
+    },
+    {
+      "epoch": 0.056891867258096716,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001984867885186245,
+      "loss": 0.1543,
+      "step": 6554
+    },
+    {
+      "epoch": 0.05690054773830088,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019848624499580656,
+      "loss": 0.1504,
+      "step": 6555
+    },
+    {
+      "epoch": 0.05690922821850505,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.001984857013762214,
+      "loss": 0.127,
+      "step": 6556
+    },
+    {
+      "epoch": 0.05691790869870921,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001984851576598696,
+      "loss": 0.125,
+      "step": 6557
+    },
+    {
+      "epoch": 0.05692658917891338,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019848461384675176,
+      "loss": 0.1299,
+      "step": 6558
+    },
+    {
+      "epoch": 0.05693526965911754,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019848406993686844,
+      "loss": 0.1035,
+      "step": 6559
+    },
+    {
+      "epoch": 0.05694395013932171,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0019848352593022033,
+      "loss": 0.1758,
+      "step": 6560
+    },
+    {
+      "epoch": 0.05695263061952587,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001984829818268079,
+      "loss": 0.1338,
+      "step": 6561
+    },
+    {
+      "epoch": 0.05696131109973004,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001984824376266319,
+      "loss": 0.105,
+      "step": 6562
+    },
+    {
+      "epoch": 0.0569699915799342,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0019848189332969273,
+      "loss": 0.1836,
+      "step": 6563
+    },
+    {
+      "epoch": 0.05697867206013837,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019848134893599117,
+      "loss": 0.1348,
+      "step": 6564
+    },
+    {
+      "epoch": 0.05698735254034253,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0019848080444552774,
+      "loss": 0.1309,
+      "step": 6565
+    },
+    {
+      "epoch": 0.0569960330205467,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00198480259858303,
+      "loss": 0.1553,
+      "step": 6566
+    },
+    {
+      "epoch": 0.05700471350075086,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019847971517431757,
+      "loss": 0.0957,
+      "step": 6567
+    },
+    {
+      "epoch": 0.05701339398095503,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0019847917039357206,
+      "loss": 0.2021,
+      "step": 6568
+    },
+    {
+      "epoch": 0.05702207446115919,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019847862551606705,
+      "loss": 0.1445,
+      "step": 6569
+    },
+    {
+      "epoch": 0.05703075494136336,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019847808054180315,
+      "loss": 0.1406,
+      "step": 6570
+    },
+    {
+      "epoch": 0.05703943542156752,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00198477535470781,
+      "loss": 0.126,
+      "step": 6571
+    },
+    {
+      "epoch": 0.05704811590177169,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001984769903030011,
+      "loss": 0.1777,
+      "step": 6572
+    },
+    {
+      "epoch": 0.057056796381975854,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001984764450384641,
+      "loss": 0.127,
+      "step": 6573
+    },
+    {
+      "epoch": 0.05706547686218002,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0019847589967717056,
+      "loss": 0.1436,
+      "step": 6574
+    },
+    {
+      "epoch": 0.057074157342384184,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0019847535421912113,
+      "loss": 0.127,
+      "step": 6575
+    },
+    {
+      "epoch": 0.05708283782258835,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001984748086643164,
+      "loss": 0.1689,
+      "step": 6576
+    },
+    {
+      "epoch": 0.05709151830279251,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0019847426301275693,
+      "loss": 0.105,
+      "step": 6577
+    },
+    {
+      "epoch": 0.05710019878299667,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0019847371726444333,
+      "loss": 0.1777,
+      "step": 6578
+    },
+    {
+      "epoch": 0.05710887926320084,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019847317141937623,
+      "loss": 0.1245,
+      "step": 6579
+    },
+    {
+      "epoch": 0.057117559743405,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001984726254775562,
+      "loss": 0.1738,
+      "step": 6580
+    },
+    {
+      "epoch": 0.05712624022360917,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001984720794389838,
+      "loss": 0.1416,
+      "step": 6581
+    },
+    {
+      "epoch": 0.05713492070381333,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019847153330365967,
+      "loss": 0.1338,
+      "step": 6582
+    },
+    {
+      "epoch": 0.0571436011840175,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001984709870715844,
+      "loss": 0.166,
+      "step": 6583
+    },
+    {
+      "epoch": 0.05715228166422166,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001984704407427586,
+      "loss": 0.1201,
+      "step": 6584
+    },
+    {
+      "epoch": 0.05716096214442583,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019846989431718284,
+      "loss": 0.1621,
+      "step": 6585
+    },
+    {
+      "epoch": 0.05716964262462999,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001984693477948578,
+      "loss": 0.1523,
+      "step": 6586
+    },
+    {
+      "epoch": 0.05717832310483416,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001984688011757839,
+      "loss": 0.1592,
+      "step": 6587
+    },
+    {
+      "epoch": 0.057187003585038323,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001984682544599619,
+      "loss": 0.1787,
+      "step": 6588
+    },
+    {
+      "epoch": 0.05719568406524249,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019846770764739233,
+      "loss": 0.1182,
+      "step": 6589
+    },
+    {
+      "epoch": 0.057204364545446654,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001984671607380759,
+      "loss": 0.1699,
+      "step": 6590
+    },
+    {
+      "epoch": 0.05721304502565082,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00198466613732013,
+      "loss": 0.1504,
+      "step": 6591
+    },
+    {
+      "epoch": 0.057221725505854984,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019846606662920437,
+      "loss": 0.1475,
+      "step": 6592
+    },
+    {
+      "epoch": 0.05723040598605915,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001984655194296506,
+      "loss": 0.1465,
+      "step": 6593
+    },
+    {
+      "epoch": 0.057239086466263314,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001984649721333522,
+      "loss": 0.1484,
+      "step": 6594
+    },
+    {
+      "epoch": 0.05724776694646748,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0019846442474030996,
+      "loss": 0.1396,
+      "step": 6595
+    },
+    {
+      "epoch": 0.057256447426671644,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0019846387725052424,
+      "loss": 0.1533,
+      "step": 6596
+    },
+    {
+      "epoch": 0.05726512790687581,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019846332966399582,
+      "loss": 0.127,
+      "step": 6597
+    },
+    {
+      "epoch": 0.057273808387079975,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019846278198072517,
+      "loss": 0.1318,
+      "step": 6598
+    },
+    {
+      "epoch": 0.05728248886728414,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00198462234200713,
+      "loss": 0.1309,
+      "step": 6599
+    },
+    {
+      "epoch": 0.057291169347488305,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019846168632395985,
+      "loss": 0.1572,
+      "step": 6600
+    },
+    {
+      "epoch": 0.05729984982769247,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.001984611383504663,
+      "loss": 0.123,
+      "step": 6601
+    },
+    {
+      "epoch": 0.057308530307896635,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0019846059028023303,
+      "loss": 0.1582,
+      "step": 6602
+    },
+    {
+      "epoch": 0.0573172107881008,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019846004211326058,
+      "loss": 0.1182,
+      "step": 6603
+    },
+    {
+      "epoch": 0.057325891268304965,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0019845949384954954,
+      "loss": 0.1289,
+      "step": 6604
+    },
+    {
+      "epoch": 0.05733457174850913,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0019845894548910052,
+      "loss": 0.1699,
+      "step": 6605
+    },
+    {
+      "epoch": 0.057343252228713296,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019845839703191413,
+      "loss": 0.1377,
+      "step": 6606
+    },
+    {
+      "epoch": 0.05735193270891746,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0019845784847799097,
+      "loss": 0.1465,
+      "step": 6607
+    },
+    {
+      "epoch": 0.05736061318912162,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019845729982733164,
+      "loss": 0.124,
+      "step": 6608
+    },
+    {
+      "epoch": 0.057369293669325784,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019845675107993672,
+      "loss": 0.1348,
+      "step": 6609
+    },
+    {
+      "epoch": 0.05737797414952995,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019845620223580686,
+      "loss": 0.207,
+      "step": 6610
+    },
+    {
+      "epoch": 0.057386654629734114,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0019845565329494256,
+      "loss": 0.1523,
+      "step": 6611
+    },
+    {
+      "epoch": 0.05739533510993828,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0019845510425734454,
+      "loss": 0.1416,
+      "step": 6612
+    },
+    {
+      "epoch": 0.057404015590142445,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019845455512301335,
+      "loss": 0.1484,
+      "step": 6613
+    },
+    {
+      "epoch": 0.05741269607034661,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.001984540058919496,
+      "loss": 0.124,
+      "step": 6614
+    },
+    {
+      "epoch": 0.057421376550550775,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0019845345656415384,
+      "loss": 0.1338,
+      "step": 6615
+    },
+    {
+      "epoch": 0.05743005703075494,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001984529071396267,
+      "loss": 0.1699,
+      "step": 6616
+    },
+    {
+      "epoch": 0.057438737510959105,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001984523576183688,
+      "loss": 0.1123,
+      "step": 6617
+    },
+    {
+      "epoch": 0.05744741799116327,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019845180800038076,
+      "loss": 0.1279,
+      "step": 6618
+    },
+    {
+      "epoch": 0.057456098471367435,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019845125828566314,
+      "loss": 0.1079,
+      "step": 6619
+    },
+    {
+      "epoch": 0.0574647789515716,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0019845070847421655,
+      "loss": 0.1221,
+      "step": 6620
+    },
+    {
+      "epoch": 0.057473459431775765,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019845015856604156,
+      "loss": 0.1748,
+      "step": 6621
+    },
+    {
+      "epoch": 0.05748213991197993,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019844960856113883,
+      "loss": 0.1504,
+      "step": 6622
+    },
+    {
+      "epoch": 0.057490820392184096,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019844905845950896,
+      "loss": 0.1328,
+      "step": 6623
+    },
+    {
+      "epoch": 0.05749950087238826,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.001984485082611525,
+      "loss": 0.1572,
+      "step": 6624
+    },
+    {
+      "epoch": 0.057508181352592426,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019844795796607013,
+      "loss": 0.1328,
+      "step": 6625
+    },
+    {
+      "epoch": 0.05751686183279659,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019844740757426235,
+      "loss": 0.1533,
+      "step": 6626
+    },
+    {
+      "epoch": 0.057525542313000756,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001984468570857298,
+      "loss": 0.126,
+      "step": 6627
+    },
+    {
+      "epoch": 0.05753422279320492,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019844630650047313,
+      "loss": 0.1235,
+      "step": 6628
+    },
+    {
+      "epoch": 0.057542903273409086,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001984457558184929,
+      "loss": 0.1309,
+      "step": 6629
+    },
+    {
+      "epoch": 0.05755158375361325,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0019844520503978975,
+      "loss": 0.209,
+      "step": 6630
+    },
+    {
+      "epoch": 0.05756026423381742,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019844465416436422,
+      "loss": 0.1328,
+      "step": 6631
+    },
+    {
+      "epoch": 0.05756894471402158,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019844410319221693,
+      "loss": 0.124,
+      "step": 6632
+    },
+    {
+      "epoch": 0.05757762519422575,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0019844355212334853,
+      "loss": 0.2031,
+      "step": 6633
+    },
+    {
+      "epoch": 0.05758630567442991,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001984430009577596,
+      "loss": 0.1221,
+      "step": 6634
+    },
+    {
+      "epoch": 0.05759498615463408,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0019844244969545073,
+      "loss": 0.1494,
+      "step": 6635
+    },
+    {
+      "epoch": 0.05760366663483824,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019844189833642246,
+      "loss": 0.1494,
+      "step": 6636
+    },
+    {
+      "epoch": 0.05761234711504241,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001984413468806755,
+      "loss": 0.1465,
+      "step": 6637
+    },
+    {
+      "epoch": 0.057621027595246566,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019844079532821044,
+      "loss": 0.1494,
+      "step": 6638
+    },
+    {
+      "epoch": 0.05762970807545073,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001984402436790278,
+      "loss": 0.1143,
+      "step": 6639
+    },
+    {
+      "epoch": 0.057638388555654896,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001984396919331283,
+      "loss": 0.1152,
+      "step": 6640
+    },
+    {
+      "epoch": 0.05764706903585906,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019843914009051245,
+      "loss": 0.1387,
+      "step": 6641
+    },
+    {
+      "epoch": 0.057655749516063226,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001984385881511809,
+      "loss": 0.1152,
+      "step": 6642
+    },
+    {
+      "epoch": 0.05766442999626739,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001984380361151342,
+      "loss": 0.1572,
+      "step": 6643
+    },
+    {
+      "epoch": 0.057673110476471556,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00198437483982373,
+      "loss": 0.1689,
+      "step": 6644
+    },
+    {
+      "epoch": 0.05768179095667572,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0019843693175289794,
+      "loss": 0.1211,
+      "step": 6645
+    },
+    {
+      "epoch": 0.05769047143687989,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019843637942670955,
+      "loss": 0.2051,
+      "step": 6646
+    },
+    {
+      "epoch": 0.05769915191708405,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001984358270038085,
+      "loss": 0.1602,
+      "step": 6647
+    },
+    {
+      "epoch": 0.05770783239728822,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019843527448419533,
+      "loss": 0.1455,
+      "step": 6648
+    },
+    {
+      "epoch": 0.05771651287749238,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019843472186787067,
+      "loss": 0.1465,
+      "step": 6649
+    },
+    {
+      "epoch": 0.05772519335769655,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019843416915483513,
+      "loss": 0.1318,
+      "step": 6650
+    },
+    {
+      "epoch": 0.05773387383790071,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019843361634508928,
+      "loss": 0.1157,
+      "step": 6651
+    },
+    {
+      "epoch": 0.05774255431810488,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001984330634386338,
+      "loss": 0.1416,
+      "step": 6652
+    },
+    {
+      "epoch": 0.05775123479830904,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0019843251043546923,
+      "loss": 0.1445,
+      "step": 6653
+    },
+    {
+      "epoch": 0.05775991527851321,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001984319573355962,
+      "loss": 0.1504,
+      "step": 6654
+    },
+    {
+      "epoch": 0.05776859575871737,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001984314041390153,
+      "loss": 0.1504,
+      "step": 6655
+    },
+    {
+      "epoch": 0.05777727623892154,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019843085084572717,
+      "loss": 0.1592,
+      "step": 6656
+    },
+    {
+      "epoch": 0.0577859567191257,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019843029745573236,
+      "loss": 0.1846,
+      "step": 6657
+    },
+    {
+      "epoch": 0.05779463719932987,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019842974396903153,
+      "loss": 0.2246,
+      "step": 6658
+    },
+    {
+      "epoch": 0.05780331767953403,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019842919038562524,
+      "loss": 0.124,
+      "step": 6659
+    },
+    {
+      "epoch": 0.0578119981597382,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019842863670551415,
+      "loss": 0.1162,
+      "step": 6660
+    },
+    {
+      "epoch": 0.05782067863994236,
+      "grad_norm": 0.875,
+      "learning_rate": 0.001984280829286988,
+      "loss": 0.1465,
+      "step": 6661
+    },
+    {
+      "epoch": 0.05782935912014653,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019842752905517985,
+      "loss": 0.1104,
+      "step": 6662
+    },
+    {
+      "epoch": 0.057838039600350694,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019842697508495786,
+      "loss": 0.125,
+      "step": 6663
+    },
+    {
+      "epoch": 0.05784672008055486,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.001984264210180335,
+      "loss": 0.1436,
+      "step": 6664
+    },
+    {
+      "epoch": 0.057855400560759024,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0019842586685440726,
+      "loss": 0.2334,
+      "step": 6665
+    },
+    {
+      "epoch": 0.05786408104096319,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0019842531259407986,
+      "loss": 0.1138,
+      "step": 6666
+    },
+    {
+      "epoch": 0.057872761521167354,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001984247582370519,
+      "loss": 0.1211,
+      "step": 6667
+    },
+    {
+      "epoch": 0.05788144200137152,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001984242037833239,
+      "loss": 0.1279,
+      "step": 6668
+    },
+    {
+      "epoch": 0.05789012248157568,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0019842364923289657,
+      "loss": 0.165,
+      "step": 6669
+    },
+    {
+      "epoch": 0.05789880296177984,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019842309458577042,
+      "loss": 0.1396,
+      "step": 6670
+    },
+    {
+      "epoch": 0.05790748344198401,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001984225398419461,
+      "loss": 0.1221,
+      "step": 6671
+    },
+    {
+      "epoch": 0.05791616392218817,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019842198500142428,
+      "loss": 0.1768,
+      "step": 6672
+    },
+    {
+      "epoch": 0.05792484440239234,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0019842143006420544,
+      "loss": 0.1318,
+      "step": 6673
+    },
+    {
+      "epoch": 0.0579335248825965,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0019842087503029026,
+      "loss": 0.166,
+      "step": 6674
+    },
+    {
+      "epoch": 0.05794220536280067,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001984203198996794,
+      "loss": 0.126,
+      "step": 6675
+    },
+    {
+      "epoch": 0.05795088584300483,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019841976467237336,
+      "loss": 0.1396,
+      "step": 6676
+    },
+    {
+      "epoch": 0.057959566323209,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019841920934837278,
+      "loss": 0.1533,
+      "step": 6677
+    },
+    {
+      "epoch": 0.05796824680341316,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001984186539276783,
+      "loss": 0.1475,
+      "step": 6678
+    },
+    {
+      "epoch": 0.05797692728361733,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001984180984102905,
+      "loss": 0.1699,
+      "step": 6679
+    },
+    {
+      "epoch": 0.057985607763821494,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0019841754279621,
+      "loss": 0.125,
+      "step": 6680
+    },
+    {
+      "epoch": 0.05799428824402566,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001984169870854374,
+      "loss": 0.1357,
+      "step": 6681
+    },
+    {
+      "epoch": 0.058002968724229824,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001984164312779733,
+      "loss": 0.1367,
+      "step": 6682
+    },
+    {
+      "epoch": 0.05801164920443399,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019841587537381836,
+      "loss": 0.1855,
+      "step": 6683
+    },
+    {
+      "epoch": 0.058020329684638154,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019841531937297313,
+      "loss": 0.1289,
+      "step": 6684
+    },
+    {
+      "epoch": 0.05802901016484232,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0019841476327543822,
+      "loss": 0.1621,
+      "step": 6685
+    },
+    {
+      "epoch": 0.058037690645046484,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0019841420708121425,
+      "loss": 0.1309,
+      "step": 6686
+    },
+    {
+      "epoch": 0.05804637112525065,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019841365079030187,
+      "loss": 0.124,
+      "step": 6687
+    },
+    {
+      "epoch": 0.058055051605454815,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001984130944027016,
+      "loss": 0.1494,
+      "step": 6688
+    },
+    {
+      "epoch": 0.05806373208565898,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0019841253791841418,
+      "loss": 0.1387,
+      "step": 6689
+    },
+    {
+      "epoch": 0.058072412565863145,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019841198133744007,
+      "loss": 0.126,
+      "step": 6690
+    },
+    {
+      "epoch": 0.05808109304606731,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019841142465977994,
+      "loss": 0.1719,
+      "step": 6691
+    },
+    {
+      "epoch": 0.058089773526271475,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001984108678854344,
+      "loss": 0.1055,
+      "step": 6692
+    },
+    {
+      "epoch": 0.05809845400647564,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001984103110144041,
+      "loss": 0.123,
+      "step": 6693
+    },
+    {
+      "epoch": 0.058107134486679805,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001984097540466896,
+      "loss": 0.1377,
+      "step": 6694
+    },
+    {
+      "epoch": 0.05811581496688397,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.001984091969822915,
+      "loss": 0.1191,
+      "step": 6695
+    },
+    {
+      "epoch": 0.058124495447088136,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019840863982121047,
+      "loss": 0.1357,
+      "step": 6696
+    },
+    {
+      "epoch": 0.0581331759272923,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0019840808256344706,
+      "loss": 0.1172,
+      "step": 6697
+    },
+    {
+      "epoch": 0.058141856407496466,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001984075252090019,
+      "loss": 0.1172,
+      "step": 6698
+    },
+    {
+      "epoch": 0.05815053688770063,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001984069677578756,
+      "loss": 0.1357,
+      "step": 6699
+    },
+    {
+      "epoch": 0.05815921736790479,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0019840641021006875,
+      "loss": 0.1465,
+      "step": 6700
+    },
+    {
+      "epoch": 0.058167897848108954,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.00198405852565582,
+      "loss": 0.0942,
+      "step": 6701
+    },
+    {
+      "epoch": 0.05817657832831312,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001984052948244159,
+      "loss": 0.2178,
+      "step": 6702
+    },
+    {
+      "epoch": 0.058185258808517284,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019840473698657115,
+      "loss": 0.1709,
+      "step": 6703
+    },
+    {
+      "epoch": 0.05819393928872145,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.001984041790520483,
+      "loss": 0.1416,
+      "step": 6704
+    },
+    {
+      "epoch": 0.058202619768925615,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019840362102084793,
+      "loss": 0.1484,
+      "step": 6705
+    },
+    {
+      "epoch": 0.05821130024912978,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0019840306289297073,
+      "loss": 0.124,
+      "step": 6706
+    },
+    {
+      "epoch": 0.058219980729333945,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001984025046684172,
+      "loss": 0.1357,
+      "step": 6707
+    },
+    {
+      "epoch": 0.05822866120953811,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019840194634718807,
+      "loss": 0.1982,
+      "step": 6708
+    },
+    {
+      "epoch": 0.058237341689742275,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019840138792928392,
+      "loss": 0.1445,
+      "step": 6709
+    },
+    {
+      "epoch": 0.05824602216994644,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019840082941470533,
+      "loss": 0.1592,
+      "step": 6710
+    },
+    {
+      "epoch": 0.058254702650150605,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001984002708034529,
+      "loss": 0.1348,
+      "step": 6711
+    },
+    {
+      "epoch": 0.05826338313035477,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001983997120955273,
+      "loss": 0.1348,
+      "step": 6712
+    },
+    {
+      "epoch": 0.058272063610558936,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019839915329092905,
+      "loss": 0.1172,
+      "step": 6713
+    },
+    {
+      "epoch": 0.0582807440907631,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0019839859438965887,
+      "loss": 0.1465,
+      "step": 6714
+    },
+    {
+      "epoch": 0.058289424570967266,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0019839803539171727,
+      "loss": 0.1299,
+      "step": 6715
+    },
+    {
+      "epoch": 0.05829810505117143,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019839747629710495,
+      "loss": 0.124,
+      "step": 6716
+    },
+    {
+      "epoch": 0.058306785531375596,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0019839691710582244,
+      "loss": 0.1592,
+      "step": 6717
+    },
+    {
+      "epoch": 0.05831546601157976,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0019839635781787037,
+      "loss": 0.1523,
+      "step": 6718
+    },
+    {
+      "epoch": 0.058324146491783926,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.001983957984332494,
+      "loss": 0.1328,
+      "step": 6719
+    },
+    {
+      "epoch": 0.05833282697198809,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019839523895196012,
+      "loss": 0.1162,
+      "step": 6720
+    },
+    {
+      "epoch": 0.05834150745219226,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019839467937400315,
+      "loss": 0.1719,
+      "step": 6721
+    },
+    {
+      "epoch": 0.05835018793239642,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0019839411969937906,
+      "loss": 0.1348,
+      "step": 6722
+    },
+    {
+      "epoch": 0.05835886841260059,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.001983935599280885,
+      "loss": 0.1602,
+      "step": 6723
+    },
+    {
+      "epoch": 0.05836754889280475,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0019839300006013207,
+      "loss": 0.1816,
+      "step": 6724
+    },
+    {
+      "epoch": 0.05837622937300892,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0019839244009551035,
+      "loss": 0.1807,
+      "step": 6725
+    },
+    {
+      "epoch": 0.05838490985321308,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019839188003422403,
+      "loss": 0.1152,
+      "step": 6726
+    },
+    {
+      "epoch": 0.05839359033341725,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.001983913198762736,
+      "loss": 0.1934,
+      "step": 6727
+    },
+    {
+      "epoch": 0.05840227081362141,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019839075962165987,
+      "loss": 0.1328,
+      "step": 6728
+    },
+    {
+      "epoch": 0.05841095129382558,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001983901992703833,
+      "loss": 0.124,
+      "step": 6729
+    },
+    {
+      "epoch": 0.05841963177402974,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019838963882244445,
+      "loss": 0.1279,
+      "step": 6730
+    },
+    {
+      "epoch": 0.0584283122542339,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019838907827784413,
+      "loss": 0.126,
+      "step": 6731
+    },
+    {
+      "epoch": 0.058436992734438066,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019838851763658277,
+      "loss": 0.1748,
+      "step": 6732
+    },
+    {
+      "epoch": 0.05844567321464223,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001983879568986611,
+      "loss": 0.1699,
+      "step": 6733
+    },
+    {
+      "epoch": 0.058454353694846396,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019838739606407967,
+      "loss": 0.1104,
+      "step": 6734
+    },
+    {
+      "epoch": 0.05846303417505056,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001983868351328391,
+      "loss": 0.1523,
+      "step": 6735
+    },
+    {
+      "epoch": 0.058471714655254727,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019838627410494,
+      "loss": 0.123,
+      "step": 6736
+    },
+    {
+      "epoch": 0.05848039513545889,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019838571298038303,
+      "loss": 0.1318,
+      "step": 6737
+    },
+    {
+      "epoch": 0.05848907561566306,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019838515175916874,
+      "loss": 0.1245,
+      "step": 6738
+    },
+    {
+      "epoch": 0.05849775609586722,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019838459044129782,
+      "loss": 0.1152,
+      "step": 6739
+    },
+    {
+      "epoch": 0.05850643657607139,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0019838402902677085,
+      "loss": 0.1377,
+      "step": 6740
+    },
+    {
+      "epoch": 0.05851511705627555,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019838346751558837,
+      "loss": 0.1895,
+      "step": 6741
+    },
+    {
+      "epoch": 0.05852379753647972,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001983829059077511,
+      "loss": 0.1099,
+      "step": 6742
+    },
+    {
+      "epoch": 0.05853247801668388,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.001983823442032596,
+      "loss": 0.1211,
+      "step": 6743
+    },
+    {
+      "epoch": 0.05854115849688805,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019838178240211453,
+      "loss": 0.1611,
+      "step": 6744
+    },
+    {
+      "epoch": 0.05854983897709221,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019838122050431645,
+      "loss": 0.1699,
+      "step": 6745
+    },
+    {
+      "epoch": 0.05855851945729638,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019838065850986595,
+      "loss": 0.127,
+      "step": 6746
+    },
+    {
+      "epoch": 0.05856719993750054,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019838009641876373,
+      "loss": 0.1406,
+      "step": 6747
+    },
+    {
+      "epoch": 0.05857588041770471,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0019837953423101034,
+      "loss": 0.127,
+      "step": 6748
+    },
+    {
+      "epoch": 0.05858456089790887,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019837897194660645,
+      "loss": 0.1504,
+      "step": 6749
+    },
+    {
+      "epoch": 0.05859324137811304,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0019837840956555266,
+      "loss": 0.1377,
+      "step": 6750
+    },
+    {
+      "epoch": 0.0586019218583172,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0019837784708784953,
+      "loss": 0.1377,
+      "step": 6751
+    },
+    {
+      "epoch": 0.05861060233852137,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.001983772845134977,
+      "loss": 0.1123,
+      "step": 6752
+    },
+    {
+      "epoch": 0.058619282818725534,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0019837672184249786,
+      "loss": 0.0991,
+      "step": 6753
+    },
+    {
+      "epoch": 0.0586279632989297,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0019837615907485054,
+      "loss": 0.1445,
+      "step": 6754
+    },
+    {
+      "epoch": 0.058636643779133864,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019837559621055634,
+      "loss": 0.1562,
+      "step": 6755
+    },
+    {
+      "epoch": 0.05864532425933803,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0019837503324961598,
+      "loss": 0.1699,
+      "step": 6756
+    },
+    {
+      "epoch": 0.058654004739542194,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019837447019202996,
+      "loss": 0.1201,
+      "step": 6757
+    },
+    {
+      "epoch": 0.05866268521974636,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0019837390703779894,
+      "loss": 0.1758,
+      "step": 6758
+    },
+    {
+      "epoch": 0.058671365699950524,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001983733437869236,
+      "loss": 0.1201,
+      "step": 6759
+    },
+    {
+      "epoch": 0.05868004618015469,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019837278043940447,
+      "loss": 0.1357,
+      "step": 6760
+    },
+    {
+      "epoch": 0.058688726660358854,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0019837221699524218,
+      "loss": 0.1572,
+      "step": 6761
+    },
+    {
+      "epoch": 0.05869740714056301,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.001983716534544374,
+      "loss": 0.1562,
+      "step": 6762
+    },
+    {
+      "epoch": 0.05870608762076718,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001983710898169907,
+      "loss": 0.1172,
+      "step": 6763
+    },
+    {
+      "epoch": 0.05871476810097134,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.001983705260829027,
+      "loss": 0.1602,
+      "step": 6764
+    },
+    {
+      "epoch": 0.05872344858117551,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019836996225217397,
+      "loss": 0.1147,
+      "step": 6765
+    },
+    {
+      "epoch": 0.05873212906137967,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001983693983248052,
+      "loss": 0.1641,
+      "step": 6766
+    },
+    {
+      "epoch": 0.05874080954158384,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00198368834300797,
+      "loss": 0.1504,
+      "step": 6767
+    },
+    {
+      "epoch": 0.058749490021788,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019836827018015,
+      "loss": 0.1553,
+      "step": 6768
+    },
+    {
+      "epoch": 0.05875817050199217,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0019836770596286476,
+      "loss": 0.1133,
+      "step": 6769
+    },
+    {
+      "epoch": 0.058766850982196334,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001983671416489419,
+      "loss": 0.1465,
+      "step": 6770
+    },
+    {
+      "epoch": 0.0587755314624005,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001983665772383821,
+      "loss": 0.1094,
+      "step": 6771
+    },
+    {
+      "epoch": 0.058784211942604664,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.001983660127311859,
+      "loss": 0.1157,
+      "step": 6772
+    },
+    {
+      "epoch": 0.05879289242280883,
+      "grad_norm": 3.609375,
+      "learning_rate": 0.00198365448127354,
+      "loss": 0.2578,
+      "step": 6773
+    },
+    {
+      "epoch": 0.058801572903012994,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001983648834268869,
+      "loss": 0.1289,
+      "step": 6774
+    },
+    {
+      "epoch": 0.05881025338321716,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019836431862978537,
+      "loss": 0.126,
+      "step": 6775
+    },
+    {
+      "epoch": 0.058818933863421324,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019836375373604993,
+      "loss": 0.1309,
+      "step": 6776
+    },
+    {
+      "epoch": 0.05882761434362549,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.001983631887456812,
+      "loss": 0.167,
+      "step": 6777
+    },
+    {
+      "epoch": 0.058836294823829655,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001983626236586798,
+      "loss": 0.1523,
+      "step": 6778
+    },
+    {
+      "epoch": 0.05884497530403382,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001983620584750464,
+      "loss": 0.127,
+      "step": 6779
+    },
+    {
+      "epoch": 0.058853655784237985,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019836149319478157,
+      "loss": 0.1406,
+      "step": 6780
+    },
+    {
+      "epoch": 0.05886233626444215,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001983609278178859,
+      "loss": 0.125,
+      "step": 6781
+    },
+    {
+      "epoch": 0.058871016744646315,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001983603623443601,
+      "loss": 0.1475,
+      "step": 6782
+    },
+    {
+      "epoch": 0.05887969722485048,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019835979677420477,
+      "loss": 0.1426,
+      "step": 6783
+    },
+    {
+      "epoch": 0.058888377705054645,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001983592311074204,
+      "loss": 0.1377,
+      "step": 6784
+    },
+    {
+      "epoch": 0.05889705818525881,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001983586653440077,
+      "loss": 0.1875,
+      "step": 6785
+    },
+    {
+      "epoch": 0.058905738665462976,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019835809948396737,
+      "loss": 0.1289,
+      "step": 6786
+    },
+    {
+      "epoch": 0.05891441914566714,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001983575335272999,
+      "loss": 0.1045,
+      "step": 6787
+    },
+    {
+      "epoch": 0.058923099625871306,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00198356967474006,
+      "loss": 0.1426,
+      "step": 6788
+    },
+    {
+      "epoch": 0.05893178010607547,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019835640132408617,
+      "loss": 0.1572,
+      "step": 6789
+    },
+    {
+      "epoch": 0.058940460586279636,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0019835583507754116,
+      "loss": 0.1338,
+      "step": 6790
+    },
+    {
+      "epoch": 0.0589491410664838,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0019835526873437158,
+      "loss": 0.1865,
+      "step": 6791
+    },
+    {
+      "epoch": 0.05895782154668796,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019835470229457794,
+      "loss": 0.1758,
+      "step": 6792
+    },
+    {
+      "epoch": 0.058966502026892124,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0019835413575816094,
+      "loss": 0.1191,
+      "step": 6793
+    },
+    {
+      "epoch": 0.05897518250709629,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001983535691251212,
+      "loss": 0.1299,
+      "step": 6794
+    },
+    {
+      "epoch": 0.058983862987300455,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019835300239545934,
+      "loss": 0.1338,
+      "step": 6795
+    },
+    {
+      "epoch": 0.05899254346750462,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001983524355691759,
+      "loss": 0.126,
+      "step": 6796
+    },
+    {
+      "epoch": 0.059001223947708785,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0019835186864627158,
+      "loss": 0.2129,
+      "step": 6797
+    },
+    {
+      "epoch": 0.05900990442791295,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0019835130162674706,
+      "loss": 0.1377,
+      "step": 6798
+    },
+    {
+      "epoch": 0.059018584908117115,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001983507345106028,
+      "loss": 0.1201,
+      "step": 6799
+    },
+    {
+      "epoch": 0.05902726538832128,
+      "grad_norm": 3.203125,
+      "learning_rate": 0.001983501672978395,
+      "loss": 0.1309,
+      "step": 6800
+    },
+    {
+      "epoch": 0.059035945868525445,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019834959998845784,
+      "loss": 0.1445,
+      "step": 6801
+    },
+    {
+      "epoch": 0.05904462634872961,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001983490325824584,
+      "loss": 0.125,
+      "step": 6802
+    },
+    {
+      "epoch": 0.059053306828933776,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001983484650798417,
+      "loss": 0.1611,
+      "step": 6803
+    },
+    {
+      "epoch": 0.05906198730913794,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.001983478974806085,
+      "loss": 0.2285,
+      "step": 6804
+    },
+    {
+      "epoch": 0.059070667789342106,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019834732978475937,
+      "loss": 0.1455,
+      "step": 6805
+    },
+    {
+      "epoch": 0.05907934826954627,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001983467619922949,
+      "loss": 0.1465,
+      "step": 6806
+    },
+    {
+      "epoch": 0.059088028749750436,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0019834619410321577,
+      "loss": 0.2422,
+      "step": 6807
+    },
+    {
+      "epoch": 0.0590967092299546,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0019834562611752257,
+      "loss": 0.1182,
+      "step": 6808
+    },
+    {
+      "epoch": 0.059105389710158766,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001983450580352159,
+      "loss": 0.1396,
+      "step": 6809
+    },
+    {
+      "epoch": 0.05911407019036293,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019834448985629636,
+      "loss": 0.1406,
+      "step": 6810
+    },
+    {
+      "epoch": 0.0591227506705671,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0019834392158076474,
+      "loss": 0.2041,
+      "step": 6811
+    },
+    {
+      "epoch": 0.05913143115077126,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019834335320862142,
+      "loss": 0.1367,
+      "step": 6812
+    },
+    {
+      "epoch": 0.05914011163097543,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001983427847398672,
+      "loss": 0.1226,
+      "step": 6813
+    },
+    {
+      "epoch": 0.05914879211117959,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0019834221617450257,
+      "loss": 0.1465,
+      "step": 6814
+    },
+    {
+      "epoch": 0.05915747259138376,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001983416475125283,
+      "loss": 0.1465,
+      "step": 6815
+    },
+    {
+      "epoch": 0.05916615307158792,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019834107875394487,
+      "loss": 0.1162,
+      "step": 6816
+    },
+    {
+      "epoch": 0.05917483355179209,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0019834050989875297,
+      "loss": 0.1602,
+      "step": 6817
+    },
+    {
+      "epoch": 0.05918351403199625,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019833994094695323,
+      "loss": 0.127,
+      "step": 6818
+    },
+    {
+      "epoch": 0.05919219451220042,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0019833937189854627,
+      "loss": 0.1504,
+      "step": 6819
+    },
+    {
+      "epoch": 0.05920087499240458,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001983388027535327,
+      "loss": 0.1641,
+      "step": 6820
+    },
+    {
+      "epoch": 0.05920955547260875,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019833823351191313,
+      "loss": 0.1533,
+      "step": 6821
+    },
+    {
+      "epoch": 0.05921823595281291,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0019833766417368818,
+      "loss": 0.166,
+      "step": 6822
+    },
+    {
+      "epoch": 0.05922691643301707,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001983370947388585,
+      "loss": 0.168,
+      "step": 6823
+    },
+    {
+      "epoch": 0.059235596913221236,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0019833652520742473,
+      "loss": 0.1182,
+      "step": 6824
+    },
+    {
+      "epoch": 0.0592442773934254,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001983359555793874,
+      "loss": 0.1465,
+      "step": 6825
+    },
+    {
+      "epoch": 0.059252957873629566,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.001983353858547473,
+      "loss": 0.1133,
+      "step": 6826
+    },
+    {
+      "epoch": 0.05926163835383373,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019833481603350484,
+      "loss": 0.1094,
+      "step": 6827
+    },
+    {
+      "epoch": 0.0592703188340379,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001983342461156608,
+      "loss": 0.1348,
+      "step": 6828
+    },
+    {
+      "epoch": 0.05927899931424206,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001983336761012158,
+      "loss": 0.1738,
+      "step": 6829
+    },
+    {
+      "epoch": 0.05928767979444623,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019833310599017035,
+      "loss": 0.1279,
+      "step": 6830
+    },
+    {
+      "epoch": 0.05929636027465039,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019833253578252514,
+      "loss": 0.1299,
+      "step": 6831
+    },
+    {
+      "epoch": 0.05930504075485456,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001983319654782808,
+      "loss": 0.125,
+      "step": 6832
+    },
+    {
+      "epoch": 0.05931372123505872,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.00198331395077438,
+      "loss": 0.1123,
+      "step": 6833
+    },
+    {
+      "epoch": 0.05932240171526289,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001983308245799973,
+      "loss": 0.1328,
+      "step": 6834
+    },
+    {
+      "epoch": 0.05933108219546705,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001983302539859593,
+      "loss": 0.2012,
+      "step": 6835
+    },
+    {
+      "epoch": 0.05933976267567122,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.001983296832953247,
+      "loss": 0.1387,
+      "step": 6836
+    },
+    {
+      "epoch": 0.05934844315587538,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019832911250809403,
+      "loss": 0.1406,
+      "step": 6837
+    },
+    {
+      "epoch": 0.05935712363607955,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019832854162426805,
+      "loss": 0.1426,
+      "step": 6838
+    },
+    {
+      "epoch": 0.05936580411628371,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001983279706438472,
+      "loss": 0.106,
+      "step": 6839
+    },
+    {
+      "epoch": 0.05937448459648788,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001983273995668323,
+      "loss": 0.1113,
+      "step": 6840
+    },
+    {
+      "epoch": 0.05938316507669204,
+      "grad_norm": 1.7421875,
+      "learning_rate": 0.0019832682839322387,
+      "loss": 0.1406,
+      "step": 6841
+    },
+    {
+      "epoch": 0.05939184555689621,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001983262571230225,
+      "loss": 0.1309,
+      "step": 6842
+    },
+    {
+      "epoch": 0.059400526037100373,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001983256857562289,
+      "loss": 0.1895,
+      "step": 6843
+    },
+    {
+      "epoch": 0.05940920651730454,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0019832511429284366,
+      "loss": 0.1084,
+      "step": 6844
+    },
+    {
+      "epoch": 0.059417886997508704,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0019832454273286735,
+      "loss": 0.1504,
+      "step": 6845
+    },
+    {
+      "epoch": 0.05942656747771287,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0019832397107630074,
+      "loss": 0.1445,
+      "step": 6846
+    },
+    {
+      "epoch": 0.059435247957917034,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001983233993231443,
+      "loss": 0.1699,
+      "step": 6847
+    },
+    {
+      "epoch": 0.0594439284381212,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001983228274733987,
+      "loss": 0.1289,
+      "step": 6848
+    },
+    {
+      "epoch": 0.059452608918325364,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.001983222555270646,
+      "loss": 0.1855,
+      "step": 6849
+    },
+    {
+      "epoch": 0.05946128939852953,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019832168348414268,
+      "loss": 0.1982,
+      "step": 6850
+    },
+    {
+      "epoch": 0.059469969878733694,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001983211113446334,
+      "loss": 0.1797,
+      "step": 6851
+    },
+    {
+      "epoch": 0.05947865035893786,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.001983205391085375,
+      "loss": 0.1494,
+      "step": 6852
+    },
+    {
+      "epoch": 0.059487330839142025,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0019831996677585565,
+      "loss": 0.1582,
+      "step": 6853
+    },
+    {
+      "epoch": 0.05949601131934618,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0019831939434658834,
+      "loss": 0.1104,
+      "step": 6854
+    },
+    {
+      "epoch": 0.05950469179955035,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019831882182073626,
+      "loss": 0.1289,
+      "step": 6855
+    },
+    {
+      "epoch": 0.05951337227975451,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0019831824919830008,
+      "loss": 0.1699,
+      "step": 6856
+    },
+    {
+      "epoch": 0.05952205275995868,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001983176764792804,
+      "loss": 0.1689,
+      "step": 6857
+    },
+    {
+      "epoch": 0.05953073324016284,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001983171036636778,
+      "loss": 0.1089,
+      "step": 6858
+    },
+    {
+      "epoch": 0.05953941372036701,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019831653075149296,
+      "loss": 0.1436,
+      "step": 6859
+    },
+    {
+      "epoch": 0.059548094200571174,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001983159577427265,
+      "loss": 0.1211,
+      "step": 6860
+    },
+    {
+      "epoch": 0.05955677468077534,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00198315384637379,
+      "loss": 0.1641,
+      "step": 6861
+    },
+    {
+      "epoch": 0.059565455160979504,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0019831481143545117,
+      "loss": 0.1289,
+      "step": 6862
+    },
+    {
+      "epoch": 0.05957413564118367,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019831423813694354,
+      "loss": 0.166,
+      "step": 6863
+    },
+    {
+      "epoch": 0.059582816121387834,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019831366474185683,
+      "loss": 0.166,
+      "step": 6864
+    },
+    {
+      "epoch": 0.059591496601592,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001983130912501916,
+      "loss": 0.166,
+      "step": 6865
+    },
+    {
+      "epoch": 0.059600177081796164,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001983125176619485,
+      "loss": 0.1484,
+      "step": 6866
+    },
+    {
+      "epoch": 0.05960885756200033,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019831194397712817,
+      "loss": 0.1455,
+      "step": 6867
+    },
+    {
+      "epoch": 0.059617538042204495,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.001983113701957312,
+      "loss": 0.1807,
+      "step": 6868
+    },
+    {
+      "epoch": 0.05962621852240866,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.001983107963177583,
+      "loss": 0.1426,
+      "step": 6869
+    },
+    {
+      "epoch": 0.059634899002612825,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019831022234320997,
+      "loss": 0.1289,
+      "step": 6870
+    },
+    {
+      "epoch": 0.05964357948281699,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.001983096482720869,
+      "loss": 0.1221,
+      "step": 6871
+    },
+    {
+      "epoch": 0.059652259963021155,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001983090741043898,
+      "loss": 0.1455,
+      "step": 6872
+    },
+    {
+      "epoch": 0.05966094044322532,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001983084998401192,
+      "loss": 0.0996,
+      "step": 6873
+    },
+    {
+      "epoch": 0.059669620923429485,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001983079254792757,
+      "loss": 0.0903,
+      "step": 6874
+    },
+    {
+      "epoch": 0.05967830140363365,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019830735102186006,
+      "loss": 0.126,
+      "step": 6875
+    },
+    {
+      "epoch": 0.059686981883837815,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0019830677646787277,
+      "loss": 0.0962,
+      "step": 6876
+    },
+    {
+      "epoch": 0.05969566236404198,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001983062018173145,
+      "loss": 0.1699,
+      "step": 6877
+    },
+    {
+      "epoch": 0.059704342844246146,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0019830562707018598,
+      "loss": 0.1455,
+      "step": 6878
+    },
+    {
+      "epoch": 0.05971302332445031,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001983050522264877,
+      "loss": 0.1465,
+      "step": 6879
+    },
+    {
+      "epoch": 0.059721703804654476,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019830447728622033,
+      "loss": 0.1245,
+      "step": 6880
+    },
+    {
+      "epoch": 0.05973038428485864,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001983039022493845,
+      "loss": 0.1128,
+      "step": 6881
+    },
+    {
+      "epoch": 0.059739064765062806,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019830332711598085,
+      "loss": 0.0996,
+      "step": 6882
+    },
+    {
+      "epoch": 0.05974774524526697,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0019830275188601003,
+      "loss": 0.1123,
+      "step": 6883
+    },
+    {
+      "epoch": 0.059756425725471136,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019830217655947267,
+      "loss": 0.1133,
+      "step": 6884
+    },
+    {
+      "epoch": 0.059765106205675295,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019830160113636936,
+      "loss": 0.1309,
+      "step": 6885
+    },
+    {
+      "epoch": 0.05977378668587946,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.001983010256167007,
+      "loss": 0.1523,
+      "step": 6886
+    },
+    {
+      "epoch": 0.059782467166083625,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0019830045000046744,
+      "loss": 0.1426,
+      "step": 6887
+    },
+    {
+      "epoch": 0.05979114764628779,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001982998742876701,
+      "loss": 0.1104,
+      "step": 6888
+    },
+    {
+      "epoch": 0.059799828126491955,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001982992984783093,
+      "loss": 0.1533,
+      "step": 6889
+    },
+    {
+      "epoch": 0.05980850860669612,
+      "grad_norm": 0.05908203125,
+      "learning_rate": 0.001982987225723858,
+      "loss": 0.1113,
+      "step": 6890
+    },
+    {
+      "epoch": 0.059817189086900285,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019829814656990006,
+      "loss": 0.1592,
+      "step": 6891
+    },
+    {
+      "epoch": 0.05982586956710445,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001982975704708528,
+      "loss": 0.1777,
+      "step": 6892
+    },
+    {
+      "epoch": 0.059834550047308616,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.001982969942752447,
+      "loss": 0.1406,
+      "step": 6893
+    },
+    {
+      "epoch": 0.05984323052751278,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.0019829641798307634,
+      "loss": 0.1895,
+      "step": 6894
+    },
+    {
+      "epoch": 0.059851911007716946,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001982958415943483,
+      "loss": 0.1328,
+      "step": 6895
+    },
+    {
+      "epoch": 0.05986059148792111,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019829526510906125,
+      "loss": 0.1416,
+      "step": 6896
+    },
+    {
+      "epoch": 0.059869271968125276,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0019829468852721587,
+      "loss": 0.1641,
+      "step": 6897
+    },
+    {
+      "epoch": 0.05987795244832944,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001982941118488127,
+      "loss": 0.126,
+      "step": 6898
+    },
+    {
+      "epoch": 0.059886632928533606,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019829353507385245,
+      "loss": 0.1816,
+      "step": 6899
+    },
+    {
+      "epoch": 0.05989531340873777,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019829295820233566,
+      "loss": 0.1377,
+      "step": 6900
+    },
+    {
+      "epoch": 0.05990399388894194,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.001982923812342631,
+      "loss": 0.1758,
+      "step": 6901
+    },
+    {
+      "epoch": 0.0599126743691461,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019829180416963527,
+      "loss": 0.2656,
+      "step": 6902
+    },
+    {
+      "epoch": 0.05992135484935027,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0019829122700845284,
+      "loss": 0.1416,
+      "step": 6903
+    },
+    {
+      "epoch": 0.05993003532955443,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001982906497507165,
+      "loss": 0.1162,
+      "step": 6904
+    },
+    {
+      "epoch": 0.0599387158097586,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0019829007239642676,
+      "loss": 0.1865,
+      "step": 6905
+    },
+    {
+      "epoch": 0.05994739628996276,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001982894949455844,
+      "loss": 0.1045,
+      "step": 6906
+    },
+    {
+      "epoch": 0.05995607677016693,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001982889173981899,
+      "loss": 0.1104,
+      "step": 6907
+    },
+    {
+      "epoch": 0.05996475725037109,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.00198288339754244,
+      "loss": 0.1455,
+      "step": 6908
+    },
+    {
+      "epoch": 0.05997343773057526,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.001982877620137473,
+      "loss": 0.1309,
+      "step": 6909
+    },
+    {
+      "epoch": 0.05998211821077942,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001982871841767005,
+      "loss": 0.1309,
+      "step": 6910
+    },
+    {
+      "epoch": 0.05999079869098359,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0019828660624310406,
+      "loss": 0.1406,
+      "step": 6911
+    },
+    {
+      "epoch": 0.05999947917118775,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0019828602821295874,
+      "loss": 0.1465,
+      "step": 6912
+    },
+    {
+      "epoch": 0.06000815965139192,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019828545008626513,
+      "loss": 0.1299,
+      "step": 6913
+    },
+    {
+      "epoch": 0.06001684013159608,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019828487186302392,
+      "loss": 0.1475,
+      "step": 6914
+    },
+    {
+      "epoch": 0.06002552061180025,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001982842935432357,
+      "loss": 0.1182,
+      "step": 6915
+    },
+    {
+      "epoch": 0.060034201092004406,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0019828371512690107,
+      "loss": 0.1631,
+      "step": 6916
+    },
+    {
+      "epoch": 0.06004288157220857,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001982831366140207,
+      "loss": 0.127,
+      "step": 6917
+    },
+    {
+      "epoch": 0.06005156205241274,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0019828255800459524,
+      "loss": 0.5977,
+      "step": 6918
+    },
+    {
+      "epoch": 0.0600602425326169,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001982819792986253,
+      "loss": 0.1348,
+      "step": 6919
+    },
+    {
+      "epoch": 0.06006892301282107,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001982814004961115,
+      "loss": 0.124,
+      "step": 6920
+    },
+    {
+      "epoch": 0.06007760349302523,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001982808215970545,
+      "loss": 0.1611,
+      "step": 6921
+    },
+    {
+      "epoch": 0.0600862839732294,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001982802426014549,
+      "loss": 0.1758,
+      "step": 6922
+    },
+    {
+      "epoch": 0.06009496445343356,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0019827966350931332,
+      "loss": 0.124,
+      "step": 6923
+    },
+    {
+      "epoch": 0.06010364493363773,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001982790843206305,
+      "loss": 0.1602,
+      "step": 6924
+    },
+    {
+      "epoch": 0.06011232541384189,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.00198278505035407,
+      "loss": 0.1602,
+      "step": 6925
+    },
+    {
+      "epoch": 0.06012100589404606,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019827792565364336,
+      "loss": 0.1738,
+      "step": 6926
+    },
+    {
+      "epoch": 0.06012968637425022,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001982773461753404,
+      "loss": 0.1865,
+      "step": 6927
+    },
+    {
+      "epoch": 0.06013836685445439,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0019827676660049863,
+      "loss": 0.1592,
+      "step": 6928
+    },
+    {
+      "epoch": 0.06014704733465855,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001982761869291187,
+      "loss": 0.1133,
+      "step": 6929
+    },
+    {
+      "epoch": 0.06015572781486272,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019827560716120128,
+      "loss": 0.1465,
+      "step": 6930
+    },
+    {
+      "epoch": 0.06016440829506688,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.00198275027296747,
+      "loss": 0.127,
+      "step": 6931
+    },
+    {
+      "epoch": 0.06017308877527105,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019827444733575643,
+      "loss": 0.1543,
+      "step": 6932
+    },
+    {
+      "epoch": 0.06018176925547521,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019827386727823025,
+      "loss": 0.1172,
+      "step": 6933
+    },
+    {
+      "epoch": 0.06019044973567938,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019827328712416913,
+      "loss": 0.1289,
+      "step": 6934
+    },
+    {
+      "epoch": 0.060199130215883544,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001982727068735737,
+      "loss": 0.1094,
+      "step": 6935
+    },
+    {
+      "epoch": 0.06020781069608771,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0019827212652644447,
+      "loss": 0.1279,
+      "step": 6936
+    },
+    {
+      "epoch": 0.060216491176291874,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019827154608278224,
+      "loss": 0.1221,
+      "step": 6937
+    },
+    {
+      "epoch": 0.06022517165649604,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0019827096554258755,
+      "loss": 0.1367,
+      "step": 6938
+    },
+    {
+      "epoch": 0.060233852136700204,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019827038490586105,
+      "loss": 0.1147,
+      "step": 6939
+    },
+    {
+      "epoch": 0.06024253261690437,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0019826980417260343,
+      "loss": 0.1406,
+      "step": 6940
+    },
+    {
+      "epoch": 0.060251213097108534,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001982692233428152,
+      "loss": 0.1357,
+      "step": 6941
+    },
+    {
+      "epoch": 0.0602598935773127,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0019826864241649714,
+      "loss": 0.1211,
+      "step": 6942
+    },
+    {
+      "epoch": 0.060268574057516865,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001982680613936498,
+      "loss": 0.1143,
+      "step": 6943
+    },
+    {
+      "epoch": 0.06027725453772103,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019826748027427384,
+      "loss": 0.1738,
+      "step": 6944
+    },
+    {
+      "epoch": 0.060285935017925195,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0019826689905836988,
+      "loss": 0.1055,
+      "step": 6945
+    },
+    {
+      "epoch": 0.06029461549812935,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019826631774593857,
+      "loss": 0.1582,
+      "step": 6946
+    },
+    {
+      "epoch": 0.06030329597833352,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0019826573633698052,
+      "loss": 0.1084,
+      "step": 6947
+    },
+    {
+      "epoch": 0.06031197645853768,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001982651548314964,
+      "loss": 0.1572,
+      "step": 6948
+    },
+    {
+      "epoch": 0.06032065693874185,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0019826457322948683,
+      "loss": 0.2051,
+      "step": 6949
+    },
+    {
+      "epoch": 0.060329337418946014,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019826399153095244,
+      "loss": 0.1143,
+      "step": 6950
+    },
+    {
+      "epoch": 0.06033801789915018,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001982634097358939,
+      "loss": 0.1758,
+      "step": 6951
+    },
+    {
+      "epoch": 0.060346698379354344,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.001982628278443118,
+      "loss": 0.1045,
+      "step": 6952
+    },
+    {
+      "epoch": 0.06035537885955851,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019826224585620684,
+      "loss": 0.1387,
+      "step": 6953
+    },
+    {
+      "epoch": 0.060364059339762674,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001982616637715796,
+      "loss": 0.1191,
+      "step": 6954
+    },
+    {
+      "epoch": 0.06037273981996684,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.001982610815904307,
+      "loss": 0.1162,
+      "step": 6955
+    },
+    {
+      "epoch": 0.060381420300171004,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001982604993127608,
+      "loss": 0.1289,
+      "step": 6956
+    },
+    {
+      "epoch": 0.06039010078037517,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001982599169385706,
+      "loss": 0.165,
+      "step": 6957
+    },
+    {
+      "epoch": 0.060398781260579334,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019825933446786063,
+      "loss": 0.1699,
+      "step": 6958
+    },
+    {
+      "epoch": 0.0604074617407835,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001982587519006316,
+      "loss": 0.1523,
+      "step": 6959
+    },
+    {
+      "epoch": 0.060416142220987665,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001982581692368841,
+      "loss": 0.1543,
+      "step": 6960
+    },
+    {
+      "epoch": 0.06042482270119183,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019825758647661882,
+      "loss": 0.1689,
+      "step": 6961
+    },
+    {
+      "epoch": 0.060433503181395995,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0019825700361983637,
+      "loss": 0.1465,
+      "step": 6962
+    },
+    {
+      "epoch": 0.06044218366160016,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019825642066653736,
+      "loss": 0.1152,
+      "step": 6963
+    },
+    {
+      "epoch": 0.060450864141804325,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019825583761672247,
+      "loss": 0.1504,
+      "step": 6964
+    },
+    {
+      "epoch": 0.06045954462200849,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0019825525447039233,
+      "loss": 0.0996,
+      "step": 6965
+    },
+    {
+      "epoch": 0.060468225102212655,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019825467122754755,
+      "loss": 0.1069,
+      "step": 6966
+    },
+    {
+      "epoch": 0.06047690558241682,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001982540878881888,
+      "loss": 0.1094,
+      "step": 6967
+    },
+    {
+      "epoch": 0.060485586062620986,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001982535044523167,
+      "loss": 0.1328,
+      "step": 6968
+    },
+    {
+      "epoch": 0.06049426654282515,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.001982529209199319,
+      "loss": 0.1934,
+      "step": 6969
+    },
+    {
+      "epoch": 0.060502947023029316,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019825233729103505,
+      "loss": 0.1206,
+      "step": 6970
+    },
+    {
+      "epoch": 0.06051162750323348,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019825175356562677,
+      "loss": 0.1309,
+      "step": 6971
+    },
+    {
+      "epoch": 0.060520307983437646,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019825116974370766,
+      "loss": 0.168,
+      "step": 6972
+    },
+    {
+      "epoch": 0.06052898846364181,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019825058582527842,
+      "loss": 0.1523,
+      "step": 6973
+    },
+    {
+      "epoch": 0.060537668943845976,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019825000181033967,
+      "loss": 0.1445,
+      "step": 6974
+    },
+    {
+      "epoch": 0.06054634942405014,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019824941769889205,
+      "loss": 0.1416,
+      "step": 6975
+    },
+    {
+      "epoch": 0.06055502990425431,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0019824883349093612,
+      "loss": 0.1201,
+      "step": 6976
+    },
+    {
+      "epoch": 0.060563710384458465,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0019824824918647267,
+      "loss": 0.1504,
+      "step": 6977
+    },
+    {
+      "epoch": 0.06057239086466263,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019824766478550226,
+      "loss": 0.1172,
+      "step": 6978
+    },
+    {
+      "epoch": 0.060581071344866795,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001982470802880255,
+      "loss": 0.1426,
+      "step": 6979
+    },
+    {
+      "epoch": 0.06058975182507096,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019824649569404307,
+      "loss": 0.1177,
+      "step": 6980
+    },
+    {
+      "epoch": 0.060598432305275125,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001982459110035556,
+      "loss": 0.1406,
+      "step": 6981
+    },
+    {
+      "epoch": 0.06060711278547929,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0019824532621656376,
+      "loss": 0.1455,
+      "step": 6982
+    },
+    {
+      "epoch": 0.060615793265683456,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019824474133306813,
+      "loss": 0.1377,
+      "step": 6983
+    },
+    {
+      "epoch": 0.06062447374588762,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.001982441563530693,
+      "loss": 0.1514,
+      "step": 6984
+    },
+    {
+      "epoch": 0.060633154226091786,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001982435712765681,
+      "loss": 0.1147,
+      "step": 6985
+    },
+    {
+      "epoch": 0.06064183470629595,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00198242986103565,
+      "loss": 0.1562,
+      "step": 6986
+    },
+    {
+      "epoch": 0.060650515186500116,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019824240083406073,
+      "loss": 0.1699,
+      "step": 6987
+    },
+    {
+      "epoch": 0.06065919566670428,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019824181546805584,
+      "loss": 0.123,
+      "step": 6988
+    },
+    {
+      "epoch": 0.060667876146908446,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019824123000555106,
+      "loss": 0.1387,
+      "step": 6989
+    },
+    {
+      "epoch": 0.06067655662711261,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.00198240644446547,
+      "loss": 0.1553,
+      "step": 6990
+    },
+    {
+      "epoch": 0.060685237107316777,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001982400587910443,
+      "loss": 0.1191,
+      "step": 6991
+    },
+    {
+      "epoch": 0.06069391758752094,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001982394730390436,
+      "loss": 0.1445,
+      "step": 6992
+    },
+    {
+      "epoch": 0.06070259806772511,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019823888719054552,
+      "loss": 0.1387,
+      "step": 6993
+    },
+    {
+      "epoch": 0.06071127854792927,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019823830124555074,
+      "loss": 0.1357,
+      "step": 6994
+    },
+    {
+      "epoch": 0.06071995902813344,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0019823771520405984,
+      "loss": 0.1289,
+      "step": 6995
+    },
+    {
+      "epoch": 0.0607286395083376,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019823712906607356,
+      "loss": 0.1582,
+      "step": 6996
+    },
+    {
+      "epoch": 0.06073731998854177,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.001982365428315924,
+      "loss": 0.1128,
+      "step": 6997
+    },
+    {
+      "epoch": 0.06074600046874593,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.001982359565006172,
+      "loss": 0.1445,
+      "step": 6998
+    },
+    {
+      "epoch": 0.0607546809489501,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001982353700731484,
+      "loss": 0.1377,
+      "step": 6999
+    },
+    {
+      "epoch": 0.06076336142915426,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019823478354918673,
+      "loss": 0.1504,
+      "step": 7000
+    },
+    {
+      "epoch": 0.06077204190935843,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019823419692873282,
+      "loss": 0.1611,
+      "step": 7001
+    },
+    {
+      "epoch": 0.06078072238956259,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0019823361021178735,
+      "loss": 0.1426,
+      "step": 7002
+    },
+    {
+      "epoch": 0.06078940286976676,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019823302339835092,
+      "loss": 0.1084,
+      "step": 7003
+    },
+    {
+      "epoch": 0.06079808334997092,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001982324364884242,
+      "loss": 0.1338,
+      "step": 7004
+    },
+    {
+      "epoch": 0.06080676383017509,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001982318494820078,
+      "loss": 0.1221,
+      "step": 7005
+    },
+    {
+      "epoch": 0.06081544431037925,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019823126237910236,
+      "loss": 0.166,
+      "step": 7006
+    },
+    {
+      "epoch": 0.06082412479058342,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001982306751797085,
+      "loss": 0.1836,
+      "step": 7007
+    },
+    {
+      "epoch": 0.06083280527078758,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019823008788382697,
+      "loss": 0.127,
+      "step": 7008
+    },
+    {
+      "epoch": 0.06084148575099174,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0019822950049145833,
+      "loss": 0.1504,
+      "step": 7009
+    },
+    {
+      "epoch": 0.06085016623119591,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001982289130026032,
+      "loss": 0.1221,
+      "step": 7010
+    },
+    {
+      "epoch": 0.06085884671140007,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0019822832541726227,
+      "loss": 0.168,
+      "step": 7011
+    },
+    {
+      "epoch": 0.06086752719160424,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019822773773543615,
+      "loss": 0.1104,
+      "step": 7012
+    },
+    {
+      "epoch": 0.0608762076718084,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019822714995712553,
+      "loss": 0.127,
+      "step": 7013
+    },
+    {
+      "epoch": 0.06088488815201257,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0019822656208233103,
+      "loss": 0.1514,
+      "step": 7014
+    },
+    {
+      "epoch": 0.06089356863221673,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.001982259741110533,
+      "loss": 0.1387,
+      "step": 7015
+    },
+    {
+      "epoch": 0.0609022491124209,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001982253860432929,
+      "loss": 0.1689,
+      "step": 7016
+    },
+    {
+      "epoch": 0.06091092959262506,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001982247978790506,
+      "loss": 0.1738,
+      "step": 7017
+    },
+    {
+      "epoch": 0.06091961007282923,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0019822420961832696,
+      "loss": 0.1689,
+      "step": 7018
+    },
+    {
+      "epoch": 0.06092829055303339,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.001982236212611227,
+      "loss": 0.1523,
+      "step": 7019
+    },
+    {
+      "epoch": 0.06093697103323756,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001982230328074384,
+      "loss": 0.1836,
+      "step": 7020
+    },
+    {
+      "epoch": 0.06094565151344172,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019822244425727466,
+      "loss": 0.1416,
+      "step": 7021
+    },
+    {
+      "epoch": 0.06095433199364589,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001982218556106322,
+      "loss": 0.1553,
+      "step": 7022
+    },
+    {
+      "epoch": 0.06096301247385005,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001982212668675117,
+      "loss": 0.1377,
+      "step": 7023
+    },
+    {
+      "epoch": 0.06097169295405422,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.001982206780279137,
+      "loss": 0.1455,
+      "step": 7024
+    },
+    {
+      "epoch": 0.060980373434258384,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.001982200890918389,
+      "loss": 0.1348,
+      "step": 7025
+    },
+    {
+      "epoch": 0.06098905391446255,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0019821950005928794,
+      "loss": 0.166,
+      "step": 7026
+    },
+    {
+      "epoch": 0.060997734394666714,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019821891093026144,
+      "loss": 0.0874,
+      "step": 7027
+    },
+    {
+      "epoch": 0.06100641487487088,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001982183217047601,
+      "loss": 0.1426,
+      "step": 7028
+    },
+    {
+      "epoch": 0.061015095355075044,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001982177323827845,
+      "loss": 0.1133,
+      "step": 7029
+    },
+    {
+      "epoch": 0.06102377583527921,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0019821714296433535,
+      "loss": 0.1699,
+      "step": 7030
+    },
+    {
+      "epoch": 0.061032456315483374,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019821655344941323,
+      "loss": 0.1523,
+      "step": 7031
+    },
+    {
+      "epoch": 0.06104113679568754,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001982159638380188,
+      "loss": 0.1367,
+      "step": 7032
+    },
+    {
+      "epoch": 0.061049817275891705,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001982153741301527,
+      "loss": 0.1582,
+      "step": 7033
+    },
+    {
+      "epoch": 0.06105849775609587,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019821478432581565,
+      "loss": 0.1777,
+      "step": 7034
+    },
+    {
+      "epoch": 0.061067178236300035,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001982141944250082,
+      "loss": 0.1465,
+      "step": 7035
+    },
+    {
+      "epoch": 0.0610758587165042,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019821360442773103,
+      "loss": 0.1318,
+      "step": 7036
+    },
+    {
+      "epoch": 0.061084539196708365,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001982130143339848,
+      "loss": 0.1182,
+      "step": 7037
+    },
+    {
+      "epoch": 0.06109321967691253,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019821242414377015,
+      "loss": 0.1191,
+      "step": 7038
+    },
+    {
+      "epoch": 0.06110190015711669,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019821183385708772,
+      "loss": 0.1592,
+      "step": 7039
+    },
+    {
+      "epoch": 0.061110580637320853,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001982112434739382,
+      "loss": 0.1318,
+      "step": 7040
+    },
+    {
+      "epoch": 0.06111926111752502,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001982106529943221,
+      "loss": 0.208,
+      "step": 7041
+    },
+    {
+      "epoch": 0.061127941597729184,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0019821006241824016,
+      "loss": 0.1011,
+      "step": 7042
+    },
+    {
+      "epoch": 0.06113662207793335,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001982094717456931,
+      "loss": 0.1348,
+      "step": 7043
+    },
+    {
+      "epoch": 0.061145302558137514,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019820888097668143,
+      "loss": 0.168,
+      "step": 7044
+    },
+    {
+      "epoch": 0.06115398303834168,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0019820829011120583,
+      "loss": 0.1108,
+      "step": 7045
+    },
+    {
+      "epoch": 0.061162663518545844,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0019820769914926702,
+      "loss": 0.126,
+      "step": 7046
+    },
+    {
+      "epoch": 0.06117134399875001,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001982071080908656,
+      "loss": 0.1299,
+      "step": 7047
+    },
+    {
+      "epoch": 0.061180024478954174,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019820651693600215,
+      "loss": 0.1523,
+      "step": 7048
+    },
+    {
+      "epoch": 0.06118870495915834,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019820592568467743,
+      "loss": 0.1719,
+      "step": 7049
+    },
+    {
+      "epoch": 0.061197385439362505,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.00198205334336892,
+      "loss": 0.1777,
+      "step": 7050
+    },
+    {
+      "epoch": 0.06120606591956667,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019820474289264658,
+      "loss": 0.2012,
+      "step": 7051
+    },
+    {
+      "epoch": 0.061214746399770835,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0019820415135194175,
+      "loss": 0.1279,
+      "step": 7052
+    },
+    {
+      "epoch": 0.061223426879975,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019820355971477822,
+      "loss": 0.1191,
+      "step": 7053
+    },
+    {
+      "epoch": 0.061232107360179165,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019820296798115657,
+      "loss": 0.1074,
+      "step": 7054
+    },
+    {
+      "epoch": 0.06124078784038333,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001982023761510775,
+      "loss": 0.2227,
+      "step": 7055
+    },
+    {
+      "epoch": 0.061249468320587495,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001982017842245416,
+      "loss": 0.126,
+      "step": 7056
+    },
+    {
+      "epoch": 0.06125814880079166,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001982011922015496,
+      "loss": 0.1074,
+      "step": 7057
+    },
+    {
+      "epoch": 0.061266829280995826,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001982006000821021,
+      "loss": 0.1279,
+      "step": 7058
+    },
+    {
+      "epoch": 0.06127550976119999,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019820000786619973,
+      "loss": 0.1494,
+      "step": 7059
+    },
+    {
+      "epoch": 0.061284190241404156,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0019819941555384315,
+      "loss": 0.1602,
+      "step": 7060
+    },
+    {
+      "epoch": 0.06129287072160832,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019819882314503303,
+      "loss": 0.1211,
+      "step": 7061
+    },
+    {
+      "epoch": 0.061301551201812486,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019819823063977,
+      "loss": 0.1641,
+      "step": 7062
+    },
+    {
+      "epoch": 0.06131023168201665,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001981976380380547,
+      "loss": 0.1045,
+      "step": 7063
+    },
+    {
+      "epoch": 0.061318912162220816,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001981970453398878,
+      "loss": 0.1494,
+      "step": 7064
+    },
+    {
+      "epoch": 0.06132759264242498,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019819645254526993,
+      "loss": 0.1484,
+      "step": 7065
+    },
+    {
+      "epoch": 0.06133627312262915,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001981958596542017,
+      "loss": 0.1445,
+      "step": 7066
+    },
+    {
+      "epoch": 0.06134495360283331,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019819526666668384,
+      "loss": 0.127,
+      "step": 7067
+    },
+    {
+      "epoch": 0.06135363408303748,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00198194673582717,
+      "loss": 0.1553,
+      "step": 7068
+    },
+    {
+      "epoch": 0.06136231456324164,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001981940804023017,
+      "loss": 0.1387,
+      "step": 7069
+    },
+    {
+      "epoch": 0.0613709950434458,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001981934871254388,
+      "loss": 0.125,
+      "step": 7070
+    },
+    {
+      "epoch": 0.061379675523649965,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019819289375212876,
+      "loss": 0.0996,
+      "step": 7071
+    },
+    {
+      "epoch": 0.06138835600385413,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001981923002823723,
+      "loss": 0.1445,
+      "step": 7072
+    },
+    {
+      "epoch": 0.061397036484058296,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0019819170671617004,
+      "loss": 0.1484,
+      "step": 7073
+    },
+    {
+      "epoch": 0.06140571696426246,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001981911130535227,
+      "loss": 0.1689,
+      "step": 7074
+    },
+    {
+      "epoch": 0.061414397444466626,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001981905192944309,
+      "loss": 0.1289,
+      "step": 7075
+    },
+    {
+      "epoch": 0.06142307792467079,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001981899254388952,
+      "loss": 0.1396,
+      "step": 7076
+    },
+    {
+      "epoch": 0.061431758404874956,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0019818933148691635,
+      "loss": 0.1289,
+      "step": 7077
+    },
+    {
+      "epoch": 0.06144043888507912,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.00198188737438495,
+      "loss": 0.1348,
+      "step": 7078
+    },
+    {
+      "epoch": 0.061449119365283286,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001981881432936318,
+      "loss": 0.1602,
+      "step": 7079
+    },
+    {
+      "epoch": 0.06145779984548745,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019818754905232727,
+      "loss": 0.1084,
+      "step": 7080
+    },
+    {
+      "epoch": 0.061466480325691616,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001981869547145822,
+      "loss": 0.1133,
+      "step": 7081
+    },
+    {
+      "epoch": 0.06147516080589578,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0019818636028039725,
+      "loss": 0.1582,
+      "step": 7082
+    },
+    {
+      "epoch": 0.06148384128609995,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00198185765749773,
+      "loss": 0.1191,
+      "step": 7083
+    },
+    {
+      "epoch": 0.06149252176630411,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019818517112271013,
+      "loss": 0.1206,
+      "step": 7084
+    },
+    {
+      "epoch": 0.06150120224650828,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001981845763992092,
+      "loss": 0.1309,
+      "step": 7085
+    },
+    {
+      "epoch": 0.06150988272671244,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019818398157927103,
+      "loss": 0.1514,
+      "step": 7086
+    },
+    {
+      "epoch": 0.06151856320691661,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019818338666289614,
+      "loss": 0.1211,
+      "step": 7087
+    },
+    {
+      "epoch": 0.06152724368712077,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0019818279165008524,
+      "loss": 0.1514,
+      "step": 7088
+    },
+    {
+      "epoch": 0.06153592416732494,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00198182196540839,
+      "loss": 0.125,
+      "step": 7089
+    },
+    {
+      "epoch": 0.0615446046475291,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00198181601335158,
+      "loss": 0.0913,
+      "step": 7090
+    },
+    {
+      "epoch": 0.06155328512773327,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001981810060330429,
+      "loss": 0.1318,
+      "step": 7091
+    },
+    {
+      "epoch": 0.06156196560793743,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001981804106344944,
+      "loss": 0.127,
+      "step": 7092
+    },
+    {
+      "epoch": 0.0615706460881416,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0019817981513951316,
+      "loss": 0.1172,
+      "step": 7093
+    },
+    {
+      "epoch": 0.06157932656834576,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019817921954809977,
+      "loss": 0.1416,
+      "step": 7094
+    },
+    {
+      "epoch": 0.06158800704854993,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001981786238602549,
+      "loss": 0.1562,
+      "step": 7095
+    },
+    {
+      "epoch": 0.06159668752875409,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001981780280759792,
+      "loss": 0.1147,
+      "step": 7096
+    },
+    {
+      "epoch": 0.06160536800895826,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001981774321952734,
+      "loss": 0.1406,
+      "step": 7097
+    },
+    {
+      "epoch": 0.061614048489162423,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0019817683621813804,
+      "loss": 0.1543,
+      "step": 7098
+    },
+    {
+      "epoch": 0.06162272896936659,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001981762401445738,
+      "loss": 0.1504,
+      "step": 7099
+    },
+    {
+      "epoch": 0.06163140944957075,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0019817564397458134,
+      "loss": 0.1289,
+      "step": 7100
+    },
+    {
+      "epoch": 0.06164008992977491,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019817504770816134,
+      "loss": 0.1162,
+      "step": 7101
+    },
+    {
+      "epoch": 0.06164877040997908,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019817445134531443,
+      "loss": 0.123,
+      "step": 7102
+    },
+    {
+      "epoch": 0.06165745089018324,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.001981738548860413,
+      "loss": 0.1377,
+      "step": 7103
+    },
+    {
+      "epoch": 0.06166613137038741,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019817325833034255,
+      "loss": 0.1367,
+      "step": 7104
+    },
+    {
+      "epoch": 0.06167481185059157,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0019817266167821883,
+      "loss": 0.1426,
+      "step": 7105
+    },
+    {
+      "epoch": 0.06168349233079574,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001981720649296708,
+      "loss": 0.1582,
+      "step": 7106
+    },
+    {
+      "epoch": 0.0616921728109999,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019817146808469915,
+      "loss": 0.1279,
+      "step": 7107
+    },
+    {
+      "epoch": 0.06170085329120407,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019817087114330453,
+      "loss": 0.1797,
+      "step": 7108
+    },
+    {
+      "epoch": 0.06170953377140823,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0019817027410548756,
+      "loss": 0.2168,
+      "step": 7109
+    },
+    {
+      "epoch": 0.0617182142516124,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0019816967697124884,
+      "loss": 0.0991,
+      "step": 7110
+    },
+    {
+      "epoch": 0.06172689473181656,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019816907974058917,
+      "loss": 0.1094,
+      "step": 7111
+    },
+    {
+      "epoch": 0.06173557521202073,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0019816848241350905,
+      "loss": 0.1758,
+      "step": 7112
+    },
+    {
+      "epoch": 0.06174425569222489,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0019816788499000927,
+      "loss": 0.1621,
+      "step": 7113
+    },
+    {
+      "epoch": 0.06175293617242906,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0019816728747009036,
+      "loss": 0.1187,
+      "step": 7114
+    },
+    {
+      "epoch": 0.061761616652633224,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019816668985375304,
+      "loss": 0.1514,
+      "step": 7115
+    },
+    {
+      "epoch": 0.06177029713283739,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019816609214099797,
+      "loss": 0.1089,
+      "step": 7116
+    },
+    {
+      "epoch": 0.061778977613041554,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.001981654943318258,
+      "loss": 0.1426,
+      "step": 7117
+    },
+    {
+      "epoch": 0.06178765809324572,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019816489642623713,
+      "loss": 0.1348,
+      "step": 7118
+    },
+    {
+      "epoch": 0.061796338573449884,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019816429842423266,
+      "loss": 0.167,
+      "step": 7119
+    },
+    {
+      "epoch": 0.06180501905365405,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019816370032581304,
+      "loss": 0.1084,
+      "step": 7120
+    },
+    {
+      "epoch": 0.061813699533858214,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0019816310213097896,
+      "loss": 0.1064,
+      "step": 7121
+    },
+    {
+      "epoch": 0.06182238001406238,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.00198162503839731,
+      "loss": 0.1426,
+      "step": 7122
+    },
+    {
+      "epoch": 0.061831060494266545,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0019816190545206987,
+      "loss": 0.1299,
+      "step": 7123
+    },
+    {
+      "epoch": 0.06183974097447071,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001981613069679962,
+      "loss": 0.1206,
+      "step": 7124
+    },
+    {
+      "epoch": 0.061848421454674875,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019816070838751063,
+      "loss": 0.1777,
+      "step": 7125
+    },
+    {
+      "epoch": 0.06185710193487904,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019816010971061386,
+      "loss": 0.1089,
+      "step": 7126
+    },
+    {
+      "epoch": 0.061865782415083205,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001981595109373065,
+      "loss": 0.1348,
+      "step": 7127
+    },
+    {
+      "epoch": 0.06187446289528737,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.001981589120675892,
+      "loss": 0.1011,
+      "step": 7128
+    },
+    {
+      "epoch": 0.061883143375491535,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001981583131014627,
+      "loss": 0.1309,
+      "step": 7129
+    },
+    {
+      "epoch": 0.0618918238556957,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019815771403892757,
+      "loss": 0.1445,
+      "step": 7130
+    },
+    {
+      "epoch": 0.06190050433589986,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019815711487998445,
+      "loss": 0.1357,
+      "step": 7131
+    },
+    {
+      "epoch": 0.061909184816104024,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019815651562463408,
+      "loss": 0.168,
+      "step": 7132
+    },
+    {
+      "epoch": 0.06191786529630819,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019815591627287705,
+      "loss": 0.1572,
+      "step": 7133
+    },
+    {
+      "epoch": 0.061926545776512354,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019815531682471402,
+      "loss": 0.1758,
+      "step": 7134
+    },
+    {
+      "epoch": 0.06193522625671652,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0019815471728014565,
+      "loss": 0.1221,
+      "step": 7135
+    },
+    {
+      "epoch": 0.061943906736920684,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0019815411763917266,
+      "loss": 0.1787,
+      "step": 7136
+    },
+    {
+      "epoch": 0.06195258721712485,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0019815351790179563,
+      "loss": 0.125,
+      "step": 7137
+    },
+    {
+      "epoch": 0.061961267697329014,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001981529180680152,
+      "loss": 0.1475,
+      "step": 7138
+    },
+    {
+      "epoch": 0.06196994817753318,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001981523181378321,
+      "loss": 0.1338,
+      "step": 7139
+    },
+    {
+      "epoch": 0.061978628657737345,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0019815171811124696,
+      "loss": 0.1553,
+      "step": 7140
+    },
+    {
+      "epoch": 0.06198730913794151,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001981511179882604,
+      "loss": 0.1465,
+      "step": 7141
+    },
+    {
+      "epoch": 0.061995989618145675,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0019815051776887315,
+      "loss": 0.1396,
+      "step": 7142
+    },
+    {
+      "epoch": 0.06200467009834984,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019814991745308575,
+      "loss": 0.1045,
+      "step": 7143
+    },
+    {
+      "epoch": 0.062013350578554005,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0019814931704089894,
+      "loss": 0.1689,
+      "step": 7144
+    },
+    {
+      "epoch": 0.06202203105875817,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019814871653231337,
+      "loss": 0.1226,
+      "step": 7145
+    },
+    {
+      "epoch": 0.062030711538962335,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.001981481159273297,
+      "loss": 0.1484,
+      "step": 7146
+    },
+    {
+      "epoch": 0.0620393920191665,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001981475152259486,
+      "loss": 0.1016,
+      "step": 7147
+    },
+    {
+      "epoch": 0.062048072499370666,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019814691442817064,
+      "loss": 0.1484,
+      "step": 7148
+    },
+    {
+      "epoch": 0.06205675297957483,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019814631353399655,
+      "loss": 0.1562,
+      "step": 7149
+    },
+    {
+      "epoch": 0.062065433459778996,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.00198145712543427,
+      "loss": 0.1357,
+      "step": 7150
+    },
+    {
+      "epoch": 0.06207411393998316,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019814511145646264,
+      "loss": 0.1514,
+      "step": 7151
+    },
+    {
+      "epoch": 0.062082794420187326,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019814451027310408,
+      "loss": 0.1992,
+      "step": 7152
+    },
+    {
+      "epoch": 0.06209147490039149,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.00198143908993352,
+      "loss": 0.1309,
+      "step": 7153
+    },
+    {
+      "epoch": 0.062100155380595656,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.001981433076172071,
+      "loss": 0.1123,
+      "step": 7154
+    },
+    {
+      "epoch": 0.06210883586079982,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0019814270614467,
+      "loss": 0.1855,
+      "step": 7155
+    },
+    {
+      "epoch": 0.06211751634100399,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.001981421045757413,
+      "loss": 0.1367,
+      "step": 7156
+    },
+    {
+      "epoch": 0.06212619682120815,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001981415029104218,
+      "loss": 0.1406,
+      "step": 7157
+    },
+    {
+      "epoch": 0.06213487730141232,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0019814090114871207,
+      "loss": 0.1396,
+      "step": 7158
+    },
+    {
+      "epoch": 0.06214355778161648,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0019814029929061273,
+      "loss": 0.125,
+      "step": 7159
+    },
+    {
+      "epoch": 0.06215223826182065,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0019813969733612448,
+      "loss": 0.1484,
+      "step": 7160
+    },
+    {
+      "epoch": 0.06216091874202481,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00198139095285248,
+      "loss": 0.1426,
+      "step": 7161
+    },
+    {
+      "epoch": 0.06216959922222897,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019813849313798392,
+      "loss": 0.1582,
+      "step": 7162
+    },
+    {
+      "epoch": 0.062178279702433135,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0019813789089433292,
+      "loss": 0.1719,
+      "step": 7163
+    },
+    {
+      "epoch": 0.0621869601826373,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0019813728855429566,
+      "loss": 0.1309,
+      "step": 7164
+    },
+    {
+      "epoch": 0.062195640662841466,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019813668611787278,
+      "loss": 0.1226,
+      "step": 7165
+    },
+    {
+      "epoch": 0.06220432114304563,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019813608358506493,
+      "loss": 0.1348,
+      "step": 7166
+    },
+    {
+      "epoch": 0.062213001623249796,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001981354809558728,
+      "loss": 0.1553,
+      "step": 7167
+    },
+    {
+      "epoch": 0.06222168210345396,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00198134878230297,
+      "loss": 0.1621,
+      "step": 7168
+    },
+    {
+      "epoch": 0.062230362583658126,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001981342754083382,
+      "loss": 0.1055,
+      "step": 7169
+    },
+    {
+      "epoch": 0.06223904306386229,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019813367248999714,
+      "loss": 0.1875,
+      "step": 7170
+    },
+    {
+      "epoch": 0.062247723544066456,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001981330694752744,
+      "loss": 0.1445,
+      "step": 7171
+    },
+    {
+      "epoch": 0.06225640402427062,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0019813246636417067,
+      "loss": 0.127,
+      "step": 7172
+    },
+    {
+      "epoch": 0.06226508450447479,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0019813186315668662,
+      "loss": 0.1484,
+      "step": 7173
+    },
+    {
+      "epoch": 0.06227376498467895,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0019813125985282285,
+      "loss": 0.1514,
+      "step": 7174
+    },
+    {
+      "epoch": 0.06228244546488312,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0019813065645258006,
+      "loss": 0.1279,
+      "step": 7175
+    },
+    {
+      "epoch": 0.06229112594508728,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001981300529559589,
+      "loss": 0.1196,
+      "step": 7176
+    },
+    {
+      "epoch": 0.06229980642529145,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019812944936296005,
+      "loss": 0.1494,
+      "step": 7177
+    },
+    {
+      "epoch": 0.06230848690549561,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0019812884567358417,
+      "loss": 0.1475,
+      "step": 7178
+    },
+    {
+      "epoch": 0.06231716738569978,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001981282418878319,
+      "loss": 0.1055,
+      "step": 7179
+    },
+    {
+      "epoch": 0.06232584786590394,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019812763800570394,
+      "loss": 0.1094,
+      "step": 7180
+    },
+    {
+      "epoch": 0.06233452834610811,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019812703402720087,
+      "loss": 0.1143,
+      "step": 7181
+    },
+    {
+      "epoch": 0.06234320882631227,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0019812642995232343,
+      "loss": 0.1182,
+      "step": 7182
+    },
+    {
+      "epoch": 0.06235188930651644,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019812582578107225,
+      "loss": 0.166,
+      "step": 7183
+    },
+    {
+      "epoch": 0.0623605697867206,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00198125221513448,
+      "loss": 0.1211,
+      "step": 7184
+    },
+    {
+      "epoch": 0.06236925026692477,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001981246171494513,
+      "loss": 0.1191,
+      "step": 7185
+    },
+    {
+      "epoch": 0.06237793074712893,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001981240126890828,
+      "loss": 0.1445,
+      "step": 7186
+    },
+    {
+      "epoch": 0.0623866112273331,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.001981234081323433,
+      "loss": 0.1484,
+      "step": 7187
+    },
+    {
+      "epoch": 0.06239529170753726,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019812280347923333,
+      "loss": 0.1113,
+      "step": 7188
+    },
+    {
+      "epoch": 0.06240397218774143,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001981221987297536,
+      "loss": 0.1367,
+      "step": 7189
+    },
+    {
+      "epoch": 0.062412652667945594,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001981215938839047,
+      "loss": 0.127,
+      "step": 7190
+    },
+    {
+      "epoch": 0.06242133314814976,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019812098894168738,
+      "loss": 0.1523,
+      "step": 7191
+    },
+    {
+      "epoch": 0.062430013628353924,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0019812038390310226,
+      "loss": 0.1562,
+      "step": 7192
+    },
+    {
+      "epoch": 0.06243869410855808,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019811977876815004,
+      "loss": 0.1758,
+      "step": 7193
+    },
+    {
+      "epoch": 0.06244737458876225,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019811917353683133,
+      "loss": 0.1641,
+      "step": 7194
+    },
+    {
+      "epoch": 0.06245605506896641,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0019811856820914683,
+      "loss": 0.1777,
+      "step": 7195
+    },
+    {
+      "epoch": 0.06246473554917058,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019811796278509717,
+      "loss": 0.1299,
+      "step": 7196
+    },
+    {
+      "epoch": 0.06247341602937474,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019811735726468307,
+      "loss": 0.1289,
+      "step": 7197
+    },
+    {
+      "epoch": 0.06248209650957891,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.001981167516479051,
+      "loss": 0.1475,
+      "step": 7198
+    },
+    {
+      "epoch": 0.06249077698978307,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.00198116145934764,
+      "loss": 0.1348,
+      "step": 7199
+    },
+    {
+      "epoch": 0.06249945746998724,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019811554012526046,
+      "loss": 0.1113,
+      "step": 7200
+    },
+    {
+      "epoch": 0.0625081379501914,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.00198114934219395,
+      "loss": 0.1465,
+      "step": 7201
+    },
+    {
+      "epoch": 0.06251681843039557,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0019811432821716843,
+      "loss": 0.1387,
+      "step": 7202
+    },
+    {
+      "epoch": 0.06252549891059973,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0019811372211858136,
+      "loss": 0.1377,
+      "step": 7203
+    },
+    {
+      "epoch": 0.0625341793908039,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0019811311592363443,
+      "loss": 0.1387,
+      "step": 7204
+    },
+    {
+      "epoch": 0.06254285987100806,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001981125096323283,
+      "loss": 0.1318,
+      "step": 7205
+    },
+    {
+      "epoch": 0.06255154035121223,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019811190324466365,
+      "loss": 0.1157,
+      "step": 7206
+    },
+    {
+      "epoch": 0.0625602208314164,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.001981112967606412,
+      "loss": 0.1045,
+      "step": 7207
+    },
+    {
+      "epoch": 0.06256890131162056,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019811069018026146,
+      "loss": 0.1582,
+      "step": 7208
+    },
+    {
+      "epoch": 0.06257758179182472,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019811008350352527,
+      "loss": 0.1328,
+      "step": 7209
+    },
+    {
+      "epoch": 0.06258626227202889,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001981094767304332,
+      "loss": 0.1416,
+      "step": 7210
+    },
+    {
+      "epoch": 0.06259494275223305,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019810886986098594,
+      "loss": 0.1289,
+      "step": 7211
+    },
+    {
+      "epoch": 0.06260362323243722,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0019810826289518415,
+      "loss": 0.1592,
+      "step": 7212
+    },
+    {
+      "epoch": 0.06261230371264138,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0019810765583302844,
+      "loss": 0.1621,
+      "step": 7213
+    },
+    {
+      "epoch": 0.06262098419284555,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019810704867451955,
+      "loss": 0.1221,
+      "step": 7214
+    },
+    {
+      "epoch": 0.06262966467304971,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019810644141965814,
+      "loss": 0.1465,
+      "step": 7215
+    },
+    {
+      "epoch": 0.06263834515325388,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001981058340684448,
+      "loss": 0.1455,
+      "step": 7216
+    },
+    {
+      "epoch": 0.06264702563345804,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001981052266208803,
+      "loss": 0.1465,
+      "step": 7217
+    },
+    {
+      "epoch": 0.06265570611366221,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001981046190769652,
+      "loss": 0.1406,
+      "step": 7218
+    },
+    {
+      "epoch": 0.06266438659386638,
+      "grad_norm": 3.609375,
+      "learning_rate": 0.0019810401143670023,
+      "loss": 0.3086,
+      "step": 7219
+    },
+    {
+      "epoch": 0.06267306707407054,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019810340370008607,
+      "loss": 0.1455,
+      "step": 7220
+    },
+    {
+      "epoch": 0.0626817475542747,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019810279586712333,
+      "loss": 0.123,
+      "step": 7221
+    },
+    {
+      "epoch": 0.06269042803447887,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019810218793781267,
+      "loss": 0.1367,
+      "step": 7222
+    },
+    {
+      "epoch": 0.06269910851468304,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001981015799121548,
+      "loss": 0.1309,
+      "step": 7223
+    },
+    {
+      "epoch": 0.0627077889948872,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001981009717901504,
+      "loss": 0.1406,
+      "step": 7224
+    },
+    {
+      "epoch": 0.06271646947509137,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0019810036357180002,
+      "loss": 0.1514,
+      "step": 7225
+    },
+    {
+      "epoch": 0.06272514995529553,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019809975525710444,
+      "loss": 0.1592,
+      "step": 7226
+    },
+    {
+      "epoch": 0.0627338304354997,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001980991468460643,
+      "loss": 0.1118,
+      "step": 7227
+    },
+    {
+      "epoch": 0.06274251091570386,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001980985383386803,
+      "loss": 0.1113,
+      "step": 7228
+    },
+    {
+      "epoch": 0.06275119139590803,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.00198097929734953,
+      "loss": 0.1387,
+      "step": 7229
+    },
+    {
+      "epoch": 0.06275987187611219,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019809732103488317,
+      "loss": 0.1455,
+      "step": 7230
+    },
+    {
+      "epoch": 0.06276855235631636,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001980967122384714,
+      "loss": 0.1299,
+      "step": 7231
+    },
+    {
+      "epoch": 0.06277723283652052,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001980961033457184,
+      "loss": 0.0967,
+      "step": 7232
+    },
+    {
+      "epoch": 0.06278591331672469,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001980954943566248,
+      "loss": 0.1201,
+      "step": 7233
+    },
+    {
+      "epoch": 0.06279459379692885,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019809488527119134,
+      "loss": 0.1406,
+      "step": 7234
+    },
+    {
+      "epoch": 0.06280327427713302,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001980942760894186,
+      "loss": 0.1953,
+      "step": 7235
+    },
+    {
+      "epoch": 0.06281195475733718,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001980936668113073,
+      "loss": 0.105,
+      "step": 7236
+    },
+    {
+      "epoch": 0.06282063523754135,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001980930574368581,
+      "loss": 0.1387,
+      "step": 7237
+    },
+    {
+      "epoch": 0.06282931571774551,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019809244796607163,
+      "loss": 0.1602,
+      "step": 7238
+    },
+    {
+      "epoch": 0.06283799619794966,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019809183839894864,
+      "loss": 0.2109,
+      "step": 7239
+    },
+    {
+      "epoch": 0.06284667667815383,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001980912287354897,
+      "loss": 0.1309,
+      "step": 7240
+    },
+    {
+      "epoch": 0.062855357158358,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0019809061897569547,
+      "loss": 0.1055,
+      "step": 7241
+    },
+    {
+      "epoch": 0.06286403763856216,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0019809000911956668,
+      "loss": 0.1768,
+      "step": 7242
+    },
+    {
+      "epoch": 0.06287271811876632,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019808939916710405,
+      "loss": 0.1162,
+      "step": 7243
+    },
+    {
+      "epoch": 0.06288139859897049,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.001980887891183081,
+      "loss": 0.1182,
+      "step": 7244
+    },
+    {
+      "epoch": 0.06289007907917465,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019808817897317963,
+      "loss": 0.1445,
+      "step": 7245
+    },
+    {
+      "epoch": 0.06289875955937882,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019808756873171923,
+      "loss": 0.1221,
+      "step": 7246
+    },
+    {
+      "epoch": 0.06290744003958298,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019808695839392755,
+      "loss": 0.1152,
+      "step": 7247
+    },
+    {
+      "epoch": 0.06291612051978715,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0019808634795980534,
+      "loss": 0.1562,
+      "step": 7248
+    },
+    {
+      "epoch": 0.06292480099999131,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001980857374293532,
+      "loss": 0.1387,
+      "step": 7249
+    },
+    {
+      "epoch": 0.06293348148019548,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019808512680257184,
+      "loss": 0.1055,
+      "step": 7250
+    },
+    {
+      "epoch": 0.06294216196039965,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001980845160794619,
+      "loss": 0.1914,
+      "step": 7251
+    },
+    {
+      "epoch": 0.06295084244060381,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001980839052600241,
+      "loss": 0.1216,
+      "step": 7252
+    },
+    {
+      "epoch": 0.06295952292080798,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00198083294344259,
+      "loss": 0.1328,
+      "step": 7253
+    },
+    {
+      "epoch": 0.06296820340101214,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019808268333216736,
+      "loss": 0.1113,
+      "step": 7254
+    },
+    {
+      "epoch": 0.0629768838812163,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001980820722237498,
+      "loss": 0.0933,
+      "step": 7255
+    },
+    {
+      "epoch": 0.06298556436142047,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0019808146101900704,
+      "loss": 0.1367,
+      "step": 7256
+    },
+    {
+      "epoch": 0.06299424484162464,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001980808497179397,
+      "loss": 0.1465,
+      "step": 7257
+    },
+    {
+      "epoch": 0.0630029253218288,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019808023832054845,
+      "loss": 0.1279,
+      "step": 7258
+    },
+    {
+      "epoch": 0.06301160580203297,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.00198079626826834,
+      "loss": 0.1465,
+      "step": 7259
+    },
+    {
+      "epoch": 0.06302028628223713,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.00198079015236797,
+      "loss": 0.1016,
+      "step": 7260
+    },
+    {
+      "epoch": 0.0630289667624413,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019807840355043813,
+      "loss": 0.1299,
+      "step": 7261
+    },
+    {
+      "epoch": 0.06303764724264546,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019807779176775797,
+      "loss": 0.1641,
+      "step": 7262
+    },
+    {
+      "epoch": 0.06304632772284963,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001980771798887573,
+      "loss": 0.1523,
+      "step": 7263
+    },
+    {
+      "epoch": 0.06305500820305379,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019807656791343674,
+      "loss": 0.1406,
+      "step": 7264
+    },
+    {
+      "epoch": 0.06306368868325796,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019807595584179698,
+      "loss": 0.1416,
+      "step": 7265
+    },
+    {
+      "epoch": 0.06307236916346212,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.001980753436738387,
+      "loss": 0.1895,
+      "step": 7266
+    },
+    {
+      "epoch": 0.06308104964366629,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019807473140956255,
+      "loss": 0.1377,
+      "step": 7267
+    },
+    {
+      "epoch": 0.06308973012387045,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019807411904896916,
+      "loss": 0.1416,
+      "step": 7268
+    },
+    {
+      "epoch": 0.06309841060407462,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019807350659205923,
+      "loss": 0.165,
+      "step": 7269
+    },
+    {
+      "epoch": 0.06310709108427878,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019807289403883342,
+      "loss": 0.1895,
+      "step": 7270
+    },
+    {
+      "epoch": 0.06311577156448295,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.001980722813892925,
+      "loss": 0.1582,
+      "step": 7271
+    },
+    {
+      "epoch": 0.06312445204468711,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0019807166864343703,
+      "loss": 0.1338,
+      "step": 7272
+    },
+    {
+      "epoch": 0.06313313252489128,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019807105580126765,
+      "loss": 0.1445,
+      "step": 7273
+    },
+    {
+      "epoch": 0.06314181300509544,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001980704428627851,
+      "loss": 0.1318,
+      "step": 7274
+    },
+    {
+      "epoch": 0.06315049348529961,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019806982982799007,
+      "loss": 0.1621,
+      "step": 7275
+    },
+    {
+      "epoch": 0.06315917396550377,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001980692166968832,
+      "loss": 0.1426,
+      "step": 7276
+    },
+    {
+      "epoch": 0.06316785444570794,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0019806860346946514,
+      "loss": 0.1074,
+      "step": 7277
+    },
+    {
+      "epoch": 0.0631765349259121,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001980679901457366,
+      "loss": 0.1426,
+      "step": 7278
+    },
+    {
+      "epoch": 0.06318521540611627,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001980673767256982,
+      "loss": 0.123,
+      "step": 7279
+    },
+    {
+      "epoch": 0.06319389588632043,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0019806676320935066,
+      "loss": 0.1211,
+      "step": 7280
+    },
+    {
+      "epoch": 0.0632025763665246,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001980661495966946,
+      "loss": 0.1875,
+      "step": 7281
+    },
+    {
+      "epoch": 0.06321125684672876,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019806553588773076,
+      "loss": 0.1235,
+      "step": 7282
+    },
+    {
+      "epoch": 0.06321993732693293,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019806492208245974,
+      "loss": 0.1348,
+      "step": 7283
+    },
+    {
+      "epoch": 0.0632286178071371,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001980643081808823,
+      "loss": 0.1738,
+      "step": 7284
+    },
+    {
+      "epoch": 0.06323729828734126,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00198063694182999,
+      "loss": 0.1455,
+      "step": 7285
+    },
+    {
+      "epoch": 0.06324597876754542,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0019806308008881056,
+      "loss": 0.166,
+      "step": 7286
+    },
+    {
+      "epoch": 0.06325465924774959,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019806246589831773,
+      "loss": 0.1216,
+      "step": 7287
+    },
+    {
+      "epoch": 0.06326333972795375,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0019806185161152104,
+      "loss": 0.1172,
+      "step": 7288
+    },
+    {
+      "epoch": 0.06327202020815792,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019806123722842127,
+      "loss": 0.2266,
+      "step": 7289
+    },
+    {
+      "epoch": 0.06328070068836208,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019806062274901905,
+      "loss": 0.1143,
+      "step": 7290
+    },
+    {
+      "epoch": 0.06328938116856625,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.00198060008173315,
+      "loss": 0.1465,
+      "step": 7291
+    },
+    {
+      "epoch": 0.06329806164877042,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001980593935013099,
+      "loss": 0.1299,
+      "step": 7292
+    },
+    {
+      "epoch": 0.06330674212897458,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001980587787330044,
+      "loss": 0.1465,
+      "step": 7293
+    },
+    {
+      "epoch": 0.06331542260917875,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0019805816386839906,
+      "loss": 0.1377,
+      "step": 7294
+    },
+    {
+      "epoch": 0.06332410308938291,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019805754890749473,
+      "loss": 0.1436,
+      "step": 7295
+    },
+    {
+      "epoch": 0.06333278356958708,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.001980569338502919,
+      "loss": 0.166,
+      "step": 7296
+    },
+    {
+      "epoch": 0.06334146404979124,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0019805631869679137,
+      "loss": 0.1426,
+      "step": 7297
+    },
+    {
+      "epoch": 0.0633501445299954,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001980557034469938,
+      "loss": 0.1318,
+      "step": 7298
+    },
+    {
+      "epoch": 0.06335882501019957,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019805508810089977,
+      "loss": 0.1523,
+      "step": 7299
+    },
+    {
+      "epoch": 0.06336750549040374,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001980544726585101,
+      "loss": 0.1113,
+      "step": 7300
+    },
+    {
+      "epoch": 0.06337618597060789,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001980538571198253,
+      "loss": 0.1484,
+      "step": 7301
+    },
+    {
+      "epoch": 0.06338486645081205,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001980532414848462,
+      "loss": 0.1504,
+      "step": 7302
+    },
+    {
+      "epoch": 0.06339354693101622,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019805262575357334,
+      "loss": 0.125,
+      "step": 7303
+    },
+    {
+      "epoch": 0.06340222741122038,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001980520099260075,
+      "loss": 0.1074,
+      "step": 7304
+    },
+    {
+      "epoch": 0.06341090789142455,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019805139400214927,
+      "loss": 0.1621,
+      "step": 7305
+    },
+    {
+      "epoch": 0.06341958837162871,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0019805077798199936,
+      "loss": 0.1445,
+      "step": 7306
+    },
+    {
+      "epoch": 0.06342826885183288,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019805016186555846,
+      "loss": 0.1182,
+      "step": 7307
+    },
+    {
+      "epoch": 0.06343694933203704,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019804954565282723,
+      "loss": 0.1553,
+      "step": 7308
+    },
+    {
+      "epoch": 0.06344562981224121,
+      "grad_norm": 0.875,
+      "learning_rate": 0.001980489293438063,
+      "loss": 0.127,
+      "step": 7309
+    },
+    {
+      "epoch": 0.06345431029244537,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001980483129384964,
+      "loss": 0.1592,
+      "step": 7310
+    },
+    {
+      "epoch": 0.06346299077264954,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019804769643689824,
+      "loss": 0.1562,
+      "step": 7311
+    },
+    {
+      "epoch": 0.0634716712528537,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019804707983901238,
+      "loss": 0.1299,
+      "step": 7312
+    },
+    {
+      "epoch": 0.06348035173305787,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0019804646314483957,
+      "loss": 0.1426,
+      "step": 7313
+    },
+    {
+      "epoch": 0.06348903221326203,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0019804584635438045,
+      "loss": 0.125,
+      "step": 7314
+    },
+    {
+      "epoch": 0.0634977126934662,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0019804522946763577,
+      "loss": 0.1436,
+      "step": 7315
+    },
+    {
+      "epoch": 0.06350639317367036,
+      "grad_norm": 1.0,
+      "learning_rate": 0.001980446124846061,
+      "loss": 0.1279,
+      "step": 7316
+    },
+    {
+      "epoch": 0.06351507365387453,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019804399540529222,
+      "loss": 0.1631,
+      "step": 7317
+    },
+    {
+      "epoch": 0.0635237541340787,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001980433782296947,
+      "loss": 0.1377,
+      "step": 7318
+    },
+    {
+      "epoch": 0.06353243461428286,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019804276095781426,
+      "loss": 0.1465,
+      "step": 7319
+    },
+    {
+      "epoch": 0.06354111509448702,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0019804214358965164,
+      "loss": 0.1279,
+      "step": 7320
+    },
+    {
+      "epoch": 0.06354979557469119,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001980415261252074,
+      "loss": 0.1289,
+      "step": 7321
+    },
+    {
+      "epoch": 0.06355847605489535,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001980409085644823,
+      "loss": 0.1064,
+      "step": 7322
+    },
+    {
+      "epoch": 0.06356715653509952,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0019804029090747695,
+      "loss": 0.1299,
+      "step": 7323
+    },
+    {
+      "epoch": 0.06357583701530369,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019803967315419205,
+      "loss": 0.2012,
+      "step": 7324
+    },
+    {
+      "epoch": 0.06358451749550785,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019803905530462835,
+      "loss": 0.1523,
+      "step": 7325
+    },
+    {
+      "epoch": 0.06359319797571202,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001980384373587864,
+      "loss": 0.1069,
+      "step": 7326
+    },
+    {
+      "epoch": 0.06360187845591618,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019803781931666694,
+      "loss": 0.1182,
+      "step": 7327
+    },
+    {
+      "epoch": 0.06361055893612035,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001980372011782707,
+      "loss": 0.2002,
+      "step": 7328
+    },
+    {
+      "epoch": 0.06361923941632451,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019803658294359826,
+      "loss": 0.1348,
+      "step": 7329
+    },
+    {
+      "epoch": 0.06362791989652868,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019803596461265035,
+      "loss": 0.1807,
+      "step": 7330
+    },
+    {
+      "epoch": 0.06363660037673284,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0019803534618542764,
+      "loss": 0.1543,
+      "step": 7331
+    },
+    {
+      "epoch": 0.063645280856937,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0019803472766193077,
+      "loss": 0.126,
+      "step": 7332
+    },
+    {
+      "epoch": 0.06365396133714117,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0019803410904216047,
+      "loss": 0.1328,
+      "step": 7333
+    },
+    {
+      "epoch": 0.06366264181734534,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0019803349032611736,
+      "loss": 0.1484,
+      "step": 7334
+    },
+    {
+      "epoch": 0.0636713222975495,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019803287151380217,
+      "loss": 0.1367,
+      "step": 7335
+    },
+    {
+      "epoch": 0.06368000277775367,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0019803225260521555,
+      "loss": 0.2344,
+      "step": 7336
+    },
+    {
+      "epoch": 0.06368868325795783,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.001980316336003582,
+      "loss": 0.1299,
+      "step": 7337
+    },
+    {
+      "epoch": 0.063697363738162,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019803101449923075,
+      "loss": 0.1465,
+      "step": 7338
+    },
+    {
+      "epoch": 0.06370604421836616,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019803039530183393,
+      "loss": 0.125,
+      "step": 7339
+    },
+    {
+      "epoch": 0.06371472469857033,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0019802977600816836,
+      "loss": 0.1484,
+      "step": 7340
+    },
+    {
+      "epoch": 0.06372340517877449,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001980291566182348,
+      "loss": 0.124,
+      "step": 7341
+    },
+    {
+      "epoch": 0.06373208565897866,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019802853713203382,
+      "loss": 0.166,
+      "step": 7342
+    },
+    {
+      "epoch": 0.06374076613918282,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001980279175495662,
+      "loss": 0.1562,
+      "step": 7343
+    },
+    {
+      "epoch": 0.06374944661938699,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019802729787083257,
+      "loss": 0.1172,
+      "step": 7344
+    },
+    {
+      "epoch": 0.06375812709959115,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019802667809583354,
+      "loss": 0.1221,
+      "step": 7345
+    },
+    {
+      "epoch": 0.06376680757979532,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019802605822456993,
+      "loss": 0.1328,
+      "step": 7346
+    },
+    {
+      "epoch": 0.06377548805999948,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.001980254382570423,
+      "loss": 0.1309,
+      "step": 7347
+    },
+    {
+      "epoch": 0.06378416854020365,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001980248181932514,
+      "loss": 0.1367,
+      "step": 7348
+    },
+    {
+      "epoch": 0.06379284902040781,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001980241980331979,
+      "loss": 0.1118,
+      "step": 7349
+    },
+    {
+      "epoch": 0.06380152950061198,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019802357777688245,
+      "loss": 0.1328,
+      "step": 7350
+    },
+    {
+      "epoch": 0.06381020998081614,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001980229574243057,
+      "loss": 0.1016,
+      "step": 7351
+    },
+    {
+      "epoch": 0.06381889046102031,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001980223369754684,
+      "loss": 0.1182,
+      "step": 7352
+    },
+    {
+      "epoch": 0.06382757094122447,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0019802171643037115,
+      "loss": 0.1118,
+      "step": 7353
+    },
+    {
+      "epoch": 0.06383625142142864,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019802109578901475,
+      "loss": 0.123,
+      "step": 7354
+    },
+    {
+      "epoch": 0.0638449319016328,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0019802047505139975,
+      "loss": 0.1113,
+      "step": 7355
+    },
+    {
+      "epoch": 0.06385361238183697,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001980198542175269,
+      "loss": 0.1562,
+      "step": 7356
+    },
+    {
+      "epoch": 0.06386229286204113,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001980192332873969,
+      "loss": 0.1348,
+      "step": 7357
+    },
+    {
+      "epoch": 0.0638709733422453,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001980186122610103,
+      "loss": 0.1699,
+      "step": 7358
+    },
+    {
+      "epoch": 0.06387965382244946,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001980179911383679,
+      "loss": 0.1289,
+      "step": 7359
+    },
+    {
+      "epoch": 0.06388833430265363,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001980173699194704,
+      "loss": 0.1162,
+      "step": 7360
+    },
+    {
+      "epoch": 0.0638970147828578,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019801674860431837,
+      "loss": 0.1592,
+      "step": 7361
+    },
+    {
+      "epoch": 0.06390569526306195,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0019801612719291257,
+      "loss": 0.1504,
+      "step": 7362
+    },
+    {
+      "epoch": 0.06391437574326611,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001980155056852536,
+      "loss": 0.1504,
+      "step": 7363
+    },
+    {
+      "epoch": 0.06392305622347028,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0019801488408134225,
+      "loss": 0.1719,
+      "step": 7364
+    },
+    {
+      "epoch": 0.06393173670367444,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019801426238117916,
+      "loss": 0.1279,
+      "step": 7365
+    },
+    {
+      "epoch": 0.0639404171838786,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019801364058476497,
+      "loss": 0.1328,
+      "step": 7366
+    },
+    {
+      "epoch": 0.06394909766408277,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001980130186921004,
+      "loss": 0.1641,
+      "step": 7367
+    },
+    {
+      "epoch": 0.06395777814428694,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001980123967031861,
+      "loss": 0.0938,
+      "step": 7368
+    },
+    {
+      "epoch": 0.0639664586244911,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001980117746180228,
+      "loss": 0.1133,
+      "step": 7369
+    },
+    {
+      "epoch": 0.06397513910469527,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019801115243661105,
+      "loss": 0.168,
+      "step": 7370
+    },
+    {
+      "epoch": 0.06398381958489943,
+      "grad_norm": 0.875,
+      "learning_rate": 0.001980105301589517,
+      "loss": 0.1367,
+      "step": 7371
+    },
+    {
+      "epoch": 0.0639925000651036,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.001980099077850454,
+      "loss": 0.1436,
+      "step": 7372
+    },
+    {
+      "epoch": 0.06400118054530776,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001980092853148927,
+      "loss": 0.1279,
+      "step": 7373
+    },
+    {
+      "epoch": 0.06400986102551193,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001980086627484944,
+      "loss": 0.1631,
+      "step": 7374
+    },
+    {
+      "epoch": 0.06401854150571609,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019800804008585113,
+      "loss": 0.1357,
+      "step": 7375
+    },
+    {
+      "epoch": 0.06402722198592026,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001980074173269636,
+      "loss": 0.1504,
+      "step": 7376
+    },
+    {
+      "epoch": 0.06403590246612442,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001980067944718325,
+      "loss": 0.0928,
+      "step": 7377
+    },
+    {
+      "epoch": 0.06404458294632859,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019800617152045848,
+      "loss": 0.1309,
+      "step": 7378
+    },
+    {
+      "epoch": 0.06405326342653275,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0019800554847284224,
+      "loss": 0.1157,
+      "step": 7379
+    },
+    {
+      "epoch": 0.06406194390673692,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019800492532898446,
+      "loss": 0.1211,
+      "step": 7380
+    },
+    {
+      "epoch": 0.06407062438694108,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001980043020888858,
+      "loss": 0.127,
+      "step": 7381
+    },
+    {
+      "epoch": 0.06407930486714525,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00198003678752547,
+      "loss": 0.1523,
+      "step": 7382
+    },
+    {
+      "epoch": 0.06408798534734941,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019800305531996864,
+      "loss": 0.1504,
+      "step": 7383
+    },
+    {
+      "epoch": 0.06409666582755358,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0019800243179115143,
+      "loss": 0.1143,
+      "step": 7384
+    },
+    {
+      "epoch": 0.06410534630775774,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001980018081660962,
+      "loss": 0.1465,
+      "step": 7385
+    },
+    {
+      "epoch": 0.06411402678796191,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019800118444480343,
+      "loss": 0.1064,
+      "step": 7386
+    },
+    {
+      "epoch": 0.06412270726816607,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001980005606272739,
+      "loss": 0.1318,
+      "step": 7387
+    },
+    {
+      "epoch": 0.06413138774837024,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001979999367135083,
+      "loss": 0.1465,
+      "step": 7388
+    },
+    {
+      "epoch": 0.0641400682285744,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019799931270350728,
+      "loss": 0.124,
+      "step": 7389
+    },
+    {
+      "epoch": 0.06414874870877857,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019799868859727153,
+      "loss": 0.1328,
+      "step": 7390
+    },
+    {
+      "epoch": 0.06415742918898273,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0019799806439480174,
+      "loss": 0.1201,
+      "step": 7391
+    },
+    {
+      "epoch": 0.0641661096691869,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001979974400960986,
+      "loss": 0.1855,
+      "step": 7392
+    },
+    {
+      "epoch": 0.06417479014939106,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019799681570116276,
+      "loss": 0.1133,
+      "step": 7393
+    },
+    {
+      "epoch": 0.06418347062959523,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001979961912099949,
+      "loss": 0.0977,
+      "step": 7394
+    },
+    {
+      "epoch": 0.0641921511097994,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001979955666225958,
+      "loss": 0.2617,
+      "step": 7395
+    },
+    {
+      "epoch": 0.06420083159000356,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.00197994941938966,
+      "loss": 0.168,
+      "step": 7396
+    },
+    {
+      "epoch": 0.06420951207020772,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001979943171591063,
+      "loss": 0.1133,
+      "step": 7397
+    },
+    {
+      "epoch": 0.06421819255041189,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0019799369228301732,
+      "loss": 0.1211,
+      "step": 7398
+    },
+    {
+      "epoch": 0.06422687303061606,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0019799306731069976,
+      "loss": 0.1104,
+      "step": 7399
+    },
+    {
+      "epoch": 0.06423555351082022,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001979924422421543,
+      "loss": 0.1533,
+      "step": 7400
+    },
+    {
+      "epoch": 0.06424423399102439,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019799181707738167,
+      "loss": 0.1367,
+      "step": 7401
+    },
+    {
+      "epoch": 0.06425291447122855,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0019799119181638245,
+      "loss": 0.1416,
+      "step": 7402
+    },
+    {
+      "epoch": 0.06426159495143272,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001979905664591574,
+      "loss": 0.1436,
+      "step": 7403
+    },
+    {
+      "epoch": 0.06427027543163688,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019798994100570723,
+      "loss": 0.166,
+      "step": 7404
+    },
+    {
+      "epoch": 0.06427895591184105,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001979893154560325,
+      "loss": 0.1562,
+      "step": 7405
+    },
+    {
+      "epoch": 0.06428763639204521,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001979886898101341,
+      "loss": 0.1426,
+      "step": 7406
+    },
+    {
+      "epoch": 0.06429631687224938,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019798806406801247,
+      "loss": 0.1289,
+      "step": 7407
+    },
+    {
+      "epoch": 0.06430499735245354,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019798743822966843,
+      "loss": 0.127,
+      "step": 7408
+    },
+    {
+      "epoch": 0.0643136778326577,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.001979868122951027,
+      "loss": 0.1396,
+      "step": 7409
+    },
+    {
+      "epoch": 0.06432235831286187,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001979861862643159,
+      "loss": 0.1484,
+      "step": 7410
+    },
+    {
+      "epoch": 0.06433103879306604,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001979855601373087,
+      "loss": 0.1611,
+      "step": 7411
+    },
+    {
+      "epoch": 0.0643397192732702,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001979849339140819,
+      "loss": 0.1147,
+      "step": 7412
+    },
+    {
+      "epoch": 0.06434839975347437,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.00197984307594636,
+      "loss": 0.1543,
+      "step": 7413
+    },
+    {
+      "epoch": 0.06435708023367853,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019798368117897184,
+      "loss": 0.165,
+      "step": 7414
+    },
+    {
+      "epoch": 0.0643657607138827,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019798305466709,
+      "loss": 0.1426,
+      "step": 7415
+    },
+    {
+      "epoch": 0.06437444119408686,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019798242805899127,
+      "loss": 0.1465,
+      "step": 7416
+    },
+    {
+      "epoch": 0.06438312167429103,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001979818013546762,
+      "loss": 0.124,
+      "step": 7417
+    },
+    {
+      "epoch": 0.06439180215449519,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0019798117455414564,
+      "loss": 0.106,
+      "step": 7418
+    },
+    {
+      "epoch": 0.06440048263469936,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019798054765740015,
+      "loss": 0.1758,
+      "step": 7419
+    },
+    {
+      "epoch": 0.06440916311490352,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0019797992066444047,
+      "loss": 0.1182,
+      "step": 7420
+    },
+    {
+      "epoch": 0.06441784359510769,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019797929357526726,
+      "loss": 0.1025,
+      "step": 7421
+    },
+    {
+      "epoch": 0.06442652407531185,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.001979786663898812,
+      "loss": 0.1885,
+      "step": 7422
+    },
+    {
+      "epoch": 0.06443520455551602,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00197978039108283,
+      "loss": 0.1787,
+      "step": 7423
+    },
+    {
+      "epoch": 0.06444388503572017,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019797741173047338,
+      "loss": 0.1128,
+      "step": 7424
+    },
+    {
+      "epoch": 0.06445256551592433,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019797678425645293,
+      "loss": 0.127,
+      "step": 7425
+    },
+    {
+      "epoch": 0.0644612459961285,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019797615668622243,
+      "loss": 0.0952,
+      "step": 7426
+    },
+    {
+      "epoch": 0.06446992647633266,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0019797552901978247,
+      "loss": 0.1367,
+      "step": 7427
+    },
+    {
+      "epoch": 0.06447860695653683,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0019797490125713383,
+      "loss": 0.1514,
+      "step": 7428
+    },
+    {
+      "epoch": 0.064487287436741,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019797427339827717,
+      "loss": 0.1426,
+      "step": 7429
+    },
+    {
+      "epoch": 0.06449596791694516,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001979736454432132,
+      "loss": 0.1465,
+      "step": 7430
+    },
+    {
+      "epoch": 0.06450464839714933,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0019797301739194247,
+      "loss": 0.1206,
+      "step": 7431
+    },
+    {
+      "epoch": 0.06451332887735349,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019797238924446586,
+      "loss": 0.1328,
+      "step": 7432
+    },
+    {
+      "epoch": 0.06452200935755766,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001979717610007839,
+      "loss": 0.1543,
+      "step": 7433
+    },
+    {
+      "epoch": 0.06453068983776182,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.001979711326608974,
+      "loss": 0.0938,
+      "step": 7434
+    },
+    {
+      "epoch": 0.06453937031796599,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.00197970504224807,
+      "loss": 0.0874,
+      "step": 7435
+    },
+    {
+      "epoch": 0.06454805079817015,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019796987569251335,
+      "loss": 0.2012,
+      "step": 7436
+    },
+    {
+      "epoch": 0.06455673127837432,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019796924706401717,
+      "loss": 0.125,
+      "step": 7437
+    },
+    {
+      "epoch": 0.06456541175857848,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019796861833931913,
+      "loss": 0.1245,
+      "step": 7438
+    },
+    {
+      "epoch": 0.06457409223878265,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0019796798951841996,
+      "loss": 0.1602,
+      "step": 7439
+    },
+    {
+      "epoch": 0.06458277271898681,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0019796736060132027,
+      "loss": 0.1299,
+      "step": 7440
+    },
+    {
+      "epoch": 0.06459145319919098,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019796673158802083,
+      "loss": 0.1133,
+      "step": 7441
+    },
+    {
+      "epoch": 0.06460013367939514,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001979661024785223,
+      "loss": 0.1641,
+      "step": 7442
+    },
+    {
+      "epoch": 0.0646088141595993,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019796547327282535,
+      "loss": 0.1201,
+      "step": 7443
+    },
+    {
+      "epoch": 0.06461749463980347,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001979648439709307,
+      "loss": 0.1602,
+      "step": 7444
+    },
+    {
+      "epoch": 0.06462617512000764,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0019796421457283896,
+      "loss": 0.125,
+      "step": 7445
+    },
+    {
+      "epoch": 0.0646348556002118,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019796358507855093,
+      "loss": 0.1289,
+      "step": 7446
+    },
+    {
+      "epoch": 0.06464353608041597,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0019796295548806723,
+      "loss": 0.1465,
+      "step": 7447
+    },
+    {
+      "epoch": 0.06465221656062013,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019796232580138857,
+      "loss": 0.1113,
+      "step": 7448
+    },
+    {
+      "epoch": 0.0646608970408243,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019796169601851562,
+      "loss": 0.104,
+      "step": 7449
+    },
+    {
+      "epoch": 0.06466957752102846,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019796106613944905,
+      "loss": 0.1289,
+      "step": 7450
+    },
+    {
+      "epoch": 0.06467825800123263,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019796043616418964,
+      "loss": 0.1299,
+      "step": 7451
+    },
+    {
+      "epoch": 0.06468693848143679,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.00197959806092738,
+      "loss": 0.1602,
+      "step": 7452
+    },
+    {
+      "epoch": 0.06469561896164096,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0019795917592509483,
+      "loss": 0.1094,
+      "step": 7453
+    },
+    {
+      "epoch": 0.06470429944184512,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019795854566126083,
+      "loss": 0.123,
+      "step": 7454
+    },
+    {
+      "epoch": 0.06471297992204929,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.001979579153012367,
+      "loss": 0.1846,
+      "step": 7455
+    },
+    {
+      "epoch": 0.06472166040225345,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001979572848450231,
+      "loss": 0.1621,
+      "step": 7456
+    },
+    {
+      "epoch": 0.06473034088245762,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019795665429262073,
+      "loss": 0.166,
+      "step": 7457
+    },
+    {
+      "epoch": 0.06473902136266178,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001979560236440303,
+      "loss": 0.1416,
+      "step": 7458
+    },
+    {
+      "epoch": 0.06474770184286595,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0019795539289925246,
+      "loss": 0.1309,
+      "step": 7459
+    },
+    {
+      "epoch": 0.06475638232307011,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019795476205828797,
+      "loss": 0.167,
+      "step": 7460
+    },
+    {
+      "epoch": 0.06476506280327428,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0019795413112113744,
+      "loss": 0.1289,
+      "step": 7461
+    },
+    {
+      "epoch": 0.06477374328347844,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019795350008780157,
+      "loss": 0.127,
+      "step": 7462
+    },
+    {
+      "epoch": 0.06478242376368261,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001979528689582811,
+      "loss": 0.1309,
+      "step": 7463
+    },
+    {
+      "epoch": 0.06479110424388677,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001979522377325767,
+      "loss": 0.1562,
+      "step": 7464
+    },
+    {
+      "epoch": 0.06479978472409094,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019795160641068905,
+      "loss": 0.1211,
+      "step": 7465
+    },
+    {
+      "epoch": 0.0648084652042951,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019795097499261884,
+      "loss": 0.1289,
+      "step": 7466
+    },
+    {
+      "epoch": 0.06481714568449927,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019795034347836675,
+      "loss": 0.127,
+      "step": 7467
+    },
+    {
+      "epoch": 0.06482582616470343,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019794971186793352,
+      "loss": 0.1348,
+      "step": 7468
+    },
+    {
+      "epoch": 0.0648345066449076,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019794908016131977,
+      "loss": 0.1582,
+      "step": 7469
+    },
+    {
+      "epoch": 0.06484318712511176,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0019794844835852627,
+      "loss": 0.1738,
+      "step": 7470
+    },
+    {
+      "epoch": 0.06485186760531593,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001979478164595536,
+      "loss": 0.1318,
+      "step": 7471
+    },
+    {
+      "epoch": 0.0648605480855201,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001979471844644026,
+      "loss": 0.1572,
+      "step": 7472
+    },
+    {
+      "epoch": 0.06486922856572426,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019794655237307386,
+      "loss": 0.1523,
+      "step": 7473
+    },
+    {
+      "epoch": 0.06487790904592843,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001979459201855681,
+      "loss": 0.1328,
+      "step": 7474
+    },
+    {
+      "epoch": 0.06488658952613259,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019794528790188595,
+      "loss": 0.1445,
+      "step": 7475
+    },
+    {
+      "epoch": 0.06489527000633676,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001979446555220282,
+      "loss": 0.1309,
+      "step": 7476
+    },
+    {
+      "epoch": 0.06490395048654092,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001979440230459955,
+      "loss": 0.1016,
+      "step": 7477
+    },
+    {
+      "epoch": 0.06491263096674509,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001979433904737885,
+      "loss": 0.1426,
+      "step": 7478
+    },
+    {
+      "epoch": 0.06492131144694925,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001979427578054079,
+      "loss": 0.1357,
+      "step": 7479
+    },
+    {
+      "epoch": 0.06492999192715342,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.001979421250408545,
+      "loss": 0.1504,
+      "step": 7480
+    },
+    {
+      "epoch": 0.06493867240735758,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019794149218012886,
+      "loss": 0.1118,
+      "step": 7481
+    },
+    {
+      "epoch": 0.06494735288756175,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019794085922323176,
+      "loss": 0.1279,
+      "step": 7482
+    },
+    {
+      "epoch": 0.06495603336776591,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019794022617016386,
+      "loss": 0.1719,
+      "step": 7483
+    },
+    {
+      "epoch": 0.06496471384797008,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019793959302092583,
+      "loss": 0.1309,
+      "step": 7484
+    },
+    {
+      "epoch": 0.06497339432817423,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0019793895977551837,
+      "loss": 0.1289,
+      "step": 7485
+    },
+    {
+      "epoch": 0.06498207480837839,
+      "grad_norm": 2.40625,
+      "learning_rate": 0.0019793832643394227,
+      "loss": 0.2021,
+      "step": 7486
+    },
+    {
+      "epoch": 0.06499075528858256,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.00197937692996198,
+      "loss": 0.1484,
+      "step": 7487
+    },
+    {
+      "epoch": 0.06499943576878672,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0019793705946228647,
+      "loss": 0.1396,
+      "step": 7488
+    },
+    {
+      "epoch": 0.06500811624899089,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.001979364258322083,
+      "loss": 0.1309,
+      "step": 7489
+    },
+    {
+      "epoch": 0.06501679672919505,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001979357921059642,
+      "loss": 0.1641,
+      "step": 7490
+    },
+    {
+      "epoch": 0.06502547720939922,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001979351582835548,
+      "loss": 0.0869,
+      "step": 7491
+    },
+    {
+      "epoch": 0.06503415768960338,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019793452436498078,
+      "loss": 0.1465,
+      "step": 7492
+    },
+    {
+      "epoch": 0.06504283816980755,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0019793389035024293,
+      "loss": 0.1299,
+      "step": 7493
+    },
+    {
+      "epoch": 0.06505151865001171,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001979332562393419,
+      "loss": 0.1758,
+      "step": 7494
+    },
+    {
+      "epoch": 0.06506019913021588,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019793262203227843,
+      "loss": 0.127,
+      "step": 7495
+    },
+    {
+      "epoch": 0.06506887961042004,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001979319877290531,
+      "loss": 0.0996,
+      "step": 7496
+    },
+    {
+      "epoch": 0.06507756009062421,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001979313533296667,
+      "loss": 0.1426,
+      "step": 7497
+    },
+    {
+      "epoch": 0.06508624057082837,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001979307188341199,
+      "loss": 0.1543,
+      "step": 7498
+    },
+    {
+      "epoch": 0.06509492105103254,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0019793008424241337,
+      "loss": 0.0962,
+      "step": 7499
+    },
+    {
+      "epoch": 0.0651036015312367,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0019792944955454783,
+      "loss": 0.1211,
+      "step": 7500
+    },
+    {
+      "epoch": 0.06511228201144087,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0019792881477052396,
+      "loss": 0.1465,
+      "step": 7501
+    },
+    {
+      "epoch": 0.06512096249164503,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019792817989034246,
+      "loss": 0.1748,
+      "step": 7502
+    },
+    {
+      "epoch": 0.0651296429718492,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019792754491400402,
+      "loss": 0.1318,
+      "step": 7503
+    },
+    {
+      "epoch": 0.06513832345205336,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001979269098415093,
+      "loss": 0.1045,
+      "step": 7504
+    },
+    {
+      "epoch": 0.06514700393225753,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001979262746728591,
+      "loss": 0.1416,
+      "step": 7505
+    },
+    {
+      "epoch": 0.0651556844124617,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0019792563940805403,
+      "loss": 0.1348,
+      "step": 7506
+    },
+    {
+      "epoch": 0.06516436489266586,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.001979250040470948,
+      "loss": 0.1621,
+      "step": 7507
+    },
+    {
+      "epoch": 0.06517304537287003,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019792436858998208,
+      "loss": 0.1406,
+      "step": 7508
+    },
+    {
+      "epoch": 0.06518172585307419,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001979237330367166,
+      "loss": 0.1152,
+      "step": 7509
+    },
+    {
+      "epoch": 0.06519040633327836,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019792309738729907,
+      "loss": 0.1191,
+      "step": 7510
+    },
+    {
+      "epoch": 0.06519908681348252,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019792246164173012,
+      "loss": 0.1377,
+      "step": 7511
+    },
+    {
+      "epoch": 0.06520776729368669,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019792182580001053,
+      "loss": 0.1494,
+      "step": 7512
+    },
+    {
+      "epoch": 0.06521644777389085,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0019792118986214093,
+      "loss": 0.165,
+      "step": 7513
+    },
+    {
+      "epoch": 0.06522512825409502,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0019792055382812203,
+      "loss": 0.1377,
+      "step": 7514
+    },
+    {
+      "epoch": 0.06523380873429918,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019791991769795457,
+      "loss": 0.1455,
+      "step": 7515
+    },
+    {
+      "epoch": 0.06524248921450335,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001979192814716392,
+      "loss": 0.1465,
+      "step": 7516
+    },
+    {
+      "epoch": 0.06525116969470751,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001979186451491766,
+      "loss": 0.1357,
+      "step": 7517
+    },
+    {
+      "epoch": 0.06525985017491168,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001979180087305675,
+      "loss": 0.1602,
+      "step": 7518
+    },
+    {
+      "epoch": 0.06526853065511584,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001979173722158126,
+      "loss": 0.1201,
+      "step": 7519
+    },
+    {
+      "epoch": 0.06527721113532,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019791673560491255,
+      "loss": 0.1367,
+      "step": 7520
+    },
+    {
+      "epoch": 0.06528589161552417,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019791609889786813,
+      "loss": 0.126,
+      "step": 7521
+    },
+    {
+      "epoch": 0.06529457209572834,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019791546209467995,
+      "loss": 0.1079,
+      "step": 7522
+    },
+    {
+      "epoch": 0.0653032525759325,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001979148251953487,
+      "loss": 0.1089,
+      "step": 7523
+    },
+    {
+      "epoch": 0.06531193305613667,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.001979141881998752,
+      "loss": 0.1602,
+      "step": 7524
+    },
+    {
+      "epoch": 0.06532061353634083,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019791355110826,
+      "loss": 0.1445,
+      "step": 7525
+    },
+    {
+      "epoch": 0.065329294016545,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001979129139205039,
+      "loss": 0.2383,
+      "step": 7526
+    },
+    {
+      "epoch": 0.06533797449674916,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001979122766366076,
+      "loss": 0.1602,
+      "step": 7527
+    },
+    {
+      "epoch": 0.06534665497695333,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.001979116392565717,
+      "loss": 0.1201,
+      "step": 7528
+    },
+    {
+      "epoch": 0.06535533545715749,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019791100178039693,
+      "loss": 0.1387,
+      "step": 7529
+    },
+    {
+      "epoch": 0.06536401593736166,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.00197910364208084,
+      "loss": 0.1494,
+      "step": 7530
+    },
+    {
+      "epoch": 0.06537269641756582,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001979097265396337,
+      "loss": 0.1006,
+      "step": 7531
+    },
+    {
+      "epoch": 0.06538137689776999,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019790908877504658,
+      "loss": 0.1367,
+      "step": 7532
+    },
+    {
+      "epoch": 0.06539005737797415,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0019790845091432344,
+      "loss": 0.1279,
+      "step": 7533
+    },
+    {
+      "epoch": 0.06539873785817832,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001979078129574649,
+      "loss": 0.0898,
+      "step": 7534
+    },
+    {
+      "epoch": 0.06540741833838248,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001979071749044717,
+      "loss": 0.1445,
+      "step": 7535
+    },
+    {
+      "epoch": 0.06541609881858665,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019790653675534455,
+      "loss": 0.2109,
+      "step": 7536
+    },
+    {
+      "epoch": 0.06542477929879081,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019790589851008413,
+      "loss": 0.1426,
+      "step": 7537
+    },
+    {
+      "epoch": 0.06543345977899498,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019790526016869115,
+      "loss": 0.1406,
+      "step": 7538
+    },
+    {
+      "epoch": 0.06544214025919914,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001979046217311663,
+      "loss": 0.1064,
+      "step": 7539
+    },
+    {
+      "epoch": 0.06545082073940331,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019790398319751027,
+      "loss": 0.1484,
+      "step": 7540
+    },
+    {
+      "epoch": 0.06545950121960747,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0019790334456772376,
+      "loss": 0.1562,
+      "step": 7541
+    },
+    {
+      "epoch": 0.06546818169981164,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0019790270584180746,
+      "loss": 0.1611,
+      "step": 7542
+    },
+    {
+      "epoch": 0.0654768621800158,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0019790206701976207,
+      "loss": 0.1299,
+      "step": 7543
+    },
+    {
+      "epoch": 0.06548554266021997,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001979014281015883,
+      "loss": 0.1152,
+      "step": 7544
+    },
+    {
+      "epoch": 0.06549422314042413,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001979007890872869,
+      "loss": 0.125,
+      "step": 7545
+    },
+    {
+      "epoch": 0.0655029036206283,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019790014997685852,
+      "loss": 0.1318,
+      "step": 7546
+    },
+    {
+      "epoch": 0.06551158410083245,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001978995107703038,
+      "loss": 0.1533,
+      "step": 7547
+    },
+    {
+      "epoch": 0.06552026458103662,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0019789887146762354,
+      "loss": 0.1182,
+      "step": 7548
+    },
+    {
+      "epoch": 0.06552894506124078,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0019789823206881837,
+      "loss": 0.1553,
+      "step": 7549
+    },
+    {
+      "epoch": 0.06553762554144495,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019789759257388905,
+      "loss": 0.1299,
+      "step": 7550
+    },
+    {
+      "epoch": 0.06554630602164911,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0019789695298283623,
+      "loss": 0.1416,
+      "step": 7551
+    },
+    {
+      "epoch": 0.06555498650185328,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019789631329566056,
+      "loss": 0.1465,
+      "step": 7552
+    },
+    {
+      "epoch": 0.06556366698205744,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0019789567351236287,
+      "loss": 0.1064,
+      "step": 7553
+    },
+    {
+      "epoch": 0.06557234746226161,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0019789503363294375,
+      "loss": 0.123,
+      "step": 7554
+    },
+    {
+      "epoch": 0.06558102794246577,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019789439365740396,
+      "loss": 0.1182,
+      "step": 7555
+    },
+    {
+      "epoch": 0.06558970842266994,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019789375358574422,
+      "loss": 0.1729,
+      "step": 7556
+    },
+    {
+      "epoch": 0.0655983889028741,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019789311341796515,
+      "loss": 0.1162,
+      "step": 7557
+    },
+    {
+      "epoch": 0.06560706938307827,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019789247315406752,
+      "loss": 0.1289,
+      "step": 7558
+    },
+    {
+      "epoch": 0.06561574986328243,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.00197891832794052,
+      "loss": 0.1167,
+      "step": 7559
+    },
+    {
+      "epoch": 0.0656244303434866,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0019789119233791924,
+      "loss": 0.1084,
+      "step": 7560
+    },
+    {
+      "epoch": 0.06563311082369076,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019789055178567002,
+      "loss": 0.1504,
+      "step": 7561
+    },
+    {
+      "epoch": 0.06564179130389493,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.00197889911137305,
+      "loss": 0.1133,
+      "step": 7562
+    },
+    {
+      "epoch": 0.0656504717840991,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019788927039282493,
+      "loss": 0.1143,
+      "step": 7563
+    },
+    {
+      "epoch": 0.06565915226430326,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0019788862955223046,
+      "loss": 0.1035,
+      "step": 7564
+    },
+    {
+      "epoch": 0.06566783274450742,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001978879886155223,
+      "loss": 0.1348,
+      "step": 7565
+    },
+    {
+      "epoch": 0.06567651322471159,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001978873475827012,
+      "loss": 0.1738,
+      "step": 7566
+    },
+    {
+      "epoch": 0.06568519370491575,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019788670645376773,
+      "loss": 0.1377,
+      "step": 7567
+    },
+    {
+      "epoch": 0.06569387418511992,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019788606522872275,
+      "loss": 0.125,
+      "step": 7568
+    },
+    {
+      "epoch": 0.06570255466532408,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001978854239075669,
+      "loss": 0.1133,
+      "step": 7569
+    },
+    {
+      "epoch": 0.06571123514552825,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001978847824903008,
+      "loss": 0.1445,
+      "step": 7570
+    },
+    {
+      "epoch": 0.06571991562573241,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0019788414097692527,
+      "loss": 0.126,
+      "step": 7571
+    },
+    {
+      "epoch": 0.06572859610593658,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019788349936744093,
+      "loss": 0.1221,
+      "step": 7572
+    },
+    {
+      "epoch": 0.06573727658614074,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0019788285766184853,
+      "loss": 0.1367,
+      "step": 7573
+    },
+    {
+      "epoch": 0.06574595706634491,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019788221586014877,
+      "loss": 0.1729,
+      "step": 7574
+    },
+    {
+      "epoch": 0.06575463754654907,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019788157396234234,
+      "loss": 0.1592,
+      "step": 7575
+    },
+    {
+      "epoch": 0.06576331802675324,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001978809319684299,
+      "loss": 0.2061,
+      "step": 7576
+    },
+    {
+      "epoch": 0.0657719985069574,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019788028987841226,
+      "loss": 0.1699,
+      "step": 7577
+    },
+    {
+      "epoch": 0.06578067898716157,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019787964769229,
+      "loss": 0.1533,
+      "step": 7578
+    },
+    {
+      "epoch": 0.06578935946736574,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0019787900541006396,
+      "loss": 0.1064,
+      "step": 7579
+    },
+    {
+      "epoch": 0.0657980399475699,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001978783630317347,
+      "loss": 0.1123,
+      "step": 7580
+    },
+    {
+      "epoch": 0.06580672042777407,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0019787772055730297,
+      "loss": 0.1172,
+      "step": 7581
+    },
+    {
+      "epoch": 0.06581540090797823,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019787707798676946,
+      "loss": 0.1279,
+      "step": 7582
+    },
+    {
+      "epoch": 0.0658240813881824,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0019787643532013497,
+      "loss": 0.1582,
+      "step": 7583
+    },
+    {
+      "epoch": 0.06583276186838656,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.001978757925574001,
+      "loss": 0.1172,
+      "step": 7584
+    },
+    {
+      "epoch": 0.06584144234859073,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0019787514969856557,
+      "loss": 0.1328,
+      "step": 7585
+    },
+    {
+      "epoch": 0.06585012282879489,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019787450674363214,
+      "loss": 0.1309,
+      "step": 7586
+    },
+    {
+      "epoch": 0.06585880330899906,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001978738636926004,
+      "loss": 0.1133,
+      "step": 7587
+    },
+    {
+      "epoch": 0.06586748378920322,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001978732205454712,
+      "loss": 0.1162,
+      "step": 7588
+    },
+    {
+      "epoch": 0.06587616426940739,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001978725773022451,
+      "loss": 0.126,
+      "step": 7589
+    },
+    {
+      "epoch": 0.06588484474961155,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0019787193396292287,
+      "loss": 0.207,
+      "step": 7590
+    },
+    {
+      "epoch": 0.06589352522981572,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0019787129052750525,
+      "loss": 0.1445,
+      "step": 7591
+    },
+    {
+      "epoch": 0.06590220571001988,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019787064699599293,
+      "loss": 0.1465,
+      "step": 7592
+    },
+    {
+      "epoch": 0.06591088619022405,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001978700033683865,
+      "loss": 0.1357,
+      "step": 7593
+    },
+    {
+      "epoch": 0.06591956667042821,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019786935964468686,
+      "loss": 0.1211,
+      "step": 7594
+    },
+    {
+      "epoch": 0.06592824715063238,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019786871582489454,
+      "loss": 0.1318,
+      "step": 7595
+    },
+    {
+      "epoch": 0.06593692763083654,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0019786807190901035,
+      "loss": 0.1602,
+      "step": 7596
+    },
+    {
+      "epoch": 0.06594560811104071,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.001978674278970349,
+      "loss": 0.1523,
+      "step": 7597
+    },
+    {
+      "epoch": 0.06595428859124487,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019786678378896903,
+      "loss": 0.1064,
+      "step": 7598
+    },
+    {
+      "epoch": 0.06596296907144904,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019786613958481334,
+      "loss": 0.1328,
+      "step": 7599
+    },
+    {
+      "epoch": 0.0659716495516532,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001978654952845685,
+      "loss": 0.1514,
+      "step": 7600
+    },
+    {
+      "epoch": 0.06598033003185737,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019786485088823537,
+      "loss": 0.1069,
+      "step": 7601
+    },
+    {
+      "epoch": 0.06598901051206153,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001978642063958145,
+      "loss": 0.1113,
+      "step": 7602
+    },
+    {
+      "epoch": 0.0659976909922657,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019786356180730665,
+      "loss": 0.0977,
+      "step": 7603
+    },
+    {
+      "epoch": 0.06600637147246986,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019786291712271253,
+      "loss": 0.127,
+      "step": 7604
+    },
+    {
+      "epoch": 0.06601505195267403,
+      "grad_norm": 1.8046875,
+      "learning_rate": 0.0019786227234203286,
+      "loss": 0.1465,
+      "step": 7605
+    },
+    {
+      "epoch": 0.0660237324328782,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019786162746526833,
+      "loss": 0.1348,
+      "step": 7606
+    },
+    {
+      "epoch": 0.06603241291308236,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019786098249241963,
+      "loss": 0.1396,
+      "step": 7607
+    },
+    {
+      "epoch": 0.06604109339328652,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001978603374234875,
+      "loss": 0.1006,
+      "step": 7608
+    },
+    {
+      "epoch": 0.06604977387349067,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001978596922584726,
+      "loss": 0.1631,
+      "step": 7609
+    },
+    {
+      "epoch": 0.06605845435369484,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001978590469973757,
+      "loss": 0.1191,
+      "step": 7610
+    },
+    {
+      "epoch": 0.066067134833899,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001978584016401974,
+      "loss": 0.166,
+      "step": 7611
+    },
+    {
+      "epoch": 0.06607581531410317,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0019785775618693852,
+      "loss": 0.1523,
+      "step": 7612
+    },
+    {
+      "epoch": 0.06608449579430734,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019785711063759976,
+      "loss": 0.1543,
+      "step": 7613
+    },
+    {
+      "epoch": 0.0660931762745115,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0019785646499218172,
+      "loss": 0.123,
+      "step": 7614
+    },
+    {
+      "epoch": 0.06610185675471567,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019785581925068517,
+      "loss": 0.1523,
+      "step": 7615
+    },
+    {
+      "epoch": 0.06611053723491983,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0019785517341311086,
+      "loss": 0.1143,
+      "step": 7616
+    },
+    {
+      "epoch": 0.066119217715124,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001978545274794594,
+      "loss": 0.1328,
+      "step": 7617
+    },
+    {
+      "epoch": 0.06612789819532816,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0019785388144973157,
+      "loss": 0.1211,
+      "step": 7618
+    },
+    {
+      "epoch": 0.06613657867553233,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019785323532392806,
+      "loss": 0.1128,
+      "step": 7619
+    },
+    {
+      "epoch": 0.06614525915573649,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019785258910204957,
+      "loss": 0.105,
+      "step": 7620
+    },
+    {
+      "epoch": 0.06615393963594066,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001978519427840968,
+      "loss": 0.125,
+      "step": 7621
+    },
+    {
+      "epoch": 0.06616262011614482,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0019785129637007047,
+      "loss": 0.1309,
+      "step": 7622
+    },
+    {
+      "epoch": 0.06617130059634899,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001978506498599713,
+      "loss": 0.1514,
+      "step": 7623
+    },
+    {
+      "epoch": 0.06617998107655315,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001978500032538,
+      "loss": 0.0938,
+      "step": 7624
+    },
+    {
+      "epoch": 0.06618866155675732,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001978493565515572,
+      "loss": 0.1426,
+      "step": 7625
+    },
+    {
+      "epoch": 0.06619734203696148,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019784870975324367,
+      "loss": 0.1172,
+      "step": 7626
+    },
+    {
+      "epoch": 0.06620602251716565,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019784806285886012,
+      "loss": 0.125,
+      "step": 7627
+    },
+    {
+      "epoch": 0.06621470299736981,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.001978474158684073,
+      "loss": 0.1436,
+      "step": 7628
+    },
+    {
+      "epoch": 0.06622338347757398,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019784676878188577,
+      "loss": 0.1123,
+      "step": 7629
+    },
+    {
+      "epoch": 0.06623206395777814,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001978461215992964,
+      "loss": 0.1445,
+      "step": 7630
+    },
+    {
+      "epoch": 0.06624074443798231,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001978454743206398,
+      "loss": 0.1406,
+      "step": 7631
+    },
+    {
+      "epoch": 0.06624942491818647,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019784482694591675,
+      "loss": 0.1523,
+      "step": 7632
+    },
+    {
+      "epoch": 0.06625810539839064,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019784417947512787,
+      "loss": 0.0986,
+      "step": 7633
+    },
+    {
+      "epoch": 0.0662667858785948,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019784353190827396,
+      "loss": 0.1895,
+      "step": 7634
+    },
+    {
+      "epoch": 0.06627546635879897,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001978428842453557,
+      "loss": 0.1289,
+      "step": 7635
+    },
+    {
+      "epoch": 0.06628414683900313,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.001978422364863737,
+      "loss": 0.1445,
+      "step": 7636
+    },
+    {
+      "epoch": 0.0662928273192073,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001978415886313288,
+      "loss": 0.1099,
+      "step": 7637
+    },
+    {
+      "epoch": 0.06630150779941146,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019784094068022165,
+      "loss": 0.1006,
+      "step": 7638
+    },
+    {
+      "epoch": 0.06631018827961563,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019784029263305294,
+      "loss": 0.1221,
+      "step": 7639
+    },
+    {
+      "epoch": 0.0663188687598198,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0019783964448982345,
+      "loss": 0.1143,
+      "step": 7640
+    },
+    {
+      "epoch": 0.06632754924002396,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001978389962505338,
+      "loss": 0.1631,
+      "step": 7641
+    },
+    {
+      "epoch": 0.06633622972022812,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.001978383479151848,
+      "loss": 0.1318,
+      "step": 7642
+    },
+    {
+      "epoch": 0.06634491020043229,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019783769948377708,
+      "loss": 0.1299,
+      "step": 7643
+    },
+    {
+      "epoch": 0.06635359068063645,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0019783705095631136,
+      "loss": 0.1621,
+      "step": 7644
+    },
+    {
+      "epoch": 0.06636227116084062,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019783640233278837,
+      "loss": 0.1416,
+      "step": 7645
+    },
+    {
+      "epoch": 0.06637095164104478,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001978357536132088,
+      "loss": 0.0996,
+      "step": 7646
+    },
+    {
+      "epoch": 0.06637963212124895,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001978351047975734,
+      "loss": 0.1836,
+      "step": 7647
+    },
+    {
+      "epoch": 0.06638831260145311,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001978344558858828,
+      "loss": 0.1084,
+      "step": 7648
+    },
+    {
+      "epoch": 0.06639699308165728,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001978338068781378,
+      "loss": 0.1445,
+      "step": 7649
+    },
+    {
+      "epoch": 0.06640567356186144,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019783315777433903,
+      "loss": 0.0952,
+      "step": 7650
+    },
+    {
+      "epoch": 0.06641435404206561,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0019783250857448725,
+      "loss": 0.125,
+      "step": 7651
+    },
+    {
+      "epoch": 0.06642303452226977,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001978318592785832,
+      "loss": 0.104,
+      "step": 7652
+    },
+    {
+      "epoch": 0.06643171500247394,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001978312098866275,
+      "loss": 0.2061,
+      "step": 7653
+    },
+    {
+      "epoch": 0.0664403954826781,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0019783056039862092,
+      "loss": 0.1514,
+      "step": 7654
+    },
+    {
+      "epoch": 0.06644907596288227,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001978299108145642,
+      "loss": 0.1445,
+      "step": 7655
+    },
+    {
+      "epoch": 0.06645775644308644,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019782926113445795,
+      "loss": 0.124,
+      "step": 7656
+    },
+    {
+      "epoch": 0.0664664369232906,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.00197828611358303,
+      "loss": 0.1543,
+      "step": 7657
+    },
+    {
+      "epoch": 0.06647511740349477,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0019782796148609992,
+      "loss": 0.1592,
+      "step": 7658
+    },
+    {
+      "epoch": 0.06648379788369893,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019782731151784957,
+      "loss": 0.1309,
+      "step": 7659
+    },
+    {
+      "epoch": 0.0664924783639031,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019782666145355256,
+      "loss": 0.1592,
+      "step": 7660
+    },
+    {
+      "epoch": 0.06650115884410726,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019782601129320963,
+      "loss": 0.1729,
+      "step": 7661
+    },
+    {
+      "epoch": 0.06650983932431143,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001978253610368215,
+      "loss": 0.1475,
+      "step": 7662
+    },
+    {
+      "epoch": 0.06651851980451559,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0019782471068438886,
+      "loss": 0.123,
+      "step": 7663
+    },
+    {
+      "epoch": 0.06652720028471976,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001978240602359125,
+      "loss": 0.1025,
+      "step": 7664
+    },
+    {
+      "epoch": 0.06653588076492392,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0019782340969139297,
+      "loss": 0.1914,
+      "step": 7665
+    },
+    {
+      "epoch": 0.06654456124512809,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019782275905083115,
+      "loss": 0.1338,
+      "step": 7666
+    },
+    {
+      "epoch": 0.06655324172533225,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0019782210831422765,
+      "loss": 0.127,
+      "step": 7667
+    },
+    {
+      "epoch": 0.06656192220553642,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0019782145748158324,
+      "loss": 0.209,
+      "step": 7668
+    },
+    {
+      "epoch": 0.06657060268574058,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001978208065528986,
+      "loss": 0.1494,
+      "step": 7669
+    },
+    {
+      "epoch": 0.06657928316594473,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.001978201555281744,
+      "loss": 0.1758,
+      "step": 7670
+    },
+    {
+      "epoch": 0.0665879636461489,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.0019781950440741145,
+      "loss": 0.1494,
+      "step": 7671
+    },
+    {
+      "epoch": 0.06659664412635306,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001978188531906104,
+      "loss": 0.1416,
+      "step": 7672
+    },
+    {
+      "epoch": 0.06660532460655723,
+      "grad_norm": 2.859375,
+      "learning_rate": 0.001978182018777719,
+      "loss": 0.1719,
+      "step": 7673
+    },
+    {
+      "epoch": 0.0666140050867614,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.001978175504688968,
+      "loss": 0.127,
+      "step": 7674
+    },
+    {
+      "epoch": 0.06662268556696556,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0019781689896398577,
+      "loss": 0.1982,
+      "step": 7675
+    },
+    {
+      "epoch": 0.06663136604716972,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019781624736303943,
+      "loss": 0.1123,
+      "step": 7676
+    },
+    {
+      "epoch": 0.06664004652737389,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0019781559566605864,
+      "loss": 0.1328,
+      "step": 7677
+    },
+    {
+      "epoch": 0.06664872700757805,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.00197814943873044,
+      "loss": 0.1562,
+      "step": 7678
+    },
+    {
+      "epoch": 0.06665740748778222,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0019781429198399625,
+      "loss": 0.1387,
+      "step": 7679
+    },
+    {
+      "epoch": 0.06666608796798638,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.001978136399989161,
+      "loss": 0.1504,
+      "step": 7680
+    },
+    {
+      "epoch": 0.06667476844819055,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019781298791780428,
+      "loss": 0.1318,
+      "step": 7681
+    },
+    {
+      "epoch": 0.06668344892839471,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0019781233574066148,
+      "loss": 0.168,
+      "step": 7682
+    },
+    {
+      "epoch": 0.06669212940859888,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019781168346748846,
+      "loss": 0.0918,
+      "step": 7683
+    },
+    {
+      "epoch": 0.06670080988880304,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0019781103109828588,
+      "loss": 0.126,
+      "step": 7684
+    },
+    {
+      "epoch": 0.06670949036900721,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019781037863305447,
+      "loss": 0.1074,
+      "step": 7685
+    },
+    {
+      "epoch": 0.06671817084921138,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.00197809726071795,
+      "loss": 0.1143,
+      "step": 7686
+    },
+    {
+      "epoch": 0.06672685132941554,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019780907341450807,
+      "loss": 0.1045,
+      "step": 7687
+    },
+    {
+      "epoch": 0.0667355318096197,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019780842066119447,
+      "loss": 0.1123,
+      "step": 7688
+    },
+    {
+      "epoch": 0.06674421228982387,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0019780776781185496,
+      "loss": 0.1924,
+      "step": 7689
+    },
+    {
+      "epoch": 0.06675289277002804,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0019780711486649012,
+      "loss": 0.252,
+      "step": 7690
+    },
+    {
+      "epoch": 0.0667615732502322,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.001978064618251008,
+      "loss": 0.1426,
+      "step": 7691
+    },
+    {
+      "epoch": 0.06677025373043637,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001978058086876876,
+      "loss": 0.1396,
+      "step": 7692
+    },
+    {
+      "epoch": 0.06677893421064053,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0019780515545425132,
+      "loss": 0.125,
+      "step": 7693
+    },
+    {
+      "epoch": 0.0667876146908447,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001978045021247926,
+      "loss": 0.1621,
+      "step": 7694
+    },
+    {
+      "epoch": 0.06679629517104886,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001978038486993122,
+      "loss": 0.1504,
+      "step": 7695
+    },
+    {
+      "epoch": 0.06680497565125303,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019780319517781085,
+      "loss": 0.1377,
+      "step": 7696
+    },
+    {
+      "epoch": 0.06681365613145719,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001978025415602893,
+      "loss": 0.1465,
+      "step": 7697
+    },
+    {
+      "epoch": 0.06682233661166136,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019780188784674817,
+      "loss": 0.1621,
+      "step": 7698
+    },
+    {
+      "epoch": 0.06683101709186552,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019780123403718816,
+      "loss": 0.1387,
+      "step": 7699
+    },
+    {
+      "epoch": 0.06683969757206969,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001978005801316101,
+      "loss": 0.1484,
+      "step": 7700
+    },
+    {
+      "epoch": 0.06684837805227385,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0019779992613001463,
+      "loss": 0.1279,
+      "step": 7701
+    },
+    {
+      "epoch": 0.06685705853247802,
+      "grad_norm": 2.65625,
+      "learning_rate": 0.001977992720324025,
+      "loss": 0.3203,
+      "step": 7702
+    },
+    {
+      "epoch": 0.06686573901268218,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0019779861783877435,
+      "loss": 0.1611,
+      "step": 7703
+    },
+    {
+      "epoch": 0.06687441949288635,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.00197797963549131,
+      "loss": 0.0977,
+      "step": 7704
+    },
+    {
+      "epoch": 0.06688309997309051,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001977973091634731,
+      "loss": 0.125,
+      "step": 7705
+    },
+    {
+      "epoch": 0.06689178045329468,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001977966546818014,
+      "loss": 0.1377,
+      "step": 7706
+    },
+    {
+      "epoch": 0.06690046093349884,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0019779600010411654,
+      "loss": 0.2148,
+      "step": 7707
+    },
+    {
+      "epoch": 0.06690914141370301,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019779534543041934,
+      "loss": 0.1699,
+      "step": 7708
+    },
+    {
+      "epoch": 0.06691782189390717,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001977946906607105,
+      "loss": 0.1299,
+      "step": 7709
+    },
+    {
+      "epoch": 0.06692650237411134,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0019779403579499063,
+      "loss": 0.168,
+      "step": 7710
+    },
+    {
+      "epoch": 0.0669351828543155,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019779338083326056,
+      "loss": 0.1816,
+      "step": 7711
+    },
+    {
+      "epoch": 0.06694386333451967,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0019779272577552095,
+      "loss": 0.1289,
+      "step": 7712
+    },
+    {
+      "epoch": 0.06695254381472383,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.001977920706217726,
+      "loss": 0.1279,
+      "step": 7713
+    },
+    {
+      "epoch": 0.066961224294928,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0019779141537201606,
+      "loss": 0.1709,
+      "step": 7714
+    },
+    {
+      "epoch": 0.06696990477513216,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001977907600262522,
+      "loss": 0.1982,
+      "step": 7715
+    },
+    {
+      "epoch": 0.06697858525533633,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001977901045844817,
+      "loss": 0.1445,
+      "step": 7716
+    },
+    {
+      "epoch": 0.0669872657355405,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019778944904670524,
+      "loss": 0.1143,
+      "step": 7717
+    },
+    {
+      "epoch": 0.06699594621574466,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0019778879341292356,
+      "loss": 0.1235,
+      "step": 7718
+    },
+    {
+      "epoch": 0.06700462669594882,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019778813768313737,
+      "loss": 0.1475,
+      "step": 7719
+    },
+    {
+      "epoch": 0.06701330717615299,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001977874818573474,
+      "loss": 0.1328,
+      "step": 7720
+    },
+    {
+      "epoch": 0.06702198765635715,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001977868259355544,
+      "loss": 0.1104,
+      "step": 7721
+    },
+    {
+      "epoch": 0.06703066813656132,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0019778616991775896,
+      "loss": 0.1162,
+      "step": 7722
+    },
+    {
+      "epoch": 0.06703934861676548,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019778551380396194,
+      "loss": 0.1182,
+      "step": 7723
+    },
+    {
+      "epoch": 0.06704802909696965,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0019778485759416402,
+      "loss": 0.2051,
+      "step": 7724
+    },
+    {
+      "epoch": 0.06705670957717381,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001977842012883658,
+      "loss": 0.1299,
+      "step": 7725
+    },
+    {
+      "epoch": 0.06706539005737798,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.001977835448865682,
+      "loss": 0.2285,
+      "step": 7726
+    },
+    {
+      "epoch": 0.06707407053758214,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001977828883887718,
+      "loss": 0.126,
+      "step": 7727
+    },
+    {
+      "epoch": 0.06708275101778631,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0019778223179497735,
+      "loss": 0.1455,
+      "step": 7728
+    },
+    {
+      "epoch": 0.06709143149799048,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0019778157510518556,
+      "loss": 0.125,
+      "step": 7729
+    },
+    {
+      "epoch": 0.06710011197819464,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001977809183193972,
+      "loss": 0.1235,
+      "step": 7730
+    },
+    {
+      "epoch": 0.0671087924583988,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.001977802614376129,
+      "loss": 0.1699,
+      "step": 7731
+    },
+    {
+      "epoch": 0.06711747293860296,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001977796044598334,
+      "loss": 0.1348,
+      "step": 7732
+    },
+    {
+      "epoch": 0.06712615341880712,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001977789473860595,
+      "loss": 0.1025,
+      "step": 7733
+    },
+    {
+      "epoch": 0.06713483389901129,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001977782902162918,
+      "loss": 0.1387,
+      "step": 7734
+    },
+    {
+      "epoch": 0.06714351437921545,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019777763295053116,
+      "loss": 0.1001,
+      "step": 7735
+    },
+    {
+      "epoch": 0.06715219485941962,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001977769755887782,
+      "loss": 0.1104,
+      "step": 7736
+    },
+    {
+      "epoch": 0.06716087533962378,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001977763181310336,
+      "loss": 0.1582,
+      "step": 7737
+    },
+    {
+      "epoch": 0.06716955581982795,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001977756605772982,
+      "loss": 0.103,
+      "step": 7738
+    },
+    {
+      "epoch": 0.06717823630003211,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019777500292757263,
+      "loss": 0.1289,
+      "step": 7739
+    },
+    {
+      "epoch": 0.06718691678023628,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0019777434518185766,
+      "loss": 0.1504,
+      "step": 7740
+    },
+    {
+      "epoch": 0.06719559726044044,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0019777368734015394,
+      "loss": 0.1387,
+      "step": 7741
+    },
+    {
+      "epoch": 0.06720427774064461,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001977730294024623,
+      "loss": 0.1582,
+      "step": 7742
+    },
+    {
+      "epoch": 0.06721295822084877,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019777237136878335,
+      "loss": 0.1504,
+      "step": 7743
+    },
+    {
+      "epoch": 0.06722163870105294,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0019777171323911782,
+      "loss": 0.1069,
+      "step": 7744
+    },
+    {
+      "epoch": 0.0672303191812571,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0019777105501346654,
+      "loss": 0.1191,
+      "step": 7745
+    },
+    {
+      "epoch": 0.06723899966146127,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001977703966918301,
+      "loss": 0.1108,
+      "step": 7746
+    },
+    {
+      "epoch": 0.06724768014166543,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001977697382742093,
+      "loss": 0.1738,
+      "step": 7747
+    },
+    {
+      "epoch": 0.0672563606218696,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.001977690797606048,
+      "loss": 0.1514,
+      "step": 7748
+    },
+    {
+      "epoch": 0.06726504110207376,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001977684211510174,
+      "loss": 0.1504,
+      "step": 7749
+    },
+    {
+      "epoch": 0.06727372158227793,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0019776776244544774,
+      "loss": 0.168,
+      "step": 7750
+    },
+    {
+      "epoch": 0.0672824020624821,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.001977671036438966,
+      "loss": 0.1445,
+      "step": 7751
+    },
+    {
+      "epoch": 0.06729108254268626,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001977664447463646,
+      "loss": 0.1177,
+      "step": 7752
+    },
+    {
+      "epoch": 0.06729976302289042,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019776578575285263,
+      "loss": 0.1729,
+      "step": 7753
+    },
+    {
+      "epoch": 0.06730844350309459,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0019776512666336127,
+      "loss": 0.1426,
+      "step": 7754
+    },
+    {
+      "epoch": 0.06731712398329875,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001977644674778913,
+      "loss": 0.1895,
+      "step": 7755
+    },
+    {
+      "epoch": 0.06732580446350292,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001977638081964434,
+      "loss": 0.1641,
+      "step": 7756
+    },
+    {
+      "epoch": 0.06733448494370708,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0019776314881901833,
+      "loss": 0.1553,
+      "step": 7757
+    },
+    {
+      "epoch": 0.06734316542391125,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019776248934561683,
+      "loss": 0.1396,
+      "step": 7758
+    },
+    {
+      "epoch": 0.06735184590411541,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019776182977623955,
+      "loss": 0.1758,
+      "step": 7759
+    },
+    {
+      "epoch": 0.06736052638431958,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019776117011088726,
+      "loss": 0.1602,
+      "step": 7760
+    },
+    {
+      "epoch": 0.06736920686452375,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001977605103495607,
+      "loss": 0.0996,
+      "step": 7761
+    },
+    {
+      "epoch": 0.06737788734472791,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001977598504922605,
+      "loss": 0.1123,
+      "step": 7762
+    },
+    {
+      "epoch": 0.06738656782493208,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0019775919053898754,
+      "loss": 0.1162,
+      "step": 7763
+    },
+    {
+      "epoch": 0.06739524830513624,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019775853048974237,
+      "loss": 0.1123,
+      "step": 7764
+    },
+    {
+      "epoch": 0.0674039287853404,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019775787034452584,
+      "loss": 0.127,
+      "step": 7765
+    },
+    {
+      "epoch": 0.06741260926554457,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001977572101033386,
+      "loss": 0.1348,
+      "step": 7766
+    },
+    {
+      "epoch": 0.06742128974574874,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001977565497661814,
+      "loss": 0.1035,
+      "step": 7767
+    },
+    {
+      "epoch": 0.0674299702259529,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001977558893330549,
+      "loss": 0.1201,
+      "step": 7768
+    },
+    {
+      "epoch": 0.06743865070615707,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019775522880395996,
+      "loss": 0.1104,
+      "step": 7769
+    },
+    {
+      "epoch": 0.06744733118636123,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001977545681788972,
+      "loss": 0.1543,
+      "step": 7770
+    },
+    {
+      "epoch": 0.0674560116665654,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019775390745786735,
+      "loss": 0.1777,
+      "step": 7771
+    },
+    {
+      "epoch": 0.06746469214676956,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019775324664087113,
+      "loss": 0.1201,
+      "step": 7772
+    },
+    {
+      "epoch": 0.06747337262697373,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001977525857279093,
+      "loss": 0.1187,
+      "step": 7773
+    },
+    {
+      "epoch": 0.06748205310717789,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0019775192471898255,
+      "loss": 0.1738,
+      "step": 7774
+    },
+    {
+      "epoch": 0.06749073358738206,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019775126361409163,
+      "loss": 0.1113,
+      "step": 7775
+    },
+    {
+      "epoch": 0.06749941406758622,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0019775060241323723,
+      "loss": 0.1729,
+      "step": 7776
+    },
+    {
+      "epoch": 0.06750809454779039,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001977499411164201,
+      "loss": 0.1475,
+      "step": 7777
+    },
+    {
+      "epoch": 0.06751677502799455,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019774927972364092,
+      "loss": 0.1396,
+      "step": 7778
+    },
+    {
+      "epoch": 0.06752545550819872,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0019774861823490045,
+      "loss": 0.123,
+      "step": 7779
+    },
+    {
+      "epoch": 0.06753413598840288,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019774795665019945,
+      "loss": 0.1367,
+      "step": 7780
+    },
+    {
+      "epoch": 0.06754281646860705,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001977472949695386,
+      "loss": 0.1387,
+      "step": 7781
+    },
+    {
+      "epoch": 0.06755149694881121,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001977466331929186,
+      "loss": 0.1387,
+      "step": 7782
+    },
+    {
+      "epoch": 0.06756017742901538,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001977459713203402,
+      "loss": 0.1738,
+      "step": 7783
+    },
+    {
+      "epoch": 0.06756885790921954,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0019774530935180415,
+      "loss": 0.1885,
+      "step": 7784
+    },
+    {
+      "epoch": 0.06757753838942371,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0019774464728731112,
+      "loss": 0.1455,
+      "step": 7785
+    },
+    {
+      "epoch": 0.06758621886962787,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001977439851268619,
+      "loss": 0.1406,
+      "step": 7786
+    },
+    {
+      "epoch": 0.06759489934983204,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019774332287045716,
+      "loss": 0.1387,
+      "step": 7787
+    },
+    {
+      "epoch": 0.0676035798300362,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001977426605180976,
+      "loss": 0.1152,
+      "step": 7788
+    },
+    {
+      "epoch": 0.06761226031024037,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019774199806978404,
+      "loss": 0.1045,
+      "step": 7789
+    },
+    {
+      "epoch": 0.06762094079044453,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.001977413355255171,
+      "loss": 0.1299,
+      "step": 7790
+    },
+    {
+      "epoch": 0.0676296212706487,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001977406728852976,
+      "loss": 0.1348,
+      "step": 7791
+    },
+    {
+      "epoch": 0.06763830175085286,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0019774001014912623,
+      "loss": 0.1621,
+      "step": 7792
+    },
+    {
+      "epoch": 0.06764698223105703,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0019773934731700366,
+      "loss": 0.125,
+      "step": 7793
+    },
+    {
+      "epoch": 0.06765566271126118,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0019773868438893068,
+      "loss": 0.1475,
+      "step": 7794
+    },
+    {
+      "epoch": 0.06766434319146535,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0019773802136490797,
+      "loss": 0.1021,
+      "step": 7795
+    },
+    {
+      "epoch": 0.06767302367166951,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019773735824493628,
+      "loss": 0.1211,
+      "step": 7796
+    },
+    {
+      "epoch": 0.06768170415187368,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019773669502901633,
+      "loss": 0.1406,
+      "step": 7797
+    },
+    {
+      "epoch": 0.06769038463207784,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0019773603171714888,
+      "loss": 0.1299,
+      "step": 7798
+    },
+    {
+      "epoch": 0.067699065112282,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019773536830933465,
+      "loss": 0.1895,
+      "step": 7799
+    },
+    {
+      "epoch": 0.06770774559248617,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019773470480557425,
+      "loss": 0.1426,
+      "step": 7800
+    },
+    {
+      "epoch": 0.06771642607269034,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0019773404120586855,
+      "loss": 0.1504,
+      "step": 7801
+    },
+    {
+      "epoch": 0.0677251065528945,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001977333775102182,
+      "loss": 0.125,
+      "step": 7802
+    },
+    {
+      "epoch": 0.06773378703309867,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.00197732713718624,
+      "loss": 0.1348,
+      "step": 7803
+    },
+    {
+      "epoch": 0.06774246751330283,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019773204983108655,
+      "loss": 0.1279,
+      "step": 7804
+    },
+    {
+      "epoch": 0.067751147993507,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001977313858476067,
+      "loss": 0.1289,
+      "step": 7805
+    },
+    {
+      "epoch": 0.06775982847371116,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019773072176818506,
+      "loss": 0.124,
+      "step": 7806
+    },
+    {
+      "epoch": 0.06776850895391533,
+      "grad_norm": 0.875,
+      "learning_rate": 0.001977300575928225,
+      "loss": 0.3457,
+      "step": 7807
+    },
+    {
+      "epoch": 0.06777718943411949,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019772939332151963,
+      "loss": 0.1279,
+      "step": 7808
+    },
+    {
+      "epoch": 0.06778586991432366,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.001977287289542772,
+      "loss": 0.1348,
+      "step": 7809
+    },
+    {
+      "epoch": 0.06779455039452782,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.00197728064491096,
+      "loss": 0.124,
+      "step": 7810
+    },
+    {
+      "epoch": 0.06780323087473199,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0019772739993197666,
+      "loss": 0.1455,
+      "step": 7811
+    },
+    {
+      "epoch": 0.06781191135493615,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0019772673527691997,
+      "loss": 0.1426,
+      "step": 7812
+    },
+    {
+      "epoch": 0.06782059183514032,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019772607052592666,
+      "loss": 0.1504,
+      "step": 7813
+    },
+    {
+      "epoch": 0.06782927231534448,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019772540567899737,
+      "loss": 0.1094,
+      "step": 7814
+    },
+    {
+      "epoch": 0.06783795279554865,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0019772474073613297,
+      "loss": 0.1816,
+      "step": 7815
+    },
+    {
+      "epoch": 0.06784663327575281,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0019772407569733402,
+      "loss": 0.1738,
+      "step": 7816
+    },
+    {
+      "epoch": 0.06785531375595698,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019772341056260145,
+      "loss": 0.1348,
+      "step": 7817
+    },
+    {
+      "epoch": 0.06786399423616114,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.001977227453319358,
+      "loss": 0.5586,
+      "step": 7818
+    },
+    {
+      "epoch": 0.06787267471636531,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001977220800053379,
+      "loss": 0.1504,
+      "step": 7819
+    },
+    {
+      "epoch": 0.06788135519656947,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0019772141458280846,
+      "loss": 0.1318,
+      "step": 7820
+    },
+    {
+      "epoch": 0.06789003567677364,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001977207490643482,
+      "loss": 0.1533,
+      "step": 7821
+    },
+    {
+      "epoch": 0.0678987161569778,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001977200834499578,
+      "loss": 0.1084,
+      "step": 7822
+    },
+    {
+      "epoch": 0.06790739663718197,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.001977194177396381,
+      "loss": 0.1318,
+      "step": 7823
+    },
+    {
+      "epoch": 0.06791607711738613,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001977187519333897,
+      "loss": 0.1406,
+      "step": 7824
+    },
+    {
+      "epoch": 0.0679247575975903,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019771808603121347,
+      "loss": 0.1416,
+      "step": 7825
+    },
+    {
+      "epoch": 0.06793343807779446,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019771742003311,
+      "loss": 0.1465,
+      "step": 7826
+    },
+    {
+      "epoch": 0.06794211855799863,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001977167539390801,
+      "loss": 0.1396,
+      "step": 7827
+    },
+    {
+      "epoch": 0.0679507990382028,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019771608774912445,
+      "loss": 0.1167,
+      "step": 7828
+    },
+    {
+      "epoch": 0.06795947951840696,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001977154214632438,
+      "loss": 0.166,
+      "step": 7829
+    },
+    {
+      "epoch": 0.06796815999861112,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001977147550814389,
+      "loss": 0.1094,
+      "step": 7830
+    },
+    {
+      "epoch": 0.06797684047881529,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001977140886037105,
+      "loss": 0.1602,
+      "step": 7831
+    },
+    {
+      "epoch": 0.06798552095901945,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019771342203005926,
+      "loss": 0.1289,
+      "step": 7832
+    },
+    {
+      "epoch": 0.06799420143922362,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019771275536048594,
+      "loss": 0.168,
+      "step": 7833
+    },
+    {
+      "epoch": 0.06800288191942779,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001977120885949912,
+      "loss": 0.168,
+      "step": 7834
+    },
+    {
+      "epoch": 0.06801156239963195,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019771142173357595,
+      "loss": 0.1445,
+      "step": 7835
+    },
+    {
+      "epoch": 0.06802024287983612,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019771075477624076,
+      "loss": 0.1299,
+      "step": 7836
+    },
+    {
+      "epoch": 0.06802892336004028,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001977100877229864,
+      "loss": 0.1299,
+      "step": 7837
+    },
+    {
+      "epoch": 0.06803760384024445,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.001977094205738136,
+      "loss": 0.1123,
+      "step": 7838
+    },
+    {
+      "epoch": 0.06804628432044861,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0019770875332872314,
+      "loss": 0.104,
+      "step": 7839
+    },
+    {
+      "epoch": 0.06805496480065278,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019770808598771564,
+      "loss": 0.1592,
+      "step": 7840
+    },
+    {
+      "epoch": 0.06806364528085694,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019770741855079195,
+      "loss": 0.1113,
+      "step": 7841
+    },
+    {
+      "epoch": 0.0680723257610611,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001977067510179527,
+      "loss": 0.1445,
+      "step": 7842
+    },
+    {
+      "epoch": 0.06808100624126527,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019770608338919872,
+      "loss": 0.1475,
+      "step": 7843
+    },
+    {
+      "epoch": 0.06808968672146944,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019770541566453063,
+      "loss": 0.1514,
+      "step": 7844
+    },
+    {
+      "epoch": 0.0680983672016736,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0019770474784394925,
+      "loss": 0.126,
+      "step": 7845
+    },
+    {
+      "epoch": 0.06810704768187777,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0019770407992745526,
+      "loss": 0.168,
+      "step": 7846
+    },
+    {
+      "epoch": 0.06811572816208193,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001977034119150494,
+      "loss": 0.1187,
+      "step": 7847
+    },
+    {
+      "epoch": 0.0681244086422861,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019770274380673243,
+      "loss": 0.1011,
+      "step": 7848
+    },
+    {
+      "epoch": 0.06813308912249026,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.00197702075602505,
+      "loss": 0.1602,
+      "step": 7849
+    },
+    {
+      "epoch": 0.06814176960269443,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019770140730236796,
+      "loss": 0.123,
+      "step": 7850
+    },
+    {
+      "epoch": 0.06815045008289859,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.00197700738906322,
+      "loss": 0.1206,
+      "step": 7851
+    },
+    {
+      "epoch": 0.06815913056310276,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0019770007041436774,
+      "loss": 0.1406,
+      "step": 7852
+    },
+    {
+      "epoch": 0.06816781104330692,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0019769940182650606,
+      "loss": 0.1006,
+      "step": 7853
+    },
+    {
+      "epoch": 0.06817649152351109,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019769873314273764,
+      "loss": 0.1201,
+      "step": 7854
+    },
+    {
+      "epoch": 0.06818517200371524,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001976980643630632,
+      "loss": 0.3047,
+      "step": 7855
+    },
+    {
+      "epoch": 0.0681938524839194,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001976973954874834,
+      "loss": 0.124,
+      "step": 7856
+    },
+    {
+      "epoch": 0.06820253296412357,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0019769672651599917,
+      "loss": 0.1123,
+      "step": 7857
+    },
+    {
+      "epoch": 0.06821121344432773,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019769605744861104,
+      "loss": 0.1367,
+      "step": 7858
+    },
+    {
+      "epoch": 0.0682198939245319,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001976953882853198,
+      "loss": 0.1035,
+      "step": 7859
+    },
+    {
+      "epoch": 0.06822857440473606,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001976947190261262,
+      "loss": 0.1182,
+      "step": 7860
+    },
+    {
+      "epoch": 0.06823725488494023,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019769404967103104,
+      "loss": 0.1621,
+      "step": 7861
+    },
+    {
+      "epoch": 0.0682459353651444,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019769338022003497,
+      "loss": 0.1885,
+      "step": 7862
+    },
+    {
+      "epoch": 0.06825461584534856,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001976927106731387,
+      "loss": 0.1592,
+      "step": 7863
+    },
+    {
+      "epoch": 0.06826329632555272,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.00197692041030343,
+      "loss": 0.1797,
+      "step": 7864
+    },
+    {
+      "epoch": 0.06827197680575689,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019769137129164865,
+      "loss": 0.1279,
+      "step": 7865
+    },
+    {
+      "epoch": 0.06828065728596105,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001976907014570563,
+      "loss": 0.1436,
+      "step": 7866
+    },
+    {
+      "epoch": 0.06828933776616522,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001976900315265667,
+      "loss": 0.1338,
+      "step": 7867
+    },
+    {
+      "epoch": 0.06829801824636939,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0019768936150018064,
+      "loss": 0.1523,
+      "step": 7868
+    },
+    {
+      "epoch": 0.06830669872657355,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001976886913778988,
+      "loss": 0.1318,
+      "step": 7869
+    },
+    {
+      "epoch": 0.06831537920677772,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.001976880211597219,
+      "loss": 0.1157,
+      "step": 7870
+    },
+    {
+      "epoch": 0.06832405968698188,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001976873508456507,
+      "loss": 0.1396,
+      "step": 7871
+    },
+    {
+      "epoch": 0.06833274016718605,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001976866804356859,
+      "loss": 0.1455,
+      "step": 7872
+    },
+    {
+      "epoch": 0.06834142064739021,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019768600992982836,
+      "loss": 0.1035,
+      "step": 7873
+    },
+    {
+      "epoch": 0.06835010112759438,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0019768533932807866,
+      "loss": 0.1689,
+      "step": 7874
+    },
+    {
+      "epoch": 0.06835878160779854,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0019768466863043757,
+      "loss": 0.1318,
+      "step": 7875
+    },
+    {
+      "epoch": 0.0683674620880027,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019768399783690587,
+      "loss": 0.1357,
+      "step": 7876
+    },
+    {
+      "epoch": 0.06837614256820687,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001976833269474843,
+      "loss": 0.0991,
+      "step": 7877
+    },
+    {
+      "epoch": 0.06838482304841104,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001976826559621735,
+      "loss": 0.1562,
+      "step": 7878
+    },
+    {
+      "epoch": 0.0683935035286152,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001976819848809743,
+      "loss": 0.1504,
+      "step": 7879
+    },
+    {
+      "epoch": 0.06840218400881937,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0019768131370388737,
+      "loss": 0.1104,
+      "step": 7880
+    },
+    {
+      "epoch": 0.06841086448902353,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001976806424309135,
+      "loss": 0.1182,
+      "step": 7881
+    },
+    {
+      "epoch": 0.0684195449692277,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0019767997106205338,
+      "loss": 0.1504,
+      "step": 7882
+    },
+    {
+      "epoch": 0.06842822544943186,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001976792995973078,
+      "loss": 0.1484,
+      "step": 7883
+    },
+    {
+      "epoch": 0.06843690592963603,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001976786280366774,
+      "loss": 0.1328,
+      "step": 7884
+    },
+    {
+      "epoch": 0.06844558640984019,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.00197677956380163,
+      "loss": 0.1533,
+      "step": 7885
+    },
+    {
+      "epoch": 0.06845426689004436,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001976772846277653,
+      "loss": 0.1182,
+      "step": 7886
+    },
+    {
+      "epoch": 0.06846294737024852,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019767661277948502,
+      "loss": 0.1514,
+      "step": 7887
+    },
+    {
+      "epoch": 0.06847162785045269,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001976759408353229,
+      "loss": 0.166,
+      "step": 7888
+    },
+    {
+      "epoch": 0.06848030833065685,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019767526879527976,
+      "loss": 0.1406,
+      "step": 7889
+    },
+    {
+      "epoch": 0.06848898881086102,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0019767459665935624,
+      "loss": 0.1377,
+      "step": 7890
+    },
+    {
+      "epoch": 0.06849766929106518,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0019767392442755307,
+      "loss": 0.1123,
+      "step": 7891
+    },
+    {
+      "epoch": 0.06850634977126935,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.00197673252099871,
+      "loss": 0.1426,
+      "step": 7892
+    },
+    {
+      "epoch": 0.06851503025147351,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001976725796763108,
+      "loss": 0.124,
+      "step": 7893
+    },
+    {
+      "epoch": 0.06852371073167768,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019767190715687323,
+      "loss": 0.1416,
+      "step": 7894
+    },
+    {
+      "epoch": 0.06853239121188184,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001976712345415589,
+      "loss": 0.127,
+      "step": 7895
+    },
+    {
+      "epoch": 0.06854107169208601,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019767056183036865,
+      "loss": 0.0918,
+      "step": 7896
+    },
+    {
+      "epoch": 0.06854975217229017,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001976698890233032,
+      "loss": 0.105,
+      "step": 7897
+    },
+    {
+      "epoch": 0.06855843265249434,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019766921612036333,
+      "loss": 0.166,
+      "step": 7898
+    },
+    {
+      "epoch": 0.0685671131326985,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019766854312154966,
+      "loss": 0.1011,
+      "step": 7899
+    },
+    {
+      "epoch": 0.06857579361290267,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00197667870026863,
+      "loss": 0.1016,
+      "step": 7900
+    },
+    {
+      "epoch": 0.06858447409310683,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001976671968363041,
+      "loss": 0.1758,
+      "step": 7901
+    },
+    {
+      "epoch": 0.068593154573311,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019766652354987362,
+      "loss": 0.1523,
+      "step": 7902
+    },
+    {
+      "epoch": 0.06860183505351516,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0019766585016757236,
+      "loss": 0.1187,
+      "step": 7903
+    },
+    {
+      "epoch": 0.06861051553371933,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001976651766894011,
+      "loss": 0.1377,
+      "step": 7904
+    },
+    {
+      "epoch": 0.0686191960139235,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019766450311536044,
+      "loss": 0.1543,
+      "step": 7905
+    },
+    {
+      "epoch": 0.06862787649412766,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019766382944545126,
+      "loss": 0.1318,
+      "step": 7906
+    },
+    {
+      "epoch": 0.06863655697433182,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001976631556796742,
+      "loss": 0.1523,
+      "step": 7907
+    },
+    {
+      "epoch": 0.06864523745453599,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019766248181803,
+      "loss": 0.1484,
+      "step": 7908
+    },
+    {
+      "epoch": 0.06865391793474016,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019766180786051947,
+      "loss": 0.1113,
+      "step": 7909
+    },
+    {
+      "epoch": 0.06866259841494432,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019766113380714333,
+      "loss": 0.1514,
+      "step": 7910
+    },
+    {
+      "epoch": 0.06867127889514849,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019766045965790226,
+      "loss": 0.1123,
+      "step": 7911
+    },
+    {
+      "epoch": 0.06867995937535265,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019765978541279703,
+      "loss": 0.1143,
+      "step": 7912
+    },
+    {
+      "epoch": 0.06868863985555682,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0019765911107182833,
+      "loss": 0.1475,
+      "step": 7913
+    },
+    {
+      "epoch": 0.06869732033576098,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.00197658436634997,
+      "loss": 0.1484,
+      "step": 7914
+    },
+    {
+      "epoch": 0.06870600081596515,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001976577621023037,
+      "loss": 0.1338,
+      "step": 7915
+    },
+    {
+      "epoch": 0.06871468129616931,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001976570874737492,
+      "loss": 0.1279,
+      "step": 7916
+    },
+    {
+      "epoch": 0.06872336177637346,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019765641274933424,
+      "loss": 0.1514,
+      "step": 7917
+    },
+    {
+      "epoch": 0.06873204225657763,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0019765573792905953,
+      "loss": 0.1611,
+      "step": 7918
+    },
+    {
+      "epoch": 0.06874072273678179,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001976550630129258,
+      "loss": 0.1689,
+      "step": 7919
+    },
+    {
+      "epoch": 0.06874940321698596,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019765438800093383,
+      "loss": 0.1299,
+      "step": 7920
+    },
+    {
+      "epoch": 0.06875808369719012,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019765371289308434,
+      "loss": 0.1338,
+      "step": 7921
+    },
+    {
+      "epoch": 0.06876676417739429,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0019765303768937807,
+      "loss": 0.1191,
+      "step": 7922
+    },
+    {
+      "epoch": 0.06877544465759845,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001976523623898157,
+      "loss": 0.1191,
+      "step": 7923
+    },
+    {
+      "epoch": 0.06878412513780262,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001976516869943981,
+      "loss": 0.1484,
+      "step": 7924
+    },
+    {
+      "epoch": 0.06879280561800678,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001976510115031259,
+      "loss": 0.1035,
+      "step": 7925
+    },
+    {
+      "epoch": 0.06880148609821095,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001976503359159999,
+      "loss": 0.127,
+      "step": 7926
+    },
+    {
+      "epoch": 0.06881016657841511,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0019764966023302077,
+      "loss": 0.1309,
+      "step": 7927
+    },
+    {
+      "epoch": 0.06881884705861928,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001976489844541893,
+      "loss": 0.1396,
+      "step": 7928
+    },
+    {
+      "epoch": 0.06882752753882344,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0019764830857950623,
+      "loss": 0.1328,
+      "step": 7929
+    },
+    {
+      "epoch": 0.06883620801902761,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019764763260897225,
+      "loss": 0.1504,
+      "step": 7930
+    },
+    {
+      "epoch": 0.06884488849923177,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001976469565425882,
+      "loss": 0.1494,
+      "step": 7931
+    },
+    {
+      "epoch": 0.06885356897943594,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001976462803803547,
+      "loss": 0.1235,
+      "step": 7932
+    },
+    {
+      "epoch": 0.0688622494596401,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0019764560412227255,
+      "loss": 0.1211,
+      "step": 7933
+    },
+    {
+      "epoch": 0.06887092993984427,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.001976449277683425,
+      "loss": 0.084,
+      "step": 7934
+    },
+    {
+      "epoch": 0.06887961042004843,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0019764425131856525,
+      "loss": 0.1719,
+      "step": 7935
+    },
+    {
+      "epoch": 0.0688882909002526,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001976435747729416,
+      "loss": 0.1191,
+      "step": 7936
+    },
+    {
+      "epoch": 0.06889697138045676,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0019764289813147223,
+      "loss": 0.1426,
+      "step": 7937
+    },
+    {
+      "epoch": 0.06890565186066093,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001976422213941579,
+      "loss": 0.1523,
+      "step": 7938
+    },
+    {
+      "epoch": 0.0689143323408651,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001976415445609994,
+      "loss": 0.1348,
+      "step": 7939
+    },
+    {
+      "epoch": 0.06892301282106926,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001976408676319974,
+      "loss": 0.1055,
+      "step": 7940
+    },
+    {
+      "epoch": 0.06893169330127343,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019764019060715264,
+      "loss": 0.1162,
+      "step": 7941
+    },
+    {
+      "epoch": 0.06894037378147759,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019763951348646587,
+      "loss": 0.1006,
+      "step": 7942
+    },
+    {
+      "epoch": 0.06894905426168176,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019763883626993787,
+      "loss": 0.1084,
+      "step": 7943
+    },
+    {
+      "epoch": 0.06895773474188592,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019763815895756935,
+      "loss": 0.1079,
+      "step": 7944
+    },
+    {
+      "epoch": 0.06896641522209009,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001976374815493611,
+      "loss": 0.1094,
+      "step": 7945
+    },
+    {
+      "epoch": 0.06897509570229425,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019763680404531374,
+      "loss": 0.1182,
+      "step": 7946
+    },
+    {
+      "epoch": 0.06898377618249842,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019763612644542817,
+      "loss": 0.0957,
+      "step": 7947
+    },
+    {
+      "epoch": 0.06899245666270258,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019763544874970497,
+      "loss": 0.1138,
+      "step": 7948
+    },
+    {
+      "epoch": 0.06900113714290675,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019763477095814503,
+      "loss": 0.1147,
+      "step": 7949
+    },
+    {
+      "epoch": 0.06900981762311091,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019763409307074894,
+      "loss": 0.1387,
+      "step": 7950
+    },
+    {
+      "epoch": 0.06901849810331508,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001976334150875176,
+      "loss": 0.1377,
+      "step": 7951
+    },
+    {
+      "epoch": 0.06902717858351924,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019763273700845164,
+      "loss": 0.1162,
+      "step": 7952
+    },
+    {
+      "epoch": 0.0690358590637234,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001976320588335518,
+      "loss": 0.1543,
+      "step": 7953
+    },
+    {
+      "epoch": 0.06904453954392757,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.001976313805628189,
+      "loss": 0.1167,
+      "step": 7954
+    },
+    {
+      "epoch": 0.06905322002413174,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019763070219625365,
+      "loss": 0.124,
+      "step": 7955
+    },
+    {
+      "epoch": 0.0690619005043359,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001976300237338568,
+      "loss": 0.0947,
+      "step": 7956
+    },
+    {
+      "epoch": 0.06907058098454007,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019762934517562904,
+      "loss": 0.2656,
+      "step": 7957
+    },
+    {
+      "epoch": 0.06907926146474423,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001976286665215711,
+      "loss": 0.125,
+      "step": 7958
+    },
+    {
+      "epoch": 0.0690879419449484,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001976279877716838,
+      "loss": 0.1699,
+      "step": 7959
+    },
+    {
+      "epoch": 0.06909662242515256,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001976273089259679,
+      "loss": 0.1289,
+      "step": 7960
+    },
+    {
+      "epoch": 0.06910530290535673,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.00197626629984424,
+      "loss": 0.1157,
+      "step": 7961
+    },
+    {
+      "epoch": 0.06911398338556089,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.00197625950947053,
+      "loss": 0.1182,
+      "step": 7962
+    },
+    {
+      "epoch": 0.06912266386576506,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019762527181385555,
+      "loss": 0.1348,
+      "step": 7963
+    },
+    {
+      "epoch": 0.06913134434596922,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001976245925848325,
+      "loss": 0.1206,
+      "step": 7964
+    },
+    {
+      "epoch": 0.06914002482617339,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.001976239132599844,
+      "loss": 0.1494,
+      "step": 7965
+    },
+    {
+      "epoch": 0.06914870530637755,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019762323383931218,
+      "loss": 0.1113,
+      "step": 7966
+    },
+    {
+      "epoch": 0.06915738578658172,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0019762255432281646,
+      "loss": 0.1504,
+      "step": 7967
+    },
+    {
+      "epoch": 0.06916606626678588,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019762187471049804,
+      "loss": 0.1099,
+      "step": 7968
+    },
+    {
+      "epoch": 0.06917474674699005,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0019762119500235766,
+      "loss": 0.1084,
+      "step": 7969
+    },
+    {
+      "epoch": 0.06918342722719421,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0019762051519839605,
+      "loss": 0.2109,
+      "step": 7970
+    },
+    {
+      "epoch": 0.06919210770739838,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019761983529861396,
+      "loss": 0.1455,
+      "step": 7971
+    },
+    {
+      "epoch": 0.06920078818760254,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019761915530301217,
+      "loss": 0.1055,
+      "step": 7972
+    },
+    {
+      "epoch": 0.06920946866780671,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019761847521159136,
+      "loss": 0.1328,
+      "step": 7973
+    },
+    {
+      "epoch": 0.06921814914801087,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019761779502435227,
+      "loss": 0.1445,
+      "step": 7974
+    },
+    {
+      "epoch": 0.06922682962821504,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019761711474129574,
+      "loss": 0.1777,
+      "step": 7975
+    },
+    {
+      "epoch": 0.0692355101084192,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001976164343624224,
+      "loss": 0.1523,
+      "step": 7976
+    },
+    {
+      "epoch": 0.06924419058862337,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001976157538877331,
+      "loss": 0.1309,
+      "step": 7977
+    },
+    {
+      "epoch": 0.06925287106882752,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001976150733172285,
+      "loss": 0.1406,
+      "step": 7978
+    },
+    {
+      "epoch": 0.06926155154903169,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0019761439265090935,
+      "loss": 0.1816,
+      "step": 7979
+    },
+    {
+      "epoch": 0.06927023202923585,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001976137118887764,
+      "loss": 0.0903,
+      "step": 7980
+    },
+    {
+      "epoch": 0.06927891250944002,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019761303103083046,
+      "loss": 0.0903,
+      "step": 7981
+    },
+    {
+      "epoch": 0.06928759298964418,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001976123500770722,
+      "loss": 0.1396,
+      "step": 7982
+    },
+    {
+      "epoch": 0.06929627346984835,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001976116690275024,
+      "loss": 0.123,
+      "step": 7983
+    },
+    {
+      "epoch": 0.06930495395005251,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001976109878821218,
+      "loss": 0.1406,
+      "step": 7984
+    },
+    {
+      "epoch": 0.06931363443025668,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019761030664093115,
+      "loss": 0.1084,
+      "step": 7985
+    },
+    {
+      "epoch": 0.06932231491046084,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019760962530393114,
+      "loss": 0.1445,
+      "step": 7986
+    },
+    {
+      "epoch": 0.069330995390665,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019760894387112257,
+      "loss": 0.1504,
+      "step": 7987
+    },
+    {
+      "epoch": 0.06933967587086917,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001976082623425062,
+      "loss": 0.1216,
+      "step": 7988
+    },
+    {
+      "epoch": 0.06934835635107334,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019760758071808277,
+      "loss": 0.1191,
+      "step": 7989
+    },
+    {
+      "epoch": 0.0693570368312775,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019760689899785294,
+      "loss": 0.126,
+      "step": 7990
+    },
+    {
+      "epoch": 0.06936571731148167,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001976062171818176,
+      "loss": 0.1299,
+      "step": 7991
+    },
+    {
+      "epoch": 0.06937439779168583,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019760553526997735,
+      "loss": 0.1099,
+      "step": 7992
+    },
+    {
+      "epoch": 0.06938307827189,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.00197604853262333,
+      "loss": 0.123,
+      "step": 7993
+    },
+    {
+      "epoch": 0.06939175875209416,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001976041711588854,
+      "loss": 0.1182,
+      "step": 7994
+    },
+    {
+      "epoch": 0.06940043923229833,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0019760348895963512,
+      "loss": 0.1621,
+      "step": 7995
+    },
+    {
+      "epoch": 0.06940911971250249,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.00197602806664583,
+      "loss": 0.1543,
+      "step": 7996
+    },
+    {
+      "epoch": 0.06941780019270666,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019760212427372974,
+      "loss": 0.1445,
+      "step": 7997
+    },
+    {
+      "epoch": 0.06942648067291082,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001976014417870761,
+      "loss": 0.0977,
+      "step": 7998
+    },
+    {
+      "epoch": 0.06943516115311499,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001976007592046229,
+      "loss": 0.124,
+      "step": 7999
+    },
+    {
+      "epoch": 0.06944384163331915,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0019760007652637078,
+      "loss": 0.2266,
+      "step": 8000
+    },
+    {
+      "epoch": 0.06945252211352332,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001975993937523205,
+      "loss": 0.1445,
+      "step": 8001
+    },
+    {
+      "epoch": 0.06946120259372748,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001975987108824729,
+      "loss": 0.1357,
+      "step": 8002
+    },
+    {
+      "epoch": 0.06946988307393165,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019759802791682866,
+      "loss": 0.1602,
+      "step": 8003
+    },
+    {
+      "epoch": 0.06947856355413581,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0019759734485538852,
+      "loss": 0.1021,
+      "step": 8004
+    },
+    {
+      "epoch": 0.06948724403433998,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019759666169815323,
+      "loss": 0.1182,
+      "step": 8005
+    },
+    {
+      "epoch": 0.06949592451454414,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0019759597844512355,
+      "loss": 0.1016,
+      "step": 8006
+    },
+    {
+      "epoch": 0.06950460499474831,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019759529509630024,
+      "loss": 0.1387,
+      "step": 8007
+    },
+    {
+      "epoch": 0.06951328547495247,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00197594611651684,
+      "loss": 0.2246,
+      "step": 8008
+    },
+    {
+      "epoch": 0.06952196595515664,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0019759392811127567,
+      "loss": 0.1328,
+      "step": 8009
+    },
+    {
+      "epoch": 0.0695306464353608,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001975932444750759,
+      "loss": 0.1602,
+      "step": 8010
+    },
+    {
+      "epoch": 0.06953932691556497,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0019759256074308547,
+      "loss": 0.1104,
+      "step": 8011
+    },
+    {
+      "epoch": 0.06954800739576913,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019759187691530513,
+      "loss": 0.1235,
+      "step": 8012
+    },
+    {
+      "epoch": 0.0695566878759733,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019759119299173566,
+      "loss": 0.124,
+      "step": 8013
+    },
+    {
+      "epoch": 0.06956536835617746,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019759050897237775,
+      "loss": 0.1689,
+      "step": 8014
+    },
+    {
+      "epoch": 0.06957404883638163,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0019758982485723214,
+      "loss": 0.1406,
+      "step": 8015
+    },
+    {
+      "epoch": 0.0695827293165858,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019758914064629965,
+      "loss": 0.125,
+      "step": 8016
+    },
+    {
+      "epoch": 0.06959140979678996,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.00197588456339581,
+      "loss": 0.123,
+      "step": 8017
+    },
+    {
+      "epoch": 0.06960009027699413,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001975877719370769,
+      "loss": 0.1367,
+      "step": 8018
+    },
+    {
+      "epoch": 0.06960877075719829,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019758708743878818,
+      "loss": 0.1279,
+      "step": 8019
+    },
+    {
+      "epoch": 0.06961745123740246,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001975864028447155,
+      "loss": 0.1602,
+      "step": 8020
+    },
+    {
+      "epoch": 0.06962613171760662,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019758571815485966,
+      "loss": 0.1426,
+      "step": 8021
+    },
+    {
+      "epoch": 0.06963481219781079,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001975850333692214,
+      "loss": 0.1162,
+      "step": 8022
+    },
+    {
+      "epoch": 0.06964349267801495,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0019758434848780146,
+      "loss": 0.1055,
+      "step": 8023
+    },
+    {
+      "epoch": 0.06965217315821912,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019758366351060055,
+      "loss": 0.1328,
+      "step": 8024
+    },
+    {
+      "epoch": 0.06966085363842328,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001975829784376195,
+      "loss": 0.1279,
+      "step": 8025
+    },
+    {
+      "epoch": 0.06966953411862745,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0019758229326885903,
+      "loss": 0.1426,
+      "step": 8026
+    },
+    {
+      "epoch": 0.06967821459883161,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0019758160800431987,
+      "loss": 0.1426,
+      "step": 8027
+    },
+    {
+      "epoch": 0.06968689507903578,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001975809226440028,
+      "loss": 0.1309,
+      "step": 8028
+    },
+    {
+      "epoch": 0.06969557555923994,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001975802371879085,
+      "loss": 0.1562,
+      "step": 8029
+    },
+    {
+      "epoch": 0.0697042560394441,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001975795516360378,
+      "loss": 0.1523,
+      "step": 8030
+    },
+    {
+      "epoch": 0.06971293651964827,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001975788659883914,
+      "loss": 0.1069,
+      "step": 8031
+    },
+    {
+      "epoch": 0.06972161699985244,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0019757818024497007,
+      "loss": 0.1367,
+      "step": 8032
+    },
+    {
+      "epoch": 0.0697302974800566,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019757749440577456,
+      "loss": 0.1592,
+      "step": 8033
+    },
+    {
+      "epoch": 0.06973897796026077,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0019757680847080562,
+      "loss": 0.1699,
+      "step": 8034
+    },
+    {
+      "epoch": 0.06974765844046493,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0019757612244006404,
+      "loss": 0.1445,
+      "step": 8035
+    },
+    {
+      "epoch": 0.0697563389206691,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001975754363135505,
+      "loss": 0.0977,
+      "step": 8036
+    },
+    {
+      "epoch": 0.06976501940087326,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001975747500912657,
+      "loss": 0.1816,
+      "step": 8037
+    },
+    {
+      "epoch": 0.06977369988107743,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0019757406377321055,
+      "loss": 0.127,
+      "step": 8038
+    },
+    {
+      "epoch": 0.06978238036128159,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0019757337735938573,
+      "loss": 0.0996,
+      "step": 8039
+    },
+    {
+      "epoch": 0.06979106084148574,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.00197572690849792,
+      "loss": 0.1807,
+      "step": 8040
+    },
+    {
+      "epoch": 0.06979974132168991,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019757200424443,
+      "loss": 0.1309,
+      "step": 8041
+    },
+    {
+      "epoch": 0.06980842180189407,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019757131754330063,
+      "loss": 0.1836,
+      "step": 8042
+    },
+    {
+      "epoch": 0.06981710228209824,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001975706307464046,
+      "loss": 0.1396,
+      "step": 8043
+    },
+    {
+      "epoch": 0.0698257827623024,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001975699438537426,
+      "loss": 0.1748,
+      "step": 8044
+    },
+    {
+      "epoch": 0.06983446324250657,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0019756925686531543,
+      "loss": 0.1367,
+      "step": 8045
+    },
+    {
+      "epoch": 0.06984314372271073,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0019756856978112385,
+      "loss": 0.1406,
+      "step": 8046
+    },
+    {
+      "epoch": 0.0698518242029149,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019756788260116864,
+      "loss": 0.1465,
+      "step": 8047
+    },
+    {
+      "epoch": 0.06986050468311907,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0019756719532545044,
+      "loss": 0.1211,
+      "step": 8048
+    },
+    {
+      "epoch": 0.06986918516332323,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001975665079539701,
+      "loss": 0.1494,
+      "step": 8049
+    },
+    {
+      "epoch": 0.0698778656435274,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0019756582048672837,
+      "loss": 0.1426,
+      "step": 8050
+    },
+    {
+      "epoch": 0.06988654612373156,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001975651329237259,
+      "loss": 0.125,
+      "step": 8051
+    },
+    {
+      "epoch": 0.06989522660393573,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0019756444526496356,
+      "loss": 0.0835,
+      "step": 8052
+    },
+    {
+      "epoch": 0.06990390708413989,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001975637575104421,
+      "loss": 0.1367,
+      "step": 8053
+    },
+    {
+      "epoch": 0.06991258756434406,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001975630696601622,
+      "loss": 0.1543,
+      "step": 8054
+    },
+    {
+      "epoch": 0.06992126804454822,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0019756238171412463,
+      "loss": 0.125,
+      "step": 8055
+    },
+    {
+      "epoch": 0.06992994852475239,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019756169367233013,
+      "loss": 0.1592,
+      "step": 8056
+    },
+    {
+      "epoch": 0.06993862900495655,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.001975610055347795,
+      "loss": 0.1143,
+      "step": 8057
+    },
+    {
+      "epoch": 0.06994730948516072,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001975603173014735,
+      "loss": 0.1758,
+      "step": 8058
+    },
+    {
+      "epoch": 0.06995598996536488,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019755962897241284,
+      "loss": 0.123,
+      "step": 8059
+    },
+    {
+      "epoch": 0.06996467044556905,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019755894054759825,
+      "loss": 0.1406,
+      "step": 8060
+    },
+    {
+      "epoch": 0.06997335092577321,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001975582520270305,
+      "loss": 0.1426,
+      "step": 8061
+    },
+    {
+      "epoch": 0.06998203140597738,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019755756341071043,
+      "loss": 0.125,
+      "step": 8062
+    },
+    {
+      "epoch": 0.06999071188618154,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001975568746986387,
+      "loss": 0.127,
+      "step": 8063
+    },
+    {
+      "epoch": 0.06999939236638571,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0019755618589081606,
+      "loss": 0.1221,
+      "step": 8064
+    },
+    {
+      "epoch": 0.07000807284658987,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019755549698724337,
+      "loss": 0.167,
+      "step": 8065
+    },
+    {
+      "epoch": 0.07001675332679404,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019755480798792123,
+      "loss": 0.1924,
+      "step": 8066
+    },
+    {
+      "epoch": 0.0700254338069982,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0019755411889285047,
+      "loss": 0.1484,
+      "step": 8067
+    },
+    {
+      "epoch": 0.07003411428720237,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0019755342970203186,
+      "loss": 0.1484,
+      "step": 8068
+    },
+    {
+      "epoch": 0.07004279476740653,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019755274041546615,
+      "loss": 0.1338,
+      "step": 8069
+    },
+    {
+      "epoch": 0.0700514752476107,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0019755205103315406,
+      "loss": 0.1523,
+      "step": 8070
+    },
+    {
+      "epoch": 0.07006015572781486,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001975513615550964,
+      "loss": 0.2021,
+      "step": 8071
+    },
+    {
+      "epoch": 0.07006883620801903,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001975506719812938,
+      "loss": 0.1797,
+      "step": 8072
+    },
+    {
+      "epoch": 0.0700775166882232,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0019754998231174717,
+      "loss": 0.1191,
+      "step": 8073
+    },
+    {
+      "epoch": 0.07008619716842736,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019754929254645714,
+      "loss": 0.0986,
+      "step": 8074
+    },
+    {
+      "epoch": 0.07009487764863152,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019754860268542456,
+      "loss": 0.1226,
+      "step": 8075
+    },
+    {
+      "epoch": 0.07010355812883569,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0019754791272865017,
+      "loss": 0.1279,
+      "step": 8076
+    },
+    {
+      "epoch": 0.07011223860903985,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0019754722267613465,
+      "loss": 0.1025,
+      "step": 8077
+    },
+    {
+      "epoch": 0.07012091908924402,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0019754653252787883,
+      "loss": 0.0977,
+      "step": 8078
+    },
+    {
+      "epoch": 0.07012959956944818,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019754584228388345,
+      "loss": 0.1279,
+      "step": 8079
+    },
+    {
+      "epoch": 0.07013828004965235,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.001975451519441492,
+      "loss": 0.1348,
+      "step": 8080
+    },
+    {
+      "epoch": 0.07014696052985651,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0019754446150867687,
+      "loss": 0.1064,
+      "step": 8081
+    },
+    {
+      "epoch": 0.07015564101006068,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001975437709774673,
+      "loss": 0.1289,
+      "step": 8082
+    },
+    {
+      "epoch": 0.07016432149026484,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001975430803505212,
+      "loss": 0.1484,
+      "step": 8083
+    },
+    {
+      "epoch": 0.07017300197046901,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019754238962783922,
+      "loss": 0.0957,
+      "step": 8084
+    },
+    {
+      "epoch": 0.07018168245067317,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019754169880942226,
+      "loss": 0.1338,
+      "step": 8085
+    },
+    {
+      "epoch": 0.07019036293087734,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0019754100789527097,
+      "loss": 0.1377,
+      "step": 8086
+    },
+    {
+      "epoch": 0.0701990434110815,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019754031688538616,
+      "loss": 0.1035,
+      "step": 8087
+    },
+    {
+      "epoch": 0.07020772389128567,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.001975396257797686,
+      "loss": 0.1611,
+      "step": 8088
+    },
+    {
+      "epoch": 0.07021640437148984,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.00197538934578419,
+      "loss": 0.1367,
+      "step": 8089
+    },
+    {
+      "epoch": 0.070225084851694,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0019753824328133814,
+      "loss": 0.1318,
+      "step": 8090
+    },
+    {
+      "epoch": 0.07023376533189817,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001975375518885268,
+      "loss": 0.1621,
+      "step": 8091
+    },
+    {
+      "epoch": 0.07024244581210233,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0019753686039998566,
+      "loss": 0.1641,
+      "step": 8092
+    },
+    {
+      "epoch": 0.0702511262923065,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019753616881571555,
+      "loss": 0.0928,
+      "step": 8093
+    },
+    {
+      "epoch": 0.07025980677251066,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019753547713571716,
+      "loss": 0.125,
+      "step": 8094
+    },
+    {
+      "epoch": 0.07026848725271483,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019753478535999135,
+      "loss": 0.1426,
+      "step": 8095
+    },
+    {
+      "epoch": 0.07027716773291899,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019753409348853878,
+      "loss": 0.1279,
+      "step": 8096
+    },
+    {
+      "epoch": 0.07028584821312316,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019753340152136025,
+      "loss": 0.1338,
+      "step": 8097
+    },
+    {
+      "epoch": 0.07029452869332732,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.001975327094584565,
+      "loss": 0.1631,
+      "step": 8098
+    },
+    {
+      "epoch": 0.07030320917353149,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001975320172998283,
+      "loss": 0.1963,
+      "step": 8099
+    },
+    {
+      "epoch": 0.07031188965373565,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019753132504547644,
+      "loss": 0.1719,
+      "step": 8100
+    },
+    {
+      "epoch": 0.07032057013393982,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019753063269540155,
+      "loss": 0.0923,
+      "step": 8101
+    },
+    {
+      "epoch": 0.07032925061414397,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0019752994024960454,
+      "loss": 0.1143,
+      "step": 8102
+    },
+    {
+      "epoch": 0.07033793109434813,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001975292477080861,
+      "loss": 0.1196,
+      "step": 8103
+    },
+    {
+      "epoch": 0.0703466115745523,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0019752855507084695,
+      "loss": 0.1348,
+      "step": 8104
+    },
+    {
+      "epoch": 0.07035529205475646,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0019752786233788793,
+      "loss": 0.1299,
+      "step": 8105
+    },
+    {
+      "epoch": 0.07036397253496063,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019752716950920974,
+      "loss": 0.1592,
+      "step": 8106
+    },
+    {
+      "epoch": 0.0703726530151648,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019752647658481315,
+      "loss": 0.127,
+      "step": 8107
+    },
+    {
+      "epoch": 0.07038133349536896,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019752578356469893,
+      "loss": 0.1152,
+      "step": 8108
+    },
+    {
+      "epoch": 0.07039001397557312,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001975250904488678,
+      "loss": 0.1025,
+      "step": 8109
+    },
+    {
+      "epoch": 0.07039869445577729,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019752439723732056,
+      "loss": 0.125,
+      "step": 8110
+    },
+    {
+      "epoch": 0.07040737493598145,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.0019752370393005796,
+      "loss": 0.1641,
+      "step": 8111
+    },
+    {
+      "epoch": 0.07041605541618562,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001975230105270808,
+      "loss": 0.1387,
+      "step": 8112
+    },
+    {
+      "epoch": 0.07042473589638978,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001975223170283897,
+      "loss": 0.1289,
+      "step": 8113
+    },
+    {
+      "epoch": 0.07043341637659395,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.001975216234339856,
+      "loss": 0.0996,
+      "step": 8114
+    },
+    {
+      "epoch": 0.07044209685679811,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019752092974386908,
+      "loss": 0.0908,
+      "step": 8115
+    },
+    {
+      "epoch": 0.07045077733700228,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019752023595804103,
+      "loss": 0.1133,
+      "step": 8116
+    },
+    {
+      "epoch": 0.07045945781720644,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0019751954207650217,
+      "loss": 0.1797,
+      "step": 8117
+    },
+    {
+      "epoch": 0.07046813829741061,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0019751884809925324,
+      "loss": 0.1445,
+      "step": 8118
+    },
+    {
+      "epoch": 0.07047681877761477,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.00197518154026295,
+      "loss": 0.125,
+      "step": 8119
+    },
+    {
+      "epoch": 0.07048549925781894,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0019751745985762824,
+      "loss": 0.1357,
+      "step": 8120
+    },
+    {
+      "epoch": 0.0704941797380231,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001975167655932537,
+      "loss": 0.1338,
+      "step": 8121
+    },
+    {
+      "epoch": 0.07050286021822727,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0019751607123317215,
+      "loss": 0.1377,
+      "step": 8122
+    },
+    {
+      "epoch": 0.07051154069843144,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001975153767773843,
+      "loss": 0.1787,
+      "step": 8123
+    },
+    {
+      "epoch": 0.0705202211786356,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.00197514682225891,
+      "loss": 0.124,
+      "step": 8124
+    },
+    {
+      "epoch": 0.07052890165883977,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019751398757869294,
+      "loss": 0.1455,
+      "step": 8125
+    },
+    {
+      "epoch": 0.07053758213904393,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001975132928357909,
+      "loss": 0.1035,
+      "step": 8126
+    },
+    {
+      "epoch": 0.0705462626192481,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001975125979971856,
+      "loss": 0.1152,
+      "step": 8127
+    },
+    {
+      "epoch": 0.07055494309945226,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.001975119030628779,
+      "loss": 0.1533,
+      "step": 8128
+    },
+    {
+      "epoch": 0.07056362357965643,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019751120803286845,
+      "loss": 0.1523,
+      "step": 8129
+    },
+    {
+      "epoch": 0.07057230405986059,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001975105129071581,
+      "loss": 0.1465,
+      "step": 8130
+    },
+    {
+      "epoch": 0.07058098454006476,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019750981768574755,
+      "loss": 0.1475,
+      "step": 8131
+    },
+    {
+      "epoch": 0.07058966502026892,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001975091223686376,
+      "loss": 0.1162,
+      "step": 8132
+    },
+    {
+      "epoch": 0.07059834550047309,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001975084269558289,
+      "loss": 0.1494,
+      "step": 8133
+    },
+    {
+      "epoch": 0.07060702598067725,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0019750773144732235,
+      "loss": 0.1279,
+      "step": 8134
+    },
+    {
+      "epoch": 0.07061570646088142,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001975070358431187,
+      "loss": 0.126,
+      "step": 8135
+    },
+    {
+      "epoch": 0.07062438694108558,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019750634014321865,
+      "loss": 0.126,
+      "step": 8136
+    },
+    {
+      "epoch": 0.07063306742128975,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00197505644347623,
+      "loss": 0.0933,
+      "step": 8137
+    },
+    {
+      "epoch": 0.07064174790149391,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0019750494845633244,
+      "loss": 0.1211,
+      "step": 8138
+    },
+    {
+      "epoch": 0.07065042838169808,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019750425246934777,
+      "loss": 0.1357,
+      "step": 8139
+    },
+    {
+      "epoch": 0.07065910886190224,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001975035563866698,
+      "loss": 0.1201,
+      "step": 8140
+    },
+    {
+      "epoch": 0.07066778934210641,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001975028602082993,
+      "loss": 0.1514,
+      "step": 8141
+    },
+    {
+      "epoch": 0.07067646982231057,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001975021639342369,
+      "loss": 0.1079,
+      "step": 8142
+    },
+    {
+      "epoch": 0.07068515030251474,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001975014675644835,
+      "loss": 0.1172,
+      "step": 8143
+    },
+    {
+      "epoch": 0.0706938307827189,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001975007710990398,
+      "loss": 0.1602,
+      "step": 8144
+    },
+    {
+      "epoch": 0.07070251126292307,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019750007453790655,
+      "loss": 0.1572,
+      "step": 8145
+    },
+    {
+      "epoch": 0.07071119174312723,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019749937788108456,
+      "loss": 0.1182,
+      "step": 8146
+    },
+    {
+      "epoch": 0.0707198722233314,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001974986811285746,
+      "loss": 0.1504,
+      "step": 8147
+    },
+    {
+      "epoch": 0.07072855270353556,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0019749798428037734,
+      "loss": 0.1123,
+      "step": 8148
+    },
+    {
+      "epoch": 0.07073723318373973,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0019749728733649365,
+      "loss": 0.1299,
+      "step": 8149
+    },
+    {
+      "epoch": 0.0707459136639439,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001974965902969242,
+      "loss": 0.0986,
+      "step": 8150
+    },
+    {
+      "epoch": 0.07075459414414806,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0019749589316166976,
+      "loss": 0.1328,
+      "step": 8151
+    },
+    {
+      "epoch": 0.07076327462435222,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001974951959307312,
+      "loss": 0.0913,
+      "step": 8152
+    },
+    {
+      "epoch": 0.07077195510455639,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0019749449860410914,
+      "loss": 0.1318,
+      "step": 8153
+    },
+    {
+      "epoch": 0.07078063558476055,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0019749380118180447,
+      "loss": 0.1182,
+      "step": 8154
+    },
+    {
+      "epoch": 0.07078931606496472,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019749310366381787,
+      "loss": 0.1084,
+      "step": 8155
+    },
+    {
+      "epoch": 0.07079799654516888,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019749240605015012,
+      "loss": 0.125,
+      "step": 8156
+    },
+    {
+      "epoch": 0.07080667702537305,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.00197491708340802,
+      "loss": 0.1543,
+      "step": 8157
+    },
+    {
+      "epoch": 0.07081535750557721,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019749101053577424,
+      "loss": 0.1465,
+      "step": 8158
+    },
+    {
+      "epoch": 0.07082403798578138,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019749031263506765,
+      "loss": 0.1387,
+      "step": 8159
+    },
+    {
+      "epoch": 0.07083271846598554,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019748961463868296,
+      "loss": 0.1406,
+      "step": 8160
+    },
+    {
+      "epoch": 0.07084139894618971,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019748891654662094,
+      "loss": 0.0967,
+      "step": 8161
+    },
+    {
+      "epoch": 0.07085007942639387,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019748821835888237,
+      "loss": 0.1318,
+      "step": 8162
+    },
+    {
+      "epoch": 0.07085875990659803,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00197487520075468,
+      "loss": 0.1406,
+      "step": 8163
+    },
+    {
+      "epoch": 0.07086744038680219,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019748682169637858,
+      "loss": 0.1914,
+      "step": 8164
+    },
+    {
+      "epoch": 0.07087612086700636,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019748612322161487,
+      "loss": 0.0835,
+      "step": 8165
+    },
+    {
+      "epoch": 0.07088480134721052,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0019748542465117765,
+      "loss": 0.1123,
+      "step": 8166
+    },
+    {
+      "epoch": 0.07089348182741469,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001974847259850677,
+      "loss": 0.1084,
+      "step": 8167
+    },
+    {
+      "epoch": 0.07090216230761885,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.001974840272232858,
+      "loss": 0.1094,
+      "step": 8168
+    },
+    {
+      "epoch": 0.07091084278782302,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001974833283658326,
+      "loss": 0.1709,
+      "step": 8169
+    },
+    {
+      "epoch": 0.07091952326802718,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019748262941270906,
+      "loss": 0.1523,
+      "step": 8170
+    },
+    {
+      "epoch": 0.07092820374823135,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0019748193036391577,
+      "loss": 0.126,
+      "step": 8171
+    },
+    {
+      "epoch": 0.07093688422843551,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019748123121945352,
+      "loss": 0.1133,
+      "step": 8172
+    },
+    {
+      "epoch": 0.07094556470863968,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019748053197932314,
+      "loss": 0.1113,
+      "step": 8173
+    },
+    {
+      "epoch": 0.07095424518884384,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0019747983264352537,
+      "loss": 0.1006,
+      "step": 8174
+    },
+    {
+      "epoch": 0.07096292566904801,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019747913321206097,
+      "loss": 0.2168,
+      "step": 8175
+    },
+    {
+      "epoch": 0.07097160614925217,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019747843368493074,
+      "loss": 0.1201,
+      "step": 8176
+    },
+    {
+      "epoch": 0.07098028662945634,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001974777340621353,
+      "loss": 0.1348,
+      "step": 8177
+    },
+    {
+      "epoch": 0.0709889671096605,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019747703434367567,
+      "loss": 0.123,
+      "step": 8178
+    },
+    {
+      "epoch": 0.07099764758986467,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019747633452955235,
+      "loss": 0.1396,
+      "step": 8179
+    },
+    {
+      "epoch": 0.07100632807006883,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001974756346197663,
+      "loss": 0.1191,
+      "step": 8180
+    },
+    {
+      "epoch": 0.071015008550273,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.001974749346143182,
+      "loss": 0.1729,
+      "step": 8181
+    },
+    {
+      "epoch": 0.07102368903047716,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019747423451320877,
+      "loss": 0.1338,
+      "step": 8182
+    },
+    {
+      "epoch": 0.07103236951068133,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0019747353431643884,
+      "loss": 0.1416,
+      "step": 8183
+    },
+    {
+      "epoch": 0.0710410499908855,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001974728340240092,
+      "loss": 0.0986,
+      "step": 8184
+    },
+    {
+      "epoch": 0.07104973047108966,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001974721336359206,
+      "loss": 0.1523,
+      "step": 8185
+    },
+    {
+      "epoch": 0.07105841095129382,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001974714331521738,
+      "loss": 0.2012,
+      "step": 8186
+    },
+    {
+      "epoch": 0.07106709143149799,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019747073257276953,
+      "loss": 0.125,
+      "step": 8187
+    },
+    {
+      "epoch": 0.07107577191170215,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019747003189770854,
+      "loss": 0.1162,
+      "step": 8188
+    },
+    {
+      "epoch": 0.07108445239190632,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001974693311269917,
+      "loss": 0.1387,
+      "step": 8189
+    },
+    {
+      "epoch": 0.07109313287211048,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0019746863026061964,
+      "loss": 0.1123,
+      "step": 8190
+    },
+    {
+      "epoch": 0.07110181335231465,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019746792929859324,
+      "loss": 0.1279,
+      "step": 8191
+    },
+    {
+      "epoch": 0.07111049383251881,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019746722824091323,
+      "loss": 0.1426,
+      "step": 8192
+    },
+    {
+      "epoch": 0.07111917431272298,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001974665270875804,
+      "loss": 0.1621,
+      "step": 8193
+    },
+    {
+      "epoch": 0.07112785479292714,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0019746582583859544,
+      "loss": 0.1084,
+      "step": 8194
+    },
+    {
+      "epoch": 0.07113653527313131,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001974651244939592,
+      "loss": 0.168,
+      "step": 8195
+    },
+    {
+      "epoch": 0.07114521575333548,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001974644230536724,
+      "loss": 0.1436,
+      "step": 8196
+    },
+    {
+      "epoch": 0.07115389623353964,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019746372151773586,
+      "loss": 0.1157,
+      "step": 8197
+    },
+    {
+      "epoch": 0.0711625767137438,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019746301988615026,
+      "loss": 0.1221,
+      "step": 8198
+    },
+    {
+      "epoch": 0.07117125719394797,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019746231815891643,
+      "loss": 0.1025,
+      "step": 8199
+    },
+    {
+      "epoch": 0.07117993767415214,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001974616163360351,
+      "loss": 0.1826,
+      "step": 8200
+    },
+    {
+      "epoch": 0.0711886181543563,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001974609144175071,
+      "loss": 0.1201,
+      "step": 8201
+    },
+    {
+      "epoch": 0.07119729863456047,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0019746021240333316,
+      "loss": 0.1348,
+      "step": 8202
+    },
+    {
+      "epoch": 0.07120597911476463,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.00197459510293514,
+      "loss": 0.1133,
+      "step": 8203
+    },
+    {
+      "epoch": 0.0712146595949688,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019745880808805048,
+      "loss": 0.125,
+      "step": 8204
+    },
+    {
+      "epoch": 0.07122334007517296,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001974581057869433,
+      "loss": 0.1416,
+      "step": 8205
+    },
+    {
+      "epoch": 0.07123202055537713,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019745740339019327,
+      "loss": 0.1113,
+      "step": 8206
+    },
+    {
+      "epoch": 0.07124070103558129,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0019745670089780106,
+      "loss": 0.1523,
+      "step": 8207
+    },
+    {
+      "epoch": 0.07124938151578546,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001974559983097676,
+      "loss": 0.1436,
+      "step": 8208
+    },
+    {
+      "epoch": 0.07125806199598962,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0019745529562609355,
+      "loss": 0.0967,
+      "step": 8209
+    },
+    {
+      "epoch": 0.07126674247619379,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0019745459284677973,
+      "loss": 0.1113,
+      "step": 8210
+    },
+    {
+      "epoch": 0.07127542295639795,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019745388997182683,
+      "loss": 0.1562,
+      "step": 8211
+    },
+    {
+      "epoch": 0.07128410343660212,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0019745318700123567,
+      "loss": 0.0933,
+      "step": 8212
+    },
+    {
+      "epoch": 0.07129278391680628,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001974524839350071,
+      "loss": 0.1118,
+      "step": 8213
+    },
+    {
+      "epoch": 0.07130146439701045,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001974517807731417,
+      "loss": 0.125,
+      "step": 8214
+    },
+    {
+      "epoch": 0.07131014487721461,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0019745107751564044,
+      "loss": 0.1396,
+      "step": 8215
+    },
+    {
+      "epoch": 0.07131882535741878,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0019745037416250394,
+      "loss": 0.1406,
+      "step": 8216
+    },
+    {
+      "epoch": 0.07132750583762294,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0019744967071373305,
+      "loss": 0.1572,
+      "step": 8217
+    },
+    {
+      "epoch": 0.07133618631782711,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001974489671693285,
+      "loss": 0.1504,
+      "step": 8218
+    },
+    {
+      "epoch": 0.07134486679803127,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001974482635292911,
+      "loss": 0.1138,
+      "step": 8219
+    },
+    {
+      "epoch": 0.07135354727823544,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001974475597936215,
+      "loss": 0.123,
+      "step": 8220
+    },
+    {
+      "epoch": 0.0713622277584396,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019744685596232068,
+      "loss": 0.1523,
+      "step": 8221
+    },
+    {
+      "epoch": 0.07137090823864377,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001974461520353892,
+      "loss": 0.1934,
+      "step": 8222
+    },
+    {
+      "epoch": 0.07137958871884793,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019744544801282796,
+      "loss": 0.1201,
+      "step": 8223
+    },
+    {
+      "epoch": 0.0713882691990521,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0019744474389463773,
+      "loss": 0.1553,
+      "step": 8224
+    },
+    {
+      "epoch": 0.07139694967925625,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001974440396808192,
+      "loss": 0.1611,
+      "step": 8225
+    },
+    {
+      "epoch": 0.07140563015946041,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001974433353713732,
+      "loss": 0.1543,
+      "step": 8226
+    },
+    {
+      "epoch": 0.07141431063966458,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019744263096630045,
+      "loss": 0.1133,
+      "step": 8227
+    },
+    {
+      "epoch": 0.07142299111986875,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001974419264656018,
+      "loss": 0.1816,
+      "step": 8228
+    },
+    {
+      "epoch": 0.07143167160007291,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001974412218692779,
+      "loss": 0.1055,
+      "step": 8229
+    },
+    {
+      "epoch": 0.07144035208027708,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001974405171773297,
+      "loss": 0.1016,
+      "step": 8230
+    },
+    {
+      "epoch": 0.07144903256048124,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001974398123897578,
+      "loss": 0.1318,
+      "step": 8231
+    },
+    {
+      "epoch": 0.0714577130406854,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.00197439107506563,
+      "loss": 0.1396,
+      "step": 8232
+    },
+    {
+      "epoch": 0.07146639352088957,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001974384025277462,
+      "loss": 0.1064,
+      "step": 8233
+    },
+    {
+      "epoch": 0.07147507400109374,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.00197437697453308,
+      "loss": 0.1455,
+      "step": 8234
+    },
+    {
+      "epoch": 0.0714837544812979,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0019743699228324924,
+      "loss": 0.1523,
+      "step": 8235
+    },
+    {
+      "epoch": 0.07149243496150207,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019743628701757074,
+      "loss": 0.1089,
+      "step": 8236
+    },
+    {
+      "epoch": 0.07150111544170623,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001974355816562732,
+      "loss": 0.1533,
+      "step": 8237
+    },
+    {
+      "epoch": 0.0715097959219104,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001974348761993575,
+      "loss": 0.1377,
+      "step": 8238
+    },
+    {
+      "epoch": 0.07151847640211456,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019743417064682425,
+      "loss": 0.1357,
+      "step": 8239
+    },
+    {
+      "epoch": 0.07152715688231873,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019743346499867436,
+      "loss": 0.1465,
+      "step": 8240
+    },
+    {
+      "epoch": 0.07153583736252289,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001974327592549085,
+      "loss": 0.1455,
+      "step": 8241
+    },
+    {
+      "epoch": 0.07154451784272706,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019743205341552746,
+      "loss": 0.1211,
+      "step": 8242
+    },
+    {
+      "epoch": 0.07155319832293122,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001974313474805321,
+      "loss": 0.1074,
+      "step": 8243
+    },
+    {
+      "epoch": 0.07156187880313539,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0019743064144992313,
+      "loss": 0.1299,
+      "step": 8244
+    },
+    {
+      "epoch": 0.07157055928333955,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001974299353237013,
+      "loss": 0.1318,
+      "step": 8245
+    },
+    {
+      "epoch": 0.07157923976354372,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019742922910186743,
+      "loss": 0.1216,
+      "step": 8246
+    },
+    {
+      "epoch": 0.07158792024374788,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0019742852278442223,
+      "loss": 0.1045,
+      "step": 8247
+    },
+    {
+      "epoch": 0.07159660072395205,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0019742781637136653,
+      "loss": 0.1211,
+      "step": 8248
+    },
+    {
+      "epoch": 0.07160528120415621,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001974271098627011,
+      "loss": 0.1582,
+      "step": 8249
+    },
+    {
+      "epoch": 0.07161396168436038,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019742640325842668,
+      "loss": 0.167,
+      "step": 8250
+    },
+    {
+      "epoch": 0.07162264216456454,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001974256965585441,
+      "loss": 0.1689,
+      "step": 8251
+    },
+    {
+      "epoch": 0.07163132264476871,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019742498976305406,
+      "loss": 0.1133,
+      "step": 8252
+    },
+    {
+      "epoch": 0.07164000312497287,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0019742428287195736,
+      "loss": 0.1143,
+      "step": 8253
+    },
+    {
+      "epoch": 0.07164868360517704,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019742357588525476,
+      "loss": 0.104,
+      "step": 8254
+    },
+    {
+      "epoch": 0.0716573640853812,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001974228688029471,
+      "loss": 0.127,
+      "step": 8255
+    },
+    {
+      "epoch": 0.07166604456558537,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0019742216162503506,
+      "loss": 0.1758,
+      "step": 8256
+    },
+    {
+      "epoch": 0.07167472504578953,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019742145435151944,
+      "loss": 0.1299,
+      "step": 8257
+    },
+    {
+      "epoch": 0.0716834055259937,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019742074698240113,
+      "loss": 0.1309,
+      "step": 8258
+    },
+    {
+      "epoch": 0.07169208600619786,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019742003951768072,
+      "loss": 0.0986,
+      "step": 8259
+    },
+    {
+      "epoch": 0.07170076648640203,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019741933195735906,
+      "loss": 0.1348,
+      "step": 8260
+    },
+    {
+      "epoch": 0.0717094469666062,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0019741862430143696,
+      "loss": 0.1191,
+      "step": 8261
+    },
+    {
+      "epoch": 0.07171812744681036,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019741791654991517,
+      "loss": 0.1172,
+      "step": 8262
+    },
+    {
+      "epoch": 0.07172680792701452,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019741720870279445,
+      "loss": 0.0942,
+      "step": 8263
+    },
+    {
+      "epoch": 0.07173548840721869,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.001974165007600756,
+      "loss": 0.1348,
+      "step": 8264
+    },
+    {
+      "epoch": 0.07174416888742285,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0019741579272175938,
+      "loss": 0.1553,
+      "step": 8265
+    },
+    {
+      "epoch": 0.07175284936762702,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019741508458784654,
+      "loss": 0.1104,
+      "step": 8266
+    },
+    {
+      "epoch": 0.07176152984783118,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019741437635833786,
+      "loss": 0.1406,
+      "step": 8267
+    },
+    {
+      "epoch": 0.07177021032803535,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019741366803323417,
+      "loss": 0.1348,
+      "step": 8268
+    },
+    {
+      "epoch": 0.07177889080823951,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001974129596125362,
+      "loss": 0.1406,
+      "step": 8269
+    },
+    {
+      "epoch": 0.07178757128844368,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0019741225109624473,
+      "loss": 0.1074,
+      "step": 8270
+    },
+    {
+      "epoch": 0.07179625176864785,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0019741154248436054,
+      "loss": 0.105,
+      "step": 8271
+    },
+    {
+      "epoch": 0.07180493224885201,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0019741083377688437,
+      "loss": 0.1523,
+      "step": 8272
+    },
+    {
+      "epoch": 0.07181361272905618,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0019741012497381705,
+      "loss": 0.1504,
+      "step": 8273
+    },
+    {
+      "epoch": 0.07182229320926034,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001974094160751593,
+      "loss": 0.1289,
+      "step": 8274
+    },
+    {
+      "epoch": 0.0718309736894645,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019740870708091197,
+      "loss": 0.1235,
+      "step": 8275
+    },
+    {
+      "epoch": 0.07183965416966867,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019740799799107577,
+      "loss": 0.1309,
+      "step": 8276
+    },
+    {
+      "epoch": 0.07184833464987284,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001974072888056515,
+      "loss": 0.1079,
+      "step": 8277
+    },
+    {
+      "epoch": 0.071857015130077,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019740657952463992,
+      "loss": 0.1338,
+      "step": 8278
+    },
+    {
+      "epoch": 0.07186569561028117,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0019740587014804183,
+      "loss": 0.125,
+      "step": 8279
+    },
+    {
+      "epoch": 0.07187437609048533,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.00197405160675858,
+      "loss": 0.1172,
+      "step": 8280
+    },
+    {
+      "epoch": 0.0718830565706895,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001974044511080892,
+      "loss": 0.1309,
+      "step": 8281
+    },
+    {
+      "epoch": 0.07189173705089366,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001974037414447362,
+      "loss": 0.1562,
+      "step": 8282
+    },
+    {
+      "epoch": 0.07190041753109783,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019740303168579976,
+      "loss": 0.1318,
+      "step": 8283
+    },
+    {
+      "epoch": 0.07190909801130199,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001974023218312807,
+      "loss": 0.1084,
+      "step": 8284
+    },
+    {
+      "epoch": 0.07191777849150616,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.001974016118811798,
+      "loss": 0.1226,
+      "step": 8285
+    },
+    {
+      "epoch": 0.07192645897171031,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001974009018354978,
+      "loss": 0.1592,
+      "step": 8286
+    },
+    {
+      "epoch": 0.07193513945191447,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019740019169423545,
+      "loss": 0.1445,
+      "step": 8287
+    },
+    {
+      "epoch": 0.07194381993211864,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001973994814573936,
+      "loss": 0.127,
+      "step": 8288
+    },
+    {
+      "epoch": 0.0719525004123228,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001973987711249729,
+      "loss": 0.1104,
+      "step": 8289
+    },
+    {
+      "epoch": 0.07196118089252697,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0019739806069697433,
+      "loss": 0.1357,
+      "step": 8290
+    },
+    {
+      "epoch": 0.07196986137273113,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019739735017339855,
+      "loss": 0.1133,
+      "step": 8291
+    },
+    {
+      "epoch": 0.0719785418529353,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001973966395542463,
+      "loss": 0.0801,
+      "step": 8292
+    },
+    {
+      "epoch": 0.07198722233313946,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001973959288395184,
+      "loss": 0.0986,
+      "step": 8293
+    },
+    {
+      "epoch": 0.07199590281334363,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001973952180292156,
+      "loss": 0.1055,
+      "step": 8294
+    },
+    {
+      "epoch": 0.0720045832935478,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019739450712333876,
+      "loss": 0.1011,
+      "step": 8295
+    },
+    {
+      "epoch": 0.07201326377375196,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019739379612188856,
+      "loss": 0.1484,
+      "step": 8296
+    },
+    {
+      "epoch": 0.07202194425395612,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.001973930850248658,
+      "loss": 0.082,
+      "step": 8297
+    },
+    {
+      "epoch": 0.07203062473416029,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.001973923738322713,
+      "loss": 0.1045,
+      "step": 8298
+    },
+    {
+      "epoch": 0.07203930521436445,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019739166254410584,
+      "loss": 0.1406,
+      "step": 8299
+    },
+    {
+      "epoch": 0.07204798569456862,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019739095116037015,
+      "loss": 0.1299,
+      "step": 8300
+    },
+    {
+      "epoch": 0.07205666617477278,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019739023968106503,
+      "loss": 0.0957,
+      "step": 8301
+    },
+    {
+      "epoch": 0.07206534665497695,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0019738952810619127,
+      "loss": 0.1484,
+      "step": 8302
+    },
+    {
+      "epoch": 0.07207402713518112,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.001973888164357496,
+      "loss": 0.125,
+      "step": 8303
+    },
+    {
+      "epoch": 0.07208270761538528,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019738810466974088,
+      "loss": 0.1152,
+      "step": 8304
+    },
+    {
+      "epoch": 0.07209138809558945,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019738739280816577,
+      "loss": 0.1182,
+      "step": 8305
+    },
+    {
+      "epoch": 0.07210006857579361,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019738668085102517,
+      "loss": 0.126,
+      "step": 8306
+    },
+    {
+      "epoch": 0.07210874905599778,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019738596879831983,
+      "loss": 0.1377,
+      "step": 8307
+    },
+    {
+      "epoch": 0.07211742953620194,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001973852566500505,
+      "loss": 0.1289,
+      "step": 8308
+    },
+    {
+      "epoch": 0.0721261100164061,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.001973845444062179,
+      "loss": 0.1016,
+      "step": 8309
+    },
+    {
+      "epoch": 0.07213479049661027,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0019738383206682297,
+      "loss": 0.1523,
+      "step": 8310
+    },
+    {
+      "epoch": 0.07214347097681444,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019738311963186637,
+      "loss": 0.1289,
+      "step": 8311
+    },
+    {
+      "epoch": 0.0721521514570186,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019738240710134884,
+      "loss": 0.1514,
+      "step": 8312
+    },
+    {
+      "epoch": 0.07216083193722277,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0019738169447527125,
+      "loss": 0.1001,
+      "step": 8313
+    },
+    {
+      "epoch": 0.07216951241742693,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019738098175363437,
+      "loss": 0.1484,
+      "step": 8314
+    },
+    {
+      "epoch": 0.0721781928976311,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0019738026893643896,
+      "loss": 0.1001,
+      "step": 8315
+    },
+    {
+      "epoch": 0.07218687337783526,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001973795560236858,
+      "loss": 0.1484,
+      "step": 8316
+    },
+    {
+      "epoch": 0.07219555385803943,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0019737884301537566,
+      "loss": 0.1191,
+      "step": 8317
+    },
+    {
+      "epoch": 0.07220423433824359,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019737812991150933,
+      "loss": 0.1357,
+      "step": 8318
+    },
+    {
+      "epoch": 0.07221291481844776,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001973774167120876,
+      "loss": 0.1143,
+      "step": 8319
+    },
+    {
+      "epoch": 0.07222159529865192,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019737670341711125,
+      "loss": 0.1187,
+      "step": 8320
+    },
+    {
+      "epoch": 0.07223027577885609,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.00197375990026581,
+      "loss": 0.127,
+      "step": 8321
+    },
+    {
+      "epoch": 0.07223895625906025,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019737527654049773,
+      "loss": 0.127,
+      "step": 8322
+    },
+    {
+      "epoch": 0.07224763673926442,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001973745629588622,
+      "loss": 0.1123,
+      "step": 8323
+    },
+    {
+      "epoch": 0.07225631721946858,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0019737384928167506,
+      "loss": 0.1172,
+      "step": 8324
+    },
+    {
+      "epoch": 0.07226499769967275,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001973731355089373,
+      "loss": 0.1484,
+      "step": 8325
+    },
+    {
+      "epoch": 0.07227367817987691,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001973724216406495,
+      "loss": 0.1133,
+      "step": 8326
+    },
+    {
+      "epoch": 0.07228235866008108,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001973717076768126,
+      "loss": 0.1416,
+      "step": 8327
+    },
+    {
+      "epoch": 0.07229103914028524,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001973709936174273,
+      "loss": 0.1504,
+      "step": 8328
+    },
+    {
+      "epoch": 0.07229971962048941,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0019737027946249435,
+      "loss": 0.1045,
+      "step": 8329
+    },
+    {
+      "epoch": 0.07230840010069357,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001973695652120146,
+      "loss": 0.127,
+      "step": 8330
+    },
+    {
+      "epoch": 0.07231708058089774,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019736885086598884,
+      "loss": 0.1289,
+      "step": 8331
+    },
+    {
+      "epoch": 0.0723257610611019,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001973681364244178,
+      "loss": 0.1816,
+      "step": 8332
+    },
+    {
+      "epoch": 0.07233444154130607,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0019736742188730225,
+      "loss": 0.1221,
+      "step": 8333
+    },
+    {
+      "epoch": 0.07234312202151023,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0019736670725464304,
+      "loss": 0.1211,
+      "step": 8334
+    },
+    {
+      "epoch": 0.0723518025017144,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019736599252644085,
+      "loss": 0.1162,
+      "step": 8335
+    },
+    {
+      "epoch": 0.07236048298191856,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0019736527770269656,
+      "loss": 0.1553,
+      "step": 8336
+    },
+    {
+      "epoch": 0.07236916346212273,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019736456278341094,
+      "loss": 0.1523,
+      "step": 8337
+    },
+    {
+      "epoch": 0.0723778439423269,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0019736384776858473,
+      "loss": 0.1367,
+      "step": 8338
+    },
+    {
+      "epoch": 0.07238652442253106,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001973631326582187,
+      "loss": 0.1235,
+      "step": 8339
+    },
+    {
+      "epoch": 0.07239520490273522,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001973624174523137,
+      "loss": 0.1406,
+      "step": 8340
+    },
+    {
+      "epoch": 0.07240388538293939,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019736170215087045,
+      "loss": 0.1387,
+      "step": 8341
+    },
+    {
+      "epoch": 0.07241256586314355,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001973609867538898,
+      "loss": 0.1289,
+      "step": 8342
+    },
+    {
+      "epoch": 0.07242124634334772,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001973602712613724,
+      "loss": 0.1289,
+      "step": 8343
+    },
+    {
+      "epoch": 0.07242992682355189,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001973595556733192,
+      "loss": 0.1309,
+      "step": 8344
+    },
+    {
+      "epoch": 0.07243860730375605,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0019735883998973085,
+      "loss": 0.1377,
+      "step": 8345
+    },
+    {
+      "epoch": 0.07244728778396022,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001973581242106082,
+      "loss": 0.1445,
+      "step": 8346
+    },
+    {
+      "epoch": 0.07245596826416438,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019735740833595203,
+      "loss": 0.1436,
+      "step": 8347
+    },
+    {
+      "epoch": 0.07246464874436853,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019735669236576312,
+      "loss": 0.1289,
+      "step": 8348
+    },
+    {
+      "epoch": 0.0724733292245727,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001973559763000422,
+      "loss": 0.1514,
+      "step": 8349
+    },
+    {
+      "epoch": 0.07248200970477686,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019735526013879017,
+      "loss": 0.1689,
+      "step": 8350
+    },
+    {
+      "epoch": 0.07249069018498103,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001973545438820077,
+      "loss": 0.0981,
+      "step": 8351
+    },
+    {
+      "epoch": 0.07249937066518519,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019735382752969557,
+      "loss": 0.1465,
+      "step": 8352
+    },
+    {
+      "epoch": 0.07250805114538936,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019735311108185468,
+      "loss": 0.1348,
+      "step": 8353
+    },
+    {
+      "epoch": 0.07251673162559352,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.001973523945384857,
+      "loss": 0.1396,
+      "step": 8354
+    },
+    {
+      "epoch": 0.07252541210579769,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019735167789958945,
+      "loss": 0.1406,
+      "step": 8355
+    },
+    {
+      "epoch": 0.07253409258600185,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019735096116516673,
+      "loss": 0.1553,
+      "step": 8356
+    },
+    {
+      "epoch": 0.07254277306620602,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019735024433521833,
+      "loss": 0.1162,
+      "step": 8357
+    },
+    {
+      "epoch": 0.07255145354641018,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.00197349527409745,
+      "loss": 0.1201,
+      "step": 8358
+    },
+    {
+      "epoch": 0.07256013402661435,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019734881038874752,
+      "loss": 0.1113,
+      "step": 8359
+    },
+    {
+      "epoch": 0.07256881450681851,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019734809327222667,
+      "loss": 0.1089,
+      "step": 8360
+    },
+    {
+      "epoch": 0.07257749498702268,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.001973473760601833,
+      "loss": 0.1318,
+      "step": 8361
+    },
+    {
+      "epoch": 0.07258617546722684,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001973466587526182,
+      "loss": 0.1289,
+      "step": 8362
+    },
+    {
+      "epoch": 0.07259485594743101,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019734594134953203,
+      "loss": 0.0884,
+      "step": 8363
+    },
+    {
+      "epoch": 0.07260353642763517,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019734522385092567,
+      "loss": 0.1338,
+      "step": 8364
+    },
+    {
+      "epoch": 0.07261221690783934,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001973445062567999,
+      "loss": 0.0898,
+      "step": 8365
+    },
+    {
+      "epoch": 0.0726208973880435,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019734378856715545,
+      "loss": 0.0957,
+      "step": 8366
+    },
+    {
+      "epoch": 0.07262957786824767,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001973430707819932,
+      "loss": 0.1914,
+      "step": 8367
+    },
+    {
+      "epoch": 0.07263825834845183,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001973423529013138,
+      "loss": 0.1079,
+      "step": 8368
+    },
+    {
+      "epoch": 0.072646938828656,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001973416349251182,
+      "loss": 0.1172,
+      "step": 8369
+    },
+    {
+      "epoch": 0.07265561930886016,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019734091685340706,
+      "loss": 0.1523,
+      "step": 8370
+    },
+    {
+      "epoch": 0.07266429978906433,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0019734019868618124,
+      "loss": 0.0977,
+      "step": 8371
+    },
+    {
+      "epoch": 0.0726729802692685,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019733948042344146,
+      "loss": 0.1289,
+      "step": 8372
+    },
+    {
+      "epoch": 0.07268166074947266,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001973387620651886,
+      "loss": 0.1338,
+      "step": 8373
+    },
+    {
+      "epoch": 0.07269034122967682,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001973380436114233,
+      "loss": 0.1162,
+      "step": 8374
+    },
+    {
+      "epoch": 0.07269902170988099,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001973373250621464,
+      "loss": 0.1377,
+      "step": 8375
+    },
+    {
+      "epoch": 0.07270770219008516,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001973366064173588,
+      "loss": 0.1138,
+      "step": 8376
+    },
+    {
+      "epoch": 0.07271638267028932,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.001973358876770612,
+      "loss": 0.0889,
+      "step": 8377
+    },
+    {
+      "epoch": 0.07272506315049349,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019733516884125435,
+      "loss": 0.0771,
+      "step": 8378
+    },
+    {
+      "epoch": 0.07273374363069765,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0019733444990993904,
+      "loss": 0.1309,
+      "step": 8379
+    },
+    {
+      "epoch": 0.07274242411090182,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001973337308831161,
+      "loss": 0.1016,
+      "step": 8380
+    },
+    {
+      "epoch": 0.07275110459110598,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019733301176078637,
+      "loss": 0.1001,
+      "step": 8381
+    },
+    {
+      "epoch": 0.07275978507131015,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.001973322925429505,
+      "loss": 0.1543,
+      "step": 8382
+    },
+    {
+      "epoch": 0.07276846555151431,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019733157322960938,
+      "loss": 0.1523,
+      "step": 8383
+    },
+    {
+      "epoch": 0.07277714603171848,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0019733085382076373,
+      "loss": 0.1377,
+      "step": 8384
+    },
+    {
+      "epoch": 0.07278582651192264,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.001973301343164144,
+      "loss": 0.1611,
+      "step": 8385
+    },
+    {
+      "epoch": 0.0727945069921268,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019732941471656217,
+      "loss": 0.1201,
+      "step": 8386
+    },
+    {
+      "epoch": 0.07280318747233097,
+      "grad_norm": 1.9609375,
+      "learning_rate": 0.0019732869502120777,
+      "loss": 0.127,
+      "step": 8387
+    },
+    {
+      "epoch": 0.07281186795253514,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0019732797523035204,
+      "loss": 0.1338,
+      "step": 8388
+    },
+    {
+      "epoch": 0.0728205484327393,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0019732725534399574,
+      "loss": 0.4219,
+      "step": 8389
+    },
+    {
+      "epoch": 0.07282922891294347,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019732653536213966,
+      "loss": 0.123,
+      "step": 8390
+    },
+    {
+      "epoch": 0.07283790939314763,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001973258152847846,
+      "loss": 0.1309,
+      "step": 8391
+    },
+    {
+      "epoch": 0.0728465898733518,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0019732509511193133,
+      "loss": 0.1631,
+      "step": 8392
+    },
+    {
+      "epoch": 0.07285527035355596,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019732437484358068,
+      "loss": 0.1387,
+      "step": 8393
+    },
+    {
+      "epoch": 0.07286395083376013,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019732365447973333,
+      "loss": 0.1152,
+      "step": 8394
+    },
+    {
+      "epoch": 0.07287263131396429,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0019732293402039023,
+      "loss": 0.1973,
+      "step": 8395
+    },
+    {
+      "epoch": 0.07288131179416846,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019732221346555204,
+      "loss": 0.1592,
+      "step": 8396
+    },
+    {
+      "epoch": 0.07288999227437262,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019732149281521958,
+      "loss": 0.1162,
+      "step": 8397
+    },
+    {
+      "epoch": 0.07289867275457679,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001973207720693937,
+      "loss": 0.1221,
+      "step": 8398
+    },
+    {
+      "epoch": 0.07290735323478095,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001973200512280751,
+      "loss": 0.166,
+      "step": 8399
+    },
+    {
+      "epoch": 0.07291603371498512,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019731933029126457,
+      "loss": 0.1396,
+      "step": 8400
+    },
+    {
+      "epoch": 0.07292471419518928,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00197318609258963,
+      "loss": 0.0835,
+      "step": 8401
+    },
+    {
+      "epoch": 0.07293339467539345,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019731788813117106,
+      "loss": 0.166,
+      "step": 8402
+    },
+    {
+      "epoch": 0.07294207515559761,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001973171669078896,
+      "loss": 0.1299,
+      "step": 8403
+    },
+    {
+      "epoch": 0.07295075563580178,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001973164455891194,
+      "loss": 0.1338,
+      "step": 8404
+    },
+    {
+      "epoch": 0.07295943611600594,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019731572417486126,
+      "loss": 0.1816,
+      "step": 8405
+    },
+    {
+      "epoch": 0.07296811659621011,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019731500266511595,
+      "loss": 0.1641,
+      "step": 8406
+    },
+    {
+      "epoch": 0.07297679707641427,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019731428105988426,
+      "loss": 0.1221,
+      "step": 8407
+    },
+    {
+      "epoch": 0.07298547755661844,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0019731355935916697,
+      "loss": 0.0947,
+      "step": 8408
+    },
+    {
+      "epoch": 0.0729941580368226,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001973128375629649,
+      "loss": 0.1338,
+      "step": 8409
+    },
+    {
+      "epoch": 0.07300283851702676,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001973121156712788,
+      "loss": 0.1641,
+      "step": 8410
+    },
+    {
+      "epoch": 0.07301151899723092,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019731139368410954,
+      "loss": 0.1123,
+      "step": 8411
+    },
+    {
+      "epoch": 0.07302019947743509,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001973106716014578,
+      "loss": 0.1484,
+      "step": 8412
+    },
+    {
+      "epoch": 0.07302887995763925,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019730994942332445,
+      "loss": 0.1289,
+      "step": 8413
+    },
+    {
+      "epoch": 0.07303756043784342,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019730922714971027,
+      "loss": 0.1426,
+      "step": 8414
+    },
+    {
+      "epoch": 0.07304624091804758,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019730850478061595,
+      "loss": 0.1768,
+      "step": 8415
+    },
+    {
+      "epoch": 0.07305492139825175,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.001973077823160424,
+      "loss": 0.166,
+      "step": 8416
+    },
+    {
+      "epoch": 0.07306360187845591,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001973070597559904,
+      "loss": 0.1533,
+      "step": 8417
+    },
+    {
+      "epoch": 0.07307228235866008,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.001973063371004607,
+      "loss": 0.1089,
+      "step": 8418
+    },
+    {
+      "epoch": 0.07308096283886424,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001973056143494541,
+      "loss": 0.1074,
+      "step": 8419
+    },
+    {
+      "epoch": 0.0730896433190684,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019730489150297137,
+      "loss": 0.1357,
+      "step": 8420
+    },
+    {
+      "epoch": 0.07309832379927257,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019730416856101334,
+      "loss": 0.1357,
+      "step": 8421
+    },
+    {
+      "epoch": 0.07310700427947674,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001973034455235808,
+      "loss": 0.1387,
+      "step": 8422
+    },
+    {
+      "epoch": 0.0731156847596809,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019730272239067447,
+      "loss": 0.1699,
+      "step": 8423
+    },
+    {
+      "epoch": 0.07312436523988507,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019730199916229525,
+      "loss": 0.207,
+      "step": 8424
+    },
+    {
+      "epoch": 0.07313304572008923,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019730127583844383,
+      "loss": 0.1094,
+      "step": 8425
+    },
+    {
+      "epoch": 0.0731417262002934,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019730055241912107,
+      "loss": 0.1445,
+      "step": 8426
+    },
+    {
+      "epoch": 0.07315040668049756,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0019729982890432777,
+      "loss": 0.1504,
+      "step": 8427
+    },
+    {
+      "epoch": 0.07315908716070173,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019729910529406466,
+      "loss": 0.1387,
+      "step": 8428
+    },
+    {
+      "epoch": 0.07316776764090589,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019729838158833255,
+      "loss": 0.1855,
+      "step": 8429
+    },
+    {
+      "epoch": 0.07317644812111006,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0019729765778713224,
+      "loss": 0.1221,
+      "step": 8430
+    },
+    {
+      "epoch": 0.07318512860131422,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0019729693389046454,
+      "loss": 0.1758,
+      "step": 8431
+    },
+    {
+      "epoch": 0.07319380908151839,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001972962098983302,
+      "loss": 0.1289,
+      "step": 8432
+    },
+    {
+      "epoch": 0.07320248956172255,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019729548581073008,
+      "loss": 0.1152,
+      "step": 8433
+    },
+    {
+      "epoch": 0.07321117004192672,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019729476162766487,
+      "loss": 0.1484,
+      "step": 8434
+    },
+    {
+      "epoch": 0.07321985052213088,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001972940373491355,
+      "loss": 0.1465,
+      "step": 8435
+    },
+    {
+      "epoch": 0.07322853100233505,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001972933129751426,
+      "loss": 0.1465,
+      "step": 8436
+    },
+    {
+      "epoch": 0.07323721148253921,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0019729258850568706,
+      "loss": 0.1309,
+      "step": 8437
+    },
+    {
+      "epoch": 0.07324589196274338,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0019729186394076966,
+      "loss": 0.1523,
+      "step": 8438
+    },
+    {
+      "epoch": 0.07325457244294754,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0019729113928039126,
+      "loss": 0.1309,
+      "step": 8439
+    },
+    {
+      "epoch": 0.07326325292315171,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019729041452455254,
+      "loss": 0.1641,
+      "step": 8440
+    },
+    {
+      "epoch": 0.07327193340335587,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001972896896732543,
+      "loss": 0.1953,
+      "step": 8441
+    },
+    {
+      "epoch": 0.07328061388356004,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0019728896472649736,
+      "loss": 0.2227,
+      "step": 8442
+    },
+    {
+      "epoch": 0.0732892943637642,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019728823968428255,
+      "loss": 0.1201,
+      "step": 8443
+    },
+    {
+      "epoch": 0.07329797484396837,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0019728751454661064,
+      "loss": 0.1113,
+      "step": 8444
+    },
+    {
+      "epoch": 0.07330665532417253,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001972867893134824,
+      "loss": 0.1445,
+      "step": 8445
+    },
+    {
+      "epoch": 0.0733153358043767,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019728606398489866,
+      "loss": 0.1279,
+      "step": 8446
+    },
+    {
+      "epoch": 0.07332401628458086,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0019728533856086016,
+      "loss": 0.1035,
+      "step": 8447
+    },
+    {
+      "epoch": 0.07333269676478503,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019728461304136776,
+      "loss": 0.1484,
+      "step": 8448
+    },
+    {
+      "epoch": 0.0733413772449892,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001972838874264222,
+      "loss": 0.1152,
+      "step": 8449
+    },
+    {
+      "epoch": 0.07335005772519336,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019728316171602426,
+      "loss": 0.1328,
+      "step": 8450
+    },
+    {
+      "epoch": 0.07335873820539753,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001972824359101748,
+      "loss": 0.1807,
+      "step": 8451
+    },
+    {
+      "epoch": 0.07336741868560169,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001972817100088746,
+      "loss": 0.1533,
+      "step": 8452
+    },
+    {
+      "epoch": 0.07337609916580586,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001972809840121244,
+      "loss": 0.1289,
+      "step": 8453
+    },
+    {
+      "epoch": 0.07338477964601002,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0019728025791992503,
+      "loss": 0.1582,
+      "step": 8454
+    },
+    {
+      "epoch": 0.07339346012621419,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001972795317322773,
+      "loss": 0.1182,
+      "step": 8455
+    },
+    {
+      "epoch": 0.07340214060641835,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019727880544918195,
+      "loss": 0.1475,
+      "step": 8456
+    },
+    {
+      "epoch": 0.07341082108662252,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019727807907063987,
+      "loss": 0.2051,
+      "step": 8457
+    },
+    {
+      "epoch": 0.07341950156682668,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019727735259665174,
+      "loss": 0.1357,
+      "step": 8458
+    },
+    {
+      "epoch": 0.07342818204703085,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019727662602721844,
+      "loss": 0.1328,
+      "step": 8459
+    },
+    {
+      "epoch": 0.07343686252723501,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001972758993623407,
+      "loss": 0.1523,
+      "step": 8460
+    },
+    {
+      "epoch": 0.07344554300743918,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019727517260201936,
+      "loss": 0.0938,
+      "step": 8461
+    },
+    {
+      "epoch": 0.07345422348764334,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019727444574625524,
+      "loss": 0.1426,
+      "step": 8462
+    },
+    {
+      "epoch": 0.0734629039678475,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019727371879504907,
+      "loss": 0.1631,
+      "step": 8463
+    },
+    {
+      "epoch": 0.07347158444805167,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0019727299174840166,
+      "loss": 0.1064,
+      "step": 8464
+    },
+    {
+      "epoch": 0.07348026492825584,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019727226460631384,
+      "loss": 0.0869,
+      "step": 8465
+    },
+    {
+      "epoch": 0.07348894540846,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0019727153736878635,
+      "loss": 0.1426,
+      "step": 8466
+    },
+    {
+      "epoch": 0.07349762588866417,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019727081003582,
+      "loss": 0.1201,
+      "step": 8467
+    },
+    {
+      "epoch": 0.07350630636886833,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001972700826074157,
+      "loss": 0.1562,
+      "step": 8468
+    },
+    {
+      "epoch": 0.0735149868490725,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001972693550835741,
+      "loss": 0.1504,
+      "step": 8469
+    },
+    {
+      "epoch": 0.07352366732927666,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0019726862746429603,
+      "loss": 0.125,
+      "step": 8470
+    },
+    {
+      "epoch": 0.07353234780948081,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019726789974958228,
+      "loss": 0.1621,
+      "step": 8471
+    },
+    {
+      "epoch": 0.07354102828968498,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0019726717193943367,
+      "loss": 0.1084,
+      "step": 8472
+    },
+    {
+      "epoch": 0.07354970876988914,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019726644403385107,
+      "loss": 0.1221,
+      "step": 8473
+    },
+    {
+      "epoch": 0.07355838925009331,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001972657160328351,
+      "loss": 0.1201,
+      "step": 8474
+    },
+    {
+      "epoch": 0.07356706973029747,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001972649879363867,
+      "loss": 0.2129,
+      "step": 8475
+    },
+    {
+      "epoch": 0.07357575021050164,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019726425974450662,
+      "loss": 0.0977,
+      "step": 8476
+    },
+    {
+      "epoch": 0.0735844306907058,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019726353145719565,
+      "loss": 0.1387,
+      "step": 8477
+    },
+    {
+      "epoch": 0.07359311117090997,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001972628030744546,
+      "loss": 0.1104,
+      "step": 8478
+    },
+    {
+      "epoch": 0.07360179165111413,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019726207459628423,
+      "loss": 0.127,
+      "step": 8479
+    },
+    {
+      "epoch": 0.0736104721313183,
+      "grad_norm": 3.515625,
+      "learning_rate": 0.0019726134602268543,
+      "loss": 0.3281,
+      "step": 8480
+    },
+    {
+      "epoch": 0.07361915261152246,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019726061735365888,
+      "loss": 0.1396,
+      "step": 8481
+    },
+    {
+      "epoch": 0.07362783309172663,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019725988858920545,
+      "loss": 0.1172,
+      "step": 8482
+    },
+    {
+      "epoch": 0.0736365135719308,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001972591597293259,
+      "loss": 0.1289,
+      "step": 8483
+    },
+    {
+      "epoch": 0.07364519405213496,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019725843077402104,
+      "loss": 0.1104,
+      "step": 8484
+    },
+    {
+      "epoch": 0.07365387453233913,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001972577017232917,
+      "loss": 0.1143,
+      "step": 8485
+    },
+    {
+      "epoch": 0.07366255501254329,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019725697257713865,
+      "loss": 0.1191,
+      "step": 8486
+    },
+    {
+      "epoch": 0.07367123549274746,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0019725624333556267,
+      "loss": 0.1279,
+      "step": 8487
+    },
+    {
+      "epoch": 0.07367991597295162,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019725551399856454,
+      "loss": 0.1006,
+      "step": 8488
+    },
+    {
+      "epoch": 0.07368859645315579,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019725478456614513,
+      "loss": 0.1211,
+      "step": 8489
+    },
+    {
+      "epoch": 0.07369727693335995,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001972540550383052,
+      "loss": 0.1084,
+      "step": 8490
+    },
+    {
+      "epoch": 0.07370595741356412,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019725332541504553,
+      "loss": 0.1436,
+      "step": 8491
+    },
+    {
+      "epoch": 0.07371463789376828,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019725259569636695,
+      "loss": 0.1553,
+      "step": 8492
+    },
+    {
+      "epoch": 0.07372331837397245,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0019725186588227025,
+      "loss": 0.1426,
+      "step": 8493
+    },
+    {
+      "epoch": 0.07373199885417661,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001972511359727562,
+      "loss": 0.1162,
+      "step": 8494
+    },
+    {
+      "epoch": 0.07374067933438078,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001972504059678256,
+      "loss": 0.103,
+      "step": 8495
+    },
+    {
+      "epoch": 0.07374935981458494,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019724967586747934,
+      "loss": 0.1201,
+      "step": 8496
+    },
+    {
+      "epoch": 0.0737580402947891,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001972489456717181,
+      "loss": 0.1279,
+      "step": 8497
+    },
+    {
+      "epoch": 0.07376672077499327,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001972482153805427,
+      "loss": 0.1631,
+      "step": 8498
+    },
+    {
+      "epoch": 0.07377540125519744,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.00197247484993954,
+      "loss": 0.1416,
+      "step": 8499
+    },
+    {
+      "epoch": 0.0737840817354016,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0019724675451195275,
+      "loss": 0.1123,
+      "step": 8500
+    },
+    {
+      "epoch": 0.07379276221560577,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019724602393453973,
+      "loss": 0.0903,
+      "step": 8501
+    },
+    {
+      "epoch": 0.07380144269580993,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001972452932617158,
+      "loss": 0.1855,
+      "step": 8502
+    },
+    {
+      "epoch": 0.0738101231760141,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001972445624934817,
+      "loss": 0.1455,
+      "step": 8503
+    },
+    {
+      "epoch": 0.07381880365621826,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019724383162983833,
+      "loss": 0.127,
+      "step": 8504
+    },
+    {
+      "epoch": 0.07382748413642243,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019724310067078636,
+      "loss": 0.1436,
+      "step": 8505
+    },
+    {
+      "epoch": 0.07383616461662659,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0019724236961632664,
+      "loss": 0.1426,
+      "step": 8506
+    },
+    {
+      "epoch": 0.07384484509683076,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0019724163846646,
+      "loss": 0.207,
+      "step": 8507
+    },
+    {
+      "epoch": 0.07385352557703492,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001972409072211872,
+      "loss": 0.1123,
+      "step": 8508
+    },
+    {
+      "epoch": 0.07386220605723909,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019724017588050903,
+      "loss": 0.1328,
+      "step": 8509
+    },
+    {
+      "epoch": 0.07387088653744325,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0019723944444442637,
+      "loss": 0.1309,
+      "step": 8510
+    },
+    {
+      "epoch": 0.07387956701764742,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019723871291293994,
+      "loss": 0.1543,
+      "step": 8511
+    },
+    {
+      "epoch": 0.07388824749785158,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0019723798128605057,
+      "loss": 0.1357,
+      "step": 8512
+    },
+    {
+      "epoch": 0.07389692797805575,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019723724956375904,
+      "loss": 0.1396,
+      "step": 8513
+    },
+    {
+      "epoch": 0.07390560845825991,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019723651774606614,
+      "loss": 0.1455,
+      "step": 8514
+    },
+    {
+      "epoch": 0.07391428893846408,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0019723578583297273,
+      "loss": 0.1113,
+      "step": 8515
+    },
+    {
+      "epoch": 0.07392296941866824,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001972350538244796,
+      "loss": 0.1309,
+      "step": 8516
+    },
+    {
+      "epoch": 0.07393164989887241,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001972343217205875,
+      "loss": 0.1143,
+      "step": 8517
+    },
+    {
+      "epoch": 0.07394033037907657,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019723358952129723,
+      "loss": 0.123,
+      "step": 8518
+    },
+    {
+      "epoch": 0.07394901085928074,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0019723285722660962,
+      "loss": 0.1602,
+      "step": 8519
+    },
+    {
+      "epoch": 0.0739576913394849,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001972321248365255,
+      "loss": 0.1128,
+      "step": 8520
+    },
+    {
+      "epoch": 0.07396637181968907,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019723139235104563,
+      "loss": 0.1201,
+      "step": 8521
+    },
+    {
+      "epoch": 0.07397505229989323,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019723065977017085,
+      "loss": 0.1621,
+      "step": 8522
+    },
+    {
+      "epoch": 0.0739837327800974,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019722992709390185,
+      "loss": 0.1211,
+      "step": 8523
+    },
+    {
+      "epoch": 0.07399241326030156,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019722919432223957,
+      "loss": 0.1357,
+      "step": 8524
+    },
+    {
+      "epoch": 0.07400109374050573,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0019722846145518473,
+      "loss": 0.1348,
+      "step": 8525
+    },
+    {
+      "epoch": 0.0740097742207099,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.001972277284927382,
+      "loss": 0.1123,
+      "step": 8526
+    },
+    {
+      "epoch": 0.07401845470091406,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001972269954349007,
+      "loss": 0.1621,
+      "step": 8527
+    },
+    {
+      "epoch": 0.07402713518111823,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019722626228167307,
+      "loss": 0.1152,
+      "step": 8528
+    },
+    {
+      "epoch": 0.07403581566132239,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019722552903305613,
+      "loss": 0.1152,
+      "step": 8529
+    },
+    {
+      "epoch": 0.07404449614152656,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001972247956890506,
+      "loss": 0.1523,
+      "step": 8530
+    },
+    {
+      "epoch": 0.07405317662173072,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001972240622496574,
+      "loss": 0.0986,
+      "step": 8531
+    },
+    {
+      "epoch": 0.07406185710193489,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019722332871487725,
+      "loss": 0.1289,
+      "step": 8532
+    },
+    {
+      "epoch": 0.07407053758213904,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.00197222595084711,
+      "loss": 0.1089,
+      "step": 8533
+    },
+    {
+      "epoch": 0.0740792180623432,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0019722186135915946,
+      "loss": 0.124,
+      "step": 8534
+    },
+    {
+      "epoch": 0.07408789854254737,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019722112753822337,
+      "loss": 0.1396,
+      "step": 8535
+    },
+    {
+      "epoch": 0.07409657902275153,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0019722039362190355,
+      "loss": 0.126,
+      "step": 8536
+    },
+    {
+      "epoch": 0.0741052595029557,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0019721965961020083,
+      "loss": 0.1221,
+      "step": 8537
+    },
+    {
+      "epoch": 0.07411393998315986,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.00197218925503116,
+      "loss": 0.1436,
+      "step": 8538
+    },
+    {
+      "epoch": 0.07412262046336403,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0019721819130064985,
+      "loss": 0.1396,
+      "step": 8539
+    },
+    {
+      "epoch": 0.07413130094356819,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019721745700280323,
+      "loss": 0.1055,
+      "step": 8540
+    },
+    {
+      "epoch": 0.07413998142377236,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019721672260957688,
+      "loss": 0.1875,
+      "step": 8541
+    },
+    {
+      "epoch": 0.07414866190397652,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001972159881209717,
+      "loss": 0.1074,
+      "step": 8542
+    },
+    {
+      "epoch": 0.07415734238418069,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0019721525353698834,
+      "loss": 0.1436,
+      "step": 8543
+    },
+    {
+      "epoch": 0.07416602286438485,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001972145188576277,
+      "loss": 0.1172,
+      "step": 8544
+    },
+    {
+      "epoch": 0.07417470334458902,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0019721378408289057,
+      "loss": 0.1758,
+      "step": 8545
+    },
+    {
+      "epoch": 0.07418338382479318,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001972130492127778,
+      "loss": 0.1387,
+      "step": 8546
+    },
+    {
+      "epoch": 0.07419206430499735,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001972123142472901,
+      "loss": 0.1201,
+      "step": 8547
+    },
+    {
+      "epoch": 0.07420074478520151,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019721157918642838,
+      "loss": 0.127,
+      "step": 8548
+    },
+    {
+      "epoch": 0.07420942526540568,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019721084403019336,
+      "loss": 0.1357,
+      "step": 8549
+    },
+    {
+      "epoch": 0.07421810574560984,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001972101087785859,
+      "loss": 0.1748,
+      "step": 8550
+    },
+    {
+      "epoch": 0.07422678622581401,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001972093734316067,
+      "loss": 0.1885,
+      "step": 8551
+    },
+    {
+      "epoch": 0.07423546670601817,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019720863798925667,
+      "loss": 0.082,
+      "step": 8552
+    },
+    {
+      "epoch": 0.07424414718622234,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001972079024515366,
+      "loss": 0.1299,
+      "step": 8553
+    },
+    {
+      "epoch": 0.0742528276664265,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001972071668184473,
+      "loss": 0.1279,
+      "step": 8554
+    },
+    {
+      "epoch": 0.07426150814663067,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001972064310899895,
+      "loss": 0.0942,
+      "step": 8555
+    },
+    {
+      "epoch": 0.07427018862683483,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019720569526616412,
+      "loss": 0.1309,
+      "step": 8556
+    },
+    {
+      "epoch": 0.074278869107039,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019720495934697183,
+      "loss": 0.127,
+      "step": 8557
+    },
+    {
+      "epoch": 0.07428754958724317,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001972042233324136,
+      "loss": 0.127,
+      "step": 8558
+    },
+    {
+      "epoch": 0.07429623006744733,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0019720348722249005,
+      "loss": 0.1758,
+      "step": 8559
+    },
+    {
+      "epoch": 0.0743049105476515,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019720275101720213,
+      "loss": 0.1113,
+      "step": 8560
+    },
+    {
+      "epoch": 0.07431359102785566,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0019720201471655056,
+      "loss": 0.1553,
+      "step": 8561
+    },
+    {
+      "epoch": 0.07432227150805983,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001972012783205362,
+      "loss": 0.1484,
+      "step": 8562
+    },
+    {
+      "epoch": 0.07433095198826399,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001972005418291598,
+      "loss": 0.1104,
+      "step": 8563
+    },
+    {
+      "epoch": 0.07433963246846816,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019719980524242225,
+      "loss": 0.126,
+      "step": 8564
+    },
+    {
+      "epoch": 0.07434831294867232,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001971990685603243,
+      "loss": 0.1543,
+      "step": 8565
+    },
+    {
+      "epoch": 0.07435699342887649,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001971983317828667,
+      "loss": 0.1357,
+      "step": 8566
+    },
+    {
+      "epoch": 0.07436567390908065,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019719759491005034,
+      "loss": 0.1045,
+      "step": 8567
+    },
+    {
+      "epoch": 0.07437435438928482,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00197196857941876,
+      "loss": 0.1191,
+      "step": 8568
+    },
+    {
+      "epoch": 0.07438303486948898,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001971961208783445,
+      "loss": 0.1348,
+      "step": 8569
+    },
+    {
+      "epoch": 0.07439171534969315,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0019719538371945665,
+      "loss": 0.1094,
+      "step": 8570
+    },
+    {
+      "epoch": 0.07440039582989731,
+      "grad_norm": 2.203125,
+      "learning_rate": 0.0019719464646521323,
+      "loss": 0.25,
+      "step": 8571
+    },
+    {
+      "epoch": 0.07440907631010148,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00197193909115615,
+      "loss": 0.1533,
+      "step": 8572
+    },
+    {
+      "epoch": 0.07441775679030564,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001971931716706629,
+      "loss": 0.0972,
+      "step": 8573
+    },
+    {
+      "epoch": 0.07442643727050981,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001971924341303576,
+      "loss": 0.1611,
+      "step": 8574
+    },
+    {
+      "epoch": 0.07443511775071397,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001971916964947,
+      "loss": 0.1133,
+      "step": 8575
+    },
+    {
+      "epoch": 0.07444379823091814,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019719095876369083,
+      "loss": 0.1543,
+      "step": 8576
+    },
+    {
+      "epoch": 0.0744524787111223,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019719022093733097,
+      "loss": 0.1396,
+      "step": 8577
+    },
+    {
+      "epoch": 0.07446115919132647,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019718948301562116,
+      "loss": 0.1123,
+      "step": 8578
+    },
+    {
+      "epoch": 0.07446983967153063,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001971887449985623,
+      "loss": 0.1475,
+      "step": 8579
+    },
+    {
+      "epoch": 0.0744785201517348,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001971880068861551,
+      "loss": 0.1128,
+      "step": 8580
+    },
+    {
+      "epoch": 0.07448720063193896,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001971872686784004,
+      "loss": 0.168,
+      "step": 8581
+    },
+    {
+      "epoch": 0.07449588111214313,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019718653037529904,
+      "loss": 0.1475,
+      "step": 8582
+    },
+    {
+      "epoch": 0.0745045615923473,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019718579197685175,
+      "loss": 0.1279,
+      "step": 8583
+    },
+    {
+      "epoch": 0.07451324207255146,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019718505348305946,
+      "loss": 0.1436,
+      "step": 8584
+    },
+    {
+      "epoch": 0.07452192255275562,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001971843148939228,
+      "loss": 0.0977,
+      "step": 8585
+    },
+    {
+      "epoch": 0.07453060303295979,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019718357620944283,
+      "loss": 0.127,
+      "step": 8586
+    },
+    {
+      "epoch": 0.07453928351316395,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001971828374296201,
+      "loss": 0.124,
+      "step": 8587
+    },
+    {
+      "epoch": 0.07454796399336812,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019718209855445557,
+      "loss": 0.126,
+      "step": 8588
+    },
+    {
+      "epoch": 0.07455664447357228,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019718135958395,
+      "loss": 0.1338,
+      "step": 8589
+    },
+    {
+      "epoch": 0.07456532495377645,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001971806205181042,
+      "loss": 0.1089,
+      "step": 8590
+    },
+    {
+      "epoch": 0.07457400543398061,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.00197179881356919,
+      "loss": 0.127,
+      "step": 8591
+    },
+    {
+      "epoch": 0.07458268591418478,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019717914210039514,
+      "loss": 0.1221,
+      "step": 8592
+    },
+    {
+      "epoch": 0.07459136639438894,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0019717840274853355,
+      "loss": 0.1445,
+      "step": 8593
+    },
+    {
+      "epoch": 0.0746000468745931,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001971776633013349,
+      "loss": 0.1152,
+      "step": 8594
+    },
+    {
+      "epoch": 0.07460872735479726,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0019717692375880013,
+      "loss": 0.2344,
+      "step": 8595
+    },
+    {
+      "epoch": 0.07461740783500143,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019717618412092995,
+      "loss": 0.0967,
+      "step": 8596
+    },
+    {
+      "epoch": 0.07462608831520559,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019717544438772523,
+      "loss": 0.1143,
+      "step": 8597
+    },
+    {
+      "epoch": 0.07463476879540976,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0019717470455918673,
+      "loss": 0.1318,
+      "step": 8598
+    },
+    {
+      "epoch": 0.07464344927561392,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001971739646353153,
+      "loss": 0.1025,
+      "step": 8599
+    },
+    {
+      "epoch": 0.07465212975581809,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001971732246161117,
+      "loss": 0.126,
+      "step": 8600
+    },
+    {
+      "epoch": 0.07466081023602225,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001971724845015768,
+      "loss": 0.1387,
+      "step": 8601
+    },
+    {
+      "epoch": 0.07466949071622642,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001971717442917114,
+      "loss": 0.127,
+      "step": 8602
+    },
+    {
+      "epoch": 0.07467817119643058,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019717100398651626,
+      "loss": 0.2207,
+      "step": 8603
+    },
+    {
+      "epoch": 0.07468685167663475,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001971702635859922,
+      "loss": 0.105,
+      "step": 8604
+    },
+    {
+      "epoch": 0.07469553215683891,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019716952309014007,
+      "loss": 0.1621,
+      "step": 8605
+    },
+    {
+      "epoch": 0.07470421263704308,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019716878249896068,
+      "loss": 0.1104,
+      "step": 8606
+    },
+    {
+      "epoch": 0.07471289311724724,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0019716804181245484,
+      "loss": 0.1123,
+      "step": 8607
+    },
+    {
+      "epoch": 0.07472157359745141,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019716730103062326,
+      "loss": 0.1025,
+      "step": 8608
+    },
+    {
+      "epoch": 0.07473025407765557,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.001971665601534669,
+      "loss": 0.1123,
+      "step": 8609
+    },
+    {
+      "epoch": 0.07473893455785974,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019716581918098643,
+      "loss": 0.1289,
+      "step": 8610
+    },
+    {
+      "epoch": 0.0747476150380639,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019716507811318278,
+      "loss": 0.1094,
+      "step": 8611
+    },
+    {
+      "epoch": 0.07475629551826807,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.001971643369500567,
+      "loss": 0.3652,
+      "step": 8612
+    },
+    {
+      "epoch": 0.07476497599847223,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.00197163595691609,
+      "loss": 0.0991,
+      "step": 8613
+    },
+    {
+      "epoch": 0.0747736564786764,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001971628543378405,
+      "loss": 0.1738,
+      "step": 8614
+    },
+    {
+      "epoch": 0.07478233695888056,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019716211288875207,
+      "loss": 0.1289,
+      "step": 8615
+    },
+    {
+      "epoch": 0.07479101743908473,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001971613713443444,
+      "loss": 0.1465,
+      "step": 8616
+    },
+    {
+      "epoch": 0.0747996979192889,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001971606297046184,
+      "loss": 0.1006,
+      "step": 8617
+    },
+    {
+      "epoch": 0.07480837839949306,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.001971598879695748,
+      "loss": 0.1299,
+      "step": 8618
+    },
+    {
+      "epoch": 0.07481705887969722,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001971591461392145,
+      "loss": 0.1396,
+      "step": 8619
+    },
+    {
+      "epoch": 0.07482573935990139,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019715840421353826,
+      "loss": 0.1157,
+      "step": 8620
+    },
+    {
+      "epoch": 0.07483441984010555,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019715766219254686,
+      "loss": 0.168,
+      "step": 8621
+    },
+    {
+      "epoch": 0.07484310032030972,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001971569200762412,
+      "loss": 0.1177,
+      "step": 8622
+    },
+    {
+      "epoch": 0.07485178080051388,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.00197156177864622,
+      "loss": 0.124,
+      "step": 8623
+    },
+    {
+      "epoch": 0.07486046128071805,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0019715543555769016,
+      "loss": 0.1406,
+      "step": 8624
+    },
+    {
+      "epoch": 0.07486914176092221,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001971546931554464,
+      "loss": 0.1084,
+      "step": 8625
+    },
+    {
+      "epoch": 0.07487782224112638,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001971539506578916,
+      "loss": 0.1426,
+      "step": 8626
+    },
+    {
+      "epoch": 0.07488650272133054,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019715320806502653,
+      "loss": 0.125,
+      "step": 8627
+    },
+    {
+      "epoch": 0.07489518320153471,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019715246537685208,
+      "loss": 0.1348,
+      "step": 8628
+    },
+    {
+      "epoch": 0.07490386368173887,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0019715172259336896,
+      "loss": 0.1309,
+      "step": 8629
+    },
+    {
+      "epoch": 0.07491254416194304,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.00197150979714578,
+      "loss": 0.0771,
+      "step": 8630
+    },
+    {
+      "epoch": 0.0749212246421472,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019715023674048006,
+      "loss": 0.125,
+      "step": 8631
+    },
+    {
+      "epoch": 0.07492990512235137,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001971494936710759,
+      "loss": 0.124,
+      "step": 8632
+    },
+    {
+      "epoch": 0.07493858560255554,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019714875050636646,
+      "loss": 0.1367,
+      "step": 8633
+    },
+    {
+      "epoch": 0.0749472660827597,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019714800724635237,
+      "loss": 0.1699,
+      "step": 8634
+    },
+    {
+      "epoch": 0.07495594656296387,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001971472638910346,
+      "loss": 0.1021,
+      "step": 8635
+    },
+    {
+      "epoch": 0.07496462704316803,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001971465204404138,
+      "loss": 0.1836,
+      "step": 8636
+    },
+    {
+      "epoch": 0.0749733075233722,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019714577689449093,
+      "loss": 0.1006,
+      "step": 8637
+    },
+    {
+      "epoch": 0.07498198800357636,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0019714503325326672,
+      "loss": 0.1426,
+      "step": 8638
+    },
+    {
+      "epoch": 0.07499066848378053,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00197144289516742,
+      "loss": 0.1133,
+      "step": 8639
+    },
+    {
+      "epoch": 0.07499934896398469,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019714354568491762,
+      "loss": 0.1328,
+      "step": 8640
+    },
+    {
+      "epoch": 0.07500802944418886,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019714280175779438,
+      "loss": 0.0991,
+      "step": 8641
+    },
+    {
+      "epoch": 0.07501670992439302,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001971420577353731,
+      "loss": 0.0889,
+      "step": 8642
+    },
+    {
+      "epoch": 0.07502539040459719,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019714131361765453,
+      "loss": 0.1543,
+      "step": 8643
+    },
+    {
+      "epoch": 0.07503407088480135,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0019714056940463953,
+      "loss": 0.1514,
+      "step": 8644
+    },
+    {
+      "epoch": 0.07504275136500552,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0019713982509632897,
+      "loss": 0.167,
+      "step": 8645
+    },
+    {
+      "epoch": 0.07505143184520968,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0019713908069272355,
+      "loss": 0.106,
+      "step": 8646
+    },
+    {
+      "epoch": 0.07506011232541385,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019713833619382413,
+      "loss": 0.1543,
+      "step": 8647
+    },
+    {
+      "epoch": 0.07506879280561801,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019713759159963157,
+      "loss": 0.1191,
+      "step": 8648
+    },
+    {
+      "epoch": 0.07507747328582218,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019713684691014663,
+      "loss": 0.127,
+      "step": 8649
+    },
+    {
+      "epoch": 0.07508615376602634,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019713610212537015,
+      "loss": 0.1592,
+      "step": 8650
+    },
+    {
+      "epoch": 0.07509483424623051,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019713535724530293,
+      "loss": 0.0952,
+      "step": 8651
+    },
+    {
+      "epoch": 0.07510351472643467,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0019713461226994584,
+      "loss": 0.127,
+      "step": 8652
+    },
+    {
+      "epoch": 0.07511219520663884,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001971338671992996,
+      "loss": 0.1387,
+      "step": 8653
+    },
+    {
+      "epoch": 0.075120875686843,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001971331220333651,
+      "loss": 0.1211,
+      "step": 8654
+    },
+    {
+      "epoch": 0.07512955616704717,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001971323767721431,
+      "loss": 0.1172,
+      "step": 8655
+    },
+    {
+      "epoch": 0.07513823664725132,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001971316314156344,
+      "loss": 0.1602,
+      "step": 8656
+    },
+    {
+      "epoch": 0.07514691712745548,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019713088596383993,
+      "loss": 0.1289,
+      "step": 8657
+    },
+    {
+      "epoch": 0.07515559760765965,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.001971301404167604,
+      "loss": 0.1094,
+      "step": 8658
+    },
+    {
+      "epoch": 0.07516427808786381,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019712939477439665,
+      "loss": 0.0874,
+      "step": 8659
+    },
+    {
+      "epoch": 0.07517295856806798,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0019712864903674954,
+      "loss": 0.1143,
+      "step": 8660
+    },
+    {
+      "epoch": 0.07518163904827214,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0019712790320381983,
+      "loss": 0.1172,
+      "step": 8661
+    },
+    {
+      "epoch": 0.07519031952847631,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0019712715727560836,
+      "loss": 0.1523,
+      "step": 8662
+    },
+    {
+      "epoch": 0.07519900000868047,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0019712641125211594,
+      "loss": 0.0962,
+      "step": 8663
+    },
+    {
+      "epoch": 0.07520768048888464,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019712566513334336,
+      "loss": 0.1523,
+      "step": 8664
+    },
+    {
+      "epoch": 0.0752163609690888,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.001971249189192915,
+      "loss": 0.1289,
+      "step": 8665
+    },
+    {
+      "epoch": 0.07522504144929297,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001971241726099611,
+      "loss": 0.1865,
+      "step": 8666
+    },
+    {
+      "epoch": 0.07523372192949714,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019712342620535304,
+      "loss": 0.1226,
+      "step": 8667
+    },
+    {
+      "epoch": 0.0752424024097013,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.001971226797054681,
+      "loss": 0.1084,
+      "step": 8668
+    },
+    {
+      "epoch": 0.07525108288990547,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.001971219331103071,
+      "loss": 0.1021,
+      "step": 8669
+    },
+    {
+      "epoch": 0.07525976337010963,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0019712118641987087,
+      "loss": 0.1348,
+      "step": 8670
+    },
+    {
+      "epoch": 0.0752684438503138,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001971204396341602,
+      "loss": 0.124,
+      "step": 8671
+    },
+    {
+      "epoch": 0.07527712433051796,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019711969275317594,
+      "loss": 0.1396,
+      "step": 8672
+    },
+    {
+      "epoch": 0.07528580481072213,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019711894577691892,
+      "loss": 0.1289,
+      "step": 8673
+    },
+    {
+      "epoch": 0.07529448529092629,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001971181987053899,
+      "loss": 0.127,
+      "step": 8674
+    },
+    {
+      "epoch": 0.07530316577113046,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001971174515385897,
+      "loss": 0.1455,
+      "step": 8675
+    },
+    {
+      "epoch": 0.07531184625133462,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019711670427651922,
+      "loss": 0.0845,
+      "step": 8676
+    },
+    {
+      "epoch": 0.07532052673153879,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0019711595691917923,
+      "loss": 0.1279,
+      "step": 8677
+    },
+    {
+      "epoch": 0.07532920721174295,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001971152094665705,
+      "loss": 0.166,
+      "step": 8678
+    },
+    {
+      "epoch": 0.07533788769194712,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001971144619186939,
+      "loss": 0.1387,
+      "step": 8679
+    },
+    {
+      "epoch": 0.07534656817215128,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019711371427555023,
+      "loss": 0.1064,
+      "step": 8680
+    },
+    {
+      "epoch": 0.07535524865235545,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019711296653714032,
+      "loss": 0.1611,
+      "step": 8681
+    },
+    {
+      "epoch": 0.07536392913255961,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0019711221870346496,
+      "loss": 0.1152,
+      "step": 8682
+    },
+    {
+      "epoch": 0.07537260961276378,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00197111470774525,
+      "loss": 0.1289,
+      "step": 8683
+    },
+    {
+      "epoch": 0.07538129009296794,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001971107227503212,
+      "loss": 0.1201,
+      "step": 8684
+    },
+    {
+      "epoch": 0.07538997057317211,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0019710997463085445,
+      "loss": 0.123,
+      "step": 8685
+    },
+    {
+      "epoch": 0.07539865105337627,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001971092264161256,
+      "loss": 0.1006,
+      "step": 8686
+    },
+    {
+      "epoch": 0.07540733153358044,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019710847810613533,
+      "loss": 0.1187,
+      "step": 8687
+    },
+    {
+      "epoch": 0.0754160120137846,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001971077297008846,
+      "loss": 0.1079,
+      "step": 8688
+    },
+    {
+      "epoch": 0.07542469249398877,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001971069812003741,
+      "loss": 0.1406,
+      "step": 8689
+    },
+    {
+      "epoch": 0.07543337297419293,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001971062326046048,
+      "loss": 0.1289,
+      "step": 8690
+    },
+    {
+      "epoch": 0.0754420534543971,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019710548391357733,
+      "loss": 0.1191,
+      "step": 8691
+    },
+    {
+      "epoch": 0.07545073393460126,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0019710473512729266,
+      "loss": 0.1016,
+      "step": 8692
+    },
+    {
+      "epoch": 0.07545941441480543,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0019710398624575155,
+      "loss": 0.1543,
+      "step": 8693
+    },
+    {
+      "epoch": 0.0754680948950096,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0019710323726895487,
+      "loss": 0.1084,
+      "step": 8694
+    },
+    {
+      "epoch": 0.07547677537521376,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001971024881969034,
+      "loss": 0.1455,
+      "step": 8695
+    },
+    {
+      "epoch": 0.07548545585541792,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001971017390295979,
+      "loss": 0.1523,
+      "step": 8696
+    },
+    {
+      "epoch": 0.07549413633562209,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019710098976703926,
+      "loss": 0.1699,
+      "step": 8697
+    },
+    {
+      "epoch": 0.07550281681582625,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001971002404092283,
+      "loss": 0.1523,
+      "step": 8698
+    },
+    {
+      "epoch": 0.07551149729603042,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.001970994909561658,
+      "loss": 0.1367,
+      "step": 8699
+    },
+    {
+      "epoch": 0.07552017777623458,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0019709874140785267,
+      "loss": 0.1289,
+      "step": 8700
+    },
+    {
+      "epoch": 0.07552885825643875,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.001970979917642896,
+      "loss": 0.168,
+      "step": 8701
+    },
+    {
+      "epoch": 0.07553753873664291,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001970972420254775,
+      "loss": 0.1211,
+      "step": 8702
+    },
+    {
+      "epoch": 0.07554621921684708,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019709649219141717,
+      "loss": 0.1133,
+      "step": 8703
+    },
+    {
+      "epoch": 0.07555489969705124,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0019709574226210945,
+      "loss": 0.1582,
+      "step": 8704
+    },
+    {
+      "epoch": 0.07556358017725541,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.001970949922375551,
+      "loss": 0.1328,
+      "step": 8705
+    },
+    {
+      "epoch": 0.07557226065745958,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.00197094242117755,
+      "loss": 0.1211,
+      "step": 8706
+    },
+    {
+      "epoch": 0.07558094113766374,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001970934919027099,
+      "loss": 0.0913,
+      "step": 8707
+    },
+    {
+      "epoch": 0.0755896216178679,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001970927415924207,
+      "loss": 0.1143,
+      "step": 8708
+    },
+    {
+      "epoch": 0.07559830209807207,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.001970919911868882,
+      "loss": 0.123,
+      "step": 8709
+    },
+    {
+      "epoch": 0.07560698257827624,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0019709124068611316,
+      "loss": 0.1338,
+      "step": 8710
+    },
+    {
+      "epoch": 0.0756156630584804,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001970904900900965,
+      "loss": 0.1147,
+      "step": 8711
+    },
+    {
+      "epoch": 0.07562434353868457,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019708973939883898,
+      "loss": 0.1182,
+      "step": 8712
+    },
+    {
+      "epoch": 0.07563302401888873,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019708898861234138,
+      "loss": 0.1289,
+      "step": 8713
+    },
+    {
+      "epoch": 0.0756417044990929,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.001970882377306046,
+      "loss": 0.168,
+      "step": 8714
+    },
+    {
+      "epoch": 0.07565038497929706,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001970874867536294,
+      "loss": 0.1797,
+      "step": 8715
+    },
+    {
+      "epoch": 0.07565906545950123,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.001970867356814167,
+      "loss": 0.1221,
+      "step": 8716
+    },
+    {
+      "epoch": 0.07566774593970539,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.001970859845139672,
+      "loss": 0.1533,
+      "step": 8717
+    },
+    {
+      "epoch": 0.07567642641990954,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0019708523325128184,
+      "loss": 0.1309,
+      "step": 8718
+    },
+    {
+      "epoch": 0.07568510690011371,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.001970844818933613,
+      "loss": 0.1387,
+      "step": 8719
+    },
+    {
+      "epoch": 0.07569378738031787,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.001970837304402065,
+      "loss": 0.1465,
+      "step": 8720
+    },
+    {
+      "epoch": 0.07570246786052204,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001970829788918183,
+      "loss": 0.127,
+      "step": 8721
+    },
+    {
+      "epoch": 0.0757111483407262,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.001970822272481974,
+      "loss": 0.1025,
+      "step": 8722
+    },
+    {
+      "epoch": 0.07571982882093037,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001970814755093447,
+      "loss": 0.1299,
+      "step": 8723
+    },
+    {
+      "epoch": 0.07572850930113453,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00197080723675261,
+      "loss": 0.1152,
+      "step": 8724
+    },
+    {
+      "epoch": 0.0757371897813387,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019707997174594712,
+      "loss": 0.127,
+      "step": 8725
+    },
+    {
+      "epoch": 0.07574587026154286,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019707921972140392,
+      "loss": 0.1416,
+      "step": 8726
+    },
+    {
+      "epoch": 0.07575455074174703,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019707846760163216,
+      "loss": 0.1348,
+      "step": 8727
+    },
+    {
+      "epoch": 0.0757632312219512,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001970777153866327,
+      "loss": 0.1465,
+      "step": 8728
+    },
+    {
+      "epoch": 0.07577191170215536,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001970769630764064,
+      "loss": 0.1338,
+      "step": 8729
+    },
+    {
+      "epoch": 0.07578059218235952,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00197076210670954,
+      "loss": 0.1006,
+      "step": 8730
+    },
+    {
+      "epoch": 0.07578927266256369,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0019707545817027644,
+      "loss": 0.1162,
+      "step": 8731
+    },
+    {
+      "epoch": 0.07579795314276785,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001970747055743744,
+      "loss": 0.0762,
+      "step": 8732
+    },
+    {
+      "epoch": 0.07580663362297202,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0019707395288324878,
+      "loss": 0.1309,
+      "step": 8733
+    },
+    {
+      "epoch": 0.07581531410317618,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019707320009690037,
+      "loss": 0.1299,
+      "step": 8734
+    },
+    {
+      "epoch": 0.07582399458338035,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019707244721533005,
+      "loss": 0.1221,
+      "step": 8735
+    },
+    {
+      "epoch": 0.07583267506358451,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0019707169423853863,
+      "loss": 0.0977,
+      "step": 8736
+    },
+    {
+      "epoch": 0.07584135554378868,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0019707094116652685,
+      "loss": 0.1367,
+      "step": 8737
+    },
+    {
+      "epoch": 0.07585003602399285,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001970701879992956,
+      "loss": 0.1455,
+      "step": 8738
+    },
+    {
+      "epoch": 0.07585871650419701,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001970694347368458,
+      "loss": 0.1504,
+      "step": 8739
+    },
+    {
+      "epoch": 0.07586739698440118,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019706868137917806,
+      "loss": 0.1113,
+      "step": 8740
+    },
+    {
+      "epoch": 0.07587607746460534,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001970679279262934,
+      "loss": 0.1377,
+      "step": 8741
+    },
+    {
+      "epoch": 0.0758847579448095,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001970671743781925,
+      "loss": 0.1201,
+      "step": 8742
+    },
+    {
+      "epoch": 0.07589343842501367,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019706642073487627,
+      "loss": 0.124,
+      "step": 8743
+    },
+    {
+      "epoch": 0.07590211890521784,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019706566699634553,
+      "loss": 0.1152,
+      "step": 8744
+    },
+    {
+      "epoch": 0.075910799385422,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019706491316260105,
+      "loss": 0.0908,
+      "step": 8745
+    },
+    {
+      "epoch": 0.07591947986562617,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019706415923364373,
+      "loss": 0.1113,
+      "step": 8746
+    },
+    {
+      "epoch": 0.07592816034583033,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.001970634052094743,
+      "loss": 0.1211,
+      "step": 8747
+    },
+    {
+      "epoch": 0.0759368408260345,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001970626510900937,
+      "loss": 0.1035,
+      "step": 8748
+    },
+    {
+      "epoch": 0.07594552130623866,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019706189687550267,
+      "loss": 0.1035,
+      "step": 8749
+    },
+    {
+      "epoch": 0.07595420178644283,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.00197061142565702,
+      "loss": 0.1562,
+      "step": 8750
+    },
+    {
+      "epoch": 0.07596288226664699,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0019706038816069264,
+      "loss": 0.167,
+      "step": 8751
+    },
+    {
+      "epoch": 0.07597156274685116,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019705963366047534,
+      "loss": 0.1191,
+      "step": 8752
+    },
+    {
+      "epoch": 0.07598024322705532,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001970588790650509,
+      "loss": 0.1445,
+      "step": 8753
+    },
+    {
+      "epoch": 0.07598892370725949,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0019705812437442022,
+      "loss": 0.0957,
+      "step": 8754
+    },
+    {
+      "epoch": 0.07599760418746365,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0019705736958858405,
+      "loss": 0.1055,
+      "step": 8755
+    },
+    {
+      "epoch": 0.07600628466766782,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001970566147075433,
+      "loss": 0.1572,
+      "step": 8756
+    },
+    {
+      "epoch": 0.07601496514787198,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001970558597312987,
+      "loss": 0.1025,
+      "step": 8757
+    },
+    {
+      "epoch": 0.07602364562807615,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001970551046598511,
+      "loss": 0.1494,
+      "step": 8758
+    },
+    {
+      "epoch": 0.07603232610828031,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0019705434949320137,
+      "loss": 0.1387,
+      "step": 8759
+    },
+    {
+      "epoch": 0.07604100658848448,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001970535942313503,
+      "loss": 0.1162,
+      "step": 8760
+    },
+    {
+      "epoch": 0.07604968706868864,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001970528388742988,
+      "loss": 0.1445,
+      "step": 8761
+    },
+    {
+      "epoch": 0.07605836754889281,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001970520834220475,
+      "loss": 0.168,
+      "step": 8762
+    },
+    {
+      "epoch": 0.07606704802909697,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019705132787459743,
+      "loss": 0.1055,
+      "step": 8763
+    },
+    {
+      "epoch": 0.07607572850930114,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019705057223194933,
+      "loss": 0.1357,
+      "step": 8764
+    },
+    {
+      "epoch": 0.0760844089895053,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.00197049816494104,
+      "loss": 0.1182,
+      "step": 8765
+    },
+    {
+      "epoch": 0.07609308946970947,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019704906066106234,
+      "loss": 0.1094,
+      "step": 8766
+    },
+    {
+      "epoch": 0.07610176994991363,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001970483047328251,
+      "loss": 0.1514,
+      "step": 8767
+    },
+    {
+      "epoch": 0.0761104504301178,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019704754870939318,
+      "loss": 0.1172,
+      "step": 8768
+    },
+    {
+      "epoch": 0.07611913091032196,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0019704679259076735,
+      "loss": 0.1113,
+      "step": 8769
+    },
+    {
+      "epoch": 0.07612781139052613,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019704603637694844,
+      "loss": 0.1445,
+      "step": 8770
+    },
+    {
+      "epoch": 0.0761364918707303,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.001970452800679373,
+      "loss": 0.1797,
+      "step": 8771
+    },
+    {
+      "epoch": 0.07614517235093446,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019704452366373477,
+      "loss": 0.1094,
+      "step": 8772
+    },
+    {
+      "epoch": 0.07615385283113862,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019704376716434165,
+      "loss": 0.1406,
+      "step": 8773
+    },
+    {
+      "epoch": 0.07616253331134279,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019704301056975874,
+      "loss": 0.126,
+      "step": 8774
+    },
+    {
+      "epoch": 0.07617121379154695,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001970422538799869,
+      "loss": 0.1201,
+      "step": 8775
+    },
+    {
+      "epoch": 0.07617989427175112,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.00197041497095027,
+      "loss": 0.1289,
+      "step": 8776
+    },
+    {
+      "epoch": 0.07618857475195528,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001970407402148798,
+      "loss": 0.1201,
+      "step": 8777
+    },
+    {
+      "epoch": 0.07619725523215945,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001970399832395462,
+      "loss": 0.1211,
+      "step": 8778
+    },
+    {
+      "epoch": 0.0762059357123636,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019703922616902693,
+      "loss": 0.127,
+      "step": 8779
+    },
+    {
+      "epoch": 0.07621461619256777,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001970384690033229,
+      "loss": 0.1592,
+      "step": 8780
+    },
+    {
+      "epoch": 0.07622329667277193,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019703771174243486,
+      "loss": 0.1367,
+      "step": 8781
+    },
+    {
+      "epoch": 0.0762319771529761,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019703695438636377,
+      "loss": 0.125,
+      "step": 8782
+    },
+    {
+      "epoch": 0.07624065763318026,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0019703619693511028,
+      "loss": 0.1021,
+      "step": 8783
+    },
+    {
+      "epoch": 0.07624933811338443,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0019703543938867536,
+      "loss": 0.1758,
+      "step": 8784
+    },
+    {
+      "epoch": 0.07625801859358859,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001970346817470598,
+      "loss": 0.1133,
+      "step": 8785
+    },
+    {
+      "epoch": 0.07626669907379276,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001970339240102644,
+      "loss": 0.0947,
+      "step": 8786
+    },
+    {
+      "epoch": 0.07627537955399692,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019703316617829003,
+      "loss": 0.1318,
+      "step": 8787
+    },
+    {
+      "epoch": 0.07628406003420109,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019703240825113748,
+      "loss": 0.1089,
+      "step": 8788
+    },
+    {
+      "epoch": 0.07629274051440525,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019703165022880763,
+      "loss": 0.1289,
+      "step": 8789
+    },
+    {
+      "epoch": 0.07630142099460942,
+      "grad_norm": 1.890625,
+      "learning_rate": 0.0019703089211130123,
+      "loss": 0.1641,
+      "step": 8790
+    },
+    {
+      "epoch": 0.07631010147481358,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001970301338986192,
+      "loss": 0.1328,
+      "step": 8791
+    },
+    {
+      "epoch": 0.07631878195501775,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0019702937559076227,
+      "loss": 0.1426,
+      "step": 8792
+    },
+    {
+      "epoch": 0.07632746243522191,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0019702861718773135,
+      "loss": 0.1182,
+      "step": 8793
+    },
+    {
+      "epoch": 0.07633614291542608,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001970278586895272,
+      "loss": 0.1641,
+      "step": 8794
+    },
+    {
+      "epoch": 0.07634482339563024,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019702710009615074,
+      "loss": 0.1162,
+      "step": 8795
+    },
+    {
+      "epoch": 0.07635350387583441,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0019702634140760272,
+      "loss": 0.1426,
+      "step": 8796
+    },
+    {
+      "epoch": 0.07636218435603857,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019702558262388405,
+      "loss": 0.1484,
+      "step": 8797
+    },
+    {
+      "epoch": 0.07637086483624274,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019702482374499546,
+      "loss": 0.127,
+      "step": 8798
+    },
+    {
+      "epoch": 0.0763795453164469,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019702406477093786,
+      "loss": 0.1426,
+      "step": 8799
+    },
+    {
+      "epoch": 0.07638822579665107,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019702330570171202,
+      "loss": 0.1289,
+      "step": 8800
+    },
+    {
+      "epoch": 0.07639690627685523,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019702254653731883,
+      "loss": 0.104,
+      "step": 8801
+    },
+    {
+      "epoch": 0.0764055867570594,
+      "grad_norm": 1.7265625,
+      "learning_rate": 0.001970217872777591,
+      "loss": 0.1582,
+      "step": 8802
+    },
+    {
+      "epoch": 0.07641426723726356,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0019702102792303364,
+      "loss": 0.168,
+      "step": 8803
+    },
+    {
+      "epoch": 0.07642294771746773,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001970202684731433,
+      "loss": 0.1152,
+      "step": 8804
+    },
+    {
+      "epoch": 0.0764316281976719,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0019701950892808883,
+      "loss": 0.123,
+      "step": 8805
+    },
+    {
+      "epoch": 0.07644030867787606,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001970187492878712,
+      "loss": 0.127,
+      "step": 8806
+    },
+    {
+      "epoch": 0.07644898915808022,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001970179895524912,
+      "loss": 0.1836,
+      "step": 8807
+    },
+    {
+      "epoch": 0.07645766963828439,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001970172297219496,
+      "loss": 0.168,
+      "step": 8808
+    },
+    {
+      "epoch": 0.07646635011848855,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0019701646979624725,
+      "loss": 0.0996,
+      "step": 8809
+    },
+    {
+      "epoch": 0.07647503059869272,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.00197015709775385,
+      "loss": 0.0894,
+      "step": 8810
+    },
+    {
+      "epoch": 0.07648371107889688,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019701494965936365,
+      "loss": 0.1484,
+      "step": 8811
+    },
+    {
+      "epoch": 0.07649239155910105,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019701418944818414,
+      "loss": 0.1113,
+      "step": 8812
+    },
+    {
+      "epoch": 0.07650107203930522,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019701342914184715,
+      "loss": 0.1309,
+      "step": 8813
+    },
+    {
+      "epoch": 0.07650975251950938,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001970126687403536,
+      "loss": 0.1484,
+      "step": 8814
+    },
+    {
+      "epoch": 0.07651843299971355,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.001970119082437043,
+      "loss": 0.1621,
+      "step": 8815
+    },
+    {
+      "epoch": 0.07652711347991771,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001970111476519001,
+      "loss": 0.085,
+      "step": 8816
+    },
+    {
+      "epoch": 0.07653579396012188,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001970103869649418,
+      "loss": 0.1094,
+      "step": 8817
+    },
+    {
+      "epoch": 0.07654447444032604,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0019700962618283027,
+      "loss": 0.0869,
+      "step": 8818
+    },
+    {
+      "epoch": 0.0765531549205302,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001970088653055663,
+      "loss": 0.127,
+      "step": 8819
+    },
+    {
+      "epoch": 0.07656183540073437,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0019700810433315075,
+      "loss": 0.1475,
+      "step": 8820
+    },
+    {
+      "epoch": 0.07657051588093854,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019700734326558444,
+      "loss": 0.1338,
+      "step": 8821
+    },
+    {
+      "epoch": 0.0765791963611427,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001970065821028682,
+      "loss": 0.1064,
+      "step": 8822
+    },
+    {
+      "epoch": 0.07658787684134687,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019700582084500287,
+      "loss": 0.1157,
+      "step": 8823
+    },
+    {
+      "epoch": 0.07659655732155103,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001970050594919893,
+      "loss": 0.0928,
+      "step": 8824
+    },
+    {
+      "epoch": 0.0766052378017552,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001970042980438283,
+      "loss": 0.1416,
+      "step": 8825
+    },
+    {
+      "epoch": 0.07661391828195936,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.001970035365005207,
+      "loss": 0.166,
+      "step": 8826
+    },
+    {
+      "epoch": 0.07662259876216353,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019700277486206733,
+      "loss": 0.1553,
+      "step": 8827
+    },
+    {
+      "epoch": 0.07663127924236769,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019700201312846904,
+      "loss": 0.1562,
+      "step": 8828
+    },
+    {
+      "epoch": 0.07663995972257186,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019700125129972668,
+      "loss": 0.1465,
+      "step": 8829
+    },
+    {
+      "epoch": 0.07664864020277602,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00197000489375841,
+      "loss": 0.1182,
+      "step": 8830
+    },
+    {
+      "epoch": 0.07665732068298019,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0019699972735681293,
+      "loss": 0.1396,
+      "step": 8831
+    },
+    {
+      "epoch": 0.07666600116318435,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001969989652426433,
+      "loss": 0.123,
+      "step": 8832
+    },
+    {
+      "epoch": 0.07667468164338852,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0019699820303333286,
+      "loss": 0.1436,
+      "step": 8833
+    },
+    {
+      "epoch": 0.07668336212359268,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001969974407288825,
+      "loss": 0.1445,
+      "step": 8834
+    },
+    {
+      "epoch": 0.07669204260379685,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019699667832929306,
+      "loss": 0.1089,
+      "step": 8835
+    },
+    {
+      "epoch": 0.07670072308400101,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019699591583456537,
+      "loss": 0.1582,
+      "step": 8836
+    },
+    {
+      "epoch": 0.07670940356420518,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001969951532447002,
+      "loss": 0.1113,
+      "step": 8837
+    },
+    {
+      "epoch": 0.07671808404440934,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0019699439055969846,
+      "loss": 0.1113,
+      "step": 8838
+    },
+    {
+      "epoch": 0.07672676452461351,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00196993627779561,
+      "loss": 0.1318,
+      "step": 8839
+    },
+    {
+      "epoch": 0.07673544500481767,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001969928649042886,
+      "loss": 0.1445,
+      "step": 8840
+    },
+    {
+      "epoch": 0.07674412548502182,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0019699210193388213,
+      "loss": 0.1445,
+      "step": 8841
+    },
+    {
+      "epoch": 0.07675280596522599,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019699133886834233,
+      "loss": 0.0879,
+      "step": 8842
+    },
+    {
+      "epoch": 0.07676148644543015,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0019699057570767018,
+      "loss": 0.1543,
+      "step": 8843
+    },
+    {
+      "epoch": 0.07677016692563432,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001969898124518664,
+      "loss": 0.1099,
+      "step": 8844
+    },
+    {
+      "epoch": 0.07677884740583849,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.001969890491009319,
+      "loss": 0.0957,
+      "step": 8845
+    },
+    {
+      "epoch": 0.07678752788604265,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0019698828565486745,
+      "loss": 0.1309,
+      "step": 8846
+    },
+    {
+      "epoch": 0.07679620836624682,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019698752211367394,
+      "loss": 0.1504,
+      "step": 8847
+    },
+    {
+      "epoch": 0.07680488884645098,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001969867584773522,
+      "loss": 0.1182,
+      "step": 8848
+    },
+    {
+      "epoch": 0.07681356932665515,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.00196985994745903,
+      "loss": 0.123,
+      "step": 8849
+    },
+    {
+      "epoch": 0.07682224980685931,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019698523091932723,
+      "loss": 0.1416,
+      "step": 8850
+    },
+    {
+      "epoch": 0.07683093028706348,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019698446699762577,
+      "loss": 0.1602,
+      "step": 8851
+    },
+    {
+      "epoch": 0.07683961076726764,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019698370298079936,
+      "loss": 0.1016,
+      "step": 8852
+    },
+    {
+      "epoch": 0.0768482912474718,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019698293886884887,
+      "loss": 0.1328,
+      "step": 8853
+    },
+    {
+      "epoch": 0.07685697172767597,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0019698217466177516,
+      "loss": 0.1113,
+      "step": 8854
+    },
+    {
+      "epoch": 0.07686565220788014,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019698141035957907,
+      "loss": 0.1299,
+      "step": 8855
+    },
+    {
+      "epoch": 0.0768743326880843,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001969806459622614,
+      "loss": 0.1133,
+      "step": 8856
+    },
+    {
+      "epoch": 0.07688301316828847,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019697988146982304,
+      "loss": 0.0977,
+      "step": 8857
+    },
+    {
+      "epoch": 0.07689169364849263,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019697911688226475,
+      "loss": 0.1582,
+      "step": 8858
+    },
+    {
+      "epoch": 0.0769003741286968,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019697835219958737,
+      "loss": 0.1338,
+      "step": 8859
+    },
+    {
+      "epoch": 0.07690905460890096,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019697758742179185,
+      "loss": 0.1113,
+      "step": 8860
+    },
+    {
+      "epoch": 0.07691773508910513,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.001969768225488789,
+      "loss": 0.0928,
+      "step": 8861
+    },
+    {
+      "epoch": 0.07692641556930929,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001969760575808494,
+      "loss": 0.1348,
+      "step": 8862
+    },
+    {
+      "epoch": 0.07693509604951346,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019697529251770417,
+      "loss": 0.124,
+      "step": 8863
+    },
+    {
+      "epoch": 0.07694377652971762,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001969745273594441,
+      "loss": 0.1455,
+      "step": 8864
+    },
+    {
+      "epoch": 0.07695245700992179,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0019697376210607,
+      "loss": 0.1226,
+      "step": 8865
+    },
+    {
+      "epoch": 0.07696113749012595,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.001969729967575827,
+      "loss": 0.1416,
+      "step": 8866
+    },
+    {
+      "epoch": 0.07696981797033012,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.00196972231313983,
+      "loss": 0.1182,
+      "step": 8867
+    },
+    {
+      "epoch": 0.07697849845053428,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001969714657752718,
+      "loss": 0.0986,
+      "step": 8868
+    },
+    {
+      "epoch": 0.07698717893073845,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0019697070014144994,
+      "loss": 0.1377,
+      "step": 8869
+    },
+    {
+      "epoch": 0.07699585941094261,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001969699344125182,
+      "loss": 0.1069,
+      "step": 8870
+    },
+    {
+      "epoch": 0.07700453989114678,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0019696916858847747,
+      "loss": 0.1328,
+      "step": 8871
+    },
+    {
+      "epoch": 0.07701322037135094,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0019696840266932854,
+      "loss": 0.125,
+      "step": 8872
+    },
+    {
+      "epoch": 0.07702190085155511,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0019696763665507226,
+      "loss": 0.1758,
+      "step": 8873
+    },
+    {
+      "epoch": 0.07703058133175927,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001969668705457095,
+      "loss": 0.1084,
+      "step": 8874
+    },
+    {
+      "epoch": 0.07703926181196344,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019696610434124105,
+      "loss": 0.1152,
+      "step": 8875
+    },
+    {
+      "epoch": 0.0770479422921676,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019696533804166782,
+      "loss": 0.1084,
+      "step": 8876
+    },
+    {
+      "epoch": 0.07705662277237177,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001969645716469906,
+      "loss": 0.166,
+      "step": 8877
+    },
+    {
+      "epoch": 0.07706530325257593,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001969638051572102,
+      "loss": 0.1191,
+      "step": 8878
+    },
+    {
+      "epoch": 0.0770739837327801,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001969630385723275,
+      "loss": 0.1992,
+      "step": 8879
+    },
+    {
+      "epoch": 0.07708266421298426,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019696227189234332,
+      "loss": 0.1133,
+      "step": 8880
+    },
+    {
+      "epoch": 0.07709134469318843,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0019696150511725853,
+      "loss": 0.1572,
+      "step": 8881
+    },
+    {
+      "epoch": 0.0771000251733926,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001969607382470739,
+      "loss": 0.1089,
+      "step": 8882
+    },
+    {
+      "epoch": 0.07710870565359676,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0019695997128179035,
+      "loss": 0.1328,
+      "step": 8883
+    },
+    {
+      "epoch": 0.07711738613380092,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001969592042214087,
+      "loss": 0.1816,
+      "step": 8884
+    },
+    {
+      "epoch": 0.07712606661400509,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019695843706592974,
+      "loss": 0.124,
+      "step": 8885
+    },
+    {
+      "epoch": 0.07713474709420926,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019695766981535434,
+      "loss": 0.1621,
+      "step": 8886
+    },
+    {
+      "epoch": 0.07714342757441342,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001969569024696833,
+      "loss": 0.1201,
+      "step": 8887
+    },
+    {
+      "epoch": 0.07715210805461759,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001969561350289176,
+      "loss": 0.1216,
+      "step": 8888
+    },
+    {
+      "epoch": 0.07716078853482175,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001969553674930579,
+      "loss": 0.1504,
+      "step": 8889
+    },
+    {
+      "epoch": 0.07716946901502592,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019695459986210514,
+      "loss": 0.1631,
+      "step": 8890
+    },
+    {
+      "epoch": 0.07717814949523008,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001969538321360601,
+      "loss": 0.127,
+      "step": 8891
+    },
+    {
+      "epoch": 0.07718682997543425,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001969530643149237,
+      "loss": 0.1553,
+      "step": 8892
+    },
+    {
+      "epoch": 0.07719551045563841,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001969522963986967,
+      "loss": 0.1094,
+      "step": 8893
+    },
+    {
+      "epoch": 0.07720419093584258,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0019695152838738,
+      "loss": 0.1377,
+      "step": 8894
+    },
+    {
+      "epoch": 0.07721287141604674,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019695076028097445,
+      "loss": 0.1318,
+      "step": 8895
+    },
+    {
+      "epoch": 0.0772215518962509,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001969499920794808,
+      "loss": 0.0796,
+      "step": 8896
+    },
+    {
+      "epoch": 0.07723023237645507,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019694922378289998,
+      "loss": 0.0933,
+      "step": 8897
+    },
+    {
+      "epoch": 0.07723891285665924,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0019694845539123275,
+      "loss": 0.1494,
+      "step": 8898
+    },
+    {
+      "epoch": 0.0772475933368634,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0019694768690448005,
+      "loss": 0.1089,
+      "step": 8899
+    },
+    {
+      "epoch": 0.07725627381706757,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0019694691832264264,
+      "loss": 0.168,
+      "step": 8900
+    },
+    {
+      "epoch": 0.07726495429727173,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001969461496457214,
+      "loss": 0.1416,
+      "step": 8901
+    },
+    {
+      "epoch": 0.07727363477747588,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019694538087371713,
+      "loss": 0.1074,
+      "step": 8902
+    },
+    {
+      "epoch": 0.07728231525768005,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001969446120066307,
+      "loss": 0.1221,
+      "step": 8903
+    },
+    {
+      "epoch": 0.07729099573788421,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0019694384304446296,
+      "loss": 0.1924,
+      "step": 8904
+    },
+    {
+      "epoch": 0.07729967621808838,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019694307398721474,
+      "loss": 0.1143,
+      "step": 8905
+    },
+    {
+      "epoch": 0.07730835669829254,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001969423048348869,
+      "loss": 0.1406,
+      "step": 8906
+    },
+    {
+      "epoch": 0.07731703717849671,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001969415355874802,
+      "loss": 0.1426,
+      "step": 8907
+    },
+    {
+      "epoch": 0.07732571765870087,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001969407662449956,
+      "loss": 0.1348,
+      "step": 8908
+    },
+    {
+      "epoch": 0.07733439813890504,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0019693999680743387,
+      "loss": 0.0972,
+      "step": 8909
+    },
+    {
+      "epoch": 0.0773430786191092,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019693922727479587,
+      "loss": 0.1069,
+      "step": 8910
+    },
+    {
+      "epoch": 0.07735175909931337,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019693845764708244,
+      "loss": 0.1201,
+      "step": 8911
+    },
+    {
+      "epoch": 0.07736043957951753,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001969376879242944,
+      "loss": 0.1543,
+      "step": 8912
+    },
+    {
+      "epoch": 0.0773691200597217,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001969369181064326,
+      "loss": 0.0981,
+      "step": 8913
+    },
+    {
+      "epoch": 0.07737780053992586,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001969361481934979,
+      "loss": 0.127,
+      "step": 8914
+    },
+    {
+      "epoch": 0.07738648102013003,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001969353781854912,
+      "loss": 0.1055,
+      "step": 8915
+    },
+    {
+      "epoch": 0.0773951615003342,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001969346080824132,
+      "loss": 0.125,
+      "step": 8916
+    },
+    {
+      "epoch": 0.07740384198053836,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001969338378842649,
+      "loss": 0.1523,
+      "step": 8917
+    },
+    {
+      "epoch": 0.07741252246074252,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0019693306759104696,
+      "loss": 0.1104,
+      "step": 8918
+    },
+    {
+      "epoch": 0.07742120294094669,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0019693229720276037,
+      "loss": 0.1133,
+      "step": 8919
+    },
+    {
+      "epoch": 0.07742988342115086,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001969315267194059,
+      "loss": 0.1084,
+      "step": 8920
+    },
+    {
+      "epoch": 0.07743856390135502,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0019693075614098444,
+      "loss": 0.1582,
+      "step": 8921
+    },
+    {
+      "epoch": 0.07744724438155919,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0019692998546749683,
+      "loss": 0.1719,
+      "step": 8922
+    },
+    {
+      "epoch": 0.07745592486176335,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001969292146989439,
+      "loss": 0.1309,
+      "step": 8923
+    },
+    {
+      "epoch": 0.07746460534196752,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0019692844383532642,
+      "loss": 0.1289,
+      "step": 8924
+    },
+    {
+      "epoch": 0.07747328582217168,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0019692767287664537,
+      "loss": 0.1182,
+      "step": 8925
+    },
+    {
+      "epoch": 0.07748196630237585,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001969269018229015,
+      "loss": 0.127,
+      "step": 8926
+    },
+    {
+      "epoch": 0.07749064678258001,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019692613067409568,
+      "loss": 0.1182,
+      "step": 8927
+    },
+    {
+      "epoch": 0.07749932726278418,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019692535943022873,
+      "loss": 0.166,
+      "step": 8928
+    },
+    {
+      "epoch": 0.07750800774298834,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019692458809130154,
+      "loss": 0.1367,
+      "step": 8929
+    },
+    {
+      "epoch": 0.0775166882231925,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019692381665731493,
+      "loss": 0.1328,
+      "step": 8930
+    },
+    {
+      "epoch": 0.07752536870339667,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001969230451282697,
+      "loss": 0.1035,
+      "step": 8931
+    },
+    {
+      "epoch": 0.07753404918360084,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001969222735041668,
+      "loss": 0.1494,
+      "step": 8932
+    },
+    {
+      "epoch": 0.077542729663805,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019692150178500694,
+      "loss": 0.1021,
+      "step": 8933
+    },
+    {
+      "epoch": 0.07755141014400917,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0019692072997079106,
+      "loss": 0.165,
+      "step": 8934
+    },
+    {
+      "epoch": 0.07756009062421333,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0019691995806151996,
+      "loss": 0.1084,
+      "step": 8935
+    },
+    {
+      "epoch": 0.0775687711044175,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0019691918605719448,
+      "loss": 0.1475,
+      "step": 8936
+    },
+    {
+      "epoch": 0.07757745158462166,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001969184139578155,
+      "loss": 0.1143,
+      "step": 8937
+    },
+    {
+      "epoch": 0.07758613206482583,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001969176417633839,
+      "loss": 0.1338,
+      "step": 8938
+    },
+    {
+      "epoch": 0.07759481254502999,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0019691686947390045,
+      "loss": 0.1992,
+      "step": 8939
+    },
+    {
+      "epoch": 0.07760349302523416,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00196916097089366,
+      "loss": 0.1221,
+      "step": 8940
+    },
+    {
+      "epoch": 0.07761217350543832,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001969153246097814,
+      "loss": 0.1455,
+      "step": 8941
+    },
+    {
+      "epoch": 0.07762085398564249,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0019691455203514753,
+      "loss": 0.1387,
+      "step": 8942
+    },
+    {
+      "epoch": 0.07762953446584665,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001969137793654652,
+      "loss": 0.1309,
+      "step": 8943
+    },
+    {
+      "epoch": 0.07763821494605082,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019691300660073527,
+      "loss": 0.1895,
+      "step": 8944
+    },
+    {
+      "epoch": 0.07764689542625498,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0019691223374095857,
+      "loss": 0.1035,
+      "step": 8945
+    },
+    {
+      "epoch": 0.07765557590645915,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.00196911460786136,
+      "loss": 0.1309,
+      "step": 8946
+    },
+    {
+      "epoch": 0.07766425638666331,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0019691068773626834,
+      "loss": 0.0981,
+      "step": 8947
+    },
+    {
+      "epoch": 0.07767293686686748,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019690991459135644,
+      "loss": 0.0874,
+      "step": 8948
+    },
+    {
+      "epoch": 0.07768161734707164,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019690914135140117,
+      "loss": 0.1973,
+      "step": 8949
+    },
+    {
+      "epoch": 0.07769029782727581,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019690836801640335,
+      "loss": 0.1104,
+      "step": 8950
+    },
+    {
+      "epoch": 0.07769897830747997,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001969075945863639,
+      "loss": 0.127,
+      "step": 8951
+    },
+    {
+      "epoch": 0.07770765878768414,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001969068210612836,
+      "loss": 0.1113,
+      "step": 8952
+    },
+    {
+      "epoch": 0.0777163392678883,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019690604744116327,
+      "loss": 0.2695,
+      "step": 8953
+    },
+    {
+      "epoch": 0.07772501974809247,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.001969052737260038,
+      "loss": 0.1309,
+      "step": 8954
+    },
+    {
+      "epoch": 0.07773370022829663,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00196904499915806,
+      "loss": 0.166,
+      "step": 8955
+    },
+    {
+      "epoch": 0.0777423807085008,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001969037260105708,
+      "loss": 0.165,
+      "step": 8956
+    },
+    {
+      "epoch": 0.07775106118870496,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.00196902952010299,
+      "loss": 0.127,
+      "step": 8957
+    },
+    {
+      "epoch": 0.07775974166890913,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001969021779149914,
+      "loss": 0.1426,
+      "step": 8958
+    },
+    {
+      "epoch": 0.0777684221491133,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019690140372464887,
+      "loss": 0.1387,
+      "step": 8959
+    },
+    {
+      "epoch": 0.07777710262931746,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001969006294392723,
+      "loss": 0.1104,
+      "step": 8960
+    },
+    {
+      "epoch": 0.07778578310952163,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001968998550588625,
+      "loss": 0.123,
+      "step": 8961
+    },
+    {
+      "epoch": 0.07779446358972579,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.001968990805834203,
+      "loss": 0.124,
+      "step": 8962
+    },
+    {
+      "epoch": 0.07780314406992996,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001968983060129466,
+      "loss": 0.1104,
+      "step": 8963
+    },
+    {
+      "epoch": 0.0778118245501341,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001968975313474422,
+      "loss": 0.1572,
+      "step": 8964
+    },
+    {
+      "epoch": 0.07782050503033827,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00196896756586908,
+      "loss": 0.1953,
+      "step": 8965
+    },
+    {
+      "epoch": 0.07782918551054244,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001968959817313448,
+      "loss": 0.1592,
+      "step": 8966
+    },
+    {
+      "epoch": 0.0778378659907466,
+      "grad_norm": 2.046875,
+      "learning_rate": 0.0019689520678075347,
+      "loss": 0.123,
+      "step": 8967
+    },
+    {
+      "epoch": 0.07784654647095077,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001968944317351348,
+      "loss": 0.1211,
+      "step": 8968
+    },
+    {
+      "epoch": 0.07785522695115493,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0019689365659448974,
+      "loss": 0.1426,
+      "step": 8969
+    },
+    {
+      "epoch": 0.0778639074313591,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019689288135881903,
+      "loss": 0.1123,
+      "step": 8970
+    },
+    {
+      "epoch": 0.07787258791156326,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0019689210602812364,
+      "loss": 0.1152,
+      "step": 8971
+    },
+    {
+      "epoch": 0.07788126839176743,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0019689133060240426,
+      "loss": 0.105,
+      "step": 8972
+    },
+    {
+      "epoch": 0.07788994887197159,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001968905550816619,
+      "loss": 0.123,
+      "step": 8973
+    },
+    {
+      "epoch": 0.07789862935217576,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019688977946589734,
+      "loss": 0.1484,
+      "step": 8974
+    },
+    {
+      "epoch": 0.07790730983237992,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019688900375511133,
+      "loss": 0.1299,
+      "step": 8975
+    },
+    {
+      "epoch": 0.07791599031258409,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001968882279493049,
+      "loss": 0.1045,
+      "step": 8976
+    },
+    {
+      "epoch": 0.07792467079278825,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0019688745204847874,
+      "loss": 0.1226,
+      "step": 8977
+    },
+    {
+      "epoch": 0.07793335127299242,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001968866760526338,
+      "loss": 0.1504,
+      "step": 8978
+    },
+    {
+      "epoch": 0.07794203175319658,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019688589996177093,
+      "loss": 0.1021,
+      "step": 8979
+    },
+    {
+      "epoch": 0.07795071223340075,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001968851237758909,
+      "loss": 0.1758,
+      "step": 8980
+    },
+    {
+      "epoch": 0.07795939271360491,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019688434749499466,
+      "loss": 0.1611,
+      "step": 8981
+    },
+    {
+      "epoch": 0.07796807319380908,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019688357111908297,
+      "loss": 0.1172,
+      "step": 8982
+    },
+    {
+      "epoch": 0.07797675367401324,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001968827946481567,
+      "loss": 0.1523,
+      "step": 8983
+    },
+    {
+      "epoch": 0.07798543415421741,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001968820180822167,
+      "loss": 0.1133,
+      "step": 8984
+    },
+    {
+      "epoch": 0.07799411463442157,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0019688124142126385,
+      "loss": 0.0977,
+      "step": 8985
+    },
+    {
+      "epoch": 0.07800279511462574,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.00196880464665299,
+      "loss": 0.0977,
+      "step": 8986
+    },
+    {
+      "epoch": 0.0780114755948299,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0019687968781432297,
+      "loss": 0.1367,
+      "step": 8987
+    },
+    {
+      "epoch": 0.07802015607503407,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001968789108683366,
+      "loss": 0.1045,
+      "step": 8988
+    },
+    {
+      "epoch": 0.07802883655523823,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019687813382734077,
+      "loss": 0.1064,
+      "step": 8989
+    },
+    {
+      "epoch": 0.0780375170354424,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001968773566913363,
+      "loss": 0.1133,
+      "step": 8990
+    },
+    {
+      "epoch": 0.07804619751564656,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019687657946032408,
+      "loss": 0.1533,
+      "step": 8991
+    },
+    {
+      "epoch": 0.07805487799585073,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019687580213430494,
+      "loss": 0.1709,
+      "step": 8992
+    },
+    {
+      "epoch": 0.0780635584760549,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.001968750247132797,
+      "loss": 0.1289,
+      "step": 8993
+    },
+    {
+      "epoch": 0.07807223895625906,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019687424719724927,
+      "loss": 0.1387,
+      "step": 8994
+    },
+    {
+      "epoch": 0.07808091943646323,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019687346958621446,
+      "loss": 0.167,
+      "step": 8995
+    },
+    {
+      "epoch": 0.07808959991666739,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019687269188017613,
+      "loss": 0.1318,
+      "step": 8996
+    },
+    {
+      "epoch": 0.07809828039687156,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001968719140791351,
+      "loss": 0.1216,
+      "step": 8997
+    },
+    {
+      "epoch": 0.07810696087707572,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.001968711361830923,
+      "loss": 0.1074,
+      "step": 8998
+    },
+    {
+      "epoch": 0.07811564135727989,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001968703581920485,
+      "loss": 0.1201,
+      "step": 8999
+    },
+    {
+      "epoch": 0.07812432183748405,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001968695801060046,
+      "loss": 0.1094,
+      "step": 9000
+    },
+    {
+      "epoch": 0.07813300231768822,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0019686880192496137,
+      "loss": 0.1113,
+      "step": 9001
+    },
+    {
+      "epoch": 0.07814168279789238,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001968680236489198,
+      "loss": 0.1104,
+      "step": 9002
+    },
+    {
+      "epoch": 0.07815036327809655,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.001968672452778806,
+      "loss": 0.1348,
+      "step": 9003
+    },
+    {
+      "epoch": 0.07815904375830071,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019686646681184475,
+      "loss": 0.1543,
+      "step": 9004
+    },
+    {
+      "epoch": 0.07816772423850488,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.00196865688250813,
+      "loss": 0.1367,
+      "step": 9005
+    },
+    {
+      "epoch": 0.07817640471870904,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019686490959478623,
+      "loss": 0.1211,
+      "step": 9006
+    },
+    {
+      "epoch": 0.0781850851989132,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019686413084376536,
+      "loss": 0.1289,
+      "step": 9007
+    },
+    {
+      "epoch": 0.07819376567911737,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001968633519977511,
+      "loss": 0.1279,
+      "step": 9008
+    },
+    {
+      "epoch": 0.07820244615932154,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0019686257305674445,
+      "loss": 0.2031,
+      "step": 9009
+    },
+    {
+      "epoch": 0.0782111266395257,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019686179402074616,
+      "loss": 0.166,
+      "step": 9010
+    },
+    {
+      "epoch": 0.07821980711972987,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019686101488975713,
+      "loss": 0.1621,
+      "step": 9011
+    },
+    {
+      "epoch": 0.07822848759993403,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001968602356637782,
+      "loss": 0.1338,
+      "step": 9012
+    },
+    {
+      "epoch": 0.0782371680801382,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001968594563428102,
+      "loss": 0.1211,
+      "step": 9013
+    },
+    {
+      "epoch": 0.07824584856034236,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019685867692685404,
+      "loss": 0.127,
+      "step": 9014
+    },
+    {
+      "epoch": 0.07825452904054653,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001968578974159105,
+      "loss": 0.1465,
+      "step": 9015
+    },
+    {
+      "epoch": 0.07826320952075069,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001968571178099805,
+      "loss": 0.1221,
+      "step": 9016
+    },
+    {
+      "epoch": 0.07827189000095486,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001968563381090648,
+      "loss": 0.165,
+      "step": 9017
+    },
+    {
+      "epoch": 0.07828057048115902,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0019685555831316435,
+      "loss": 0.127,
+      "step": 9018
+    },
+    {
+      "epoch": 0.07828925096136319,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019685477842228,
+      "loss": 0.1504,
+      "step": 9019
+    },
+    {
+      "epoch": 0.07829793144156735,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019685399843641254,
+      "loss": 0.1396,
+      "step": 9020
+    },
+    {
+      "epoch": 0.07830661192177152,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001968532183555629,
+      "loss": 0.1348,
+      "step": 9021
+    },
+    {
+      "epoch": 0.07831529240197568,
+      "grad_norm": 1.796875,
+      "learning_rate": 0.001968524381797318,
+      "loss": 0.1641,
+      "step": 9022
+    },
+    {
+      "epoch": 0.07832397288217985,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001968516579089202,
+      "loss": 0.166,
+      "step": 9023
+    },
+    {
+      "epoch": 0.07833265336238401,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.00196850877543129,
+      "loss": 0.1211,
+      "step": 9024
+    },
+    {
+      "epoch": 0.07834133384258818,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001968500970823589,
+      "loss": 0.1729,
+      "step": 9025
+    },
+    {
+      "epoch": 0.07835001432279233,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001968493165266109,
+      "loss": 0.1338,
+      "step": 9026
+    },
+    {
+      "epoch": 0.0783586948029965,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0019684853587588575,
+      "loss": 0.125,
+      "step": 9027
+    },
+    {
+      "epoch": 0.07836737528320066,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019684775513018437,
+      "loss": 0.1426,
+      "step": 9028
+    },
+    {
+      "epoch": 0.07837605576340483,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001968469742895076,
+      "loss": 0.1221,
+      "step": 9029
+    },
+    {
+      "epoch": 0.07838473624360899,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019684619335385623,
+      "loss": 0.1211,
+      "step": 9030
+    },
+    {
+      "epoch": 0.07839341672381316,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019684541232323125,
+      "loss": 0.1064,
+      "step": 9031
+    },
+    {
+      "epoch": 0.07840209720401732,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019684463119763337,
+      "loss": 0.1367,
+      "step": 9032
+    },
+    {
+      "epoch": 0.07841077768422149,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0019684384997706353,
+      "loss": 0.1455,
+      "step": 9033
+    },
+    {
+      "epoch": 0.07841945816442565,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019684306866152254,
+      "loss": 0.1582,
+      "step": 9034
+    },
+    {
+      "epoch": 0.07842813864462982,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001968422872510113,
+      "loss": 0.1162,
+      "step": 9035
+    },
+    {
+      "epoch": 0.07843681912483398,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0019684150574553064,
+      "loss": 0.1055,
+      "step": 9036
+    },
+    {
+      "epoch": 0.07844549960503815,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001968407241450814,
+      "loss": 0.127,
+      "step": 9037
+    },
+    {
+      "epoch": 0.07845418008524231,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019683994244966444,
+      "loss": 0.2139,
+      "step": 9038
+    },
+    {
+      "epoch": 0.07846286056544648,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019683916065928063,
+      "loss": 0.1885,
+      "step": 9039
+    },
+    {
+      "epoch": 0.07847154104565064,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001968383787739308,
+      "loss": 0.1191,
+      "step": 9040
+    },
+    {
+      "epoch": 0.0784802215258548,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001968375967936159,
+      "loss": 0.1162,
+      "step": 9041
+    },
+    {
+      "epoch": 0.07848890200605897,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001968368147183366,
+      "loss": 0.1289,
+      "step": 9042
+    },
+    {
+      "epoch": 0.07849758248626314,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019683603254809397,
+      "loss": 0.1055,
+      "step": 9043
+    },
+    {
+      "epoch": 0.0785062629664673,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001968352502828887,
+      "loss": 0.1172,
+      "step": 9044
+    },
+    {
+      "epoch": 0.07851494344667147,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001968344679227217,
+      "loss": 0.1387,
+      "step": 9045
+    },
+    {
+      "epoch": 0.07852362392687563,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019683368546759383,
+      "loss": 0.1123,
+      "step": 9046
+    },
+    {
+      "epoch": 0.0785323044070798,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00196832902917506,
+      "loss": 0.1182,
+      "step": 9047
+    },
+    {
+      "epoch": 0.07854098488728396,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.001968321202724589,
+      "loss": 0.1143,
+      "step": 9048
+    },
+    {
+      "epoch": 0.07854966536748813,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001968313375324536,
+      "loss": 0.1357,
+      "step": 9049
+    },
+    {
+      "epoch": 0.07855834584769229,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019683055469749086,
+      "loss": 0.1143,
+      "step": 9050
+    },
+    {
+      "epoch": 0.07856702632789646,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001968297717675715,
+      "loss": 0.0986,
+      "step": 9051
+    },
+    {
+      "epoch": 0.07857570680810062,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019682898874269637,
+      "loss": 0.1445,
+      "step": 9052
+    },
+    {
+      "epoch": 0.07858438728830479,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001968282056228664,
+      "loss": 0.1514,
+      "step": 9053
+    },
+    {
+      "epoch": 0.07859306776850895,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001968274224080824,
+      "loss": 0.123,
+      "step": 9054
+    },
+    {
+      "epoch": 0.07860174824871312,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0019682663909834524,
+      "loss": 0.1201,
+      "step": 9055
+    },
+    {
+      "epoch": 0.07861042872891728,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019682585569365577,
+      "loss": 0.0957,
+      "step": 9056
+    },
+    {
+      "epoch": 0.07861910920912145,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0019682507219401487,
+      "loss": 0.1602,
+      "step": 9057
+    },
+    {
+      "epoch": 0.07862778968932561,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0019682428859942335,
+      "loss": 0.1113,
+      "step": 9058
+    },
+    {
+      "epoch": 0.07863647016952978,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001968235049098821,
+      "loss": 0.1279,
+      "step": 9059
+    },
+    {
+      "epoch": 0.07864515064973394,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0019682272112539196,
+      "loss": 0.0908,
+      "step": 9060
+    },
+    {
+      "epoch": 0.07865383112993811,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019682193724595383,
+      "loss": 0.1289,
+      "step": 9061
+    },
+    {
+      "epoch": 0.07866251161014227,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001968211532715685,
+      "loss": 0.1328,
+      "step": 9062
+    },
+    {
+      "epoch": 0.07867119209034644,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0019682036920223686,
+      "loss": 0.1104,
+      "step": 9063
+    },
+    {
+      "epoch": 0.0786798725705506,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0019681958503795977,
+      "loss": 0.1338,
+      "step": 9064
+    },
+    {
+      "epoch": 0.07868855305075477,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001968188007787381,
+      "loss": 0.1201,
+      "step": 9065
+    },
+    {
+      "epoch": 0.07869723353095893,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019681801642457268,
+      "loss": 0.1118,
+      "step": 9066
+    },
+    {
+      "epoch": 0.0787059140111631,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001968172319754644,
+      "loss": 0.1187,
+      "step": 9067
+    },
+    {
+      "epoch": 0.07871459449136727,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001968164474314141,
+      "loss": 0.1104,
+      "step": 9068
+    },
+    {
+      "epoch": 0.07872327497157143,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001968156627924226,
+      "loss": 0.1621,
+      "step": 9069
+    },
+    {
+      "epoch": 0.0787319554517756,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001968148780584908,
+      "loss": 0.1289,
+      "step": 9070
+    },
+    {
+      "epoch": 0.07874063593197976,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001968140932296196,
+      "loss": 0.1738,
+      "step": 9071
+    },
+    {
+      "epoch": 0.07874931641218393,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001968133083058098,
+      "loss": 0.1074,
+      "step": 9072
+    },
+    {
+      "epoch": 0.07875799689238809,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019681252328706228,
+      "loss": 0.1235,
+      "step": 9073
+    },
+    {
+      "epoch": 0.07876667737259226,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0019681173817337784,
+      "loss": 0.1191,
+      "step": 9074
+    },
+    {
+      "epoch": 0.07877535785279642,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.001968109529647574,
+      "loss": 0.123,
+      "step": 9075
+    },
+    {
+      "epoch": 0.07878403833300059,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019681016766120185,
+      "loss": 0.1328,
+      "step": 9076
+    },
+    {
+      "epoch": 0.07879271881320475,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.00196809382262712,
+      "loss": 0.1572,
+      "step": 9077
+    },
+    {
+      "epoch": 0.07880139929340892,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001968085967692887,
+      "loss": 0.1318,
+      "step": 9078
+    },
+    {
+      "epoch": 0.07881007977361308,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019680781118093277,
+      "loss": 0.166,
+      "step": 9079
+    },
+    {
+      "epoch": 0.07881876025381725,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0019680702549764516,
+      "loss": 0.207,
+      "step": 9080
+    },
+    {
+      "epoch": 0.07882744073402141,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0019680623971942666,
+      "loss": 0.1118,
+      "step": 9081
+    },
+    {
+      "epoch": 0.07883612121422558,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001968054538462782,
+      "loss": 0.1357,
+      "step": 9082
+    },
+    {
+      "epoch": 0.07884480169442974,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019680466787820065,
+      "loss": 0.1357,
+      "step": 9083
+    },
+    {
+      "epoch": 0.07885348217463391,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0019680388181519473,
+      "loss": 0.1895,
+      "step": 9084
+    },
+    {
+      "epoch": 0.07886216265483807,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019680309565726145,
+      "loss": 0.1079,
+      "step": 9085
+    },
+    {
+      "epoch": 0.07887084313504224,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001968023094044016,
+      "loss": 0.104,
+      "step": 9086
+    },
+    {
+      "epoch": 0.07887952361524639,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00196801523056616,
+      "loss": 0.0859,
+      "step": 9087
+    },
+    {
+      "epoch": 0.07888820409545055,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001968007366139056,
+      "loss": 0.1846,
+      "step": 9088
+    },
+    {
+      "epoch": 0.07889688457565472,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001967999500762712,
+      "loss": 0.1602,
+      "step": 9089
+    },
+    {
+      "epoch": 0.07890556505585888,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001967991634437137,
+      "loss": 0.1426,
+      "step": 9090
+    },
+    {
+      "epoch": 0.07891424553606305,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0019679837671623393,
+      "loss": 0.1885,
+      "step": 9091
+    },
+    {
+      "epoch": 0.07892292601626721,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0019679758989383273,
+      "loss": 0.085,
+      "step": 9092
+    },
+    {
+      "epoch": 0.07893160649647138,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.00196796802976511,
+      "loss": 0.1328,
+      "step": 9093
+    },
+    {
+      "epoch": 0.07894028697667554,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001967960159642696,
+      "loss": 0.1348,
+      "step": 9094
+    },
+    {
+      "epoch": 0.07894896745687971,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.001967952288571094,
+      "loss": 0.1201,
+      "step": 9095
+    },
+    {
+      "epoch": 0.07895764793708387,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0019679444165503126,
+      "loss": 0.1406,
+      "step": 9096
+    },
+    {
+      "epoch": 0.07896632841728804,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.00196793654358036,
+      "loss": 0.1348,
+      "step": 9097
+    },
+    {
+      "epoch": 0.0789750088974922,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0019679286696612447,
+      "loss": 0.1328,
+      "step": 9098
+    },
+    {
+      "epoch": 0.07898368937769637,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001967920794792976,
+      "loss": 0.1143,
+      "step": 9099
+    },
+    {
+      "epoch": 0.07899236985790054,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.001967912918975562,
+      "loss": 0.1357,
+      "step": 9100
+    },
+    {
+      "epoch": 0.0790010503381047,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019679050422090113,
+      "loss": 0.1016,
+      "step": 9101
+    },
+    {
+      "epoch": 0.07900973081830887,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001967897164493333,
+      "loss": 0.1455,
+      "step": 9102
+    },
+    {
+      "epoch": 0.07901841129851303,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019678892858285358,
+      "loss": 0.1016,
+      "step": 9103
+    },
+    {
+      "epoch": 0.0790270917787172,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001967881406214627,
+      "loss": 0.1143,
+      "step": 9104
+    },
+    {
+      "epoch": 0.07903577225892136,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001967873525651617,
+      "loss": 0.1377,
+      "step": 9105
+    },
+    {
+      "epoch": 0.07904445273912553,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001967865644139513,
+      "loss": 0.0825,
+      "step": 9106
+    },
+    {
+      "epoch": 0.07905313321932969,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001967857761678324,
+      "loss": 0.2158,
+      "step": 9107
+    },
+    {
+      "epoch": 0.07906181369953386,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019678498782680592,
+      "loss": 0.1123,
+      "step": 9108
+    },
+    {
+      "epoch": 0.07907049417973802,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019678419939087267,
+      "loss": 0.1113,
+      "step": 9109
+    },
+    {
+      "epoch": 0.07907917465994219,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019678341086003352,
+      "loss": 0.1484,
+      "step": 9110
+    },
+    {
+      "epoch": 0.07908785514014635,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019678262223428934,
+      "loss": 0.1406,
+      "step": 9111
+    },
+    {
+      "epoch": 0.07909653562035052,
+      "grad_norm": 1.796875,
+      "learning_rate": 0.00196781833513641,
+      "loss": 0.1357,
+      "step": 9112
+    },
+    {
+      "epoch": 0.07910521610055468,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0019678104469808932,
+      "loss": 0.1143,
+      "step": 9113
+    },
+    {
+      "epoch": 0.07911389658075885,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001967802557876352,
+      "loss": 0.1191,
+      "step": 9114
+    },
+    {
+      "epoch": 0.07912257706096301,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0019677946678227955,
+      "loss": 0.1079,
+      "step": 9115
+    },
+    {
+      "epoch": 0.07913125754116718,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.001967786776820231,
+      "loss": 0.1484,
+      "step": 9116
+    },
+    {
+      "epoch": 0.07913993802137134,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0019677788848686682,
+      "loss": 0.123,
+      "step": 9117
+    },
+    {
+      "epoch": 0.07914861850157551,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001967770991968116,
+      "loss": 0.1235,
+      "step": 9118
+    },
+    {
+      "epoch": 0.07915729898177967,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001967763098118582,
+      "loss": 0.0972,
+      "step": 9119
+    },
+    {
+      "epoch": 0.07916597946198384,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001967755203320075,
+      "loss": 0.123,
+      "step": 9120
+    },
+    {
+      "epoch": 0.079174659942188,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0019677473075726043,
+      "loss": 0.123,
+      "step": 9121
+    },
+    {
+      "epoch": 0.07918334042239217,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001967739410876178,
+      "loss": 0.1768,
+      "step": 9122
+    },
+    {
+      "epoch": 0.07919202090259633,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019677315132308054,
+      "loss": 0.1318,
+      "step": 9123
+    },
+    {
+      "epoch": 0.0792007013828005,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0019677236146364943,
+      "loss": 0.1533,
+      "step": 9124
+    },
+    {
+      "epoch": 0.07920938186300466,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001967715715093254,
+      "loss": 0.1758,
+      "step": 9125
+    },
+    {
+      "epoch": 0.07921806234320883,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0019677078146010923,
+      "loss": 0.1074,
+      "step": 9126
+    },
+    {
+      "epoch": 0.079226742823413,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001967699913160019,
+      "loss": 0.1396,
+      "step": 9127
+    },
+    {
+      "epoch": 0.07923542330361716,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019676920107700416,
+      "loss": 0.0894,
+      "step": 9128
+    },
+    {
+      "epoch": 0.07924410378382132,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001967684107431169,
+      "loss": 0.1045,
+      "step": 9129
+    },
+    {
+      "epoch": 0.07925278426402549,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019676762031434107,
+      "loss": 0.1367,
+      "step": 9130
+    },
+    {
+      "epoch": 0.07926146474422965,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019676682979067744,
+      "loss": 0.1357,
+      "step": 9131
+    },
+    {
+      "epoch": 0.07927014522443382,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019676603917212696,
+      "loss": 0.1094,
+      "step": 9132
+    },
+    {
+      "epoch": 0.07927882570463798,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001967652484586904,
+      "loss": 0.1045,
+      "step": 9133
+    },
+    {
+      "epoch": 0.07928750618484215,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019676445765036865,
+      "loss": 0.1152,
+      "step": 9134
+    },
+    {
+      "epoch": 0.07929618666504631,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001967636667471626,
+      "loss": 0.1338,
+      "step": 9135
+    },
+    {
+      "epoch": 0.07930486714525048,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019676287574907315,
+      "loss": 0.1211,
+      "step": 9136
+    },
+    {
+      "epoch": 0.07931354762545464,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0019676208465610107,
+      "loss": 0.1621,
+      "step": 9137
+    },
+    {
+      "epoch": 0.07932222810565881,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001967612934682473,
+      "loss": 0.1523,
+      "step": 9138
+    },
+    {
+      "epoch": 0.07933090858586297,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001967605021855127,
+      "loss": 0.1484,
+      "step": 9139
+    },
+    {
+      "epoch": 0.07933958906606714,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019675971080789813,
+      "loss": 0.1279,
+      "step": 9140
+    },
+    {
+      "epoch": 0.0793482695462713,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0019675891933540438,
+      "loss": 0.124,
+      "step": 9141
+    },
+    {
+      "epoch": 0.07935695002647547,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019675812776803244,
+      "loss": 0.1104,
+      "step": 9142
+    },
+    {
+      "epoch": 0.07936563050667964,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001967573361057831,
+      "loss": 0.1108,
+      "step": 9143
+    },
+    {
+      "epoch": 0.0793743109868838,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019675654434865722,
+      "loss": 0.1602,
+      "step": 9144
+    },
+    {
+      "epoch": 0.07938299146708797,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001967557524966557,
+      "loss": 0.1221,
+      "step": 9145
+    },
+    {
+      "epoch": 0.07939167194729213,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001967549605497794,
+      "loss": 0.1279,
+      "step": 9146
+    },
+    {
+      "epoch": 0.0794003524274963,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019675416850802917,
+      "loss": 0.1221,
+      "step": 9147
+    },
+    {
+      "epoch": 0.07940903290770046,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001967533763714059,
+      "loss": 0.1348,
+      "step": 9148
+    },
+    {
+      "epoch": 0.07941771338790461,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019675258413991044,
+      "loss": 0.1162,
+      "step": 9149
+    },
+    {
+      "epoch": 0.07942639386810878,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019675179181354364,
+      "loss": 0.1494,
+      "step": 9150
+    },
+    {
+      "epoch": 0.07943507434831294,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019675099939230637,
+      "loss": 0.1289,
+      "step": 9151
+    },
+    {
+      "epoch": 0.07944375482851711,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0019675020687619955,
+      "loss": 0.1025,
+      "step": 9152
+    },
+    {
+      "epoch": 0.07945243530872127,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0019674941426522396,
+      "loss": 0.103,
+      "step": 9153
+    },
+    {
+      "epoch": 0.07946111578892544,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019674862155938055,
+      "loss": 0.1069,
+      "step": 9154
+    },
+    {
+      "epoch": 0.0794697962691296,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019674782875867013,
+      "loss": 0.1221,
+      "step": 9155
+    },
+    {
+      "epoch": 0.07947847674933377,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001967470358630936,
+      "loss": 0.127,
+      "step": 9156
+    },
+    {
+      "epoch": 0.07948715722953793,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001967462428726518,
+      "loss": 0.1602,
+      "step": 9157
+    },
+    {
+      "epoch": 0.0794958377097421,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001967454497873457,
+      "loss": 0.123,
+      "step": 9158
+    },
+    {
+      "epoch": 0.07950451818994626,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019674465660717596,
+      "loss": 0.1187,
+      "step": 9159
+    },
+    {
+      "epoch": 0.07951319867015043,
+      "grad_norm": 0.057861328125,
+      "learning_rate": 0.0019674386333214363,
+      "loss": 0.105,
+      "step": 9160
+    },
+    {
+      "epoch": 0.0795218791503546,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001967430699622495,
+      "loss": 0.1187,
+      "step": 9161
+    },
+    {
+      "epoch": 0.07953055963055876,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019674227649749444,
+      "loss": 0.1216,
+      "step": 9162
+    },
+    {
+      "epoch": 0.07953924011076292,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019674148293787933,
+      "loss": 0.1465,
+      "step": 9163
+    },
+    {
+      "epoch": 0.07954792059096709,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0019674068928340507,
+      "loss": 0.1348,
+      "step": 9164
+    },
+    {
+      "epoch": 0.07955660107117125,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019673989553407245,
+      "loss": 0.1377,
+      "step": 9165
+    },
+    {
+      "epoch": 0.07956528155137542,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001967391016898824,
+      "loss": 0.1045,
+      "step": 9166
+    },
+    {
+      "epoch": 0.07957396203157958,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001967383077508358,
+      "loss": 0.1377,
+      "step": 9167
+    },
+    {
+      "epoch": 0.07958264251178375,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0019673751371693344,
+      "loss": 0.0889,
+      "step": 9168
+    },
+    {
+      "epoch": 0.07959132299198791,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001967367195881763,
+      "loss": 0.1152,
+      "step": 9169
+    },
+    {
+      "epoch": 0.07960000347219208,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0019673592536456515,
+      "loss": 0.1328,
+      "step": 9170
+    },
+    {
+      "epoch": 0.07960868395239624,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001967351310461009,
+      "loss": 0.0747,
+      "step": 9171
+    },
+    {
+      "epoch": 0.07961736443260041,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001967343366327844,
+      "loss": 0.1221,
+      "step": 9172
+    },
+    {
+      "epoch": 0.07962604491280457,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0019673354212461653,
+      "loss": 0.0918,
+      "step": 9173
+    },
+    {
+      "epoch": 0.07963472539300874,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019673274752159816,
+      "loss": 0.1416,
+      "step": 9174
+    },
+    {
+      "epoch": 0.0796434058732129,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.001967319528237302,
+      "loss": 0.5742,
+      "step": 9175
+    },
+    {
+      "epoch": 0.07965208635341707,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019673115803101344,
+      "loss": 0.1523,
+      "step": 9176
+    },
+    {
+      "epoch": 0.07966076683362124,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.001967303631434488,
+      "loss": 0.1226,
+      "step": 9177
+    },
+    {
+      "epoch": 0.0796694473138254,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0019672956816103714,
+      "loss": 0.1143,
+      "step": 9178
+    },
+    {
+      "epoch": 0.07967812779402957,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019672877308377934,
+      "loss": 0.1221,
+      "step": 9179
+    },
+    {
+      "epoch": 0.07968680827423373,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019672797791167626,
+      "loss": 0.1025,
+      "step": 9180
+    },
+    {
+      "epoch": 0.0796954887544379,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0019672718264472874,
+      "loss": 0.2031,
+      "step": 9181
+    },
+    {
+      "epoch": 0.07970416923464206,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001967263872829377,
+      "loss": 0.126,
+      "step": 9182
+    },
+    {
+      "epoch": 0.07971284971484623,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.00196725591826304,
+      "loss": 0.1074,
+      "step": 9183
+    },
+    {
+      "epoch": 0.07972153019505039,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0019672479627482844,
+      "loss": 0.1133,
+      "step": 9184
+    },
+    {
+      "epoch": 0.07973021067525456,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.00196724000628512,
+      "loss": 0.1299,
+      "step": 9185
+    },
+    {
+      "epoch": 0.07973889115545872,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019672320488735543,
+      "loss": 0.1367,
+      "step": 9186
+    },
+    {
+      "epoch": 0.07974757163566289,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019672240905135975,
+      "loss": 0.1338,
+      "step": 9187
+    },
+    {
+      "epoch": 0.07975625211586705,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001967216131205257,
+      "loss": 0.1113,
+      "step": 9188
+    },
+    {
+      "epoch": 0.07976493259607122,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.001967208170948542,
+      "loss": 0.1289,
+      "step": 9189
+    },
+    {
+      "epoch": 0.07977361307627538,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001967200209743461,
+      "loss": 0.1553,
+      "step": 9190
+    },
+    {
+      "epoch": 0.07978229355647955,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001967192247590023,
+      "loss": 0.1309,
+      "step": 9191
+    },
+    {
+      "epoch": 0.07979097403668371,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019671842844882366,
+      "loss": 0.1855,
+      "step": 9192
+    },
+    {
+      "epoch": 0.07979965451688788,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019671763204381107,
+      "loss": 0.1221,
+      "step": 9193
+    },
+    {
+      "epoch": 0.07980833499709204,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001967168355439654,
+      "loss": 0.123,
+      "step": 9194
+    },
+    {
+      "epoch": 0.07981701547729621,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001967160389492874,
+      "loss": 0.1426,
+      "step": 9195
+    },
+    {
+      "epoch": 0.07982569595750037,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019671524225977816,
+      "loss": 0.1309,
+      "step": 9196
+    },
+    {
+      "epoch": 0.07983437643770454,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0019671444547543836,
+      "loss": 0.1104,
+      "step": 9197
+    },
+    {
+      "epoch": 0.0798430569179087,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019671364859626898,
+      "loss": 0.2012,
+      "step": 9198
+    },
+    {
+      "epoch": 0.07985173739811287,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019671285162227085,
+      "loss": 0.1167,
+      "step": 9199
+    },
+    {
+      "epoch": 0.07986041787831703,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0019671205455344485,
+      "loss": 0.1226,
+      "step": 9200
+    },
+    {
+      "epoch": 0.0798690983585212,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019671125738979184,
+      "loss": 0.1177,
+      "step": 9201
+    },
+    {
+      "epoch": 0.07987777883872536,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0019671046013131272,
+      "loss": 0.168,
+      "step": 9202
+    },
+    {
+      "epoch": 0.07988645931892953,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001967096627780083,
+      "loss": 0.1309,
+      "step": 9203
+    },
+    {
+      "epoch": 0.0798951397991337,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019670886532987954,
+      "loss": 0.1289,
+      "step": 9204
+    },
+    {
+      "epoch": 0.07990382027933786,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019670806778692724,
+      "loss": 0.1416,
+      "step": 9205
+    },
+    {
+      "epoch": 0.07991250075954202,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001967072701491523,
+      "loss": 0.1738,
+      "step": 9206
+    },
+    {
+      "epoch": 0.07992118123974619,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019670647241655562,
+      "loss": 0.1328,
+      "step": 9207
+    },
+    {
+      "epoch": 0.07992986171995035,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0019670567458913804,
+      "loss": 0.1475,
+      "step": 9208
+    },
+    {
+      "epoch": 0.07993854220015452,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001967048766669004,
+      "loss": 0.1113,
+      "step": 9209
+    },
+    {
+      "epoch": 0.07994722268035867,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.001967040786498437,
+      "loss": 0.124,
+      "step": 9210
+    },
+    {
+      "epoch": 0.07995590316056284,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001967032805379686,
+      "loss": 0.1357,
+      "step": 9211
+    },
+    {
+      "epoch": 0.079964583640767,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001967024823312762,
+      "loss": 0.1055,
+      "step": 9212
+    },
+    {
+      "epoch": 0.07997326412097117,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001967016840297672,
+      "loss": 0.0977,
+      "step": 9213
+    },
+    {
+      "epoch": 0.07998194460117533,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019670088563344254,
+      "loss": 0.1045,
+      "step": 9214
+    },
+    {
+      "epoch": 0.0799906250813795,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019670008714230314,
+      "loss": 0.1572,
+      "step": 9215
+    },
+    {
+      "epoch": 0.07999930556158366,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019669928855634983,
+      "loss": 0.0889,
+      "step": 9216
+    },
+    {
+      "epoch": 0.08000798604178783,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019669848987558343,
+      "loss": 0.1367,
+      "step": 9217
+    },
+    {
+      "epoch": 0.08001666652199199,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001966976911000049,
+      "loss": 0.1699,
+      "step": 9218
+    },
+    {
+      "epoch": 0.08002534700219616,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0019669689222961504,
+      "loss": 0.1221,
+      "step": 9219
+    },
+    {
+      "epoch": 0.08003402748240032,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019669609326441483,
+      "loss": 0.1592,
+      "step": 9220
+    },
+    {
+      "epoch": 0.08004270796260449,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.00196695294204405,
+      "loss": 0.124,
+      "step": 9221
+    },
+    {
+      "epoch": 0.08005138844280865,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0019669449504958656,
+      "loss": 0.127,
+      "step": 9222
+    },
+    {
+      "epoch": 0.08006006892301282,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019669369579996027,
+      "loss": 0.1523,
+      "step": 9223
+    },
+    {
+      "epoch": 0.08006874940321698,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001966928964555271,
+      "loss": 0.1514,
+      "step": 9224
+    },
+    {
+      "epoch": 0.08007742988342115,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019669209701628785,
+      "loss": 0.1143,
+      "step": 9225
+    },
+    {
+      "epoch": 0.08008611036362531,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019669129748224345,
+      "loss": 0.1758,
+      "step": 9226
+    },
+    {
+      "epoch": 0.08009479084382948,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019669049785339472,
+      "loss": 0.0938,
+      "step": 9227
+    },
+    {
+      "epoch": 0.08010347132403364,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001966896981297426,
+      "loss": 0.1699,
+      "step": 9228
+    },
+    {
+      "epoch": 0.08011215180423781,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001966888983112879,
+      "loss": 0.1309,
+      "step": 9229
+    },
+    {
+      "epoch": 0.08012083228444197,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001966880983980315,
+      "loss": 0.1406,
+      "step": 9230
+    },
+    {
+      "epoch": 0.08012951276464614,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019668729838997435,
+      "loss": 0.126,
+      "step": 9231
+    },
+    {
+      "epoch": 0.0801381932448503,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0019668649828711725,
+      "loss": 0.0986,
+      "step": 9232
+    },
+    {
+      "epoch": 0.08014687372505447,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001966856980894611,
+      "loss": 0.0967,
+      "step": 9233
+    },
+    {
+      "epoch": 0.08015555420525863,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001966848977970068,
+      "loss": 0.0898,
+      "step": 9234
+    },
+    {
+      "epoch": 0.0801642346854628,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001966840974097552,
+      "loss": 0.1543,
+      "step": 9235
+    },
+    {
+      "epoch": 0.08017291516566696,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019668329692770713,
+      "loss": 0.1099,
+      "step": 9236
+    },
+    {
+      "epoch": 0.08018159564587113,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001966824963508635,
+      "loss": 0.1494,
+      "step": 9237
+    },
+    {
+      "epoch": 0.0801902761260753,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019668169567922523,
+      "loss": 0.1641,
+      "step": 9238
+    },
+    {
+      "epoch": 0.08019895660627946,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019668089491279316,
+      "loss": 0.1367,
+      "step": 9239
+    },
+    {
+      "epoch": 0.08020763708648362,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0019668009405156813,
+      "loss": 0.1211,
+      "step": 9240
+    },
+    {
+      "epoch": 0.08021631756668779,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019667929309555112,
+      "loss": 0.1338,
+      "step": 9241
+    },
+    {
+      "epoch": 0.08022499804689195,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019667849204474284,
+      "loss": 0.1436,
+      "step": 9242
+    },
+    {
+      "epoch": 0.08023367852709612,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0019667769089914437,
+      "loss": 0.1475,
+      "step": 9243
+    },
+    {
+      "epoch": 0.08024235900730028,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001966768896587564,
+      "loss": 0.1172,
+      "step": 9244
+    },
+    {
+      "epoch": 0.08025103948750445,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0019667608832357993,
+      "loss": 0.1226,
+      "step": 9245
+    },
+    {
+      "epoch": 0.08025971996770861,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0019667528689361574,
+      "loss": 0.1221,
+      "step": 9246
+    },
+    {
+      "epoch": 0.08026840044791278,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0019667448536886483,
+      "loss": 0.1025,
+      "step": 9247
+    },
+    {
+      "epoch": 0.08027708092811695,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019667368374932793,
+      "loss": 0.1436,
+      "step": 9248
+    },
+    {
+      "epoch": 0.08028576140832111,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0019667288203500605,
+      "loss": 0.1279,
+      "step": 9249
+    },
+    {
+      "epoch": 0.08029444188852528,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001966720802259,
+      "loss": 0.1553,
+      "step": 9250
+    },
+    {
+      "epoch": 0.08030312236872944,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019667127832201066,
+      "loss": 0.0781,
+      "step": 9251
+    },
+    {
+      "epoch": 0.0803118028489336,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001966704763233389,
+      "loss": 0.1328,
+      "step": 9252
+    },
+    {
+      "epoch": 0.08032048332913777,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001966696742298856,
+      "loss": 0.1377,
+      "step": 9253
+    },
+    {
+      "epoch": 0.08032916380934194,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001966688720416517,
+      "loss": 0.1621,
+      "step": 9254
+    },
+    {
+      "epoch": 0.0803378442895461,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019666806975863795,
+      "loss": 0.1465,
+      "step": 9255
+    },
+    {
+      "epoch": 0.08034652476975027,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019666726738084535,
+      "loss": 0.1279,
+      "step": 9256
+    },
+    {
+      "epoch": 0.08035520524995443,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001966664649082747,
+      "loss": 0.1348,
+      "step": 9257
+    },
+    {
+      "epoch": 0.0803638857301586,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019666566234092693,
+      "loss": 0.1128,
+      "step": 9258
+    },
+    {
+      "epoch": 0.08037256621036276,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001966648596788029,
+      "loss": 0.1055,
+      "step": 9259
+    },
+    {
+      "epoch": 0.08038124669056693,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0019666405692190347,
+      "loss": 0.1523,
+      "step": 9260
+    },
+    {
+      "epoch": 0.08038992717077109,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0019666325407022953,
+      "loss": 0.1416,
+      "step": 9261
+    },
+    {
+      "epoch": 0.08039860765097526,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019666245112378196,
+      "loss": 0.1279,
+      "step": 9262
+    },
+    {
+      "epoch": 0.08040728813117942,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019666164808256163,
+      "loss": 0.1328,
+      "step": 9263
+    },
+    {
+      "epoch": 0.08041596861138359,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001966608449465694,
+      "loss": 0.1523,
+      "step": 9264
+    },
+    {
+      "epoch": 0.08042464909158775,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001966600417158062,
+      "loss": 0.1196,
+      "step": 9265
+    },
+    {
+      "epoch": 0.08043332957179192,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001966592383902729,
+      "loss": 0.123,
+      "step": 9266
+    },
+    {
+      "epoch": 0.08044201005199608,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001966584349699703,
+      "loss": 0.1201,
+      "step": 9267
+    },
+    {
+      "epoch": 0.08045069053220025,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001966576314548994,
+      "loss": 0.1016,
+      "step": 9268
+    },
+    {
+      "epoch": 0.08045937101240441,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00196656827845061,
+      "loss": 0.1367,
+      "step": 9269
+    },
+    {
+      "epoch": 0.08046805149260858,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.00196656024140456,
+      "loss": 0.1348,
+      "step": 9270
+    },
+    {
+      "epoch": 0.08047673197281274,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0019665522034108523,
+      "loss": 0.1318,
+      "step": 9271
+    },
+    {
+      "epoch": 0.0804854124530169,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019665441644694964,
+      "loss": 0.124,
+      "step": 9272
+    },
+    {
+      "epoch": 0.08049409293322106,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001966536124580501,
+      "loss": 0.1738,
+      "step": 9273
+    },
+    {
+      "epoch": 0.08050277341342522,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.001966528083743874,
+      "loss": 0.1055,
+      "step": 9274
+    },
+    {
+      "epoch": 0.08051145389362939,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019665200419596257,
+      "loss": 0.1113,
+      "step": 9275
+    },
+    {
+      "epoch": 0.08052013437383355,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001966511999227764,
+      "loss": 0.124,
+      "step": 9276
+    },
+    {
+      "epoch": 0.08052881485403772,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019665039555482977,
+      "loss": 0.0991,
+      "step": 9277
+    },
+    {
+      "epoch": 0.08053749533424188,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019664959109212355,
+      "loss": 0.1836,
+      "step": 9278
+    },
+    {
+      "epoch": 0.08054617581444605,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019664878653465868,
+      "loss": 0.1172,
+      "step": 9279
+    },
+    {
+      "epoch": 0.08055485629465022,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0019664798188243596,
+      "loss": 0.1011,
+      "step": 9280
+    },
+    {
+      "epoch": 0.08056353677485438,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.001966471771354563,
+      "loss": 0.1055,
+      "step": 9281
+    },
+    {
+      "epoch": 0.08057221725505855,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001966463722937206,
+      "loss": 0.1162,
+      "step": 9282
+    },
+    {
+      "epoch": 0.08058089773526271,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0019664556735722975,
+      "loss": 0.1445,
+      "step": 9283
+    },
+    {
+      "epoch": 0.08058957821546688,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.001966447623259846,
+      "loss": 0.1094,
+      "step": 9284
+    },
+    {
+      "epoch": 0.08059825869567104,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019664395719998606,
+      "loss": 0.1367,
+      "step": 9285
+    },
+    {
+      "epoch": 0.0806069391758752,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019664315197923496,
+      "loss": 0.0728,
+      "step": 9286
+    },
+    {
+      "epoch": 0.08061561965607937,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0019664234666373222,
+      "loss": 0.0918,
+      "step": 9287
+    },
+    {
+      "epoch": 0.08062430013628354,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.001966415412534787,
+      "loss": 0.103,
+      "step": 9288
+    },
+    {
+      "epoch": 0.0806329806164877,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001966407357484753,
+      "loss": 0.1641,
+      "step": 9289
+    },
+    {
+      "epoch": 0.08064166109669187,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001966399301487229,
+      "loss": 0.1133,
+      "step": 9290
+    },
+    {
+      "epoch": 0.08065034157689603,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019663912445422238,
+      "loss": 0.1211,
+      "step": 9291
+    },
+    {
+      "epoch": 0.0806590220571002,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.001966383186649746,
+      "loss": 0.125,
+      "step": 9292
+    },
+    {
+      "epoch": 0.08066770253730436,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0019663751278098046,
+      "loss": 0.1533,
+      "step": 9293
+    },
+    {
+      "epoch": 0.08067638301750853,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019663670680224086,
+      "loss": 0.0996,
+      "step": 9294
+    },
+    {
+      "epoch": 0.08068506349771269,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0019663590072875664,
+      "loss": 0.1445,
+      "step": 9295
+    },
+    {
+      "epoch": 0.08069374397791686,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001966350945605287,
+      "loss": 0.1348,
+      "step": 9296
+    },
+    {
+      "epoch": 0.08070242445812102,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.001966342882975579,
+      "loss": 0.127,
+      "step": 9297
+    },
+    {
+      "epoch": 0.08071110493832519,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001966334819398452,
+      "loss": 0.1074,
+      "step": 9298
+    },
+    {
+      "epoch": 0.08071978541852935,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.001966326754873914,
+      "loss": 0.1206,
+      "step": 9299
+    },
+    {
+      "epoch": 0.08072846589873352,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001966318689401974,
+      "loss": 0.1074,
+      "step": 9300
+    },
+    {
+      "epoch": 0.08073714637893768,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019663106229826405,
+      "loss": 0.1348,
+      "step": 9301
+    },
+    {
+      "epoch": 0.08074582685914185,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001966302555615923,
+      "loss": 0.1074,
+      "step": 9302
+    },
+    {
+      "epoch": 0.08075450733934601,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019662944873018303,
+      "loss": 0.125,
+      "step": 9303
+    },
+    {
+      "epoch": 0.08076318781955018,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001966286418040371,
+      "loss": 0.1104,
+      "step": 9304
+    },
+    {
+      "epoch": 0.08077186829975434,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0019662783478315536,
+      "loss": 0.1299,
+      "step": 9305
+    },
+    {
+      "epoch": 0.08078054877995851,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019662702766753875,
+      "loss": 0.1465,
+      "step": 9306
+    },
+    {
+      "epoch": 0.08078922926016267,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001966262204571881,
+      "loss": 0.0957,
+      "step": 9307
+    },
+    {
+      "epoch": 0.08079790974036684,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019662541315210434,
+      "loss": 0.125,
+      "step": 9308
+    },
+    {
+      "epoch": 0.080806590220571,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019662460575228828,
+      "loss": 0.1152,
+      "step": 9309
+    },
+    {
+      "epoch": 0.08081527070077517,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001966237982577409,
+      "loss": 0.1826,
+      "step": 9310
+    },
+    {
+      "epoch": 0.08082395118097933,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019662299066846305,
+      "loss": 0.1016,
+      "step": 9311
+    },
+    {
+      "epoch": 0.0808326316611835,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0019662218298445553,
+      "loss": 0.1484,
+      "step": 9312
+    },
+    {
+      "epoch": 0.08084131214138766,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019662137520571935,
+      "loss": 0.1177,
+      "step": 9313
+    },
+    {
+      "epoch": 0.08084999262159183,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001966205673322553,
+      "loss": 0.123,
+      "step": 9314
+    },
+    {
+      "epoch": 0.080858673101796,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019661975936406434,
+      "loss": 0.1006,
+      "step": 9315
+    },
+    {
+      "epoch": 0.08086735358200016,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.001966189513011473,
+      "loss": 0.1748,
+      "step": 9316
+    },
+    {
+      "epoch": 0.08087603406220432,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0019661814314350505,
+      "loss": 0.1201,
+      "step": 9317
+    },
+    {
+      "epoch": 0.08088471454240849,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019661733489113853,
+      "loss": 0.1108,
+      "step": 9318
+    },
+    {
+      "epoch": 0.08089339502261265,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019661652654404858,
+      "loss": 0.168,
+      "step": 9319
+    },
+    {
+      "epoch": 0.08090207550281682,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.001966157181022361,
+      "loss": 0.1123,
+      "step": 9320
+    },
+    {
+      "epoch": 0.08091075598302098,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0019661490956570196,
+      "loss": 0.0869,
+      "step": 9321
+    },
+    {
+      "epoch": 0.08091943646322515,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019661410093444708,
+      "loss": 0.1147,
+      "step": 9322
+    },
+    {
+      "epoch": 0.08092811694342932,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001966132922084723,
+      "loss": 0.1328,
+      "step": 9323
+    },
+    {
+      "epoch": 0.08093679742363348,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.001966124833877785,
+      "loss": 0.1172,
+      "step": 9324
+    },
+    {
+      "epoch": 0.08094547790383765,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0019661167447236664,
+      "loss": 0.1099,
+      "step": 9325
+    },
+    {
+      "epoch": 0.08095415838404181,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0019661086546223752,
+      "loss": 0.1006,
+      "step": 9326
+    },
+    {
+      "epoch": 0.08096283886424598,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0019661005635739207,
+      "loss": 0.1758,
+      "step": 9327
+    },
+    {
+      "epoch": 0.08097151934445014,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001966092471578312,
+      "loss": 0.1738,
+      "step": 9328
+    },
+    {
+      "epoch": 0.0809801998246543,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019660843786355573,
+      "loss": 0.1426,
+      "step": 9329
+    },
+    {
+      "epoch": 0.08098888030485847,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0019660762847456656,
+      "loss": 0.1152,
+      "step": 9330
+    },
+    {
+      "epoch": 0.08099756078506264,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019660681899086457,
+      "loss": 0.1514,
+      "step": 9331
+    },
+    {
+      "epoch": 0.0810062412652668,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001966060094124507,
+      "loss": 0.0928,
+      "step": 9332
+    },
+    {
+      "epoch": 0.08101492174547097,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001966051997393258,
+      "loss": 0.126,
+      "step": 9333
+    },
+    {
+      "epoch": 0.08102360222567512,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019660438997149074,
+      "loss": 0.1631,
+      "step": 9334
+    },
+    {
+      "epoch": 0.08103228270587928,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019660358010894644,
+      "loss": 0.1387,
+      "step": 9335
+    },
+    {
+      "epoch": 0.08104096318608345,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001966027701516938,
+      "loss": 0.1367,
+      "step": 9336
+    },
+    {
+      "epoch": 0.08104964366628761,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.001966019600997336,
+      "loss": 0.1172,
+      "step": 9337
+    },
+    {
+      "epoch": 0.08105832414649178,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019660114995306683,
+      "loss": 0.1338,
+      "step": 9338
+    },
+    {
+      "epoch": 0.08106700462669594,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019660033971169435,
+      "loss": 0.1196,
+      "step": 9339
+    },
+    {
+      "epoch": 0.08107568510690011,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.00196599529375617,
+      "loss": 0.1318,
+      "step": 9340
+    },
+    {
+      "epoch": 0.08108436558710427,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019659871894483574,
+      "loss": 0.1177,
+      "step": 9341
+    },
+    {
+      "epoch": 0.08109304606730844,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001965979084193514,
+      "loss": 0.1309,
+      "step": 9342
+    },
+    {
+      "epoch": 0.0811017265475126,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.001965970977991649,
+      "loss": 0.1572,
+      "step": 9343
+    },
+    {
+      "epoch": 0.08111040702771677,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001965962870842771,
+      "loss": 0.1387,
+      "step": 9344
+    },
+    {
+      "epoch": 0.08111908750792093,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019659547627468897,
+      "loss": 0.1445,
+      "step": 9345
+    },
+    {
+      "epoch": 0.0811277679881251,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0019659466537040125,
+      "loss": 0.1416,
+      "step": 9346
+    },
+    {
+      "epoch": 0.08113644846832926,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019659385437141496,
+      "loss": 0.1099,
+      "step": 9347
+    },
+    {
+      "epoch": 0.08114512894853343,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001965930432777309,
+      "loss": 0.1289,
+      "step": 9348
+    },
+    {
+      "epoch": 0.0811538094287376,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0019659223208935,
+      "loss": 0.1055,
+      "step": 9349
+    },
+    {
+      "epoch": 0.08116248990894176,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001965914208062731,
+      "loss": 0.1406,
+      "step": 9350
+    },
+    {
+      "epoch": 0.08117117038914592,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019659060942850116,
+      "loss": 0.1045,
+      "step": 9351
+    },
+    {
+      "epoch": 0.08117985086935009,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0019658979795603502,
+      "loss": 0.1021,
+      "step": 9352
+    },
+    {
+      "epoch": 0.08118853134955425,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001965889863888756,
+      "loss": 0.0908,
+      "step": 9353
+    },
+    {
+      "epoch": 0.08119721182975842,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0019658817472702375,
+      "loss": 0.168,
+      "step": 9354
+    },
+    {
+      "epoch": 0.08120589230996259,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001965873629704804,
+      "loss": 0.1089,
+      "step": 9355
+    },
+    {
+      "epoch": 0.08121457279016675,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019658655111924636,
+      "loss": 0.1436,
+      "step": 9356
+    },
+    {
+      "epoch": 0.08122325327037092,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001965857391733226,
+      "loss": 0.1006,
+      "step": 9357
+    },
+    {
+      "epoch": 0.08123193375057508,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0019658492713270992,
+      "loss": 0.1079,
+      "step": 9358
+    },
+    {
+      "epoch": 0.08124061423077925,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019658411499740933,
+      "loss": 0.1865,
+      "step": 9359
+    },
+    {
+      "epoch": 0.08124929471098341,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0019658330276742164,
+      "loss": 0.1006,
+      "step": 9360
+    },
+    {
+      "epoch": 0.08125797519118758,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001965824904427477,
+      "loss": 0.126,
+      "step": 9361
+    },
+    {
+      "epoch": 0.08126665567139174,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0019658167802338853,
+      "loss": 0.127,
+      "step": 9362
+    },
+    {
+      "epoch": 0.0812753361515959,
+      "grad_norm": 0.875,
+      "learning_rate": 0.001965808655093449,
+      "loss": 0.1094,
+      "step": 9363
+    },
+    {
+      "epoch": 0.08128401663180007,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019658005290061776,
+      "loss": 0.1426,
+      "step": 9364
+    },
+    {
+      "epoch": 0.08129269711200424,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0019657924019720794,
+      "loss": 0.0986,
+      "step": 9365
+    },
+    {
+      "epoch": 0.0813013775922084,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019657842739911635,
+      "loss": 0.1079,
+      "step": 9366
+    },
+    {
+      "epoch": 0.08131005807241257,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019657761450634394,
+      "loss": 0.1069,
+      "step": 9367
+    },
+    {
+      "epoch": 0.08131873855261673,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019657680151889153,
+      "loss": 0.1387,
+      "step": 9368
+    },
+    {
+      "epoch": 0.0813274190328209,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019657598843676005,
+      "loss": 0.1025,
+      "step": 9369
+    },
+    {
+      "epoch": 0.08133609951302506,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019657517525995035,
+      "loss": 0.1289,
+      "step": 9370
+    },
+    {
+      "epoch": 0.08134477999322923,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001965743619884633,
+      "loss": 0.2393,
+      "step": 9371
+    },
+    {
+      "epoch": 0.08135346047343339,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0019657354862229986,
+      "loss": 0.0981,
+      "step": 9372
+    },
+    {
+      "epoch": 0.08136214095363756,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019657273516146094,
+      "loss": 0.1367,
+      "step": 9373
+    },
+    {
+      "epoch": 0.08137082143384172,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001965719216059473,
+      "loss": 0.0889,
+      "step": 9374
+    },
+    {
+      "epoch": 0.08137950191404589,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019657110795576,
+      "loss": 0.1074,
+      "step": 9375
+    },
+    {
+      "epoch": 0.08138818239425005,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0019657029421089977,
+      "loss": 0.1426,
+      "step": 9376
+    },
+    {
+      "epoch": 0.08139686287445422,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019656948037136755,
+      "loss": 0.1699,
+      "step": 9377
+    },
+    {
+      "epoch": 0.08140554335465838,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0019656866643716427,
+      "loss": 0.127,
+      "step": 9378
+    },
+    {
+      "epoch": 0.08141422383486255,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001965678524082908,
+      "loss": 0.1025,
+      "step": 9379
+    },
+    {
+      "epoch": 0.08142290431506671,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019656703828474804,
+      "loss": 0.1226,
+      "step": 9380
+    },
+    {
+      "epoch": 0.08143158479527088,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019656622406653687,
+      "loss": 0.1064,
+      "step": 9381
+    },
+    {
+      "epoch": 0.08144026527547504,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019656540975365815,
+      "loss": 0.1318,
+      "step": 9382
+    },
+    {
+      "epoch": 0.08144894575567921,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001965645953461128,
+      "loss": 0.1504,
+      "step": 9383
+    },
+    {
+      "epoch": 0.08145762623588337,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019656378084390172,
+      "loss": 0.1934,
+      "step": 9384
+    },
+    {
+      "epoch": 0.08146630671608754,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.001965629662470258,
+      "loss": 0.1084,
+      "step": 9385
+    },
+    {
+      "epoch": 0.0814749871962917,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001965621515554859,
+      "loss": 0.0938,
+      "step": 9386
+    },
+    {
+      "epoch": 0.08148366767649587,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019656133676928295,
+      "loss": 0.1167,
+      "step": 9387
+    },
+    {
+      "epoch": 0.08149234815670003,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001965605218884178,
+      "loss": 0.1104,
+      "step": 9388
+    },
+    {
+      "epoch": 0.0815010286369042,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001965597069128914,
+      "loss": 0.1182,
+      "step": 9389
+    },
+    {
+      "epoch": 0.08150970911710836,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0019655889184270457,
+      "loss": 0.1562,
+      "step": 9390
+    },
+    {
+      "epoch": 0.08151838959731253,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019655807667785827,
+      "loss": 0.1089,
+      "step": 9391
+    },
+    {
+      "epoch": 0.0815270700775167,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001965572614183533,
+      "loss": 0.1367,
+      "step": 9392
+    },
+    {
+      "epoch": 0.08153575055772086,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001965564460641907,
+      "loss": 0.1953,
+      "step": 9393
+    },
+    {
+      "epoch": 0.08154443103792502,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001965556306153712,
+      "loss": 0.1123,
+      "step": 9394
+    },
+    {
+      "epoch": 0.08155311151812918,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001965548150718958,
+      "loss": 0.1055,
+      "step": 9395
+    },
+    {
+      "epoch": 0.08156179199833334,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001965539994337653,
+      "loss": 0.1299,
+      "step": 9396
+    },
+    {
+      "epoch": 0.0815704724785375,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001965531837009807,
+      "loss": 0.1196,
+      "step": 9397
+    },
+    {
+      "epoch": 0.08157915295874167,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0019655236787354284,
+      "loss": 0.1152,
+      "step": 9398
+    },
+    {
+      "epoch": 0.08158783343894584,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0019655155195145257,
+      "loss": 0.1191,
+      "step": 9399
+    },
+    {
+      "epoch": 0.08159651391915,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019655073593471085,
+      "loss": 0.1533,
+      "step": 9400
+    },
+    {
+      "epoch": 0.08160519439935417,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019654991982331854,
+      "loss": 0.1133,
+      "step": 9401
+    },
+    {
+      "epoch": 0.08161387487955833,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019654910361727655,
+      "loss": 0.127,
+      "step": 9402
+    },
+    {
+      "epoch": 0.0816225553597625,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019654828731658575,
+      "loss": 0.1973,
+      "step": 9403
+    },
+    {
+      "epoch": 0.08163123583996666,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019654747092124705,
+      "loss": 0.1162,
+      "step": 9404
+    },
+    {
+      "epoch": 0.08163991632017083,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001965466544312613,
+      "loss": 0.1177,
+      "step": 9405
+    },
+    {
+      "epoch": 0.08164859680037499,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019654583784662946,
+      "loss": 0.1182,
+      "step": 9406
+    },
+    {
+      "epoch": 0.08165727728057916,
+      "grad_norm": 1.6640625,
+      "learning_rate": 0.001965450211673524,
+      "loss": 0.1582,
+      "step": 9407
+    },
+    {
+      "epoch": 0.08166595776078332,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00196544204393431,
+      "loss": 0.1133,
+      "step": 9408
+    },
+    {
+      "epoch": 0.08167463824098749,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019654338752486617,
+      "loss": 0.1113,
+      "step": 9409
+    },
+    {
+      "epoch": 0.08168331872119165,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0019654257056165875,
+      "loss": 0.1143,
+      "step": 9410
+    },
+    {
+      "epoch": 0.08169199920139582,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019654175350380973,
+      "loss": 0.126,
+      "step": 9411
+    },
+    {
+      "epoch": 0.08170067968159998,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019654093635131987,
+      "loss": 0.1543,
+      "step": 9412
+    },
+    {
+      "epoch": 0.08170936016180415,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001965401191041902,
+      "loss": 0.1689,
+      "step": 9413
+    },
+    {
+      "epoch": 0.08171804064200831,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019653930176242156,
+      "loss": 0.1562,
+      "step": 9414
+    },
+    {
+      "epoch": 0.08172672112221248,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0019653848432601485,
+      "loss": 0.0996,
+      "step": 9415
+    },
+    {
+      "epoch": 0.08173540160241664,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0019653766679497094,
+      "loss": 0.1543,
+      "step": 9416
+    },
+    {
+      "epoch": 0.08174408208262081,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001965368491692907,
+      "loss": 0.1406,
+      "step": 9417
+    },
+    {
+      "epoch": 0.08175276256282497,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019653603144897513,
+      "loss": 0.167,
+      "step": 9418
+    },
+    {
+      "epoch": 0.08176144304302914,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.00196535213634025,
+      "loss": 0.1426,
+      "step": 9419
+    },
+    {
+      "epoch": 0.0817701235232333,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0019653439572444127,
+      "loss": 0.0962,
+      "step": 9420
+    },
+    {
+      "epoch": 0.08177880400343747,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019653357772022482,
+      "loss": 0.1641,
+      "step": 9421
+    },
+    {
+      "epoch": 0.08178748448364163,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019653275962137656,
+      "loss": 0.1035,
+      "step": 9422
+    },
+    {
+      "epoch": 0.0817961649638458,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001965319414278974,
+      "loss": 0.0889,
+      "step": 9423
+    },
+    {
+      "epoch": 0.08180484544404996,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019653112313978817,
+      "loss": 0.1582,
+      "step": 9424
+    },
+    {
+      "epoch": 0.08181352592425413,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0019653030475704986,
+      "loss": 0.1162,
+      "step": 9425
+    },
+    {
+      "epoch": 0.0818222064044583,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019652948627968325,
+      "loss": 0.0933,
+      "step": 9426
+    },
+    {
+      "epoch": 0.08183088688466246,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001965286677076893,
+      "loss": 0.166,
+      "step": 9427
+    },
+    {
+      "epoch": 0.08183956736486662,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019652784904106895,
+      "loss": 0.1157,
+      "step": 9428
+    },
+    {
+      "epoch": 0.08184824784507079,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.00196527030279823,
+      "loss": 0.167,
+      "step": 9429
+    },
+    {
+      "epoch": 0.08185692832527496,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001965262114239524,
+      "loss": 0.1279,
+      "step": 9430
+    },
+    {
+      "epoch": 0.08186560880547912,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019652539247345803,
+      "loss": 0.1216,
+      "step": 9431
+    },
+    {
+      "epoch": 0.08187428928568329,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001965245734283408,
+      "loss": 0.1016,
+      "step": 9432
+    },
+    {
+      "epoch": 0.08188296976588745,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0019652375428860155,
+      "loss": 0.0957,
+      "step": 9433
+    },
+    {
+      "epoch": 0.08189165024609162,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001965229350542413,
+      "loss": 0.1128,
+      "step": 9434
+    },
+    {
+      "epoch": 0.08190033072629578,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001965221157252608,
+      "loss": 0.1631,
+      "step": 9435
+    },
+    {
+      "epoch": 0.08190901120649995,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0019652129630166105,
+      "loss": 0.1309,
+      "step": 9436
+    },
+    {
+      "epoch": 0.08191769168670411,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001965204767834429,
+      "loss": 0.1514,
+      "step": 9437
+    },
+    {
+      "epoch": 0.08192637216690828,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001965196571706073,
+      "loss": 0.1895,
+      "step": 9438
+    },
+    {
+      "epoch": 0.08193505264711244,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019651883746315503,
+      "loss": 0.1367,
+      "step": 9439
+    },
+    {
+      "epoch": 0.0819437331273166,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001965180176610871,
+      "loss": 0.1152,
+      "step": 9440
+    },
+    {
+      "epoch": 0.08195241360752077,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0019651719776440433,
+      "loss": 0.1553,
+      "step": 9441
+    },
+    {
+      "epoch": 0.08196109408772494,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001965163777731077,
+      "loss": 0.1367,
+      "step": 9442
+    },
+    {
+      "epoch": 0.0819697745679291,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019651555768719804,
+      "loss": 0.1143,
+      "step": 9443
+    },
+    {
+      "epoch": 0.08197845504813327,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.001965147375066763,
+      "loss": 0.1367,
+      "step": 9444
+    },
+    {
+      "epoch": 0.08198713552833743,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019651391723154326,
+      "loss": 0.084,
+      "step": 9445
+    },
+    {
+      "epoch": 0.0819958160085416,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.001965130968618,
+      "loss": 0.1357,
+      "step": 9446
+    },
+    {
+      "epoch": 0.08200449648874576,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019651227639744724,
+      "loss": 0.1221,
+      "step": 9447
+    },
+    {
+      "epoch": 0.08201317696894993,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0019651145583848596,
+      "loss": 0.1138,
+      "step": 9448
+    },
+    {
+      "epoch": 0.08202185744915409,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019651063518491708,
+      "loss": 0.0938,
+      "step": 9449
+    },
+    {
+      "epoch": 0.08203053792935826,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0019650981443674145,
+      "loss": 0.085,
+      "step": 9450
+    },
+    {
+      "epoch": 0.08203921840956242,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0019650899359396003,
+      "loss": 0.1973,
+      "step": 9451
+    },
+    {
+      "epoch": 0.08204789888976659,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0019650817265657364,
+      "loss": 0.124,
+      "step": 9452
+    },
+    {
+      "epoch": 0.08205657936997075,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001965073516245832,
+      "loss": 0.1055,
+      "step": 9453
+    },
+    {
+      "epoch": 0.08206525985017492,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001965065304979896,
+      "loss": 0.1299,
+      "step": 9454
+    },
+    {
+      "epoch": 0.08207394033037908,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0019650570927679384,
+      "loss": 0.1357,
+      "step": 9455
+    },
+    {
+      "epoch": 0.08208262081058325,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.001965048879609967,
+      "loss": 0.1367,
+      "step": 9456
+    },
+    {
+      "epoch": 0.0820913012907874,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019650406655059905,
+      "loss": 0.1167,
+      "step": 9457
+    },
+    {
+      "epoch": 0.08209998177099156,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019650324504560195,
+      "loss": 0.1396,
+      "step": 9458
+    },
+    {
+      "epoch": 0.08210866225119573,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0019650242344600613,
+      "loss": 0.104,
+      "step": 9459
+    },
+    {
+      "epoch": 0.0821173427313999,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001965016017518126,
+      "loss": 0.125,
+      "step": 9460
+    },
+    {
+      "epoch": 0.08212602321160406,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019650077996302222,
+      "loss": 0.1016,
+      "step": 9461
+    },
+    {
+      "epoch": 0.08213470369180823,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0019649995807963587,
+      "loss": 0.1338,
+      "step": 9462
+    },
+    {
+      "epoch": 0.08214338417201239,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0019649913610165445,
+      "loss": 0.1445,
+      "step": 9463
+    },
+    {
+      "epoch": 0.08215206465221656,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.001964983140290789,
+      "loss": 0.1484,
+      "step": 9464
+    },
+    {
+      "epoch": 0.08216074513242072,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001964974918619101,
+      "loss": 0.125,
+      "step": 9465
+    },
+    {
+      "epoch": 0.08216942561262489,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001964966696001489,
+      "loss": 0.1455,
+      "step": 9466
+    },
+    {
+      "epoch": 0.08217810609282905,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019649584724379626,
+      "loss": 0.1216,
+      "step": 9467
+    },
+    {
+      "epoch": 0.08218678657303322,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001964950247928531,
+      "loss": 0.1826,
+      "step": 9468
+    },
+    {
+      "epoch": 0.08219546705323738,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001964942022473202,
+      "loss": 0.0791,
+      "step": 9469
+    },
+    {
+      "epoch": 0.08220414753344155,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001964933796071986,
+      "loss": 0.1338,
+      "step": 9470
+    },
+    {
+      "epoch": 0.08221282801364571,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019649255687248916,
+      "loss": 0.1001,
+      "step": 9471
+    },
+    {
+      "epoch": 0.08222150849384988,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001964917340431927,
+      "loss": 0.1035,
+      "step": 9472
+    },
+    {
+      "epoch": 0.08223018897405404,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0019649091111931024,
+      "loss": 0.1426,
+      "step": 9473
+    },
+    {
+      "epoch": 0.0822388694542582,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001964900881008426,
+      "loss": 0.1035,
+      "step": 9474
+    },
+    {
+      "epoch": 0.08224754993446237,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019648926498779065,
+      "loss": 0.1318,
+      "step": 9475
+    },
+    {
+      "epoch": 0.08225623041466654,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001964884417801554,
+      "loss": 0.1162,
+      "step": 9476
+    },
+    {
+      "epoch": 0.0822649108948707,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001964876184779376,
+      "loss": 0.1104,
+      "step": 9477
+    },
+    {
+      "epoch": 0.08227359137507487,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0019648679508113835,
+      "loss": 0.084,
+      "step": 9478
+    },
+    {
+      "epoch": 0.08228227185527903,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.001964859715897584,
+      "loss": 0.1348,
+      "step": 9479
+    },
+    {
+      "epoch": 0.0822909523354832,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001964851480037987,
+      "loss": 0.1182,
+      "step": 9480
+    },
+    {
+      "epoch": 0.08229963281568736,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001964843243232601,
+      "loss": 0.1494,
+      "step": 9481
+    },
+    {
+      "epoch": 0.08230831329589153,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.001964835005481436,
+      "loss": 0.1484,
+      "step": 9482
+    },
+    {
+      "epoch": 0.08231699377609569,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019648267667845003,
+      "loss": 0.1094,
+      "step": 9483
+    },
+    {
+      "epoch": 0.08232567425629986,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0019648185271418026,
+      "loss": 0.0967,
+      "step": 9484
+    },
+    {
+      "epoch": 0.08233435473650402,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019648102865533527,
+      "loss": 0.1113,
+      "step": 9485
+    },
+    {
+      "epoch": 0.08234303521670819,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001964802045019159,
+      "loss": 0.1245,
+      "step": 9486
+    },
+    {
+      "epoch": 0.08235171569691235,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.001964793802539231,
+      "loss": 0.0977,
+      "step": 9487
+    },
+    {
+      "epoch": 0.08236039617711652,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019647855591135776,
+      "loss": 0.1357,
+      "step": 9488
+    },
+    {
+      "epoch": 0.08236907665732068,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019647773147422076,
+      "loss": 0.1201,
+      "step": 9489
+    },
+    {
+      "epoch": 0.08237775713752485,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00196476906942513,
+      "loss": 0.1064,
+      "step": 9490
+    },
+    {
+      "epoch": 0.08238643761772901,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001964760823162354,
+      "loss": 0.1035,
+      "step": 9491
+    },
+    {
+      "epoch": 0.08239511809793318,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019647525759538885,
+      "loss": 0.1445,
+      "step": 9492
+    },
+    {
+      "epoch": 0.08240379857813734,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019647443277997427,
+      "loss": 0.0986,
+      "step": 9493
+    },
+    {
+      "epoch": 0.08241247905834151,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019647360786999254,
+      "loss": 0.1045,
+      "step": 9494
+    },
+    {
+      "epoch": 0.08242115953854567,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019647278286544457,
+      "loss": 0.1465,
+      "step": 9495
+    },
+    {
+      "epoch": 0.08242984001874984,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019647195776633128,
+      "loss": 0.1279,
+      "step": 9496
+    },
+    {
+      "epoch": 0.082438520498954,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0019647113257265356,
+      "loss": 0.1074,
+      "step": 9497
+    },
+    {
+      "epoch": 0.08244720097915817,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001964703072844123,
+      "loss": 0.1123,
+      "step": 9498
+    },
+    {
+      "epoch": 0.08245588145936233,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001964694819016084,
+      "loss": 0.1348,
+      "step": 9499
+    },
+    {
+      "epoch": 0.0824645619395665,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0019646865642424277,
+      "loss": 0.1211,
+      "step": 9500
+    },
+    {
+      "epoch": 0.08247324241977066,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019646783085231633,
+      "loss": 0.1045,
+      "step": 9501
+    },
+    {
+      "epoch": 0.08248192289997483,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019646700518582997,
+      "loss": 0.1191,
+      "step": 9502
+    },
+    {
+      "epoch": 0.082490603380179,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001964661794247846,
+      "loss": 0.1191,
+      "step": 9503
+    },
+    {
+      "epoch": 0.08249928386038316,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001964653535691811,
+      "loss": 0.1807,
+      "step": 9504
+    },
+    {
+      "epoch": 0.08250796434058733,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019646452761902036,
+      "loss": 0.1143,
+      "step": 9505
+    },
+    {
+      "epoch": 0.08251664482079149,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019646370157430336,
+      "loss": 0.1055,
+      "step": 9506
+    },
+    {
+      "epoch": 0.08252532530099566,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0019646287543503096,
+      "loss": 0.1416,
+      "step": 9507
+    },
+    {
+      "epoch": 0.08253400578119982,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019646204920120403,
+      "loss": 0.1445,
+      "step": 9508
+    },
+    {
+      "epoch": 0.08254268626140399,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019646122287282353,
+      "loss": 0.1914,
+      "step": 9509
+    },
+    {
+      "epoch": 0.08255136674160815,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.001964603964498903,
+      "loss": 0.1406,
+      "step": 9510
+    },
+    {
+      "epoch": 0.08256004722181232,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001964595699324053,
+      "loss": 0.1572,
+      "step": 9511
+    },
+    {
+      "epoch": 0.08256872770201648,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001964587433203694,
+      "loss": 0.1367,
+      "step": 9512
+    },
+    {
+      "epoch": 0.08257740818222065,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0019645791661378354,
+      "loss": 0.0815,
+      "step": 9513
+    },
+    {
+      "epoch": 0.08258608866242481,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0019645708981264856,
+      "loss": 0.1001,
+      "step": 9514
+    },
+    {
+      "epoch": 0.08259476914262898,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019645626291696547,
+      "loss": 0.0938,
+      "step": 9515
+    },
+    {
+      "epoch": 0.08260344962283314,
+      "grad_norm": 2.859375,
+      "learning_rate": 0.0019645543592673505,
+      "loss": 0.293,
+      "step": 9516
+    },
+    {
+      "epoch": 0.0826121301030373,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001964546088419583,
+      "loss": 0.1162,
+      "step": 9517
+    },
+    {
+      "epoch": 0.08262081058324146,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019645378166263604,
+      "loss": 0.0928,
+      "step": 9518
+    },
+    {
+      "epoch": 0.08262949106344562,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0019645295438876927,
+      "loss": 0.124,
+      "step": 9519
+    },
+    {
+      "epoch": 0.08263817154364979,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019645212702035886,
+      "loss": 0.1406,
+      "step": 9520
+    },
+    {
+      "epoch": 0.08264685202385395,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019645129955740566,
+      "loss": 0.1289,
+      "step": 9521
+    },
+    {
+      "epoch": 0.08265553250405812,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019645047199991065,
+      "loss": 0.1348,
+      "step": 9522
+    },
+    {
+      "epoch": 0.08266421298426228,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001964496443478747,
+      "loss": 0.0972,
+      "step": 9523
+    },
+    {
+      "epoch": 0.08267289346446645,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001964488166012987,
+      "loss": 0.1147,
+      "step": 9524
+    },
+    {
+      "epoch": 0.08268157394467061,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001964479887601836,
+      "loss": 0.1465,
+      "step": 9525
+    },
+    {
+      "epoch": 0.08269025442487478,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0019644716082453024,
+      "loss": 0.125,
+      "step": 9526
+    },
+    {
+      "epoch": 0.08269893490507894,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001964463327943396,
+      "loss": 0.1621,
+      "step": 9527
+    },
+    {
+      "epoch": 0.08270761538528311,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001964455046696125,
+      "loss": 0.1562,
+      "step": 9528
+    },
+    {
+      "epoch": 0.08271629586548727,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019644467645034995,
+      "loss": 0.1182,
+      "step": 9529
+    },
+    {
+      "epoch": 0.08272497634569144,
+      "grad_norm": 1.375,
+      "learning_rate": 0.001964438481365528,
+      "loss": 0.0962,
+      "step": 9530
+    },
+    {
+      "epoch": 0.0827336568258956,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0019644301972822193,
+      "loss": 0.1289,
+      "step": 9531
+    },
+    {
+      "epoch": 0.08274233730609977,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019644219122535826,
+      "loss": 0.1504,
+      "step": 9532
+    },
+    {
+      "epoch": 0.08275101778630393,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019644136262796275,
+      "loss": 0.1089,
+      "step": 9533
+    },
+    {
+      "epoch": 0.0827596982665081,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001964405339360362,
+      "loss": 0.1084,
+      "step": 9534
+    },
+    {
+      "epoch": 0.08276837874671227,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0019643970514957964,
+      "loss": 0.1074,
+      "step": 9535
+    },
+    {
+      "epoch": 0.08277705922691643,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019643887626859388,
+      "loss": 0.1328,
+      "step": 9536
+    },
+    {
+      "epoch": 0.0827857397071206,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001964380472930799,
+      "loss": 0.1455,
+      "step": 9537
+    },
+    {
+      "epoch": 0.08279442018732476,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0019643721822303855,
+      "loss": 0.1021,
+      "step": 9538
+    },
+    {
+      "epoch": 0.08280310066752893,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019643638905847078,
+      "loss": 0.1592,
+      "step": 9539
+    },
+    {
+      "epoch": 0.08281178114773309,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019643555979937744,
+      "loss": 0.1021,
+      "step": 9540
+    },
+    {
+      "epoch": 0.08282046162793726,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001964347304457595,
+      "loss": 0.127,
+      "step": 9541
+    },
+    {
+      "epoch": 0.08282914210814142,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001964339009976178,
+      "loss": 0.1289,
+      "step": 9542
+    },
+    {
+      "epoch": 0.08283782258834559,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001964330714549533,
+      "loss": 0.127,
+      "step": 9543
+    },
+    {
+      "epoch": 0.08284650306854975,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019643224181776693,
+      "loss": 0.1455,
+      "step": 9544
+    },
+    {
+      "epoch": 0.08285518354875392,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.001964314120860595,
+      "loss": 0.0845,
+      "step": 9545
+    },
+    {
+      "epoch": 0.08286386402895808,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00196430582259832,
+      "loss": 0.1201,
+      "step": 9546
+    },
+    {
+      "epoch": 0.08287254450916225,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001964297523390853,
+      "loss": 0.1289,
+      "step": 9547
+    },
+    {
+      "epoch": 0.08288122498936641,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0019642892232382037,
+      "loss": 0.1475,
+      "step": 9548
+    },
+    {
+      "epoch": 0.08288990546957058,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019642809221403802,
+      "loss": 0.1084,
+      "step": 9549
+    },
+    {
+      "epoch": 0.08289858594977474,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019642726200973923,
+      "loss": 0.127,
+      "step": 9550
+    },
+    {
+      "epoch": 0.08290726642997891,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001964264317109249,
+      "loss": 0.1211,
+      "step": 9551
+    },
+    {
+      "epoch": 0.08291594691018307,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019642560131759587,
+      "loss": 0.1523,
+      "step": 9552
+    },
+    {
+      "epoch": 0.08292462739038724,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019642477082975314,
+      "loss": 0.1641,
+      "step": 9553
+    },
+    {
+      "epoch": 0.0829333078705914,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001964239402473976,
+      "loss": 0.1426,
+      "step": 9554
+    },
+    {
+      "epoch": 0.08294198835079557,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019642310957053008,
+      "loss": 0.1309,
+      "step": 9555
+    },
+    {
+      "epoch": 0.08295066883099973,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001964222787991516,
+      "loss": 0.1123,
+      "step": 9556
+    },
+    {
+      "epoch": 0.0829593493112039,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019642144793326297,
+      "loss": 0.1201,
+      "step": 9557
+    },
+    {
+      "epoch": 0.08296802979140806,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019642061697286517,
+      "loss": 0.1289,
+      "step": 9558
+    },
+    {
+      "epoch": 0.08297671027161223,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0019641978591795907,
+      "loss": 0.1143,
+      "step": 9559
+    },
+    {
+      "epoch": 0.08298539075181639,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001964189547685456,
+      "loss": 0.106,
+      "step": 9560
+    },
+    {
+      "epoch": 0.08299407123202056,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019641812352462567,
+      "loss": 0.1201,
+      "step": 9561
+    },
+    {
+      "epoch": 0.08300275171222472,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0019641729218620014,
+      "loss": 0.123,
+      "step": 9562
+    },
+    {
+      "epoch": 0.08301143219242889,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019641646075327,
+      "loss": 0.1143,
+      "step": 9563
+    },
+    {
+      "epoch": 0.08302011267263305,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0019641562922583606,
+      "loss": 0.1104,
+      "step": 9564
+    },
+    {
+      "epoch": 0.08302879315283722,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019641479760389934,
+      "loss": 0.1201,
+      "step": 9565
+    },
+    {
+      "epoch": 0.08303747363304138,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001964139658874607,
+      "loss": 0.1172,
+      "step": 9566
+    },
+    {
+      "epoch": 0.08304615411324555,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019641313407652104,
+      "loss": 0.1299,
+      "step": 9567
+    },
+    {
+      "epoch": 0.08305483459344971,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0019641230217108123,
+      "loss": 0.1221,
+      "step": 9568
+    },
+    {
+      "epoch": 0.08306351507365388,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019641147017114224,
+      "loss": 0.0947,
+      "step": 9569
+    },
+    {
+      "epoch": 0.08307219555385804,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00196410638076705,
+      "loss": 0.0991,
+      "step": 9570
+    },
+    {
+      "epoch": 0.08308087603406221,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001964098058877703,
+      "loss": 0.1016,
+      "step": 9571
+    },
+    {
+      "epoch": 0.08308955651426637,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019640897360433925,
+      "loss": 0.1162,
+      "step": 9572
+    },
+    {
+      "epoch": 0.08309823699447054,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.001964081412264126,
+      "loss": 0.1162,
+      "step": 9573
+    },
+    {
+      "epoch": 0.0831069174746747,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001964073087539913,
+      "loss": 0.1523,
+      "step": 9574
+    },
+    {
+      "epoch": 0.08311559795487887,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019640647618707624,
+      "loss": 0.1309,
+      "step": 9575
+    },
+    {
+      "epoch": 0.08312427843508303,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001964056435256684,
+      "loss": 0.1387,
+      "step": 9576
+    },
+    {
+      "epoch": 0.0831329589152872,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001964048107697686,
+      "loss": 0.1475,
+      "step": 9577
+    },
+    {
+      "epoch": 0.08314163939549137,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0019640397791937784,
+      "loss": 0.1699,
+      "step": 9578
+    },
+    {
+      "epoch": 0.08315031987569553,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019640314497449698,
+      "loss": 0.1245,
+      "step": 9579
+    },
+    {
+      "epoch": 0.08315900035589968,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019640231193512694,
+      "loss": 0.1318,
+      "step": 9580
+    },
+    {
+      "epoch": 0.08316768083610385,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001964014788012686,
+      "loss": 0.0996,
+      "step": 9581
+    },
+    {
+      "epoch": 0.08317636131630801,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001964006455729229,
+      "loss": 0.1216,
+      "step": 9582
+    },
+    {
+      "epoch": 0.08318504179651218,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019639981225009076,
+      "loss": 0.1299,
+      "step": 9583
+    },
+    {
+      "epoch": 0.08319372227671634,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019639897883277314,
+      "loss": 0.1289,
+      "step": 9584
+    },
+    {
+      "epoch": 0.08320240275692051,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019639814532097085,
+      "loss": 0.1094,
+      "step": 9585
+    },
+    {
+      "epoch": 0.08321108323712467,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0019639731171468486,
+      "loss": 0.1045,
+      "step": 9586
+    },
+    {
+      "epoch": 0.08321976371732884,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.00196396478013916,
+      "loss": 0.1011,
+      "step": 9587
+    },
+    {
+      "epoch": 0.083228444197533,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019639564421866534,
+      "loss": 0.1484,
+      "step": 9588
+    },
+    {
+      "epoch": 0.08323712467773717,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001963948103289337,
+      "loss": 0.1064,
+      "step": 9589
+    },
+    {
+      "epoch": 0.08324580515794133,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001963939763447219,
+      "loss": 0.1089,
+      "step": 9590
+    },
+    {
+      "epoch": 0.0832544856381455,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0019639314226603103,
+      "loss": 0.1055,
+      "step": 9591
+    },
+    {
+      "epoch": 0.08326316611834966,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0019639230809286186,
+      "loss": 0.1416,
+      "step": 9592
+    },
+    {
+      "epoch": 0.08327184659855383,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019639147382521545,
+      "loss": 0.1113,
+      "step": 9593
+    },
+    {
+      "epoch": 0.083280527078758,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019639063946309252,
+      "loss": 0.1455,
+      "step": 9594
+    },
+    {
+      "epoch": 0.08328920755896216,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019638980500649413,
+      "loss": 0.0903,
+      "step": 9595
+    },
+    {
+      "epoch": 0.08329788803916632,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019638897045542118,
+      "loss": 0.104,
+      "step": 9596
+    },
+    {
+      "epoch": 0.08330656851937049,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019638813580987453,
+      "loss": 0.123,
+      "step": 9597
+    },
+    {
+      "epoch": 0.08331524899957465,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019638730106985506,
+      "loss": 0.1147,
+      "step": 9598
+    },
+    {
+      "epoch": 0.08332392947977882,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019638646623536377,
+      "loss": 0.1396,
+      "step": 9599
+    },
+    {
+      "epoch": 0.08333260995998298,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019638563130640156,
+      "loss": 0.1396,
+      "step": 9600
+    },
+    {
+      "epoch": 0.08334129044018715,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001963847962829693,
+      "loss": 0.1104,
+      "step": 9601
+    },
+    {
+      "epoch": 0.08334997092039131,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019638396116506795,
+      "loss": 0.1416,
+      "step": 9602
+    },
+    {
+      "epoch": 0.08335865140059548,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019638312595269838,
+      "loss": 0.1611,
+      "step": 9603
+    },
+    {
+      "epoch": 0.08336733188079964,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019638229064586153,
+      "loss": 0.1465,
+      "step": 9604
+    },
+    {
+      "epoch": 0.08337601236100381,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019638145524455827,
+      "loss": 0.1211,
+      "step": 9605
+    },
+    {
+      "epoch": 0.08338469284120797,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019638061974878956,
+      "loss": 0.1064,
+      "step": 9606
+    },
+    {
+      "epoch": 0.08339337332141214,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001963797841585563,
+      "loss": 0.166,
+      "step": 9607
+    },
+    {
+      "epoch": 0.0834020538016163,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0019637894847385944,
+      "loss": 0.1162,
+      "step": 9608
+    },
+    {
+      "epoch": 0.08341073428182047,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019637811269469984,
+      "loss": 0.1621,
+      "step": 9609
+    },
+    {
+      "epoch": 0.08341941476202464,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019637727682107844,
+      "loss": 0.1221,
+      "step": 9610
+    },
+    {
+      "epoch": 0.0834280952422288,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0019637644085299614,
+      "loss": 0.1074,
+      "step": 9611
+    },
+    {
+      "epoch": 0.08343677572243297,
+      "grad_norm": 2.09375,
+      "learning_rate": 0.0019637560479045385,
+      "loss": 0.293,
+      "step": 9612
+    },
+    {
+      "epoch": 0.08344545620263713,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019637476863345253,
+      "loss": 0.1318,
+      "step": 9613
+    },
+    {
+      "epoch": 0.0834541366828413,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019637393238199304,
+      "loss": 0.1182,
+      "step": 9614
+    },
+    {
+      "epoch": 0.08346281716304546,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001963730960360763,
+      "loss": 0.1396,
+      "step": 9615
+    },
+    {
+      "epoch": 0.08347149764324963,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019637225959570326,
+      "loss": 0.1118,
+      "step": 9616
+    },
+    {
+      "epoch": 0.08348017812345379,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019637142306087483,
+      "loss": 0.1387,
+      "step": 9617
+    },
+    {
+      "epoch": 0.08348885860365796,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001963705864315919,
+      "loss": 0.104,
+      "step": 9618
+    },
+    {
+      "epoch": 0.08349753908386212,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019636974970785536,
+      "loss": 0.1592,
+      "step": 9619
+    },
+    {
+      "epoch": 0.08350621956406629,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019636891288966622,
+      "loss": 0.1016,
+      "step": 9620
+    },
+    {
+      "epoch": 0.08351490004427045,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001963680759770253,
+      "loss": 0.1069,
+      "step": 9621
+    },
+    {
+      "epoch": 0.08352358052447462,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.001963672389699335,
+      "loss": 0.1367,
+      "step": 9622
+    },
+    {
+      "epoch": 0.08353226100467878,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0019636640186839185,
+      "loss": 0.1562,
+      "step": 9623
+    },
+    {
+      "epoch": 0.08354094148488295,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001963655646724012,
+      "loss": 0.0952,
+      "step": 9624
+    },
+    {
+      "epoch": 0.08354962196508711,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019636472738196243,
+      "loss": 0.0938,
+      "step": 9625
+    },
+    {
+      "epoch": 0.08355830244529128,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001963638899970765,
+      "loss": 0.1206,
+      "step": 9626
+    },
+    {
+      "epoch": 0.08356698292549544,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0019636305251774437,
+      "loss": 0.123,
+      "step": 9627
+    },
+    {
+      "epoch": 0.08357566340569961,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0019636221494396685,
+      "loss": 0.126,
+      "step": 9628
+    },
+    {
+      "epoch": 0.08358434388590377,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001963613772757449,
+      "loss": 0.1387,
+      "step": 9629
+    },
+    {
+      "epoch": 0.08359302436610794,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001963605395130795,
+      "loss": 0.0991,
+      "step": 9630
+    },
+    {
+      "epoch": 0.0836017048463121,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019635970165597146,
+      "loss": 0.1025,
+      "step": 9631
+    },
+    {
+      "epoch": 0.08361038532651627,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019635886370442178,
+      "loss": 0.127,
+      "step": 9632
+    },
+    {
+      "epoch": 0.08361906580672043,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019635802565843136,
+      "loss": 0.1162,
+      "step": 9633
+    },
+    {
+      "epoch": 0.0836277462869246,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019635718751800104,
+      "loss": 0.1406,
+      "step": 9634
+    },
+    {
+      "epoch": 0.08363642676712876,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001963563492831318,
+      "loss": 0.1055,
+      "step": 9635
+    },
+    {
+      "epoch": 0.08364510724733293,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001963555109538246,
+      "loss": 0.1426,
+      "step": 9636
+    },
+    {
+      "epoch": 0.0836537877275371,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0019635467253008034,
+      "loss": 0.1348,
+      "step": 9637
+    },
+    {
+      "epoch": 0.08366246820774126,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0019635383401189984,
+      "loss": 0.1064,
+      "step": 9638
+    },
+    {
+      "epoch": 0.08367114868794542,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019635299539928408,
+      "loss": 0.1309,
+      "step": 9639
+    },
+    {
+      "epoch": 0.08367982916814959,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019635215669223404,
+      "loss": 0.1299,
+      "step": 9640
+    },
+    {
+      "epoch": 0.08368850964835375,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001963513178907505,
+      "loss": 0.1582,
+      "step": 9641
+    },
+    {
+      "epoch": 0.0836971901285579,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019635047899483455,
+      "loss": 0.1309,
+      "step": 9642
+    },
+    {
+      "epoch": 0.08370587060876207,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019634964000448695,
+      "loss": 0.125,
+      "step": 9643
+    },
+    {
+      "epoch": 0.08371455108896624,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.001963488009197087,
+      "loss": 0.1562,
+      "step": 9644
+    },
+    {
+      "epoch": 0.0837232315691704,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001963479617405007,
+      "loss": 0.1006,
+      "step": 9645
+    },
+    {
+      "epoch": 0.08373191204937457,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0019634712246686388,
+      "loss": 0.1094,
+      "step": 9646
+    },
+    {
+      "epoch": 0.08374059252957873,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001963462830987991,
+      "loss": 0.1992,
+      "step": 9647
+    },
+    {
+      "epoch": 0.0837492730097829,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0019634544363630736,
+      "loss": 0.1299,
+      "step": 9648
+    },
+    {
+      "epoch": 0.08375795348998706,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0019634460407938954,
+      "loss": 0.1357,
+      "step": 9649
+    },
+    {
+      "epoch": 0.08376663397019123,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019634376442804655,
+      "loss": 0.1055,
+      "step": 9650
+    },
+    {
+      "epoch": 0.08377531445039539,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019634292468227927,
+      "loss": 0.1338,
+      "step": 9651
+    },
+    {
+      "epoch": 0.08378399493059956,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019634208484208874,
+      "loss": 0.1035,
+      "step": 9652
+    },
+    {
+      "epoch": 0.08379267541080372,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019634124490747577,
+      "loss": 0.1182,
+      "step": 9653
+    },
+    {
+      "epoch": 0.08380135589100789,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001963404048784413,
+      "loss": 0.1152,
+      "step": 9654
+    },
+    {
+      "epoch": 0.08381003637121205,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001963395647549863,
+      "loss": 0.1289,
+      "step": 9655
+    },
+    {
+      "epoch": 0.08381871685141622,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019633872453711163,
+      "loss": 0.1406,
+      "step": 9656
+    },
+    {
+      "epoch": 0.08382739733162038,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001963378842248182,
+      "loss": 0.1475,
+      "step": 9657
+    },
+    {
+      "epoch": 0.08383607781182455,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00196337043818107,
+      "loss": 0.123,
+      "step": 9658
+    },
+    {
+      "epoch": 0.08384475829202871,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019633620331697887,
+      "loss": 0.1133,
+      "step": 9659
+    },
+    {
+      "epoch": 0.08385343877223288,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001963353627214348,
+      "loss": 0.123,
+      "step": 9660
+    },
+    {
+      "epoch": 0.08386211925243704,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019633452203147565,
+      "loss": 0.1328,
+      "step": 9661
+    },
+    {
+      "epoch": 0.08387079973264121,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019633368124710236,
+      "loss": 0.1289,
+      "step": 9662
+    },
+    {
+      "epoch": 0.08387948021284537,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019633284036831584,
+      "loss": 0.1138,
+      "step": 9663
+    },
+    {
+      "epoch": 0.08388816069304954,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019633199939511708,
+      "loss": 0.1611,
+      "step": 9664
+    },
+    {
+      "epoch": 0.0838968411732537,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001963311583275069,
+      "loss": 0.1484,
+      "step": 9665
+    },
+    {
+      "epoch": 0.08390552165345787,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019633031716548626,
+      "loss": 0.1055,
+      "step": 9666
+    },
+    {
+      "epoch": 0.08391420213366203,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001963294759090561,
+      "loss": 0.1309,
+      "step": 9667
+    },
+    {
+      "epoch": 0.0839228826138662,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001963286345582173,
+      "loss": 0.1465,
+      "step": 9668
+    },
+    {
+      "epoch": 0.08393156309407036,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0019632779311297083,
+      "loss": 0.1309,
+      "step": 9669
+    },
+    {
+      "epoch": 0.08394024357427453,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019632695157331755,
+      "loss": 0.124,
+      "step": 9670
+    },
+    {
+      "epoch": 0.0839489240544787,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0019632610993925844,
+      "loss": 0.1328,
+      "step": 9671
+    },
+    {
+      "epoch": 0.08395760453468286,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001963252682107944,
+      "loss": 0.2383,
+      "step": 9672
+    },
+    {
+      "epoch": 0.08396628501488702,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019632442638792633,
+      "loss": 0.0972,
+      "step": 9673
+    },
+    {
+      "epoch": 0.08397496549509119,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0019632358447065515,
+      "loss": 0.1289,
+      "step": 9674
+    },
+    {
+      "epoch": 0.08398364597529535,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0019632274245898183,
+      "loss": 0.1113,
+      "step": 9675
+    },
+    {
+      "epoch": 0.08399232645549952,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001963219003529072,
+      "loss": 0.1084,
+      "step": 9676
+    },
+    {
+      "epoch": 0.08400100693570368,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0019632105815243226,
+      "loss": 0.1035,
+      "step": 9677
+    },
+    {
+      "epoch": 0.08400968741590785,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0019632021585755796,
+      "loss": 0.1074,
+      "step": 9678
+    },
+    {
+      "epoch": 0.08401836789611201,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001963193734682851,
+      "loss": 0.1289,
+      "step": 9679
+    },
+    {
+      "epoch": 0.08402704837631618,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001963185309846147,
+      "loss": 0.2363,
+      "step": 9680
+    },
+    {
+      "epoch": 0.08403572885652034,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0019631768840654767,
+      "loss": 0.1475,
+      "step": 9681
+    },
+    {
+      "epoch": 0.08404440933672451,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0019631684573408487,
+      "loss": 0.1436,
+      "step": 9682
+    },
+    {
+      "epoch": 0.08405308981692867,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019631600296722727,
+      "loss": 0.1387,
+      "step": 9683
+    },
+    {
+      "epoch": 0.08406177029713284,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001963151601059758,
+      "loss": 0.1221,
+      "step": 9684
+    },
+    {
+      "epoch": 0.084070450777337,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.001963143171503314,
+      "loss": 0.2051,
+      "step": 9685
+    },
+    {
+      "epoch": 0.08407913125754117,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019631347410029496,
+      "loss": 0.1543,
+      "step": 9686
+    },
+    {
+      "epoch": 0.08408781173774534,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019631263095586733,
+      "loss": 0.1709,
+      "step": 9687
+    },
+    {
+      "epoch": 0.0840964922179495,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0019631178771704954,
+      "loss": 0.127,
+      "step": 9688
+    },
+    {
+      "epoch": 0.08410517269815367,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001963109443838425,
+      "loss": 0.1455,
+      "step": 9689
+    },
+    {
+      "epoch": 0.08411385317835783,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019631010095624707,
+      "loss": 0.1191,
+      "step": 9690
+    },
+    {
+      "epoch": 0.084122533658562,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.001963092574342642,
+      "loss": 0.1992,
+      "step": 9691
+    },
+    {
+      "epoch": 0.08413121413876616,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0019630841381789487,
+      "loss": 0.1123,
+      "step": 9692
+    },
+    {
+      "epoch": 0.08413989461897033,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0019630757010713996,
+      "loss": 0.1084,
+      "step": 9693
+    },
+    {
+      "epoch": 0.08414857509917449,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0019630672630200035,
+      "loss": 0.1318,
+      "step": 9694
+    },
+    {
+      "epoch": 0.08415725557937866,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00196305882402477,
+      "loss": 0.1895,
+      "step": 9695
+    },
+    {
+      "epoch": 0.08416593605958282,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0019630503840857086,
+      "loss": 0.1104,
+      "step": 9696
+    },
+    {
+      "epoch": 0.08417461653978699,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0019630419432028283,
+      "loss": 0.1709,
+      "step": 9697
+    },
+    {
+      "epoch": 0.08418329701999115,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019630335013761384,
+      "loss": 0.127,
+      "step": 9698
+    },
+    {
+      "epoch": 0.08419197750019532,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019630250586056474,
+      "loss": 0.1348,
+      "step": 9699
+    },
+    {
+      "epoch": 0.08420065798039948,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001963016614891366,
+      "loss": 0.1143,
+      "step": 9700
+    },
+    {
+      "epoch": 0.08420933846060365,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001963008170233302,
+      "loss": 0.1836,
+      "step": 9701
+    },
+    {
+      "epoch": 0.08421801894080781,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019629997246314655,
+      "loss": 0.1162,
+      "step": 9702
+    },
+    {
+      "epoch": 0.08422669942101196,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0019629912780858654,
+      "loss": 0.0957,
+      "step": 9703
+    },
+    {
+      "epoch": 0.08423537990121613,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001962982830596511,
+      "loss": 0.1406,
+      "step": 9704
+    },
+    {
+      "epoch": 0.0842440603814203,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019629743821634113,
+      "loss": 0.1553,
+      "step": 9705
+    },
+    {
+      "epoch": 0.08425274086162446,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.001962965932786576,
+      "loss": 0.2617,
+      "step": 9706
+    },
+    {
+      "epoch": 0.08426142134182862,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001962957482466014,
+      "loss": 0.1377,
+      "step": 9707
+    },
+    {
+      "epoch": 0.08427010182203279,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019629490312017346,
+      "loss": 0.1191,
+      "step": 9708
+    },
+    {
+      "epoch": 0.08427878230223695,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019629405789937477,
+      "loss": 0.1475,
+      "step": 9709
+    },
+    {
+      "epoch": 0.08428746278244112,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0019629321258420613,
+      "loss": 0.125,
+      "step": 9710
+    },
+    {
+      "epoch": 0.08429614326264528,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0019629236717466857,
+      "loss": 0.1108,
+      "step": 9711
+    },
+    {
+      "epoch": 0.08430482374284945,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0019629152167076297,
+      "loss": 0.1631,
+      "step": 9712
+    },
+    {
+      "epoch": 0.08431350422305361,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0019629067607249023,
+      "loss": 0.125,
+      "step": 9713
+    },
+    {
+      "epoch": 0.08432218470325778,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001962898303798513,
+      "loss": 0.1172,
+      "step": 9714
+    },
+    {
+      "epoch": 0.08433086518346194,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001962889845928471,
+      "loss": 0.127,
+      "step": 9715
+    },
+    {
+      "epoch": 0.08433954566366611,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001962881387114786,
+      "loss": 0.1172,
+      "step": 9716
+    },
+    {
+      "epoch": 0.08434822614387028,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.001962872927357467,
+      "loss": 0.1592,
+      "step": 9717
+    },
+    {
+      "epoch": 0.08435690662407444,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001962864466656523,
+      "loss": 0.2695,
+      "step": 9718
+    },
+    {
+      "epoch": 0.0843655871042786,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001962856005011963,
+      "loss": 0.1113,
+      "step": 9719
+    },
+    {
+      "epoch": 0.08437426758448277,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001962847542423797,
+      "loss": 0.0962,
+      "step": 9720
+    },
+    {
+      "epoch": 0.08438294806468694,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0019628390788920336,
+      "loss": 0.1748,
+      "step": 9721
+    },
+    {
+      "epoch": 0.0843916285448911,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001962830614416683,
+      "loss": 0.1113,
+      "step": 9722
+    },
+    {
+      "epoch": 0.08440030902509527,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0019628221489977533,
+      "loss": 0.168,
+      "step": 9723
+    },
+    {
+      "epoch": 0.08440898950529943,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001962813682635254,
+      "loss": 0.165,
+      "step": 9724
+    },
+    {
+      "epoch": 0.0844176699855036,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001962805215329195,
+      "loss": 0.1426,
+      "step": 9725
+    },
+    {
+      "epoch": 0.08442635046570776,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001962796747079585,
+      "loss": 0.0854,
+      "step": 9726
+    },
+    {
+      "epoch": 0.08443503094591193,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019627882778864335,
+      "loss": 0.1348,
+      "step": 9727
+    },
+    {
+      "epoch": 0.08444371142611609,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.00196277980774975,
+      "loss": 0.1221,
+      "step": 9728
+    },
+    {
+      "epoch": 0.08445239190632026,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0019627713366695432,
+      "loss": 0.1128,
+      "step": 9729
+    },
+    {
+      "epoch": 0.08446107238652442,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019627628646458227,
+      "loss": 0.1128,
+      "step": 9730
+    },
+    {
+      "epoch": 0.08446975286672859,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019627543916785977,
+      "loss": 0.1367,
+      "step": 9731
+    },
+    {
+      "epoch": 0.08447843334693275,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0019627459177678774,
+      "loss": 0.1118,
+      "step": 9732
+    },
+    {
+      "epoch": 0.08448711382713692,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001962737442913671,
+      "loss": 0.1309,
+      "step": 9733
+    },
+    {
+      "epoch": 0.08449579430734108,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019627289671159884,
+      "loss": 0.1484,
+      "step": 9734
+    },
+    {
+      "epoch": 0.08450447478754525,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001962720490374838,
+      "loss": 0.127,
+      "step": 9735
+    },
+    {
+      "epoch": 0.08451315526774941,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001962712012690229,
+      "loss": 0.1523,
+      "step": 9736
+    },
+    {
+      "epoch": 0.08452183574795358,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019627035340621713,
+      "loss": 0.1582,
+      "step": 9737
+    },
+    {
+      "epoch": 0.08453051622815774,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019626950544906745,
+      "loss": 0.1338,
+      "step": 9738
+    },
+    {
+      "epoch": 0.08453919670836191,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001962686573975747,
+      "loss": 0.1182,
+      "step": 9739
+    },
+    {
+      "epoch": 0.08454787718856607,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019626780925173985,
+      "loss": 0.1338,
+      "step": 9740
+    },
+    {
+      "epoch": 0.08455655766877024,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0019626696101156384,
+      "loss": 0.1001,
+      "step": 9741
+    },
+    {
+      "epoch": 0.0845652381489744,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0019626611267704753,
+      "loss": 0.0933,
+      "step": 9742
+    },
+    {
+      "epoch": 0.08457391862917857,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019626526424819193,
+      "loss": 0.1187,
+      "step": 9743
+    },
+    {
+      "epoch": 0.08458259910938273,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019626441572499795,
+      "loss": 0.1875,
+      "step": 9744
+    },
+    {
+      "epoch": 0.0845912795895869,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0019626356710746645,
+      "loss": 0.127,
+      "step": 9745
+    },
+    {
+      "epoch": 0.08459996006979106,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019626271839559843,
+      "loss": 0.1055,
+      "step": 9746
+    },
+    {
+      "epoch": 0.08460864054999523,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001962618695893948,
+      "loss": 0.168,
+      "step": 9747
+    },
+    {
+      "epoch": 0.0846173210301994,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019626102068885654,
+      "loss": 0.1152,
+      "step": 9748
+    },
+    {
+      "epoch": 0.08462600151040356,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019626017169398443,
+      "loss": 0.125,
+      "step": 9749
+    },
+    {
+      "epoch": 0.08463468199060772,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019625932260477954,
+      "loss": 0.167,
+      "step": 9750
+    },
+    {
+      "epoch": 0.08464336247081189,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0019625847342124277,
+      "loss": 0.1396,
+      "step": 9751
+    },
+    {
+      "epoch": 0.08465204295101605,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.00196257624143375,
+      "loss": 0.0933,
+      "step": 9752
+    },
+    {
+      "epoch": 0.08466072343122022,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0019625677477117715,
+      "loss": 0.1299,
+      "step": 9753
+    },
+    {
+      "epoch": 0.08466940391142438,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019625592530465026,
+      "loss": 0.1133,
+      "step": 9754
+    },
+    {
+      "epoch": 0.08467808439162855,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0019625507574379517,
+      "loss": 0.1738,
+      "step": 9755
+    },
+    {
+      "epoch": 0.08468676487183271,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019625422608861277,
+      "loss": 0.1689,
+      "step": 9756
+    },
+    {
+      "epoch": 0.08469544535203688,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001962533763391041,
+      "loss": 0.1582,
+      "step": 9757
+    },
+    {
+      "epoch": 0.08470412583224105,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019625252649527,
+      "loss": 0.1172,
+      "step": 9758
+    },
+    {
+      "epoch": 0.08471280631244521,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001962516765571115,
+      "loss": 0.1484,
+      "step": 9759
+    },
+    {
+      "epoch": 0.08472148679264938,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001962508265246294,
+      "loss": 0.1377,
+      "step": 9760
+    },
+    {
+      "epoch": 0.08473016727285354,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001962499763978247,
+      "loss": 0.1367,
+      "step": 9761
+    },
+    {
+      "epoch": 0.0847388477530577,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0019624912617669833,
+      "loss": 0.1377,
+      "step": 9762
+    },
+    {
+      "epoch": 0.08474752823326187,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001962482758612512,
+      "loss": 0.1523,
+      "step": 9763
+    },
+    {
+      "epoch": 0.08475620871346604,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019624742545148424,
+      "loss": 0.1514,
+      "step": 9764
+    },
+    {
+      "epoch": 0.08476488919367019,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019624657494739842,
+      "loss": 0.1357,
+      "step": 9765
+    },
+    {
+      "epoch": 0.08477356967387435,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0019624572434899464,
+      "loss": 0.1143,
+      "step": 9766
+    },
+    {
+      "epoch": 0.08478225015407852,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001962448736562738,
+      "loss": 0.1582,
+      "step": 9767
+    },
+    {
+      "epoch": 0.08479093063428268,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001962440228692369,
+      "loss": 0.1138,
+      "step": 9768
+    },
+    {
+      "epoch": 0.08479961111448685,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019624317198788485,
+      "loss": 0.1084,
+      "step": 9769
+    },
+    {
+      "epoch": 0.08480829159469101,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019624232101221853,
+      "loss": 0.0938,
+      "step": 9770
+    },
+    {
+      "epoch": 0.08481697207489518,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001962414699422389,
+      "loss": 0.1211,
+      "step": 9771
+    },
+    {
+      "epoch": 0.08482565255509934,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001962406187779469,
+      "loss": 0.1162,
+      "step": 9772
+    },
+    {
+      "epoch": 0.08483433303530351,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.001962397675193435,
+      "loss": 0.124,
+      "step": 9773
+    },
+    {
+      "epoch": 0.08484301351550767,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.001962389161664295,
+      "loss": 0.1191,
+      "step": 9774
+    },
+    {
+      "epoch": 0.08485169399571184,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019623806471920596,
+      "loss": 0.1328,
+      "step": 9775
+    },
+    {
+      "epoch": 0.084860374475916,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0019623721317767375,
+      "loss": 0.1045,
+      "step": 9776
+    },
+    {
+      "epoch": 0.08486905495612017,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001962363615418338,
+      "loss": 0.1191,
+      "step": 9777
+    },
+    {
+      "epoch": 0.08487773543632433,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019623550981168712,
+      "loss": 0.1289,
+      "step": 9778
+    },
+    {
+      "epoch": 0.0848864159165285,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019623465798723457,
+      "loss": 0.0962,
+      "step": 9779
+    },
+    {
+      "epoch": 0.08489509639673266,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001962338060684771,
+      "loss": 0.1143,
+      "step": 9780
+    },
+    {
+      "epoch": 0.08490377687693683,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.001962329540554156,
+      "loss": 0.127,
+      "step": 9781
+    },
+    {
+      "epoch": 0.084912457357141,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019623210194805105,
+      "loss": 0.1201,
+      "step": 9782
+    },
+    {
+      "epoch": 0.08492113783734516,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001962312497463844,
+      "loss": 0.1621,
+      "step": 9783
+    },
+    {
+      "epoch": 0.08492981831754932,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001962303974504165,
+      "loss": 0.1367,
+      "step": 9784
+    },
+    {
+      "epoch": 0.08493849879775349,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.001962295450601484,
+      "loss": 0.123,
+      "step": 9785
+    },
+    {
+      "epoch": 0.08494717927795765,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001962286925755809,
+      "loss": 0.1416,
+      "step": 9786
+    },
+    {
+      "epoch": 0.08495585975816182,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00196227839996715,
+      "loss": 0.1484,
+      "step": 9787
+    },
+    {
+      "epoch": 0.08496454023836598,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0019622698732355164,
+      "loss": 0.1104,
+      "step": 9788
+    },
+    {
+      "epoch": 0.08497322071857015,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001962261345560918,
+      "loss": 0.1729,
+      "step": 9789
+    },
+    {
+      "epoch": 0.08498190119877432,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019622528169433626,
+      "loss": 0.1738,
+      "step": 9790
+    },
+    {
+      "epoch": 0.08499058167897848,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001962244287382861,
+      "loss": 0.1201,
+      "step": 9791
+    },
+    {
+      "epoch": 0.08499926215918265,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001962235756879422,
+      "loss": 0.1187,
+      "step": 9792
+    },
+    {
+      "epoch": 0.08500794263938681,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001962227225433055,
+      "loss": 0.1157,
+      "step": 9793
+    },
+    {
+      "epoch": 0.08501662311959098,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001962218693043769,
+      "loss": 0.1562,
+      "step": 9794
+    },
+    {
+      "epoch": 0.08502530359979514,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019622101597115736,
+      "loss": 0.0928,
+      "step": 9795
+    },
+    {
+      "epoch": 0.0850339840799993,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019622016254364784,
+      "loss": 0.1543,
+      "step": 9796
+    },
+    {
+      "epoch": 0.08504266456020347,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0019621930902184925,
+      "loss": 0.127,
+      "step": 9797
+    },
+    {
+      "epoch": 0.08505134504040764,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001962184554057625,
+      "loss": 0.1543,
+      "step": 9798
+    },
+    {
+      "epoch": 0.0850600255206118,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0019621760169538853,
+      "loss": 0.1426,
+      "step": 9799
+    },
+    {
+      "epoch": 0.08506870600081597,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001962167478907283,
+      "loss": 0.1221,
+      "step": 9800
+    },
+    {
+      "epoch": 0.08507738648102013,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001962158939917827,
+      "loss": 0.1021,
+      "step": 9801
+    },
+    {
+      "epoch": 0.0850860669612243,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0019621503999855275,
+      "loss": 0.1514,
+      "step": 9802
+    },
+    {
+      "epoch": 0.08509474744142846,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001962141859110393,
+      "loss": 0.1182,
+      "step": 9803
+    },
+    {
+      "epoch": 0.08510342792163263,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019621333172924332,
+      "loss": 0.0933,
+      "step": 9804
+    },
+    {
+      "epoch": 0.08511210840183679,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.001962124774531657,
+      "loss": 0.127,
+      "step": 9805
+    },
+    {
+      "epoch": 0.08512078888204096,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001962116230828075,
+      "loss": 0.1187,
+      "step": 9806
+    },
+    {
+      "epoch": 0.08512946936224512,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019621076861816946,
+      "loss": 0.2168,
+      "step": 9807
+    },
+    {
+      "epoch": 0.08513814984244929,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001962099140592527,
+      "loss": 0.1338,
+      "step": 9808
+    },
+    {
+      "epoch": 0.08514683032265345,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.00196209059406058,
+      "loss": 0.125,
+      "step": 9809
+    },
+    {
+      "epoch": 0.08515551080285762,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.001962082046585864,
+      "loss": 0.1543,
+      "step": 9810
+    },
+    {
+      "epoch": 0.08516419128306178,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001962073498168388,
+      "loss": 0.1299,
+      "step": 9811
+    },
+    {
+      "epoch": 0.08517287176326595,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019620649488081613,
+      "loss": 0.1455,
+      "step": 9812
+    },
+    {
+      "epoch": 0.08518155224347011,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001962056398505194,
+      "loss": 0.1201,
+      "step": 9813
+    },
+    {
+      "epoch": 0.08519023272367428,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0019620478472594936,
+      "loss": 0.1602,
+      "step": 9814
+    },
+    {
+      "epoch": 0.08519891320387844,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0019620392950710714,
+      "loss": 0.1836,
+      "step": 9815
+    },
+    {
+      "epoch": 0.08520759368408261,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019620307419399357,
+      "loss": 0.1079,
+      "step": 9816
+    },
+    {
+      "epoch": 0.08521627416428677,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0019620221878660963,
+      "loss": 0.1299,
+      "step": 9817
+    },
+    {
+      "epoch": 0.08522495464449094,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019620136328495625,
+      "loss": 0.1025,
+      "step": 9818
+    },
+    {
+      "epoch": 0.0852336351246951,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0019620050768903437,
+      "loss": 0.1475,
+      "step": 9819
+    },
+    {
+      "epoch": 0.08524231560489927,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0019619965199884483,
+      "loss": 0.0942,
+      "step": 9820
+    },
+    {
+      "epoch": 0.08525099608510343,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019619879621438872,
+      "loss": 0.1406,
+      "step": 9821
+    },
+    {
+      "epoch": 0.0852596765653076,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001961979403356669,
+      "loss": 0.106,
+      "step": 9822
+    },
+    {
+      "epoch": 0.08526835704551176,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0019619708436268025,
+      "loss": 0.1855,
+      "step": 9823
+    },
+    {
+      "epoch": 0.08527703752571593,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001961962282954298,
+      "loss": 0.1191,
+      "step": 9824
+    },
+    {
+      "epoch": 0.0852857180059201,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019619537213391644,
+      "loss": 0.124,
+      "step": 9825
+    },
+    {
+      "epoch": 0.08529439848612425,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0019619451587814113,
+      "loss": 0.1309,
+      "step": 9826
+    },
+    {
+      "epoch": 0.08530307896632841,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019619365952810476,
+      "loss": 0.1143,
+      "step": 9827
+    },
+    {
+      "epoch": 0.08531175944653258,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019619280308380838,
+      "loss": 0.0967,
+      "step": 9828
+    },
+    {
+      "epoch": 0.08532043992673674,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019619194654525276,
+      "loss": 0.1562,
+      "step": 9829
+    },
+    {
+      "epoch": 0.0853291204069409,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00196191089912439,
+      "loss": 0.1211,
+      "step": 9830
+    },
+    {
+      "epoch": 0.08533780088714507,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001961902331853679,
+      "loss": 0.0898,
+      "step": 9831
+    },
+    {
+      "epoch": 0.08534648136734924,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019618937636404045,
+      "loss": 0.1113,
+      "step": 9832
+    },
+    {
+      "epoch": 0.0853551618475534,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001961885194484576,
+      "loss": 0.1035,
+      "step": 9833
+    },
+    {
+      "epoch": 0.08536384232775757,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001961876624386203,
+      "loss": 0.106,
+      "step": 9834
+    },
+    {
+      "epoch": 0.08537252280796173,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001961868053345294,
+      "loss": 0.1162,
+      "step": 9835
+    },
+    {
+      "epoch": 0.0853812032881659,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.00196185948136186,
+      "loss": 0.1118,
+      "step": 9836
+    },
+    {
+      "epoch": 0.08538988376837006,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001961850908435909,
+      "loss": 0.1387,
+      "step": 9837
+    },
+    {
+      "epoch": 0.08539856424857423,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019618423345674507,
+      "loss": 0.2539,
+      "step": 9838
+    },
+    {
+      "epoch": 0.08540724472877839,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019618337597564948,
+      "loss": 0.1172,
+      "step": 9839
+    },
+    {
+      "epoch": 0.08541592520898256,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0019618251840030506,
+      "loss": 0.1387,
+      "step": 9840
+    },
+    {
+      "epoch": 0.08542460568918672,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019618166073071272,
+      "loss": 0.1309,
+      "step": 9841
+    },
+    {
+      "epoch": 0.08543328616939089,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001961808029668734,
+      "loss": 0.1201,
+      "step": 9842
+    },
+    {
+      "epoch": 0.08544196664959505,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019617994510878803,
+      "loss": 0.1348,
+      "step": 9843
+    },
+    {
+      "epoch": 0.08545064712979922,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001961790871564576,
+      "loss": 0.1055,
+      "step": 9844
+    },
+    {
+      "epoch": 0.08545932761000338,
+      "grad_norm": 0.05908203125,
+      "learning_rate": 0.00196178229109883,
+      "loss": 0.0942,
+      "step": 9845
+    },
+    {
+      "epoch": 0.08546800809020755,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0019617737096906517,
+      "loss": 0.0938,
+      "step": 9846
+    },
+    {
+      "epoch": 0.08547668857041171,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0019617651273400507,
+      "loss": 0.1201,
+      "step": 9847
+    },
+    {
+      "epoch": 0.08548536905061588,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0019617565440470364,
+      "loss": 0.1523,
+      "step": 9848
+    },
+    {
+      "epoch": 0.08549404953082004,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001961747959811618,
+      "loss": 0.1201,
+      "step": 9849
+    },
+    {
+      "epoch": 0.08550273001102421,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019617393746338054,
+      "loss": 0.1152,
+      "step": 9850
+    },
+    {
+      "epoch": 0.08551141049122837,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001961730788513607,
+      "loss": 0.1348,
+      "step": 9851
+    },
+    {
+      "epoch": 0.08552009097143254,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001961722201451033,
+      "loss": 0.1504,
+      "step": 9852
+    },
+    {
+      "epoch": 0.0855287714516367,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019617136134460924,
+      "loss": 0.1069,
+      "step": 9853
+    },
+    {
+      "epoch": 0.08553745193184087,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019617050244987947,
+      "loss": 0.0874,
+      "step": 9854
+    },
+    {
+      "epoch": 0.08554613241204503,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0019616964346091496,
+      "loss": 0.1738,
+      "step": 9855
+    },
+    {
+      "epoch": 0.0855548128922492,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019616878437771663,
+      "loss": 0.1035,
+      "step": 9856
+    },
+    {
+      "epoch": 0.08556349337245336,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019616792520028535,
+      "loss": 0.1318,
+      "step": 9857
+    },
+    {
+      "epoch": 0.08557217385265753,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001961670659286222,
+      "loss": 0.1138,
+      "step": 9858
+    },
+    {
+      "epoch": 0.0855808543328617,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.00196166206562728,
+      "loss": 0.2227,
+      "step": 9859
+    },
+    {
+      "epoch": 0.08558953481306586,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001961653471026037,
+      "loss": 0.1113,
+      "step": 9860
+    },
+    {
+      "epoch": 0.08559821529327002,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019616448754825036,
+      "loss": 0.126,
+      "step": 9861
+    },
+    {
+      "epoch": 0.08560689577347419,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0019616362789966874,
+      "loss": 0.1104,
+      "step": 9862
+    },
+    {
+      "epoch": 0.08561557625367835,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0019616276815685994,
+      "loss": 0.1338,
+      "step": 9863
+    },
+    {
+      "epoch": 0.08562425673388252,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0019616190831982477,
+      "loss": 0.1138,
+      "step": 9864
+    },
+    {
+      "epoch": 0.08563293721408669,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001961610483885643,
+      "loss": 0.1196,
+      "step": 9865
+    },
+    {
+      "epoch": 0.08564161769429085,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0019616018836307934,
+      "loss": 0.1074,
+      "step": 9866
+    },
+    {
+      "epoch": 0.08565029817449502,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019615932824337094,
+      "loss": 0.1069,
+      "step": 9867
+    },
+    {
+      "epoch": 0.08565897865469918,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019615846802943995,
+      "loss": 0.1055,
+      "step": 9868
+    },
+    {
+      "epoch": 0.08566765913490335,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019615760772128737,
+      "loss": 0.1035,
+      "step": 9869
+    },
+    {
+      "epoch": 0.08567633961510751,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019615674731891415,
+      "loss": 0.1719,
+      "step": 9870
+    },
+    {
+      "epoch": 0.08568502009531168,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019615588682232116,
+      "loss": 0.1299,
+      "step": 9871
+    },
+    {
+      "epoch": 0.08569370057551584,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0019615502623150944,
+      "loss": 0.1562,
+      "step": 9872
+    },
+    {
+      "epoch": 0.08570238105572,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001961541655464798,
+      "loss": 0.1201,
+      "step": 9873
+    },
+    {
+      "epoch": 0.08571106153592417,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0019615330476723336,
+      "loss": 0.1182,
+      "step": 9874
+    },
+    {
+      "epoch": 0.08571974201612834,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019615244389377087,
+      "loss": 0.1318,
+      "step": 9875
+    },
+    {
+      "epoch": 0.0857284224963325,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019615158292609343,
+      "loss": 0.0923,
+      "step": 9876
+    },
+    {
+      "epoch": 0.08573710297653667,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019615072186420185,
+      "loss": 0.1138,
+      "step": 9877
+    },
+    {
+      "epoch": 0.08574578345674083,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019614986070809718,
+      "loss": 0.165,
+      "step": 9878
+    },
+    {
+      "epoch": 0.085754463936945,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001961489994577803,
+      "loss": 0.125,
+      "step": 9879
+    },
+    {
+      "epoch": 0.08576314441714916,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.001961481381132522,
+      "loss": 0.1367,
+      "step": 9880
+    },
+    {
+      "epoch": 0.08577182489735333,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0019614727667451376,
+      "loss": 0.1211,
+      "step": 9881
+    },
+    {
+      "epoch": 0.08578050537755749,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019614641514156595,
+      "loss": 0.1279,
+      "step": 9882
+    },
+    {
+      "epoch": 0.08578918585776166,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0019614555351440974,
+      "loss": 0.1074,
+      "step": 9883
+    },
+    {
+      "epoch": 0.08579786633796582,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019614469179304603,
+      "loss": 0.1328,
+      "step": 9884
+    },
+    {
+      "epoch": 0.08580654681816999,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0019614382997747573,
+      "loss": 0.1118,
+      "step": 9885
+    },
+    {
+      "epoch": 0.08581522729837415,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.001961429680676999,
+      "loss": 0.1187,
+      "step": 9886
+    },
+    {
+      "epoch": 0.08582390777857832,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001961421060637194,
+      "loss": 0.1182,
+      "step": 9887
+    },
+    {
+      "epoch": 0.08583258825878247,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019614124396553517,
+      "loss": 0.1177,
+      "step": 9888
+    },
+    {
+      "epoch": 0.08584126873898663,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001961403817731482,
+      "loss": 0.126,
+      "step": 9889
+    },
+    {
+      "epoch": 0.0858499492191908,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019613951948655936,
+      "loss": 0.1602,
+      "step": 9890
+    },
+    {
+      "epoch": 0.08585862969939496,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0019613865710576966,
+      "loss": 0.1484,
+      "step": 9891
+    },
+    {
+      "epoch": 0.08586731017959913,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019613779463078006,
+      "loss": 0.124,
+      "step": 9892
+    },
+    {
+      "epoch": 0.0858759906598033,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001961369320615914,
+      "loss": 0.1172,
+      "step": 9893
+    },
+    {
+      "epoch": 0.08588467114000746,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0019613606939820473,
+      "loss": 0.1162,
+      "step": 9894
+    },
+    {
+      "epoch": 0.08589335162021162,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001961352066406209,
+      "loss": 0.123,
+      "step": 9895
+    },
+    {
+      "epoch": 0.08590203210041579,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019613434378884095,
+      "loss": 0.1074,
+      "step": 9896
+    },
+    {
+      "epoch": 0.08591071258061996,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0019613348084286573,
+      "loss": 0.1699,
+      "step": 9897
+    },
+    {
+      "epoch": 0.08591939306082412,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0019613261780269627,
+      "loss": 0.1152,
+      "step": 9898
+    },
+    {
+      "epoch": 0.08592807354102829,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001961317546683334,
+      "loss": 0.1152,
+      "step": 9899
+    },
+    {
+      "epoch": 0.08593675402123245,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019613089143977825,
+      "loss": 0.166,
+      "step": 9900
+    },
+    {
+      "epoch": 0.08594543450143662,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001961300281170316,
+      "loss": 0.125,
+      "step": 9901
+    },
+    {
+      "epoch": 0.08595411498164078,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001961291647000944,
+      "loss": 0.1318,
+      "step": 9902
+    },
+    {
+      "epoch": 0.08596279546184495,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001961283011889677,
+      "loss": 0.1055,
+      "step": 9903
+    },
+    {
+      "epoch": 0.08597147594204911,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019612743758365236,
+      "loss": 0.126,
+      "step": 9904
+    },
+    {
+      "epoch": 0.08598015642225328,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019612657388414934,
+      "loss": 0.1348,
+      "step": 9905
+    },
+    {
+      "epoch": 0.08598883690245744,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0019612571009045964,
+      "loss": 0.1406,
+      "step": 9906
+    },
+    {
+      "epoch": 0.0859975173826616,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001961248462025841,
+      "loss": 0.083,
+      "step": 9907
+    },
+    {
+      "epoch": 0.08600619786286577,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019612398222052375,
+      "loss": 0.1396,
+      "step": 9908
+    },
+    {
+      "epoch": 0.08601487834306994,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.001961231181442795,
+      "loss": 0.1406,
+      "step": 9909
+    },
+    {
+      "epoch": 0.0860235588232741,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001961222539738523,
+      "loss": 0.1836,
+      "step": 9910
+    },
+    {
+      "epoch": 0.08603223930347827,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001961213897092431,
+      "loss": 0.1123,
+      "step": 9911
+    },
+    {
+      "epoch": 0.08604091978368243,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0019612052535045283,
+      "loss": 0.1055,
+      "step": 9912
+    },
+    {
+      "epoch": 0.0860496002638866,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0019611966089748247,
+      "loss": 0.1318,
+      "step": 9913
+    },
+    {
+      "epoch": 0.08605828074409076,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0019611879635033293,
+      "loss": 0.0981,
+      "step": 9914
+    },
+    {
+      "epoch": 0.08606696122429493,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019611793170900517,
+      "loss": 0.0918,
+      "step": 9915
+    },
+    {
+      "epoch": 0.08607564170449909,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0019611706697350013,
+      "loss": 0.0928,
+      "step": 9916
+    },
+    {
+      "epoch": 0.08608432218470326,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001961162021438188,
+      "loss": 0.1738,
+      "step": 9917
+    },
+    {
+      "epoch": 0.08609300266490742,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00196115337219962,
+      "loss": 0.0938,
+      "step": 9918
+    },
+    {
+      "epoch": 0.08610168314511159,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019611447220193082,
+      "loss": 0.1318,
+      "step": 9919
+    },
+    {
+      "epoch": 0.08611036362531575,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001961136070897261,
+      "loss": 0.1191,
+      "step": 9920
+    },
+    {
+      "epoch": 0.08611904410551992,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.001961127418833489,
+      "loss": 0.1309,
+      "step": 9921
+    },
+    {
+      "epoch": 0.08612772458572408,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001961118765828001,
+      "loss": 0.1367,
+      "step": 9922
+    },
+    {
+      "epoch": 0.08613640506592825,
+      "grad_norm": 1.625,
+      "learning_rate": 0.001961110111880806,
+      "loss": 0.1357,
+      "step": 9923
+    },
+    {
+      "epoch": 0.08614508554613241,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001961101456991914,
+      "loss": 0.1201,
+      "step": 9924
+    },
+    {
+      "epoch": 0.08615376602633658,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0019610928011613345,
+      "loss": 0.1387,
+      "step": 9925
+    },
+    {
+      "epoch": 0.08616244650654074,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019610841443890767,
+      "loss": 0.1328,
+      "step": 9926
+    },
+    {
+      "epoch": 0.08617112698674491,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019610754866751503,
+      "loss": 0.1348,
+      "step": 9927
+    },
+    {
+      "epoch": 0.08617980746694907,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001961066828019565,
+      "loss": 0.0918,
+      "step": 9928
+    },
+    {
+      "epoch": 0.08618848794715324,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00196105816842233,
+      "loss": 0.1245,
+      "step": 9929
+    },
+    {
+      "epoch": 0.0861971684273574,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.001961049507883454,
+      "loss": 0.1328,
+      "step": 9930
+    },
+    {
+      "epoch": 0.08620584890756157,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019610408464029476,
+      "loss": 0.1553,
+      "step": 9931
+    },
+    {
+      "epoch": 0.08621452938776573,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0019610321839808194,
+      "loss": 0.1357,
+      "step": 9932
+    },
+    {
+      "epoch": 0.0862232098679699,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00196102352061708,
+      "loss": 0.1035,
+      "step": 9933
+    },
+    {
+      "epoch": 0.08623189034817406,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001961014856311738,
+      "loss": 0.1318,
+      "step": 9934
+    },
+    {
+      "epoch": 0.08624057082837823,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0019610061910648032,
+      "loss": 0.0986,
+      "step": 9935
+    },
+    {
+      "epoch": 0.0862492513085824,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019609975248762847,
+      "loss": 0.1455,
+      "step": 9936
+    },
+    {
+      "epoch": 0.08625793178878656,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019609888577461925,
+      "loss": 0.1113,
+      "step": 9937
+    },
+    {
+      "epoch": 0.08626661226899072,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001960980189674536,
+      "loss": 0.0894,
+      "step": 9938
+    },
+    {
+      "epoch": 0.08627529274919489,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001960971520661324,
+      "loss": 0.125,
+      "step": 9939
+    },
+    {
+      "epoch": 0.08628397322939906,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001960962850706567,
+      "loss": 0.1436,
+      "step": 9940
+    },
+    {
+      "epoch": 0.08629265370960322,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019609541798102734,
+      "loss": 0.1543,
+      "step": 9941
+    },
+    {
+      "epoch": 0.08630133418980739,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019609455079724536,
+      "loss": 0.1177,
+      "step": 9942
+    },
+    {
+      "epoch": 0.08631001467001155,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019609368351931164,
+      "loss": 0.1104,
+      "step": 9943
+    },
+    {
+      "epoch": 0.08631869515021572,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001960928161472272,
+      "loss": 0.1201,
+      "step": 9944
+    },
+    {
+      "epoch": 0.08632737563041988,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019609194868099294,
+      "loss": 0.1416,
+      "step": 9945
+    },
+    {
+      "epoch": 0.08633605611062405,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001960910811206098,
+      "loss": 0.126,
+      "step": 9946
+    },
+    {
+      "epoch": 0.08634473659082821,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0019609021346607878,
+      "loss": 0.1147,
+      "step": 9947
+    },
+    {
+      "epoch": 0.08635341707103238,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0019608934571740074,
+      "loss": 0.1377,
+      "step": 9948
+    },
+    {
+      "epoch": 0.08636209755123654,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019608847787457675,
+      "loss": 0.1143,
+      "step": 9949
+    },
+    {
+      "epoch": 0.08637077803144069,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0019608760993760767,
+      "loss": 0.1016,
+      "step": 9950
+    },
+    {
+      "epoch": 0.08637945851164486,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.001960867419064945,
+      "loss": 0.1123,
+      "step": 9951
+    },
+    {
+      "epoch": 0.08638813899184902,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.001960858737812381,
+      "loss": 0.1162,
+      "step": 9952
+    },
+    {
+      "epoch": 0.08639681947205319,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019608500556183954,
+      "loss": 0.124,
+      "step": 9953
+    },
+    {
+      "epoch": 0.08640549995225735,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0019608413724829963,
+      "loss": 0.0986,
+      "step": 9954
+    },
+    {
+      "epoch": 0.08641418043246152,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001960832688406195,
+      "loss": 0.0957,
+      "step": 9955
+    },
+    {
+      "epoch": 0.08642286091266568,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019608240033879994,
+      "loss": 0.124,
+      "step": 9956
+    },
+    {
+      "epoch": 0.08643154139286985,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.00196081531742842,
+      "loss": 0.1289,
+      "step": 9957
+    },
+    {
+      "epoch": 0.08644022187307401,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.001960806630527465,
+      "loss": 0.1191,
+      "step": 9958
+    },
+    {
+      "epoch": 0.08644890235327818,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019607979426851455,
+      "loss": 0.127,
+      "step": 9959
+    },
+    {
+      "epoch": 0.08645758283348234,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0019607892539014707,
+      "loss": 0.1152,
+      "step": 9960
+    },
+    {
+      "epoch": 0.08646626331368651,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019607805641764496,
+      "loss": 0.1064,
+      "step": 9961
+    },
+    {
+      "epoch": 0.08647494379389067,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001960771873510091,
+      "loss": 0.0996,
+      "step": 9962
+    },
+    {
+      "epoch": 0.08648362427409484,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001960763181902406,
+      "loss": 0.125,
+      "step": 9963
+    },
+    {
+      "epoch": 0.086492304754299,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019607544893534027,
+      "loss": 0.1934,
+      "step": 9964
+    },
+    {
+      "epoch": 0.08650098523450317,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001960745795863092,
+      "loss": 0.1094,
+      "step": 9965
+    },
+    {
+      "epoch": 0.08650966571470733,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.001960737101431482,
+      "loss": 0.1602,
+      "step": 9966
+    },
+    {
+      "epoch": 0.0865183461949115,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019607284060585827,
+      "loss": 0.085,
+      "step": 9967
+    },
+    {
+      "epoch": 0.08652702667511566,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.001960719709744404,
+      "loss": 0.1621,
+      "step": 9968
+    },
+    {
+      "epoch": 0.08653570715531983,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0019607110124889556,
+      "loss": 0.1133,
+      "step": 9969
+    },
+    {
+      "epoch": 0.086544387635524,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0019607023142922462,
+      "loss": 0.1084,
+      "step": 9970
+    },
+    {
+      "epoch": 0.08655306811572816,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019606936151542855,
+      "loss": 0.1035,
+      "step": 9971
+    },
+    {
+      "epoch": 0.08656174859593233,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0019606849150750833,
+      "loss": 0.1445,
+      "step": 9972
+    },
+    {
+      "epoch": 0.08657042907613649,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001960676214054649,
+      "loss": 0.1289,
+      "step": 9973
+    },
+    {
+      "epoch": 0.08657910955634066,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019606675120929923,
+      "loss": 0.127,
+      "step": 9974
+    },
+    {
+      "epoch": 0.08658779003654482,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019606588091901226,
+      "loss": 0.1504,
+      "step": 9975
+    },
+    {
+      "epoch": 0.08659647051674899,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001960650105346049,
+      "loss": 0.1846,
+      "step": 9976
+    },
+    {
+      "epoch": 0.08660515099695315,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001960641400560782,
+      "loss": 0.0884,
+      "step": 9977
+    },
+    {
+      "epoch": 0.08661383147715732,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019606326948343297,
+      "loss": 0.0986,
+      "step": 9978
+    },
+    {
+      "epoch": 0.08662251195736148,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001960623988166703,
+      "loss": 0.1279,
+      "step": 9979
+    },
+    {
+      "epoch": 0.08663119243756565,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019606152805579104,
+      "loss": 0.126,
+      "step": 9980
+    },
+    {
+      "epoch": 0.08663987291776981,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001960606572007962,
+      "loss": 0.1357,
+      "step": 9981
+    },
+    {
+      "epoch": 0.08664855339797398,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001960597862516867,
+      "loss": 0.125,
+      "step": 9982
+    },
+    {
+      "epoch": 0.08665723387817814,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019605891520846357,
+      "loss": 0.1309,
+      "step": 9983
+    },
+    {
+      "epoch": 0.0866659143583823,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019605804407112765,
+      "loss": 0.1201,
+      "step": 9984
+    },
+    {
+      "epoch": 0.08667459483858647,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0019605717283968003,
+      "loss": 0.0752,
+      "step": 9985
+    },
+    {
+      "epoch": 0.08668327531879064,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0019605630151412145,
+      "loss": 0.1309,
+      "step": 9986
+    },
+    {
+      "epoch": 0.0866919557989948,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.001960554300944531,
+      "loss": 0.1328,
+      "step": 9987
+    },
+    {
+      "epoch": 0.08670063627919897,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019605455858067574,
+      "loss": 0.1318,
+      "step": 9988
+    },
+    {
+      "epoch": 0.08670931675940313,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001960536869727905,
+      "loss": 0.1074,
+      "step": 9989
+    },
+    {
+      "epoch": 0.0867179972396073,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0019605281527079813,
+      "loss": 0.123,
+      "step": 9990
+    },
+    {
+      "epoch": 0.08672667771981146,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019605194347469975,
+      "loss": 0.1387,
+      "step": 9991
+    },
+    {
+      "epoch": 0.08673535820001563,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019605107158449627,
+      "loss": 0.1348,
+      "step": 9992
+    },
+    {
+      "epoch": 0.08674403868021979,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0019605019960018863,
+      "loss": 0.1719,
+      "step": 9993
+    },
+    {
+      "epoch": 0.08675271916042396,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001960493275217778,
+      "loss": 0.123,
+      "step": 9994
+    },
+    {
+      "epoch": 0.08676139964062812,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001960484553492647,
+      "loss": 0.1328,
+      "step": 9995
+    },
+    {
+      "epoch": 0.08677008012083229,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.001960475830826503,
+      "loss": 0.1157,
+      "step": 9996
+    },
+    {
+      "epoch": 0.08677876060103645,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0019604671072193554,
+      "loss": 0.1123,
+      "step": 9997
+    },
+    {
+      "epoch": 0.08678744108124062,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019604583826712137,
+      "loss": 0.1118,
+      "step": 9998
+    },
+    {
+      "epoch": 0.08679612156144478,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0019604496571820882,
+      "loss": 0.1357,
+      "step": 9999
+    },
+    {
+      "epoch": 0.08680480204164895,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0019604409307519876,
+      "loss": 0.1104,
+      "step": 10000
+    },
+    {
+      "epoch": 0.08681348252185311,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019604322033809217,
+      "loss": 0.103,
+      "step": 10001
+    },
+    {
+      "epoch": 0.08682216300205728,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019604234750689002,
+      "loss": 0.1182,
+      "step": 10002
+    },
+    {
+      "epoch": 0.08683084348226144,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019604147458159326,
+      "loss": 0.1797,
+      "step": 10003
+    },
+    {
+      "epoch": 0.08683952396246561,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019604060156220285,
+      "loss": 0.1191,
+      "step": 10004
+    },
+    {
+      "epoch": 0.08684820444266977,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001960397284487197,
+      "loss": 0.1187,
+      "step": 10005
+    },
+    {
+      "epoch": 0.08685688492287394,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019603885524114477,
+      "loss": 0.1084,
+      "step": 10006
+    },
+    {
+      "epoch": 0.0868655654030781,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019603798193947907,
+      "loss": 0.1177,
+      "step": 10007
+    },
+    {
+      "epoch": 0.08687424588328227,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0019603710854372352,
+      "loss": 0.1689,
+      "step": 10008
+    },
+    {
+      "epoch": 0.08688292636348643,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.001960362350538791,
+      "loss": 0.1426,
+      "step": 10009
+    },
+    {
+      "epoch": 0.0868916068436906,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019603536146994673,
+      "loss": 0.1211,
+      "step": 10010
+    },
+    {
+      "epoch": 0.08690028732389475,
+      "grad_norm": 2.4375,
+      "learning_rate": 0.001960344877919274,
+      "loss": 0.1406,
+      "step": 10011
+    },
+    {
+      "epoch": 0.08690896780409892,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0019603361401982202,
+      "loss": 0.0923,
+      "step": 10012
+    },
+    {
+      "epoch": 0.08691764828430308,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001960327401536316,
+      "loss": 0.1309,
+      "step": 10013
+    },
+    {
+      "epoch": 0.08692632876450725,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0019603186619335704,
+      "loss": 0.1523,
+      "step": 10014
+    },
+    {
+      "epoch": 0.08693500924471141,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019603099213899933,
+      "loss": 0.127,
+      "step": 10015
+    },
+    {
+      "epoch": 0.08694368972491558,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001960301179905594,
+      "loss": 0.1235,
+      "step": 10016
+    },
+    {
+      "epoch": 0.08695237020511974,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001960292437480383,
+      "loss": 0.1113,
+      "step": 10017
+    },
+    {
+      "epoch": 0.0869610506853239,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019602836941143686,
+      "loss": 0.123,
+      "step": 10018
+    },
+    {
+      "epoch": 0.08696973116552807,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019602749498075604,
+      "loss": 0.1963,
+      "step": 10019
+    },
+    {
+      "epoch": 0.08697841164573224,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0019602662045599692,
+      "loss": 0.1074,
+      "step": 10020
+    },
+    {
+      "epoch": 0.0869870921259364,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019602574583716037,
+      "loss": 0.1465,
+      "step": 10021
+    },
+    {
+      "epoch": 0.08699577260614057,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0019602487112424733,
+      "loss": 0.1338,
+      "step": 10022
+    },
+    {
+      "epoch": 0.08700445308634473,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019602399631725876,
+      "loss": 0.1123,
+      "step": 10023
+    },
+    {
+      "epoch": 0.0870131335665489,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001960231214161957,
+      "loss": 0.1084,
+      "step": 10024
+    },
+    {
+      "epoch": 0.08702181404675306,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019602224642105903,
+      "loss": 0.1069,
+      "step": 10025
+    },
+    {
+      "epoch": 0.08703049452695723,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001960213713318497,
+      "loss": 0.1279,
+      "step": 10026
+    },
+    {
+      "epoch": 0.08703917500716139,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001960204961485687,
+      "loss": 0.1152,
+      "step": 10027
+    },
+    {
+      "epoch": 0.08704785548736556,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00196019620871217,
+      "loss": 0.1455,
+      "step": 10028
+    },
+    {
+      "epoch": 0.08705653596756972,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001960187454997955,
+      "loss": 0.1221,
+      "step": 10029
+    },
+    {
+      "epoch": 0.08706521644777389,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0019601787003430526,
+      "loss": 0.1504,
+      "step": 10030
+    },
+    {
+      "epoch": 0.08707389692797805,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001960169944747471,
+      "loss": 0.1143,
+      "step": 10031
+    },
+    {
+      "epoch": 0.08708257740818222,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0019601611882112207,
+      "loss": 0.1309,
+      "step": 10032
+    },
+    {
+      "epoch": 0.08709125788838638,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019601524307343112,
+      "loss": 0.0859,
+      "step": 10033
+    },
+    {
+      "epoch": 0.08709993836859055,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019601436723167514,
+      "loss": 0.165,
+      "step": 10034
+    },
+    {
+      "epoch": 0.08710861884879471,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0019601349129585517,
+      "loss": 0.083,
+      "step": 10035
+    },
+    {
+      "epoch": 0.08711729932899888,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019601261526597216,
+      "loss": 0.1035,
+      "step": 10036
+    },
+    {
+      "epoch": 0.08712597980920304,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019601173914202703,
+      "loss": 0.1328,
+      "step": 10037
+    },
+    {
+      "epoch": 0.08713466028940721,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019601086292402076,
+      "loss": 0.1182,
+      "step": 10038
+    },
+    {
+      "epoch": 0.08714334076961137,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019600998661195427,
+      "loss": 0.1167,
+      "step": 10039
+    },
+    {
+      "epoch": 0.08715202124981554,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001960091102058286,
+      "loss": 0.1426,
+      "step": 10040
+    },
+    {
+      "epoch": 0.0871607017300197,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019600823370564463,
+      "loss": 0.126,
+      "step": 10041
+    },
+    {
+      "epoch": 0.08716938221022387,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019600735711140338,
+      "loss": 0.0938,
+      "step": 10042
+    },
+    {
+      "epoch": 0.08717806269042803,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019600648042310577,
+      "loss": 0.1187,
+      "step": 10043
+    },
+    {
+      "epoch": 0.0871867431706322,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001960056036407527,
+      "loss": 0.1094,
+      "step": 10044
+    },
+    {
+      "epoch": 0.08719542365083637,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0019600472676434525,
+      "loss": 0.1177,
+      "step": 10045
+    },
+    {
+      "epoch": 0.08720410413104053,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019600384979388437,
+      "loss": 0.1426,
+      "step": 10046
+    },
+    {
+      "epoch": 0.0872127846112447,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001960029727293709,
+      "loss": 0.0869,
+      "step": 10047
+    },
+    {
+      "epoch": 0.08722146509144886,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0019600209557080585,
+      "loss": 0.1387,
+      "step": 10048
+    },
+    {
+      "epoch": 0.08723014557165303,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019600121831819025,
+      "loss": 0.1377,
+      "step": 10049
+    },
+    {
+      "epoch": 0.08723882605185719,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00196000340971525,
+      "loss": 0.1055,
+      "step": 10050
+    },
+    {
+      "epoch": 0.08724750653206136,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0019599946353081104,
+      "loss": 0.1475,
+      "step": 10051
+    },
+    {
+      "epoch": 0.08725618701226552,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001959985859960494,
+      "loss": 0.1035,
+      "step": 10052
+    },
+    {
+      "epoch": 0.08726486749246969,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0019599770836724096,
+      "loss": 0.1133,
+      "step": 10053
+    },
+    {
+      "epoch": 0.08727354797267385,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001959968306443868,
+      "loss": 0.1348,
+      "step": 10054
+    },
+    {
+      "epoch": 0.08728222845287802,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0019599595282748772,
+      "loss": 0.1963,
+      "step": 10055
+    },
+    {
+      "epoch": 0.08729090893308218,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0019599507491654482,
+      "loss": 0.1289,
+      "step": 10056
+    },
+    {
+      "epoch": 0.08729958941328635,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001959941969115589,
+      "loss": 0.1689,
+      "step": 10057
+    },
+    {
+      "epoch": 0.08730826989349051,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001959933188125311,
+      "loss": 0.2754,
+      "step": 10058
+    },
+    {
+      "epoch": 0.08731695037369468,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0019599244061946227,
+      "loss": 0.1328,
+      "step": 10059
+    },
+    {
+      "epoch": 0.08732563085389884,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001959915623323534,
+      "loss": 0.1226,
+      "step": 10060
+    },
+    {
+      "epoch": 0.08733431133410301,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001959906839512055,
+      "loss": 0.1016,
+      "step": 10061
+    },
+    {
+      "epoch": 0.08734299181430717,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001959898054760194,
+      "loss": 0.1328,
+      "step": 10062
+    },
+    {
+      "epoch": 0.08735167229451134,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0019598892690679613,
+      "loss": 0.1582,
+      "step": 10063
+    },
+    {
+      "epoch": 0.0873603527747155,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019598804824353675,
+      "loss": 0.1079,
+      "step": 10064
+    },
+    {
+      "epoch": 0.08736903325491967,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001959871694862421,
+      "loss": 0.1309,
+      "step": 10065
+    },
+    {
+      "epoch": 0.08737771373512383,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019598629063491313,
+      "loss": 0.1387,
+      "step": 10066
+    },
+    {
+      "epoch": 0.087386394215328,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001959854116895509,
+      "loss": 0.1211,
+      "step": 10067
+    },
+    {
+      "epoch": 0.08739507469553216,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019598453265015628,
+      "loss": 0.1064,
+      "step": 10068
+    },
+    {
+      "epoch": 0.08740375517573633,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001959836535167303,
+      "loss": 0.1279,
+      "step": 10069
+    },
+    {
+      "epoch": 0.08741243565594049,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019598277428927386,
+      "loss": 0.1138,
+      "step": 10070
+    },
+    {
+      "epoch": 0.08742111613614466,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001959818949677879,
+      "loss": 0.1357,
+      "step": 10071
+    },
+    {
+      "epoch": 0.08742979661634882,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001959810155522735,
+      "loss": 0.0889,
+      "step": 10072
+    },
+    {
+      "epoch": 0.08743847709655297,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019598013604273158,
+      "loss": 0.1543,
+      "step": 10073
+    },
+    {
+      "epoch": 0.08744715757675714,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019597925643916304,
+      "loss": 0.1504,
+      "step": 10074
+    },
+    {
+      "epoch": 0.0874558380569613,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001959783767415689,
+      "loss": 0.1084,
+      "step": 10075
+    },
+    {
+      "epoch": 0.08746451853716547,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0019597749694995003,
+      "loss": 0.1709,
+      "step": 10076
+    },
+    {
+      "epoch": 0.08747319901736963,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001959766170643075,
+      "loss": 0.1406,
+      "step": 10077
+    },
+    {
+      "epoch": 0.0874818794975738,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001959757370846422,
+      "loss": 0.0933,
+      "step": 10078
+    },
+    {
+      "epoch": 0.08749055997777797,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.001959748570109552,
+      "loss": 0.1113,
+      "step": 10079
+    },
+    {
+      "epoch": 0.08749924045798213,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0019597397684324733,
+      "loss": 0.1191,
+      "step": 10080
+    },
+    {
+      "epoch": 0.0875079209381863,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0019597309658151963,
+      "loss": 0.1094,
+      "step": 10081
+    },
+    {
+      "epoch": 0.08751660141839046,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001959722162257731,
+      "loss": 0.1328,
+      "step": 10082
+    },
+    {
+      "epoch": 0.08752528189859463,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019597133577600855,
+      "loss": 0.082,
+      "step": 10083
+    },
+    {
+      "epoch": 0.08753396237879879,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001959704552322271,
+      "loss": 0.1396,
+      "step": 10084
+    },
+    {
+      "epoch": 0.08754264285900296,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001959695745944296,
+      "loss": 0.125,
+      "step": 10085
+    },
+    {
+      "epoch": 0.08755132333920712,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001959686938626171,
+      "loss": 0.1738,
+      "step": 10086
+    },
+    {
+      "epoch": 0.08756000381941129,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0019596781303679056,
+      "loss": 0.1113,
+      "step": 10087
+    },
+    {
+      "epoch": 0.08756868429961545,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0019596693211695088,
+      "loss": 0.0859,
+      "step": 10088
+    },
+    {
+      "epoch": 0.08757736477981962,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0019596605110309903,
+      "loss": 0.1494,
+      "step": 10089
+    },
+    {
+      "epoch": 0.08758604526002378,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019596516999523597,
+      "loss": 0.127,
+      "step": 10090
+    },
+    {
+      "epoch": 0.08759472574022795,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001959642887933628,
+      "loss": 0.1201,
+      "step": 10091
+    },
+    {
+      "epoch": 0.08760340622043211,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001959634074974803,
+      "loss": 0.1426,
+      "step": 10092
+    },
+    {
+      "epoch": 0.08761208670063628,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019596252610758953,
+      "loss": 0.1172,
+      "step": 10093
+    },
+    {
+      "epoch": 0.08762076718084044,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001959616446236914,
+      "loss": 0.1309,
+      "step": 10094
+    },
+    {
+      "epoch": 0.08762944766104461,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019596076304578694,
+      "loss": 0.1699,
+      "step": 10095
+    },
+    {
+      "epoch": 0.08763812814124877,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0019595988137387707,
+      "loss": 0.1289,
+      "step": 10096
+    },
+    {
+      "epoch": 0.08764680862145294,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001959589996079628,
+      "loss": 0.1377,
+      "step": 10097
+    },
+    {
+      "epoch": 0.0876554891016571,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00195958117748045,
+      "loss": 0.0957,
+      "step": 10098
+    },
+    {
+      "epoch": 0.08766416958186127,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0019595723579412475,
+      "loss": 0.1123,
+      "step": 10099
+    },
+    {
+      "epoch": 0.08767285006206543,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.001959563537462029,
+      "loss": 0.126,
+      "step": 10100
+    },
+    {
+      "epoch": 0.0876815305422696,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0019595547160428054,
+      "loss": 0.1445,
+      "step": 10101
+    },
+    {
+      "epoch": 0.08769021102247376,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001959545893683585,
+      "loss": 0.1406,
+      "step": 10102
+    },
+    {
+      "epoch": 0.08769889150267793,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001959537070384379,
+      "loss": 0.126,
+      "step": 10103
+    },
+    {
+      "epoch": 0.0877075719828821,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0019595282461451952,
+      "loss": 0.1133,
+      "step": 10104
+    },
+    {
+      "epoch": 0.08771625246308626,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001959519420966045,
+      "loss": 0.1562,
+      "step": 10105
+    },
+    {
+      "epoch": 0.08772493294329042,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0019595105948469367,
+      "loss": 0.1309,
+      "step": 10106
+    },
+    {
+      "epoch": 0.08773361342349459,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.001959501767787881,
+      "loss": 0.0898,
+      "step": 10107
+    },
+    {
+      "epoch": 0.08774229390369875,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0019594929397888866,
+      "loss": 0.1484,
+      "step": 10108
+    },
+    {
+      "epoch": 0.08775097438390292,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001959484110849964,
+      "loss": 0.1387,
+      "step": 10109
+    },
+    {
+      "epoch": 0.08775965486410708,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019594752809711223,
+      "loss": 0.1289,
+      "step": 10110
+    },
+    {
+      "epoch": 0.08776833534431125,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0019594664501523715,
+      "loss": 0.2012,
+      "step": 10111
+    },
+    {
+      "epoch": 0.08777701582451541,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0019594576183937207,
+      "loss": 0.0972,
+      "step": 10112
+    },
+    {
+      "epoch": 0.08778569630471958,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019594487856951803,
+      "loss": 0.0967,
+      "step": 10113
+    },
+    {
+      "epoch": 0.08779437678492374,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019594399520567597,
+      "loss": 0.123,
+      "step": 10114
+    },
+    {
+      "epoch": 0.08780305726512791,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0019594311174784686,
+      "loss": 0.1621,
+      "step": 10115
+    },
+    {
+      "epoch": 0.08781173774533207,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019594222819603164,
+      "loss": 0.0815,
+      "step": 10116
+    },
+    {
+      "epoch": 0.08782041822553624,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019594134455023127,
+      "loss": 0.1582,
+      "step": 10117
+    },
+    {
+      "epoch": 0.0878290987057404,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0019594046081044675,
+      "loss": 0.0952,
+      "step": 10118
+    },
+    {
+      "epoch": 0.08783777918594457,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019593957697667903,
+      "loss": 0.1201,
+      "step": 10119
+    },
+    {
+      "epoch": 0.08784645966614874,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019593869304892907,
+      "loss": 0.1309,
+      "step": 10120
+    },
+    {
+      "epoch": 0.0878551401463529,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001959378090271979,
+      "loss": 0.1426,
+      "step": 10121
+    },
+    {
+      "epoch": 0.08786382062655707,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019593692491148636,
+      "loss": 0.0942,
+      "step": 10122
+    },
+    {
+      "epoch": 0.08787250110676123,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001959360407017955,
+      "loss": 0.1025,
+      "step": 10123
+    },
+    {
+      "epoch": 0.0878811815869654,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.001959351563981263,
+      "loss": 0.0938,
+      "step": 10124
+    },
+    {
+      "epoch": 0.08788986206716956,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019593427200047973,
+      "loss": 0.123,
+      "step": 10125
+    },
+    {
+      "epoch": 0.08789854254737373,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001959333875088567,
+      "loss": 0.1602,
+      "step": 10126
+    },
+    {
+      "epoch": 0.08790722302757789,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001959325029232582,
+      "loss": 0.1416,
+      "step": 10127
+    },
+    {
+      "epoch": 0.08791590350778206,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019593161824368525,
+      "loss": 0.1553,
+      "step": 10128
+    },
+    {
+      "epoch": 0.08792458398798622,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0019593073347013872,
+      "loss": 0.1436,
+      "step": 10129
+    },
+    {
+      "epoch": 0.08793326446819039,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019592984860261967,
+      "loss": 0.0923,
+      "step": 10130
+    },
+    {
+      "epoch": 0.08794194494839455,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.00195928963641129,
+      "loss": 0.1689,
+      "step": 10131
+    },
+    {
+      "epoch": 0.08795062542859872,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0019592807858566774,
+      "loss": 0.1104,
+      "step": 10132
+    },
+    {
+      "epoch": 0.08795930590880288,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001959271934362368,
+      "loss": 0.1133,
+      "step": 10133
+    },
+    {
+      "epoch": 0.08796798638900703,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001959263081928372,
+      "loss": 0.1455,
+      "step": 10134
+    },
+    {
+      "epoch": 0.0879766668692112,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0019592542285546983,
+      "loss": 0.1138,
+      "step": 10135
+    },
+    {
+      "epoch": 0.08798534734941536,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019592453742413576,
+      "loss": 0.127,
+      "step": 10136
+    },
+    {
+      "epoch": 0.08799402782961953,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001959236518988359,
+      "loss": 0.1357,
+      "step": 10137
+    },
+    {
+      "epoch": 0.0880027083098237,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019592276627957116,
+      "loss": 0.1133,
+      "step": 10138
+    },
+    {
+      "epoch": 0.08801138879002786,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001959218805663426,
+      "loss": 0.123,
+      "step": 10139
+    },
+    {
+      "epoch": 0.08802006927023202,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.001959209947591512,
+      "loss": 0.0889,
+      "step": 10140
+    },
+    {
+      "epoch": 0.08802874975043619,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0019592010885799787,
+      "loss": 0.0947,
+      "step": 10141
+    },
+    {
+      "epoch": 0.08803743023064035,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001959192228628836,
+      "loss": 0.1357,
+      "step": 10142
+    },
+    {
+      "epoch": 0.08804611071084452,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019591833677380936,
+      "loss": 0.1445,
+      "step": 10143
+    },
+    {
+      "epoch": 0.08805479119104868,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0019591745059077616,
+      "loss": 0.0967,
+      "step": 10144
+    },
+    {
+      "epoch": 0.08806347167125285,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0019591656431378487,
+      "loss": 0.1396,
+      "step": 10145
+    },
+    {
+      "epoch": 0.08807215215145701,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019591567794283653,
+      "loss": 0.1118,
+      "step": 10146
+    },
+    {
+      "epoch": 0.08808083263166118,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001959147914779321,
+      "loss": 0.1641,
+      "step": 10147
+    },
+    {
+      "epoch": 0.08808951311186534,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019591390491907256,
+      "loss": 0.1172,
+      "step": 10148
+    },
+    {
+      "epoch": 0.08809819359206951,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001959130182662588,
+      "loss": 0.1104,
+      "step": 10149
+    },
+    {
+      "epoch": 0.08810687407227367,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0019591213151949195,
+      "loss": 0.1172,
+      "step": 10150
+    },
+    {
+      "epoch": 0.08811555455247784,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.001959112446787728,
+      "loss": 0.1309,
+      "step": 10151
+    },
+    {
+      "epoch": 0.088124235032682,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019591035774410245,
+      "loss": 0.1064,
+      "step": 10152
+    },
+    {
+      "epoch": 0.08813291551288617,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0019590947071548184,
+      "loss": 0.1143,
+      "step": 10153
+    },
+    {
+      "epoch": 0.08814159599309034,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0019590858359291186,
+      "loss": 0.1357,
+      "step": 10154
+    },
+    {
+      "epoch": 0.0881502764732945,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001959076963763936,
+      "loss": 0.126,
+      "step": 10155
+    },
+    {
+      "epoch": 0.08815895695349867,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0019590680906592796,
+      "loss": 0.1123,
+      "step": 10156
+    },
+    {
+      "epoch": 0.08816763743370283,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019590592166151594,
+      "loss": 0.1348,
+      "step": 10157
+    },
+    {
+      "epoch": 0.088176317913907,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019590503416315845,
+      "loss": 0.1396,
+      "step": 10158
+    },
+    {
+      "epoch": 0.08818499839411116,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0019590414657085654,
+      "loss": 0.1113,
+      "step": 10159
+    },
+    {
+      "epoch": 0.08819367887431533,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019590325888461115,
+      "loss": 0.0957,
+      "step": 10160
+    },
+    {
+      "epoch": 0.08820235935451949,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001959023711044232,
+      "loss": 0.1006,
+      "step": 10161
+    },
+    {
+      "epoch": 0.08821103983472366,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0019590148323029377,
+      "loss": 0.0918,
+      "step": 10162
+    },
+    {
+      "epoch": 0.08821972031492782,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019590059526222374,
+      "loss": 0.1309,
+      "step": 10163
+    },
+    {
+      "epoch": 0.08822840079513199,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001958997072002141,
+      "loss": 0.0771,
+      "step": 10164
+    },
+    {
+      "epoch": 0.08823708127533615,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019589881904426582,
+      "loss": 0.0928,
+      "step": 10165
+    },
+    {
+      "epoch": 0.08824576175554032,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019589793079437994,
+      "loss": 0.0918,
+      "step": 10166
+    },
+    {
+      "epoch": 0.08825444223574448,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001958970424505573,
+      "loss": 0.1152,
+      "step": 10167
+    },
+    {
+      "epoch": 0.08826312271594865,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00195896154012799,
+      "loss": 0.1465,
+      "step": 10168
+    },
+    {
+      "epoch": 0.08827180319615281,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001958952654811059,
+      "loss": 0.1211,
+      "step": 10169
+    },
+    {
+      "epoch": 0.08828048367635698,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001958943768554791,
+      "loss": 0.1416,
+      "step": 10170
+    },
+    {
+      "epoch": 0.08828916415656114,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0019589348813591947,
+      "loss": 0.1318,
+      "step": 10171
+    },
+    {
+      "epoch": 0.08829784463676531,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00195892599322428,
+      "loss": 0.1201,
+      "step": 10172
+    },
+    {
+      "epoch": 0.08830652511696947,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019589171041500568,
+      "loss": 0.1064,
+      "step": 10173
+    },
+    {
+      "epoch": 0.08831520559717364,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019589082141365345,
+      "loss": 0.1445,
+      "step": 10174
+    },
+    {
+      "epoch": 0.0883238860773778,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019588993231837238,
+      "loss": 0.1436,
+      "step": 10175
+    },
+    {
+      "epoch": 0.08833256655758197,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001958890431291633,
+      "loss": 0.1123,
+      "step": 10176
+    },
+    {
+      "epoch": 0.08834124703778613,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.001958881538460273,
+      "loss": 0.1016,
+      "step": 10177
+    },
+    {
+      "epoch": 0.0883499275179903,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019588726446896527,
+      "loss": 0.1553,
+      "step": 10178
+    },
+    {
+      "epoch": 0.08835860799819446,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.001958863749979782,
+      "loss": 0.1328,
+      "step": 10179
+    },
+    {
+      "epoch": 0.08836728847839863,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001958854854330671,
+      "loss": 0.1221,
+      "step": 10180
+    },
+    {
+      "epoch": 0.0883759689586028,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0019588459577423297,
+      "loss": 0.1875,
+      "step": 10181
+    },
+    {
+      "epoch": 0.08838464943880696,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001958837060214767,
+      "loss": 0.1089,
+      "step": 10182
+    },
+    {
+      "epoch": 0.08839332991901112,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001958828161747993,
+      "loss": 0.1436,
+      "step": 10183
+    },
+    {
+      "epoch": 0.08840201039921529,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019588192623420176,
+      "loss": 0.125,
+      "step": 10184
+    },
+    {
+      "epoch": 0.08841069087941945,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.00195881036199685,
+      "loss": 0.1494,
+      "step": 10185
+    },
+    {
+      "epoch": 0.08841937135962362,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019588014607125007,
+      "loss": 0.1465,
+      "step": 10186
+    },
+    {
+      "epoch": 0.08842805183982778,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019587925584889786,
+      "loss": 0.1992,
+      "step": 10187
+    },
+    {
+      "epoch": 0.08843673232003195,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019587836553262945,
+      "loss": 0.1553,
+      "step": 10188
+    },
+    {
+      "epoch": 0.08844541280023611,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0019587747512244567,
+      "loss": 0.1309,
+      "step": 10189
+    },
+    {
+      "epoch": 0.08845409328044028,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001958765846183476,
+      "loss": 0.0986,
+      "step": 10190
+    },
+    {
+      "epoch": 0.08846277376064444,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001958756940203362,
+      "loss": 0.1094,
+      "step": 10191
+    },
+    {
+      "epoch": 0.08847145424084861,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0019587480332841243,
+      "loss": 0.1855,
+      "step": 10192
+    },
+    {
+      "epoch": 0.08848013472105277,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019587391254257728,
+      "loss": 0.1289,
+      "step": 10193
+    },
+    {
+      "epoch": 0.08848881520125694,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019587302166283174,
+      "loss": 0.1094,
+      "step": 10194
+    },
+    {
+      "epoch": 0.0884974956814611,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001958721306891767,
+      "loss": 0.103,
+      "step": 10195
+    },
+    {
+      "epoch": 0.08850617616166526,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019587123962161316,
+      "loss": 0.2305,
+      "step": 10196
+    },
+    {
+      "epoch": 0.08851485664186942,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0019587034846014215,
+      "loss": 0.1338,
+      "step": 10197
+    },
+    {
+      "epoch": 0.08852353712207359,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0019586945720476466,
+      "loss": 0.1138,
+      "step": 10198
+    },
+    {
+      "epoch": 0.08853221760227775,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0019586856585548157,
+      "loss": 0.1172,
+      "step": 10199
+    },
+    {
+      "epoch": 0.08854089808248192,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001958676744122939,
+      "loss": 0.1123,
+      "step": 10200
+    },
+    {
+      "epoch": 0.08854957856268608,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001958667828752027,
+      "loss": 0.1523,
+      "step": 10201
+    },
+    {
+      "epoch": 0.08855825904289025,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019586589124420883,
+      "loss": 0.1377,
+      "step": 10202
+    },
+    {
+      "epoch": 0.08856693952309441,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.001958649995193133,
+      "loss": 0.1064,
+      "step": 10203
+    },
+    {
+      "epoch": 0.08857562000329858,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001958641077005171,
+      "loss": 0.1406,
+      "step": 10204
+    },
+    {
+      "epoch": 0.08858430048350274,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001958632157878212,
+      "loss": 0.1016,
+      "step": 10205
+    },
+    {
+      "epoch": 0.08859298096370691,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001958623237812266,
+      "loss": 0.1245,
+      "step": 10206
+    },
+    {
+      "epoch": 0.08860166144391107,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019586143168073425,
+      "loss": 0.1055,
+      "step": 10207
+    },
+    {
+      "epoch": 0.08861034192411524,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019586053948634514,
+      "loss": 0.1592,
+      "step": 10208
+    },
+    {
+      "epoch": 0.0886190224043194,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001958596471980602,
+      "loss": 0.1289,
+      "step": 10209
+    },
+    {
+      "epoch": 0.08862770288452357,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019585875481588043,
+      "loss": 0.1602,
+      "step": 10210
+    },
+    {
+      "epoch": 0.08863638336472773,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0019585786233980687,
+      "loss": 0.125,
+      "step": 10211
+    },
+    {
+      "epoch": 0.0886450638449319,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001958569697698404,
+      "loss": 0.0903,
+      "step": 10212
+    },
+    {
+      "epoch": 0.08865374432513606,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001958560771059821,
+      "loss": 0.1025,
+      "step": 10213
+    },
+    {
+      "epoch": 0.08866242480534023,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001958551843482328,
+      "loss": 0.1582,
+      "step": 10214
+    },
+    {
+      "epoch": 0.0886711052855444,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001958542914965936,
+      "loss": 0.1143,
+      "step": 10215
+    },
+    {
+      "epoch": 0.08867978576574856,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001958533985510654,
+      "loss": 0.105,
+      "step": 10216
+    },
+    {
+      "epoch": 0.08868846624595272,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0019585250551164926,
+      "loss": 0.1328,
+      "step": 10217
+    },
+    {
+      "epoch": 0.08869714672615689,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019585161237834613,
+      "loss": 0.1719,
+      "step": 10218
+    },
+    {
+      "epoch": 0.08870582720636105,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001958507191511569,
+      "loss": 0.1191,
+      "step": 10219
+    },
+    {
+      "epoch": 0.08871450768656522,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001958498258300827,
+      "loss": 0.1406,
+      "step": 10220
+    },
+    {
+      "epoch": 0.08872318816676938,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.001958489324151243,
+      "loss": 0.127,
+      "step": 10221
+    },
+    {
+      "epoch": 0.08873186864697355,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019584803890628287,
+      "loss": 0.1406,
+      "step": 10222
+    },
+    {
+      "epoch": 0.08874054912717771,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019584714530355932,
+      "loss": 0.1475,
+      "step": 10223
+    },
+    {
+      "epoch": 0.08874922960738188,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001958462516069546,
+      "loss": 0.1191,
+      "step": 10224
+    },
+    {
+      "epoch": 0.08875791008758604,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0019584535781646973,
+      "loss": 0.125,
+      "step": 10225
+    },
+    {
+      "epoch": 0.08876659056779021,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0019584446393210563,
+      "loss": 0.106,
+      "step": 10226
+    },
+    {
+      "epoch": 0.08877527104799438,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0019584356995386333,
+      "loss": 0.166,
+      "step": 10227
+    },
+    {
+      "epoch": 0.08878395152819854,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0019584267588174382,
+      "loss": 0.0879,
+      "step": 10228
+    },
+    {
+      "epoch": 0.0887926320084027,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00195841781715748,
+      "loss": 0.1211,
+      "step": 10229
+    },
+    {
+      "epoch": 0.08880131248860687,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019584088745587696,
+      "loss": 0.1157,
+      "step": 10230
+    },
+    {
+      "epoch": 0.08880999296881104,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0019583999310213156,
+      "loss": 0.1553,
+      "step": 10231
+    },
+    {
+      "epoch": 0.0888186734490152,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001958390986545129,
+      "loss": 0.1016,
+      "step": 10232
+    },
+    {
+      "epoch": 0.08882735392921937,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.0019583820411302184,
+      "loss": 0.1543,
+      "step": 10233
+    },
+    {
+      "epoch": 0.08883603440942353,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001958373094776594,
+      "loss": 0.1201,
+      "step": 10234
+    },
+    {
+      "epoch": 0.0888447148896277,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001958364147484266,
+      "loss": 0.1406,
+      "step": 10235
+    },
+    {
+      "epoch": 0.08885339536983186,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001958355199253244,
+      "loss": 0.1689,
+      "step": 10236
+    },
+    {
+      "epoch": 0.08886207585003603,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001958346250083537,
+      "loss": 0.1582,
+      "step": 10237
+    },
+    {
+      "epoch": 0.08887075633024019,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001958337299975156,
+      "loss": 0.0908,
+      "step": 10238
+    },
+    {
+      "epoch": 0.08887943681044436,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00195832834892811,
+      "loss": 0.0889,
+      "step": 10239
+    },
+    {
+      "epoch": 0.08888811729064852,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001958319396942409,
+      "loss": 0.1089,
+      "step": 10240
+    },
+    {
+      "epoch": 0.08889679777085269,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019583104440180628,
+      "loss": 0.0889,
+      "step": 10241
+    },
+    {
+      "epoch": 0.08890547825105685,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001958301490155081,
+      "loss": 0.1816,
+      "step": 10242
+    },
+    {
+      "epoch": 0.08891415873126102,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001958292535353474,
+      "loss": 0.1172,
+      "step": 10243
+    },
+    {
+      "epoch": 0.08892283921146518,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001958283579613251,
+      "loss": 0.124,
+      "step": 10244
+    },
+    {
+      "epoch": 0.08893151969166935,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001958274622934422,
+      "loss": 0.124,
+      "step": 10245
+    },
+    {
+      "epoch": 0.08894020017187351,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019582656653169966,
+      "loss": 0.1152,
+      "step": 10246
+    },
+    {
+      "epoch": 0.08894888065207768,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001958256706760985,
+      "loss": 0.1094,
+      "step": 10247
+    },
+    {
+      "epoch": 0.08895756113228184,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0019582477472663965,
+      "loss": 0.125,
+      "step": 10248
+    },
+    {
+      "epoch": 0.08896624161248601,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019582387868332415,
+      "loss": 0.0869,
+      "step": 10249
+    },
+    {
+      "epoch": 0.08897492209269017,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019582298254615296,
+      "loss": 0.1426,
+      "step": 10250
+    },
+    {
+      "epoch": 0.08898360257289434,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0019582208631512697,
+      "loss": 0.1396,
+      "step": 10251
+    },
+    {
+      "epoch": 0.0889922830530985,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.001958211899902473,
+      "loss": 0.1123,
+      "step": 10252
+    },
+    {
+      "epoch": 0.08900096353330267,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001958202935715148,
+      "loss": 0.1914,
+      "step": 10253
+    },
+    {
+      "epoch": 0.08900964401350683,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.001958193970589306,
+      "loss": 0.106,
+      "step": 10254
+    },
+    {
+      "epoch": 0.089018324493711,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0019581850045249554,
+      "loss": 0.124,
+      "step": 10255
+    },
+    {
+      "epoch": 0.08902700497391516,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001958176037522107,
+      "loss": 0.1211,
+      "step": 10256
+    },
+    {
+      "epoch": 0.08903568545411933,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0019581670695807698,
+      "loss": 0.0889,
+      "step": 10257
+    },
+    {
+      "epoch": 0.08904436593432348,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0019581581007009543,
+      "loss": 0.0996,
+      "step": 10258
+    },
+    {
+      "epoch": 0.08905304641452765,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.00195814913088267,
+      "loss": 0.0947,
+      "step": 10259
+    },
+    {
+      "epoch": 0.08906172689473181,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001958140160125926,
+      "loss": 0.1084,
+      "step": 10260
+    },
+    {
+      "epoch": 0.08907040737493598,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001958131188430734,
+      "loss": 0.127,
+      "step": 10261
+    },
+    {
+      "epoch": 0.08907908785514014,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019581222157971016,
+      "loss": 0.1201,
+      "step": 10262
+    },
+    {
+      "epoch": 0.0890877683353443,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.00195811324222504,
+      "loss": 0.1426,
+      "step": 10263
+    },
+    {
+      "epoch": 0.08909644881554847,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0019581042677145584,
+      "loss": 0.1016,
+      "step": 10264
+    },
+    {
+      "epoch": 0.08910512929575264,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019580952922656674,
+      "loss": 0.1299,
+      "step": 10265
+    },
+    {
+      "epoch": 0.0891138097759568,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001958086315878376,
+      "loss": 0.0957,
+      "step": 10266
+    },
+    {
+      "epoch": 0.08912249025616097,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001958077338552694,
+      "loss": 0.1201,
+      "step": 10267
+    },
+    {
+      "epoch": 0.08913117073636513,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001958068360288632,
+      "loss": 0.1445,
+      "step": 10268
+    },
+    {
+      "epoch": 0.0891398512165693,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0019580593810861993,
+      "loss": 0.1436,
+      "step": 10269
+    },
+    {
+      "epoch": 0.08914853169677346,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019580504009454056,
+      "loss": 0.1064,
+      "step": 10270
+    },
+    {
+      "epoch": 0.08915721217697763,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019580414198662605,
+      "loss": 0.0947,
+      "step": 10271
+    },
+    {
+      "epoch": 0.08916589265718179,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019580324378487748,
+      "loss": 0.1348,
+      "step": 10272
+    },
+    {
+      "epoch": 0.08917457313738596,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0019580234548929576,
+      "loss": 0.0947,
+      "step": 10273
+    },
+    {
+      "epoch": 0.08918325361759012,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019580144709988186,
+      "loss": 0.1367,
+      "step": 10274
+    },
+    {
+      "epoch": 0.08919193409779429,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.0019580054861663685,
+      "loss": 0.1484,
+      "step": 10275
+    },
+    {
+      "epoch": 0.08920061457799845,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0019579965003956156,
+      "loss": 0.1592,
+      "step": 10276
+    },
+    {
+      "epoch": 0.08920929505820262,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.001957987513686571,
+      "loss": 0.1367,
+      "step": 10277
+    },
+    {
+      "epoch": 0.08921797553840678,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0019579785260392443,
+      "loss": 0.1416,
+      "step": 10278
+    },
+    {
+      "epoch": 0.08922665601861095,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001957969537453645,
+      "loss": 0.125,
+      "step": 10279
+    },
+    {
+      "epoch": 0.08923533649881511,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001957960547929783,
+      "loss": 0.1064,
+      "step": 10280
+    },
+    {
+      "epoch": 0.08924401697901928,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019579515574676683,
+      "loss": 0.1416,
+      "step": 10281
+    },
+    {
+      "epoch": 0.08925269745922344,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001957942566067311,
+      "loss": 0.1543,
+      "step": 10282
+    },
+    {
+      "epoch": 0.08926137793942761,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.00195793357372872,
+      "loss": 0.1729,
+      "step": 10283
+    },
+    {
+      "epoch": 0.08927005841963177,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019579245804519064,
+      "loss": 0.1064,
+      "step": 10284
+    },
+    {
+      "epoch": 0.08927873889983594,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.001957915586236879,
+      "loss": 0.1475,
+      "step": 10285
+    },
+    {
+      "epoch": 0.0892874193800401,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001957906591083648,
+      "loss": 0.168,
+      "step": 10286
+    },
+    {
+      "epoch": 0.08929609986024427,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001957897594992223,
+      "loss": 0.1001,
+      "step": 10287
+    },
+    {
+      "epoch": 0.08930478034044843,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001957888597962615,
+      "loss": 0.1147,
+      "step": 10288
+    },
+    {
+      "epoch": 0.0893134608206526,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001957879599994832,
+      "loss": 0.125,
+      "step": 10289
+    },
+    {
+      "epoch": 0.08932214130085676,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001957870601088885,
+      "loss": 0.1143,
+      "step": 10290
+    },
+    {
+      "epoch": 0.08933082178106093,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019578616012447835,
+      "loss": 0.1494,
+      "step": 10291
+    },
+    {
+      "epoch": 0.0893395022612651,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0019578526004625377,
+      "loss": 0.126,
+      "step": 10292
+    },
+    {
+      "epoch": 0.08934818274146926,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.001957843598742157,
+      "loss": 0.0947,
+      "step": 10293
+    },
+    {
+      "epoch": 0.08935686322167342,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0019578345960836515,
+      "loss": 0.1206,
+      "step": 10294
+    },
+    {
+      "epoch": 0.08936554370187759,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001957825592487031,
+      "loss": 0.0903,
+      "step": 10295
+    },
+    {
+      "epoch": 0.08937422418208175,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0019578165879523056,
+      "loss": 0.1006,
+      "step": 10296
+    },
+    {
+      "epoch": 0.08938290466228592,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019578075824794846,
+      "loss": 0.126,
+      "step": 10297
+    },
+    {
+      "epoch": 0.08939158514249008,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.0019577985760685776,
+      "loss": 0.2266,
+      "step": 10298
+    },
+    {
+      "epoch": 0.08940026562269425,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001957789568719596,
+      "loss": 0.1387,
+      "step": 10299
+    },
+    {
+      "epoch": 0.08940894610289842,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019577805604325477,
+      "loss": 0.1035,
+      "step": 10300
+    },
+    {
+      "epoch": 0.08941762658310258,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001957771551207444,
+      "loss": 0.1895,
+      "step": 10301
+    },
+    {
+      "epoch": 0.08942630706330675,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.001957762541044294,
+      "loss": 0.1143,
+      "step": 10302
+    },
+    {
+      "epoch": 0.08943498754351091,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0019577535299431076,
+      "loss": 0.1016,
+      "step": 10303
+    },
+    {
+      "epoch": 0.08944366802371508,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001957744517903895,
+      "loss": 0.127,
+      "step": 10304
+    },
+    {
+      "epoch": 0.08945234850391924,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019577355049266664,
+      "loss": 0.1191,
+      "step": 10305
+    },
+    {
+      "epoch": 0.0894610289841234,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.00195772649101143,
+      "loss": 0.1406,
+      "step": 10306
+    },
+    {
+      "epoch": 0.08946970946432757,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001957717476158198,
+      "loss": 0.1074,
+      "step": 10307
+    },
+    {
+      "epoch": 0.08947838994453174,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019577084603669788,
+      "loss": 0.1318,
+      "step": 10308
+    },
+    {
+      "epoch": 0.0894870704247359,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001957699443637782,
+      "loss": 0.1719,
+      "step": 10309
+    },
+    {
+      "epoch": 0.08949575090494007,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019576904259706184,
+      "loss": 0.1367,
+      "step": 10310
+    },
+    {
+      "epoch": 0.08950443138514423,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001957681407365497,
+      "loss": 0.1465,
+      "step": 10311
+    },
+    {
+      "epoch": 0.0895131118653484,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019576723878224284,
+      "loss": 0.1074,
+      "step": 10312
+    },
+    {
+      "epoch": 0.08952179234555256,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001957663367341422,
+      "loss": 0.1318,
+      "step": 10313
+    },
+    {
+      "epoch": 0.08953047282575673,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019576543459224878,
+      "loss": 0.1445,
+      "step": 10314
+    },
+    {
+      "epoch": 0.08953915330596089,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001957645323565636,
+      "loss": 0.125,
+      "step": 10315
+    },
+    {
+      "epoch": 0.08954783378616506,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019576363002708763,
+      "loss": 0.0972,
+      "step": 10316
+    },
+    {
+      "epoch": 0.08955651426636922,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001957627276038218,
+      "loss": 0.0879,
+      "step": 10317
+    },
+    {
+      "epoch": 0.08956519474657339,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019576182508676716,
+      "loss": 0.1582,
+      "step": 10318
+    },
+    {
+      "epoch": 0.08957387522677754,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019576092247592467,
+      "loss": 0.1465,
+      "step": 10319
+    },
+    {
+      "epoch": 0.0895825557069817,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001957600197712953,
+      "loss": 0.1562,
+      "step": 10320
+    },
+    {
+      "epoch": 0.08959123618718587,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0019575911697288007,
+      "loss": 0.1143,
+      "step": 10321
+    },
+    {
+      "epoch": 0.08959991666739003,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019575821408067996,
+      "loss": 0.1172,
+      "step": 10322
+    },
+    {
+      "epoch": 0.0896085971475942,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019575731109469597,
+      "loss": 0.1484,
+      "step": 10323
+    },
+    {
+      "epoch": 0.08961727762779836,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.001957564080149291,
+      "loss": 0.1182,
+      "step": 10324
+    },
+    {
+      "epoch": 0.08962595810800253,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019575550484138026,
+      "loss": 0.1328,
+      "step": 10325
+    },
+    {
+      "epoch": 0.0896346385882067,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019575460157405054,
+      "loss": 0.1416,
+      "step": 10326
+    },
+    {
+      "epoch": 0.08964331906841086,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0019575369821294084,
+      "loss": 0.1069,
+      "step": 10327
+    },
+    {
+      "epoch": 0.08965199954861502,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001957527947580522,
+      "loss": 0.124,
+      "step": 10328
+    },
+    {
+      "epoch": 0.08966068002881919,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019575189120938554,
+      "loss": 0.1035,
+      "step": 10329
+    },
+    {
+      "epoch": 0.08966936050902335,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0019575098756694194,
+      "loss": 0.1406,
+      "step": 10330
+    },
+    {
+      "epoch": 0.08967804098922752,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001957500838307224,
+      "loss": 0.1836,
+      "step": 10331
+    },
+    {
+      "epoch": 0.08968672146943168,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001957491800007278,
+      "loss": 0.1328,
+      "step": 10332
+    },
+    {
+      "epoch": 0.08969540194963585,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001957482760769592,
+      "loss": 0.0908,
+      "step": 10333
+    },
+    {
+      "epoch": 0.08970408242984002,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019574737205941754,
+      "loss": 0.165,
+      "step": 10334
+    },
+    {
+      "epoch": 0.08971276291004418,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019574646794810386,
+      "loss": 0.1445,
+      "step": 10335
+    },
+    {
+      "epoch": 0.08972144339024835,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019574556374301914,
+      "loss": 0.1533,
+      "step": 10336
+    },
+    {
+      "epoch": 0.08973012387045251,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0019574465944416438,
+      "loss": 0.1221,
+      "step": 10337
+    },
+    {
+      "epoch": 0.08973880435065668,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019574375505154048,
+      "loss": 0.1094,
+      "step": 10338
+    },
+    {
+      "epoch": 0.08974748483086084,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019574285056514857,
+      "loss": 0.1328,
+      "step": 10339
+    },
+    {
+      "epoch": 0.089756165311065,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019574194598498953,
+      "loss": 0.1455,
+      "step": 10340
+    },
+    {
+      "epoch": 0.08976484579126917,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001957410413110644,
+      "loss": 0.0933,
+      "step": 10341
+    },
+    {
+      "epoch": 0.08977352627147334,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0019574013654337415,
+      "loss": 0.1348,
+      "step": 10342
+    },
+    {
+      "epoch": 0.0897822067516775,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001957392316819198,
+      "loss": 0.1084,
+      "step": 10343
+    },
+    {
+      "epoch": 0.08979088723188167,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001957383267267023,
+      "loss": 0.0869,
+      "step": 10344
+    },
+    {
+      "epoch": 0.08979956771208583,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019573742167772264,
+      "loss": 0.1338,
+      "step": 10345
+    },
+    {
+      "epoch": 0.08980824819229,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019573651653498184,
+      "loss": 0.1963,
+      "step": 10346
+    },
+    {
+      "epoch": 0.08981692867249416,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001957356112984809,
+      "loss": 0.1147,
+      "step": 10347
+    },
+    {
+      "epoch": 0.08982560915269833,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001957347059682207,
+      "loss": 0.1533,
+      "step": 10348
+    },
+    {
+      "epoch": 0.08983428963290249,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001957338005442024,
+      "loss": 0.1113,
+      "step": 10349
+    },
+    {
+      "epoch": 0.08984297011310666,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019573289502642682,
+      "loss": 0.0962,
+      "step": 10350
+    },
+    {
+      "epoch": 0.08985165059331082,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001957319894148951,
+      "loss": 0.1572,
+      "step": 10351
+    },
+    {
+      "epoch": 0.08986033107351499,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019573108370960816,
+      "loss": 0.1445,
+      "step": 10352
+    },
+    {
+      "epoch": 0.08986901155371915,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019573017791056696,
+      "loss": 0.0615,
+      "step": 10353
+    },
+    {
+      "epoch": 0.08987769203392332,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019572927201777254,
+      "loss": 0.123,
+      "step": 10354
+    },
+    {
+      "epoch": 0.08988637251412748,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0019572836603122586,
+      "loss": 0.0972,
+      "step": 10355
+    },
+    {
+      "epoch": 0.08989505299433165,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019572745995092796,
+      "loss": 0.1021,
+      "step": 10356
+    },
+    {
+      "epoch": 0.08990373347453581,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019572655377687984,
+      "loss": 0.1484,
+      "step": 10357
+    },
+    {
+      "epoch": 0.08991241395473998,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019572564750908236,
+      "loss": 0.1123,
+      "step": 10358
+    },
+    {
+      "epoch": 0.08992109443494414,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019572474114753664,
+      "loss": 0.1182,
+      "step": 10359
+    },
+    {
+      "epoch": 0.08992977491514831,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001957238346922436,
+      "loss": 0.0952,
+      "step": 10360
+    },
+    {
+      "epoch": 0.08993845539535247,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001957229281432043,
+      "loss": 0.1543,
+      "step": 10361
+    },
+    {
+      "epoch": 0.08994713587555664,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0019572202150041967,
+      "loss": 0.1582,
+      "step": 10362
+    },
+    {
+      "epoch": 0.0899558163557608,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019572111476389075,
+      "loss": 0.1206,
+      "step": 10363
+    },
+    {
+      "epoch": 0.08996449683596497,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001957202079336185,
+      "loss": 0.1235,
+      "step": 10364
+    },
+    {
+      "epoch": 0.08997317731616913,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019571930100960393,
+      "loss": 0.1025,
+      "step": 10365
+    },
+    {
+      "epoch": 0.0899818577963733,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0019571839399184798,
+      "loss": 0.1514,
+      "step": 10366
+    },
+    {
+      "epoch": 0.08999053827657746,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0019571748688035173,
+      "loss": 0.1387,
+      "step": 10367
+    },
+    {
+      "epoch": 0.08999921875678163,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001957165796751161,
+      "loss": 0.1377,
+      "step": 10368
+    },
+    {
+      "epoch": 0.0900078992369858,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001957156723761421,
+      "loss": 0.1147,
+      "step": 10369
+    },
+    {
+      "epoch": 0.09001657971718996,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019571476498343073,
+      "loss": 0.1006,
+      "step": 10370
+    },
+    {
+      "epoch": 0.09002526019739412,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.00195713857496983,
+      "loss": 0.1855,
+      "step": 10371
+    },
+    {
+      "epoch": 0.09003394067759829,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019571294991679983,
+      "loss": 0.1484,
+      "step": 10372
+    },
+    {
+      "epoch": 0.09004262115780245,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019571204224288234,
+      "loss": 0.0977,
+      "step": 10373
+    },
+    {
+      "epoch": 0.09005130163800662,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.001957111344752314,
+      "loss": 0.1055,
+      "step": 10374
+    },
+    {
+      "epoch": 0.09005998211821079,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019571022661384807,
+      "loss": 0.1235,
+      "step": 10375
+    },
+    {
+      "epoch": 0.09006866259841495,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019570931865873333,
+      "loss": 0.1289,
+      "step": 10376
+    },
+    {
+      "epoch": 0.09007734307861912,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0019570841060988814,
+      "loss": 0.1094,
+      "step": 10377
+    },
+    {
+      "epoch": 0.09008602355882328,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019570750246731355,
+      "loss": 0.1475,
+      "step": 10378
+    },
+    {
+      "epoch": 0.09009470403902745,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001957065942310105,
+      "loss": 0.1377,
+      "step": 10379
+    },
+    {
+      "epoch": 0.09010338451923161,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0019570568590098003,
+      "loss": 0.1279,
+      "step": 10380
+    },
+    {
+      "epoch": 0.09011206499943576,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019570477747722307,
+      "loss": 0.1084,
+      "step": 10381
+    },
+    {
+      "epoch": 0.09012074547963993,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001957038689597407,
+      "loss": 0.1104,
+      "step": 10382
+    },
+    {
+      "epoch": 0.09012942595984409,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.001957029603485338,
+      "loss": 0.1055,
+      "step": 10383
+    },
+    {
+      "epoch": 0.09013810644004826,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019570205164360353,
+      "loss": 0.1011,
+      "step": 10384
+    },
+    {
+      "epoch": 0.09014678692025242,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001957011428449507,
+      "loss": 0.207,
+      "step": 10385
+    },
+    {
+      "epoch": 0.09015546740045659,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001957002339525764,
+      "loss": 0.1045,
+      "step": 10386
+    },
+    {
+      "epoch": 0.09016414788066075,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0019569932496648164,
+      "loss": 0.1387,
+      "step": 10387
+    },
+    {
+      "epoch": 0.09017282836086492,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001956984158866674,
+      "loss": 0.1035,
+      "step": 10388
+    },
+    {
+      "epoch": 0.09018150884106908,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001956975067131346,
+      "loss": 0.1143,
+      "step": 10389
+    },
+    {
+      "epoch": 0.09019018932127325,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019569659744588432,
+      "loss": 0.1289,
+      "step": 10390
+    },
+    {
+      "epoch": 0.09019886980147741,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019569568808491754,
+      "loss": 0.1143,
+      "step": 10391
+    },
+    {
+      "epoch": 0.09020755028168158,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019569477863023523,
+      "loss": 0.124,
+      "step": 10392
+    },
+    {
+      "epoch": 0.09021623076188574,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0019569386908183843,
+      "loss": 0.1172,
+      "step": 10393
+    },
+    {
+      "epoch": 0.09022491124208991,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019569295943972806,
+      "loss": 0.1523,
+      "step": 10394
+    },
+    {
+      "epoch": 0.09023359172229407,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001956920497039052,
+      "loss": 0.1021,
+      "step": 10395
+    },
+    {
+      "epoch": 0.09024227220249824,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019569113987437073,
+      "loss": 0.1006,
+      "step": 10396
+    },
+    {
+      "epoch": 0.0902509526827024,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.001956902299511258,
+      "loss": 0.1123,
+      "step": 10397
+    },
+    {
+      "epoch": 0.09025963316290657,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019568931993417128,
+      "loss": 0.2012,
+      "step": 10398
+    },
+    {
+      "epoch": 0.09026831364311073,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0019568840982350823,
+      "loss": 0.1016,
+      "step": 10399
+    },
+    {
+      "epoch": 0.0902769941233149,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.001956874996191376,
+      "loss": 0.1357,
+      "step": 10400
+    },
+    {
+      "epoch": 0.09028567460351906,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001956865893210604,
+      "loss": 0.127,
+      "step": 10401
+    },
+    {
+      "epoch": 0.09029435508372323,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001956856789292777,
+      "loss": 0.1338,
+      "step": 10402
+    },
+    {
+      "epoch": 0.0903030355639274,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019568476844379033,
+      "loss": 0.126,
+      "step": 10403
+    },
+    {
+      "epoch": 0.09031171604413156,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001956838578645994,
+      "loss": 0.1348,
+      "step": 10404
+    },
+    {
+      "epoch": 0.09032039652433572,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019568294719170595,
+      "loss": 0.1357,
+      "step": 10405
+    },
+    {
+      "epoch": 0.09032907700453989,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.001956820364251109,
+      "loss": 0.1445,
+      "step": 10406
+    },
+    {
+      "epoch": 0.09033775748474406,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019568112556481525,
+      "loss": 0.1201,
+      "step": 10407
+    },
+    {
+      "epoch": 0.09034643796494822,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019568021461082,
+      "loss": 0.0991,
+      "step": 10408
+    },
+    {
+      "epoch": 0.09035511844515239,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.001956793035631262,
+      "loss": 0.0923,
+      "step": 10409
+    },
+    {
+      "epoch": 0.09036379892535655,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019567839242173473,
+      "loss": 0.127,
+      "step": 10410
+    },
+    {
+      "epoch": 0.09037247940556072,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.001956774811866467,
+      "loss": 0.1211,
+      "step": 10411
+    },
+    {
+      "epoch": 0.09038115988576488,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019567656985786308,
+      "loss": 0.1201,
+      "step": 10412
+    },
+    {
+      "epoch": 0.09038984036596905,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.001956756584353848,
+      "loss": 0.1069,
+      "step": 10413
+    },
+    {
+      "epoch": 0.09039852084617321,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019567474691921296,
+      "loss": 0.0996,
+      "step": 10414
+    },
+    {
+      "epoch": 0.09040720132637738,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0019567383530934845,
+      "loss": 0.1328,
+      "step": 10415
+    },
+    {
+      "epoch": 0.09041588180658154,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019567292360579236,
+      "loss": 0.1367,
+      "step": 10416
+    },
+    {
+      "epoch": 0.0904245622867857,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019567201180854562,
+      "loss": 0.1211,
+      "step": 10417
+    },
+    {
+      "epoch": 0.09043324276698987,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.001956710999176093,
+      "loss": 0.1172,
+      "step": 10418
+    },
+    {
+      "epoch": 0.09044192324719404,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001956701879329843,
+      "loss": 0.1299,
+      "step": 10419
+    },
+    {
+      "epoch": 0.0904506037273982,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001956692758546717,
+      "loss": 0.1641,
+      "step": 10420
+    },
+    {
+      "epoch": 0.09045928420760237,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001956683636826725,
+      "loss": 0.1562,
+      "step": 10421
+    },
+    {
+      "epoch": 0.09046796468780653,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0019566745141698758,
+      "loss": 0.1582,
+      "step": 10422
+    },
+    {
+      "epoch": 0.0904766451680107,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019566653905761806,
+      "loss": 0.0928,
+      "step": 10423
+    },
+    {
+      "epoch": 0.09048532564821486,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019566562660456492,
+      "loss": 0.1367,
+      "step": 10424
+    },
+    {
+      "epoch": 0.09049400612841903,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0019566471405782914,
+      "loss": 0.1377,
+      "step": 10425
+    },
+    {
+      "epoch": 0.09050268660862319,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001956638014174117,
+      "loss": 0.1777,
+      "step": 10426
+    },
+    {
+      "epoch": 0.09051136708882736,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019566288868331358,
+      "loss": 0.1138,
+      "step": 10427
+    },
+    {
+      "epoch": 0.09052004756903152,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001956619758555359,
+      "loss": 0.127,
+      "step": 10428
+    },
+    {
+      "epoch": 0.09052872804923569,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001956610629340795,
+      "loss": 0.2334,
+      "step": 10429
+    },
+    {
+      "epoch": 0.09053740852943985,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0019566014991894546,
+      "loss": 0.1143,
+      "step": 10430
+    },
+    {
+      "epoch": 0.09054608900964402,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019565923681013476,
+      "loss": 0.125,
+      "step": 10431
+    },
+    {
+      "epoch": 0.09055476948984818,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019565832360764843,
+      "loss": 0.1089,
+      "step": 10432
+    },
+    {
+      "epoch": 0.09056344997005235,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019565741031148743,
+      "loss": 0.1494,
+      "step": 10433
+    },
+    {
+      "epoch": 0.09057213045025651,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001956564969216528,
+      "loss": 0.1504,
+      "step": 10434
+    },
+    {
+      "epoch": 0.09058081093046068,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019565558343814546,
+      "loss": 0.1191,
+      "step": 10435
+    },
+    {
+      "epoch": 0.09058949141066484,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001956546698609665,
+      "loss": 0.1162,
+      "step": 10436
+    },
+    {
+      "epoch": 0.09059817189086901,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019565375619011688,
+      "loss": 0.1226,
+      "step": 10437
+    },
+    {
+      "epoch": 0.09060685237107317,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019565284242559757,
+      "loss": 0.1562,
+      "step": 10438
+    },
+    {
+      "epoch": 0.09061553285127734,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019565192856740962,
+      "loss": 0.1465,
+      "step": 10439
+    },
+    {
+      "epoch": 0.0906242133314815,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00195651014615554,
+      "loss": 0.1187,
+      "step": 10440
+    },
+    {
+      "epoch": 0.09063289381168567,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.001956501005700317,
+      "loss": 0.1357,
+      "step": 10441
+    },
+    {
+      "epoch": 0.09064157429188983,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001956491864308438,
+      "loss": 0.1182,
+      "step": 10442
+    },
+    {
+      "epoch": 0.09065025477209399,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001956482721979912,
+      "loss": 0.1143,
+      "step": 10443
+    },
+    {
+      "epoch": 0.09065893525229815,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001956473578714749,
+      "loss": 0.2637,
+      "step": 10444
+    },
+    {
+      "epoch": 0.09066761573250232,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019564644345129597,
+      "loss": 0.166,
+      "step": 10445
+    },
+    {
+      "epoch": 0.09067629621270648,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0019564552893745536,
+      "loss": 0.0874,
+      "step": 10446
+    },
+    {
+      "epoch": 0.09068497669291065,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001956446143299541,
+      "loss": 0.0918,
+      "step": 10447
+    },
+    {
+      "epoch": 0.09069365717311481,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001956436996287932,
+      "loss": 0.1079,
+      "step": 10448
+    },
+    {
+      "epoch": 0.09070233765331898,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0019564278483397363,
+      "loss": 0.1445,
+      "step": 10449
+    },
+    {
+      "epoch": 0.09071101813352314,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0019564186994549637,
+      "loss": 0.209,
+      "step": 10450
+    },
+    {
+      "epoch": 0.0907196986137273,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019564095496336243,
+      "loss": 0.1084,
+      "step": 10451
+    },
+    {
+      "epoch": 0.09072837909393147,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001956400398875729,
+      "loss": 0.1572,
+      "step": 10452
+    },
+    {
+      "epoch": 0.09073705957413564,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0019563912471812868,
+      "loss": 0.1328,
+      "step": 10453
+    },
+    {
+      "epoch": 0.0907457400543398,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019563820945503077,
+      "loss": 0.125,
+      "step": 10454
+    },
+    {
+      "epoch": 0.09075442053454397,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001956372940982802,
+      "loss": 0.1533,
+      "step": 10455
+    },
+    {
+      "epoch": 0.09076310101474813,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.00195636378647878,
+      "loss": 0.082,
+      "step": 10456
+    },
+    {
+      "epoch": 0.0907717814949523,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019563546310382516,
+      "loss": 0.1406,
+      "step": 10457
+    },
+    {
+      "epoch": 0.09078046197515646,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001956345474661226,
+      "loss": 0.1177,
+      "step": 10458
+    },
+    {
+      "epoch": 0.09078914245536063,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019563363173477144,
+      "loss": 0.0977,
+      "step": 10459
+    },
+    {
+      "epoch": 0.09079782293556479,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019563271590977258,
+      "loss": 0.1484,
+      "step": 10460
+    },
+    {
+      "epoch": 0.09080650341576896,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001956317999911271,
+      "loss": 0.1484,
+      "step": 10461
+    },
+    {
+      "epoch": 0.09081518389597312,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.00195630883978836,
+      "loss": 0.166,
+      "step": 10462
+    },
+    {
+      "epoch": 0.09082386437617729,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.001956299678729002,
+      "loss": 0.0942,
+      "step": 10463
+    },
+    {
+      "epoch": 0.09083254485638145,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0019562905167332077,
+      "loss": 0.1064,
+      "step": 10464
+    },
+    {
+      "epoch": 0.09084122533658562,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001956281353800987,
+      "loss": 0.1387,
+      "step": 10465
+    },
+    {
+      "epoch": 0.09084990581678978,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.00195627218993235,
+      "loss": 0.1504,
+      "step": 10466
+    },
+    {
+      "epoch": 0.09085858629699395,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019562630251273062,
+      "loss": 0.1455,
+      "step": 10467
+    },
+    {
+      "epoch": 0.09086726677719811,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019562538593858665,
+      "loss": 0.1689,
+      "step": 10468
+    },
+    {
+      "epoch": 0.09087594725740228,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.00195624469270804,
+      "loss": 0.1562,
+      "step": 10469
+    },
+    {
+      "epoch": 0.09088462773760644,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019562355250938375,
+      "loss": 0.123,
+      "step": 10470
+    },
+    {
+      "epoch": 0.09089330821781061,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019562263565432686,
+      "loss": 0.1079,
+      "step": 10471
+    },
+    {
+      "epoch": 0.09090198869801477,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019562171870563436,
+      "loss": 0.1289,
+      "step": 10472
+    },
+    {
+      "epoch": 0.09091066917821894,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001956208016633072,
+      "loss": 0.1152,
+      "step": 10473
+    },
+    {
+      "epoch": 0.0909193496584231,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019561988452734646,
+      "loss": 0.1084,
+      "step": 10474
+    },
+    {
+      "epoch": 0.09092803013862727,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019561896729775306,
+      "loss": 0.0986,
+      "step": 10475
+    },
+    {
+      "epoch": 0.09093671061883143,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001956180499745281,
+      "loss": 0.1172,
+      "step": 10476
+    },
+    {
+      "epoch": 0.0909453910990356,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019561713255767245,
+      "loss": 0.1904,
+      "step": 10477
+    },
+    {
+      "epoch": 0.09095407157923976,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.001956162150471872,
+      "loss": 0.1436,
+      "step": 10478
+    },
+    {
+      "epoch": 0.09096275205944393,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019561529744307336,
+      "loss": 0.1221,
+      "step": 10479
+    },
+    {
+      "epoch": 0.0909714325396481,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019561437974533196,
+      "loss": 0.1475,
+      "step": 10480
+    },
+    {
+      "epoch": 0.09098011301985226,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001956134619539639,
+      "loss": 0.0962,
+      "step": 10481
+    },
+    {
+      "epoch": 0.09098879350005643,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001956125440689703,
+      "loss": 0.1162,
+      "step": 10482
+    },
+    {
+      "epoch": 0.09099747398026059,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001956116260903521,
+      "loss": 0.1445,
+      "step": 10483
+    },
+    {
+      "epoch": 0.09100615446046476,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019561070801811025,
+      "loss": 0.1387,
+      "step": 10484
+    },
+    {
+      "epoch": 0.09101483494066892,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019560978985224587,
+      "loss": 0.1572,
+      "step": 10485
+    },
+    {
+      "epoch": 0.09102351542087309,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001956088715927599,
+      "loss": 0.0952,
+      "step": 10486
+    },
+    {
+      "epoch": 0.09103219590107725,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019560795323965334,
+      "loss": 0.1475,
+      "step": 10487
+    },
+    {
+      "epoch": 0.09104087638128142,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0019560703479292725,
+      "loss": 0.1221,
+      "step": 10488
+    },
+    {
+      "epoch": 0.09104955686148558,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019560611625258255,
+      "loss": 0.1191,
+      "step": 10489
+    },
+    {
+      "epoch": 0.09105823734168975,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001956051976186203,
+      "loss": 0.1235,
+      "step": 10490
+    },
+    {
+      "epoch": 0.09106691782189391,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019560427889104147,
+      "loss": 0.1582,
+      "step": 10491
+    },
+    {
+      "epoch": 0.09107559830209808,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019560336006984713,
+      "loss": 0.0947,
+      "step": 10492
+    },
+    {
+      "epoch": 0.09108427878230224,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001956024411550382,
+      "loss": 0.127,
+      "step": 10493
+    },
+    {
+      "epoch": 0.0910929592625064,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019560152214661577,
+      "loss": 0.125,
+      "step": 10494
+    },
+    {
+      "epoch": 0.09110163974271057,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.001956006030445808,
+      "loss": 0.1055,
+      "step": 10495
+    },
+    {
+      "epoch": 0.09111032022291474,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.001955996838489343,
+      "loss": 0.1377,
+      "step": 10496
+    },
+    {
+      "epoch": 0.0911190007031189,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001955987645596772,
+      "loss": 0.1089,
+      "step": 10497
+    },
+    {
+      "epoch": 0.09112768118332307,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001955978451768106,
+      "loss": 0.1133,
+      "step": 10498
+    },
+    {
+      "epoch": 0.09113636166352723,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0019559692570033553,
+      "loss": 0.1006,
+      "step": 10499
+    },
+    {
+      "epoch": 0.0911450421437314,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001955960061302529,
+      "loss": 0.1641,
+      "step": 10500
+    },
+    {
+      "epoch": 0.09115372262393556,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019559508646656383,
+      "loss": 0.1025,
+      "step": 10501
+    },
+    {
+      "epoch": 0.09116240310413973,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.001955941667092692,
+      "loss": 0.1465,
+      "step": 10502
+    },
+    {
+      "epoch": 0.09117108358434389,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0019559324685837014,
+      "loss": 0.1006,
+      "step": 10503
+    },
+    {
+      "epoch": 0.09117976406454804,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019559232691386754,
+      "loss": 0.1216,
+      "step": 10504
+    },
+    {
+      "epoch": 0.09118844454475221,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019559140687576246,
+      "loss": 0.1011,
+      "step": 10505
+    },
+    {
+      "epoch": 0.09119712502495637,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0019559048674405587,
+      "loss": 0.0869,
+      "step": 10506
+    },
+    {
+      "epoch": 0.09120580550516054,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001955895665187489,
+      "loss": 0.1025,
+      "step": 10507
+    },
+    {
+      "epoch": 0.0912144859853647,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001955886461998424,
+      "loss": 0.1035,
+      "step": 10508
+    },
+    {
+      "epoch": 0.09122316646556887,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001955877257873374,
+      "loss": 0.1011,
+      "step": 10509
+    },
+    {
+      "epoch": 0.09123184694577303,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0019558680528123503,
+      "loss": 0.167,
+      "step": 10510
+    },
+    {
+      "epoch": 0.0912405274259772,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001955858846815362,
+      "loss": 0.1289,
+      "step": 10511
+    },
+    {
+      "epoch": 0.09124920790618136,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001955849639882419,
+      "loss": 0.1426,
+      "step": 10512
+    },
+    {
+      "epoch": 0.09125788838638553,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001955840432013532,
+      "loss": 0.0796,
+      "step": 10513
+    },
+    {
+      "epoch": 0.0912665688665897,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001955831223208711,
+      "loss": 0.1152,
+      "step": 10514
+    },
+    {
+      "epoch": 0.09127524934679386,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0019558220134679653,
+      "loss": 0.1245,
+      "step": 10515
+    },
+    {
+      "epoch": 0.09128392982699803,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001955812802791306,
+      "loss": 0.1162,
+      "step": 10516
+    },
+    {
+      "epoch": 0.09129261030720219,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001955803591178742,
+      "loss": 0.1318,
+      "step": 10517
+    },
+    {
+      "epoch": 0.09130129078740636,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0019557943786302845,
+      "loss": 0.1328,
+      "step": 10518
+    },
+    {
+      "epoch": 0.09130997126761052,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.001955785165145943,
+      "loss": 0.0918,
+      "step": 10519
+    },
+    {
+      "epoch": 0.09131865174781469,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019557759507257276,
+      "loss": 0.1357,
+      "step": 10520
+    },
+    {
+      "epoch": 0.09132733222801885,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019557667353696487,
+      "loss": 0.1152,
+      "step": 10521
+    },
+    {
+      "epoch": 0.09133601270822302,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019557575190777164,
+      "loss": 0.1328,
+      "step": 10522
+    },
+    {
+      "epoch": 0.09134469318842718,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0019557483018499402,
+      "loss": 0.0972,
+      "step": 10523
+    },
+    {
+      "epoch": 0.09135337366863135,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019557390836863306,
+      "loss": 0.1279,
+      "step": 10524
+    },
+    {
+      "epoch": 0.09136205414883551,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019557298645868975,
+      "loss": 0.1123,
+      "step": 10525
+    },
+    {
+      "epoch": 0.09137073462903968,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001955720644551651,
+      "loss": 0.2393,
+      "step": 10526
+    },
+    {
+      "epoch": 0.09137941510924384,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019557114235806013,
+      "loss": 0.1016,
+      "step": 10527
+    },
+    {
+      "epoch": 0.091388095589448,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019557022016737586,
+      "loss": 0.1357,
+      "step": 10528
+    },
+    {
+      "epoch": 0.09139677606965217,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0019556929788311327,
+      "loss": 0.1504,
+      "step": 10529
+    },
+    {
+      "epoch": 0.09140545654985634,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001955683755052734,
+      "loss": 0.1514,
+      "step": 10530
+    },
+    {
+      "epoch": 0.0914141370300605,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0019556745303385727,
+      "loss": 0.0962,
+      "step": 10531
+    },
+    {
+      "epoch": 0.09142281751026467,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0019556653046886576,
+      "loss": 0.1211,
+      "step": 10532
+    },
+    {
+      "epoch": 0.09143149799046883,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0019556560781030007,
+      "loss": 0.1406,
+      "step": 10533
+    },
+    {
+      "epoch": 0.091440178470673,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001955646850581611,
+      "loss": 0.1201,
+      "step": 10534
+    },
+    {
+      "epoch": 0.09144885895087716,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0019556376221244983,
+      "loss": 0.1084,
+      "step": 10535
+    },
+    {
+      "epoch": 0.09145753943108133,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0019556283927316736,
+      "loss": 0.106,
+      "step": 10536
+    },
+    {
+      "epoch": 0.09146621991128549,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001955619162403146,
+      "loss": 0.0898,
+      "step": 10537
+    },
+    {
+      "epoch": 0.09147490039148966,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019556099311389265,
+      "loss": 0.1201,
+      "step": 10538
+    },
+    {
+      "epoch": 0.09148358087169382,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019556006989390248,
+      "loss": 0.1064,
+      "step": 10539
+    },
+    {
+      "epoch": 0.09149226135189799,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019555914658034514,
+      "loss": 0.1074,
+      "step": 10540
+    },
+    {
+      "epoch": 0.09150094183210215,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0019555822317322154,
+      "loss": 0.1445,
+      "step": 10541
+    },
+    {
+      "epoch": 0.09150962231230632,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019555729967253277,
+      "loss": 0.1182,
+      "step": 10542
+    },
+    {
+      "epoch": 0.09151830279251048,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0019555637607827987,
+      "loss": 0.1309,
+      "step": 10543
+    },
+    {
+      "epoch": 0.09152698327271465,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0019555545239046375,
+      "loss": 0.1416,
+      "step": 10544
+    },
+    {
+      "epoch": 0.09153566375291881,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001955545286090855,
+      "loss": 0.1484,
+      "step": 10545
+    },
+    {
+      "epoch": 0.09154434423312298,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001955536047341461,
+      "loss": 0.1001,
+      "step": 10546
+    },
+    {
+      "epoch": 0.09155302471332714,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019555268076564655,
+      "loss": 0.1118,
+      "step": 10547
+    },
+    {
+      "epoch": 0.09156170519353131,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001955517567035879,
+      "loss": 0.1572,
+      "step": 10548
+    },
+    {
+      "epoch": 0.09157038567373547,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0019555083254797112,
+      "loss": 0.1348,
+      "step": 10549
+    },
+    {
+      "epoch": 0.09157906615393964,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0019554990829879723,
+      "loss": 0.1104,
+      "step": 10550
+    },
+    {
+      "epoch": 0.0915877466341438,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001955489839560672,
+      "loss": 0.1074,
+      "step": 10551
+    },
+    {
+      "epoch": 0.09159642711434797,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019554805951978215,
+      "loss": 0.1104,
+      "step": 10552
+    },
+    {
+      "epoch": 0.09160510759455213,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019554713498994305,
+      "loss": 0.1309,
+      "step": 10553
+    },
+    {
+      "epoch": 0.0916137880747563,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019554621036655083,
+      "loss": 0.0996,
+      "step": 10554
+    },
+    {
+      "epoch": 0.09162246855496047,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.001955452856496066,
+      "loss": 0.123,
+      "step": 10555
+    },
+    {
+      "epoch": 0.09163114903516463,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019554436083911125,
+      "loss": 0.1191,
+      "step": 10556
+    },
+    {
+      "epoch": 0.0916398295153688,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019554343593506597,
+      "loss": 0.1338,
+      "step": 10557
+    },
+    {
+      "epoch": 0.09164850999557296,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001955425109374716,
+      "loss": 0.0977,
+      "step": 10558
+    },
+    {
+      "epoch": 0.09165719047577713,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001955415858463293,
+      "loss": 0.1182,
+      "step": 10559
+    },
+    {
+      "epoch": 0.09166587095598129,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0019554066066163997,
+      "loss": 0.1387,
+      "step": 10560
+    },
+    {
+      "epoch": 0.09167455143618546,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019553973538340465,
+      "loss": 0.1167,
+      "step": 10561
+    },
+    {
+      "epoch": 0.09168323191638962,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019553881001162436,
+      "loss": 0.0859,
+      "step": 10562
+    },
+    {
+      "epoch": 0.09169191239659379,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001955378845463001,
+      "loss": 0.1152,
+      "step": 10563
+    },
+    {
+      "epoch": 0.09170059287679795,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019553695898743293,
+      "loss": 0.1201,
+      "step": 10564
+    },
+    {
+      "epoch": 0.09170927335700212,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001955360333350238,
+      "loss": 0.1211,
+      "step": 10565
+    },
+    {
+      "epoch": 0.09171795383720627,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001955351075890738,
+      "loss": 0.1318,
+      "step": 10566
+    },
+    {
+      "epoch": 0.09172663431741043,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001955341817495838,
+      "loss": 0.1797,
+      "step": 10567
+    },
+    {
+      "epoch": 0.0917353147976146,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019553325581655495,
+      "loss": 0.0918,
+      "step": 10568
+    },
+    {
+      "epoch": 0.09174399527781876,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001955323297899883,
+      "loss": 0.1016,
+      "step": 10569
+    },
+    {
+      "epoch": 0.09175267575802293,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0019553140366988467,
+      "loss": 0.1011,
+      "step": 10570
+    },
+    {
+      "epoch": 0.0917613562382271,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001955304774562452,
+      "loss": 0.0889,
+      "step": 10571
+    },
+    {
+      "epoch": 0.09177003671843126,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019552955114907093,
+      "loss": 0.1416,
+      "step": 10572
+    },
+    {
+      "epoch": 0.09177871719863542,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001955286247483628,
+      "loss": 0.1406,
+      "step": 10573
+    },
+    {
+      "epoch": 0.09178739767883959,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001955276982541218,
+      "loss": 0.1147,
+      "step": 10574
+    },
+    {
+      "epoch": 0.09179607815904375,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0019552677166634904,
+      "loss": 0.1436,
+      "step": 10575
+    },
+    {
+      "epoch": 0.09180475863924792,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0019552584498504547,
+      "loss": 0.1934,
+      "step": 10576
+    },
+    {
+      "epoch": 0.09181343911945208,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0019552491821021216,
+      "loss": 0.1631,
+      "step": 10577
+    },
+    {
+      "epoch": 0.09182211959965625,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019552399134185007,
+      "loss": 0.1406,
+      "step": 10578
+    },
+    {
+      "epoch": 0.09183080007986041,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019552306437996022,
+      "loss": 0.1426,
+      "step": 10579
+    },
+    {
+      "epoch": 0.09183948056006458,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019552213732454363,
+      "loss": 0.0933,
+      "step": 10580
+    },
+    {
+      "epoch": 0.09184816104026874,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0019552121017560133,
+      "loss": 0.1182,
+      "step": 10581
+    },
+    {
+      "epoch": 0.09185684152047291,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0019552028293313433,
+      "loss": 0.123,
+      "step": 10582
+    },
+    {
+      "epoch": 0.09186552200067707,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.001955193555971436,
+      "loss": 0.1562,
+      "step": 10583
+    },
+    {
+      "epoch": 0.09187420248088124,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001955184281676302,
+      "loss": 0.2246,
+      "step": 10584
+    },
+    {
+      "epoch": 0.0918828829610854,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0019551750064459514,
+      "loss": 0.0952,
+      "step": 10585
+    },
+    {
+      "epoch": 0.09189156344128957,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001955165730280394,
+      "loss": 0.1289,
+      "step": 10586
+    },
+    {
+      "epoch": 0.09190024392149373,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0019551564531796405,
+      "loss": 0.1699,
+      "step": 10587
+    },
+    {
+      "epoch": 0.0919089244016979,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019551471751437007,
+      "loss": 0.1738,
+      "step": 10588
+    },
+    {
+      "epoch": 0.09191760488190207,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0019551378961725845,
+      "loss": 0.1055,
+      "step": 10589
+    },
+    {
+      "epoch": 0.09192628536210623,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019551286162663027,
+      "loss": 0.1299,
+      "step": 10590
+    },
+    {
+      "epoch": 0.0919349658423104,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001955119335424865,
+      "loss": 0.1069,
+      "step": 10591
+    },
+    {
+      "epoch": 0.09194364632251456,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0019551100536482815,
+      "loss": 0.1016,
+      "step": 10592
+    },
+    {
+      "epoch": 0.09195232680271873,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001955100770936563,
+      "loss": 0.1138,
+      "step": 10593
+    },
+    {
+      "epoch": 0.09196100728292289,
+      "grad_norm": 2.453125,
+      "learning_rate": 0.0019550914872897185,
+      "loss": 0.1943,
+      "step": 10594
+    },
+    {
+      "epoch": 0.09196968776312706,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.001955082202707759,
+      "loss": 0.1602,
+      "step": 10595
+    },
+    {
+      "epoch": 0.09197836824333122,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0019550729171906945,
+      "loss": 0.1138,
+      "step": 10596
+    },
+    {
+      "epoch": 0.09198704872353539,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.001955063630738535,
+      "loss": 0.1582,
+      "step": 10597
+    },
+    {
+      "epoch": 0.09199572920373955,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001955054343351291,
+      "loss": 0.1211,
+      "step": 10598
+    },
+    {
+      "epoch": 0.09200440968394372,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001955045055028972,
+      "loss": 0.1465,
+      "step": 10599
+    },
+    {
+      "epoch": 0.09201309016414788,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019550357657715893,
+      "loss": 0.1011,
+      "step": 10600
+    },
+    {
+      "epoch": 0.09202177064435205,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001955026475579152,
+      "loss": 0.1182,
+      "step": 10601
+    },
+    {
+      "epoch": 0.09203045112455621,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0019550171844516704,
+      "loss": 0.1191,
+      "step": 10602
+    },
+    {
+      "epoch": 0.09203913160476038,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019550078923891547,
+      "loss": 0.123,
+      "step": 10603
+    },
+    {
+      "epoch": 0.09204781208496454,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019549985993916157,
+      "loss": 0.1162,
+      "step": 10604
+    },
+    {
+      "epoch": 0.09205649256516871,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001954989305459063,
+      "loss": 0.1328,
+      "step": 10605
+    },
+    {
+      "epoch": 0.09206517304537287,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019549800105915067,
+      "loss": 0.125,
+      "step": 10606
+    },
+    {
+      "epoch": 0.09207385352557704,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019549707147889566,
+      "loss": 0.1621,
+      "step": 10607
+    },
+    {
+      "epoch": 0.0920825340057812,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001954961418051424,
+      "loss": 0.0767,
+      "step": 10608
+    },
+    {
+      "epoch": 0.09209121448598537,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.001954952120378918,
+      "loss": 0.1074,
+      "step": 10609
+    },
+    {
+      "epoch": 0.09209989496618953,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.00195494282177145,
+      "loss": 0.1084,
+      "step": 10610
+    },
+    {
+      "epoch": 0.0921085754463937,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001954933522229029,
+      "loss": 0.1064,
+      "step": 10611
+    },
+    {
+      "epoch": 0.09211725592659786,
+      "grad_norm": 1.375,
+      "learning_rate": 0.001954924221751665,
+      "loss": 0.1387,
+      "step": 10612
+    },
+    {
+      "epoch": 0.09212593640680203,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019549149203393693,
+      "loss": 0.1758,
+      "step": 10613
+    },
+    {
+      "epoch": 0.0921346168870062,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001954905617992151,
+      "loss": 0.1206,
+      "step": 10614
+    },
+    {
+      "epoch": 0.09214329736721036,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019548963147100217,
+      "loss": 0.106,
+      "step": 10615
+    },
+    {
+      "epoch": 0.09215197784741452,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.00195488701049299,
+      "loss": 0.1221,
+      "step": 10616
+    },
+    {
+      "epoch": 0.09216065832761869,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0019548777053410666,
+      "loss": 0.1094,
+      "step": 10617
+    },
+    {
+      "epoch": 0.09216933880782285,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001954868399254262,
+      "loss": 0.127,
+      "step": 10618
+    },
+    {
+      "epoch": 0.09217801928802702,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001954859092232586,
+      "loss": 0.1104,
+      "step": 10619
+    },
+    {
+      "epoch": 0.09218669976823118,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001954849784276049,
+      "loss": 0.0869,
+      "step": 10620
+    },
+    {
+      "epoch": 0.09219538024843535,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001954840475384661,
+      "loss": 0.1387,
+      "step": 10621
+    },
+    {
+      "epoch": 0.09220406072863951,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0019548311655584326,
+      "loss": 0.1533,
+      "step": 10622
+    },
+    {
+      "epoch": 0.09221274120884368,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019548218547973733,
+      "loss": 0.1289,
+      "step": 10623
+    },
+    {
+      "epoch": 0.09222142168904784,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.001954812543101494,
+      "loss": 0.1045,
+      "step": 10624
+    },
+    {
+      "epoch": 0.09223010216925201,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0019548032304708045,
+      "loss": 0.1064,
+      "step": 10625
+    },
+    {
+      "epoch": 0.09223878264945617,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001954793916905315,
+      "loss": 0.1357,
+      "step": 10626
+    },
+    {
+      "epoch": 0.09224746312966033,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.001954784602405036,
+      "loss": 0.1406,
+      "step": 10627
+    },
+    {
+      "epoch": 0.09225614360986449,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001954775286969977,
+      "loss": 0.1069,
+      "step": 10628
+    },
+    {
+      "epoch": 0.09226482409006866,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019547659706001487,
+      "loss": 0.1543,
+      "step": 10629
+    },
+    {
+      "epoch": 0.09227350457027282,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001954756653295561,
+      "loss": 0.1045,
+      "step": 10630
+    },
+    {
+      "epoch": 0.09228218505047699,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0019547473350562243,
+      "loss": 0.1611,
+      "step": 10631
+    },
+    {
+      "epoch": 0.09229086553068115,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001954738015882149,
+      "loss": 0.1055,
+      "step": 10632
+    },
+    {
+      "epoch": 0.09229954601088532,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001954728695773345,
+      "loss": 0.1406,
+      "step": 10633
+    },
+    {
+      "epoch": 0.09230822649108948,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019547193747298224,
+      "loss": 0.1001,
+      "step": 10634
+    },
+    {
+      "epoch": 0.09231690697129365,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0019547100527515914,
+      "loss": 0.1055,
+      "step": 10635
+    },
+    {
+      "epoch": 0.09232558745149781,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0019547007298386623,
+      "loss": 0.1191,
+      "step": 10636
+    },
+    {
+      "epoch": 0.09233426793170198,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019546914059910456,
+      "loss": 0.1084,
+      "step": 10637
+    },
+    {
+      "epoch": 0.09234294841190614,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001954682081208751,
+      "loss": 0.1182,
+      "step": 10638
+    },
+    {
+      "epoch": 0.09235162889211031,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019546727554917895,
+      "loss": 0.1099,
+      "step": 10639
+    },
+    {
+      "epoch": 0.09236030937231447,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019546634288401706,
+      "loss": 0.1152,
+      "step": 10640
+    },
+    {
+      "epoch": 0.09236898985251864,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019546541012539042,
+      "loss": 0.1445,
+      "step": 10641
+    },
+    {
+      "epoch": 0.0923776703327228,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001954644772733001,
+      "loss": 0.0977,
+      "step": 10642
+    },
+    {
+      "epoch": 0.09238635081292697,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001954635443277471,
+      "loss": 0.1001,
+      "step": 10643
+    },
+    {
+      "epoch": 0.09239503129313113,
+      "grad_norm": 1.8828125,
+      "learning_rate": 0.001954626112887325,
+      "loss": 0.2021,
+      "step": 10644
+    },
+    {
+      "epoch": 0.0924037117733353,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001954616781562572,
+      "loss": 0.1289,
+      "step": 10645
+    },
+    {
+      "epoch": 0.09241239225353946,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0019546074493032237,
+      "loss": 0.1309,
+      "step": 10646
+    },
+    {
+      "epoch": 0.09242107273374363,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.001954598116109289,
+      "loss": 0.124,
+      "step": 10647
+    },
+    {
+      "epoch": 0.0924297532139478,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001954588781980779,
+      "loss": 0.1016,
+      "step": 10648
+    },
+    {
+      "epoch": 0.09243843369415196,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001954579446917703,
+      "loss": 0.1152,
+      "step": 10649
+    },
+    {
+      "epoch": 0.09244711417435612,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019545701109200723,
+      "loss": 0.1377,
+      "step": 10650
+    },
+    {
+      "epoch": 0.09245579465456029,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019545607739878965,
+      "loss": 0.1191,
+      "step": 10651
+    },
+    {
+      "epoch": 0.09246447513476445,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019545514361211857,
+      "loss": 0.1426,
+      "step": 10652
+    },
+    {
+      "epoch": 0.09247315561496862,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.00195454209731995,
+      "loss": 0.1157,
+      "step": 10653
+    },
+    {
+      "epoch": 0.09248183609517278,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019545327575842004,
+      "loss": 0.103,
+      "step": 10654
+    },
+    {
+      "epoch": 0.09249051657537695,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019545234169139466,
+      "loss": 0.1641,
+      "step": 10655
+    },
+    {
+      "epoch": 0.09249919705558111,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0019545140753091986,
+      "loss": 0.103,
+      "step": 10656
+    },
+    {
+      "epoch": 0.09250787753578528,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001954504732769967,
+      "loss": 0.127,
+      "step": 10657
+    },
+    {
+      "epoch": 0.09251655801598944,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001954495389296262,
+      "loss": 0.1191,
+      "step": 10658
+    },
+    {
+      "epoch": 0.09252523849619361,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001954486044888093,
+      "loss": 0.082,
+      "step": 10659
+    },
+    {
+      "epoch": 0.09253391897639777,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0019544766995454716,
+      "loss": 0.1094,
+      "step": 10660
+    },
+    {
+      "epoch": 0.09254259945660194,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001954467353268407,
+      "loss": 0.1201,
+      "step": 10661
+    },
+    {
+      "epoch": 0.0925512799368061,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.00195445800605691,
+      "loss": 0.1504,
+      "step": 10662
+    },
+    {
+      "epoch": 0.09255996041701027,
+      "grad_norm": 1.8125,
+      "learning_rate": 0.0019544486579109906,
+      "loss": 0.127,
+      "step": 10663
+    },
+    {
+      "epoch": 0.09256864089721444,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019544393088306586,
+      "loss": 0.1523,
+      "step": 10664
+    },
+    {
+      "epoch": 0.0925773213774186,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001954429958815925,
+      "loss": 0.1494,
+      "step": 10665
+    },
+    {
+      "epoch": 0.09258600185762277,
+      "grad_norm": 0.061767578125,
+      "learning_rate": 0.001954420607866799,
+      "loss": 0.1104,
+      "step": 10666
+    },
+    {
+      "epoch": 0.09259468233782693,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001954411255983292,
+      "loss": 0.0996,
+      "step": 10667
+    },
+    {
+      "epoch": 0.0926033628180311,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019544019031654133,
+      "loss": 0.1309,
+      "step": 10668
+    },
+    {
+      "epoch": 0.09261204329823526,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001954392549413174,
+      "loss": 0.1201,
+      "step": 10669
+    },
+    {
+      "epoch": 0.09262072377843943,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019543831947265835,
+      "loss": 0.1104,
+      "step": 10670
+    },
+    {
+      "epoch": 0.09262940425864359,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019543738391056527,
+      "loss": 0.1104,
+      "step": 10671
+    },
+    {
+      "epoch": 0.09263808473884776,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019543644825503912,
+      "loss": 0.1104,
+      "step": 10672
+    },
+    {
+      "epoch": 0.09264676521905192,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019543551250608094,
+      "loss": 0.125,
+      "step": 10673
+    },
+    {
+      "epoch": 0.09265544569925609,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001954345766636918,
+      "loss": 0.1445,
+      "step": 10674
+    },
+    {
+      "epoch": 0.09266412617946025,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001954336407278727,
+      "loss": 0.1318,
+      "step": 10675
+    },
+    {
+      "epoch": 0.09267280665966442,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019543270469862466,
+      "loss": 0.1348,
+      "step": 10676
+    },
+    {
+      "epoch": 0.09268148713986858,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019543176857594866,
+      "loss": 0.1543,
+      "step": 10677
+    },
+    {
+      "epoch": 0.09269016762007275,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019543083235984576,
+      "loss": 0.1514,
+      "step": 10678
+    },
+    {
+      "epoch": 0.09269884810027691,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.00195429896050317,
+      "loss": 0.1436,
+      "step": 10679
+    },
+    {
+      "epoch": 0.09270752858048108,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001954289596473634,
+      "loss": 0.0967,
+      "step": 10680
+    },
+    {
+      "epoch": 0.09271620906068524,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.00195428023150986,
+      "loss": 0.1182,
+      "step": 10681
+    },
+    {
+      "epoch": 0.09272488954088941,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0019542708656118575,
+      "loss": 0.1523,
+      "step": 10682
+    },
+    {
+      "epoch": 0.09273357002109357,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019542614987796375,
+      "loss": 0.1201,
+      "step": 10683
+    },
+    {
+      "epoch": 0.09274225050129774,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019542521310132095,
+      "loss": 0.0972,
+      "step": 10684
+    },
+    {
+      "epoch": 0.0927509309815019,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0019542427623125843,
+      "loss": 0.0889,
+      "step": 10685
+    },
+    {
+      "epoch": 0.09275961146170607,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0019542333926777723,
+      "loss": 0.1289,
+      "step": 10686
+    },
+    {
+      "epoch": 0.09276829194191023,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019542240221087836,
+      "loss": 0.0869,
+      "step": 10687
+    },
+    {
+      "epoch": 0.0927769724221144,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.001954214650605628,
+      "loss": 0.1001,
+      "step": 10688
+    },
+    {
+      "epoch": 0.09278565290231855,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019542052781683166,
+      "loss": 0.1416,
+      "step": 10689
+    },
+    {
+      "epoch": 0.09279433338252271,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0019541959047968587,
+      "loss": 0.1182,
+      "step": 10690
+    },
+    {
+      "epoch": 0.09280301386272688,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001954186530491265,
+      "loss": 0.1045,
+      "step": 10691
+    },
+    {
+      "epoch": 0.09281169434293104,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001954177155251546,
+      "loss": 0.1426,
+      "step": 10692
+    },
+    {
+      "epoch": 0.09282037482313521,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0019541677790777113,
+      "loss": 0.1475,
+      "step": 10693
+    },
+    {
+      "epoch": 0.09282905530333938,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0019541584019697722,
+      "loss": 0.125,
+      "step": 10694
+    },
+    {
+      "epoch": 0.09283773578354354,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001954149023927738,
+      "loss": 0.1025,
+      "step": 10695
+    },
+    {
+      "epoch": 0.0928464162637477,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001954139644951619,
+      "loss": 0.1289,
+      "step": 10696
+    },
+    {
+      "epoch": 0.09285509674395187,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001954130265041426,
+      "loss": 0.1328,
+      "step": 10697
+    },
+    {
+      "epoch": 0.09286377722415604,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0019541208841971688,
+      "loss": 0.1455,
+      "step": 10698
+    },
+    {
+      "epoch": 0.0928724577043602,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001954111502418858,
+      "loss": 0.1406,
+      "step": 10699
+    },
+    {
+      "epoch": 0.09288113818456437,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019541021197065035,
+      "loss": 0.1387,
+      "step": 10700
+    },
+    {
+      "epoch": 0.09288981866476853,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001954092736060116,
+      "loss": 0.1416,
+      "step": 10701
+    },
+    {
+      "epoch": 0.0928984991449727,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019540833514797053,
+      "loss": 0.0942,
+      "step": 10702
+    },
+    {
+      "epoch": 0.09290717962517686,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001954073965965282,
+      "loss": 0.0859,
+      "step": 10703
+    },
+    {
+      "epoch": 0.09291586010538103,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001954064579516856,
+      "loss": 0.1011,
+      "step": 10704
+    },
+    {
+      "epoch": 0.09292454058558519,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019540551921344382,
+      "loss": 0.1338,
+      "step": 10705
+    },
+    {
+      "epoch": 0.09293322106578936,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001954045803818038,
+      "loss": 0.1279,
+      "step": 10706
+    },
+    {
+      "epoch": 0.09294190154599352,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019540364145676664,
+      "loss": 0.082,
+      "step": 10707
+    },
+    {
+      "epoch": 0.09295058202619769,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019540270243833336,
+      "loss": 0.127,
+      "step": 10708
+    },
+    {
+      "epoch": 0.09295926250640185,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019540176332650495,
+      "loss": 0.1162,
+      "step": 10709
+    },
+    {
+      "epoch": 0.09296794298660602,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019540082412128246,
+      "loss": 0.1484,
+      "step": 10710
+    },
+    {
+      "epoch": 0.09297662346681018,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001953998848226669,
+      "loss": 0.1152,
+      "step": 10711
+    },
+    {
+      "epoch": 0.09298530394701435,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001953989454306593,
+      "loss": 0.1758,
+      "step": 10712
+    },
+    {
+      "epoch": 0.09299398442721851,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0019539800594526075,
+      "loss": 0.1123,
+      "step": 10713
+    },
+    {
+      "epoch": 0.09300266490742268,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.001953970663664722,
+      "loss": 0.1377,
+      "step": 10714
+    },
+    {
+      "epoch": 0.09301134538762684,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0019539612669429466,
+      "loss": 0.1758,
+      "step": 10715
+    },
+    {
+      "epoch": 0.09302002586783101,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001953951869287292,
+      "loss": 0.1172,
+      "step": 10716
+    },
+    {
+      "epoch": 0.09302870634803517,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019539424706977687,
+      "loss": 0.1206,
+      "step": 10717
+    },
+    {
+      "epoch": 0.09303738682823934,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019539330711743873,
+      "loss": 0.1118,
+      "step": 10718
+    },
+    {
+      "epoch": 0.0930460673084435,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001953923670717157,
+      "loss": 0.127,
+      "step": 10719
+    },
+    {
+      "epoch": 0.09305474778864767,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001953914269326088,
+      "loss": 0.1172,
+      "step": 10720
+    },
+    {
+      "epoch": 0.09306342826885183,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019539048670011924,
+      "loss": 0.1211,
+      "step": 10721
+    },
+    {
+      "epoch": 0.093072108749056,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001953895463742478,
+      "loss": 0.1768,
+      "step": 10722
+    },
+    {
+      "epoch": 0.09308078922926016,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0019538860595499576,
+      "loss": 0.125,
+      "step": 10723
+    },
+    {
+      "epoch": 0.09308946970946433,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019538766544236396,
+      "loss": 0.1602,
+      "step": 10724
+    },
+    {
+      "epoch": 0.0930981501896685,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001953867248363535,
+      "loss": 0.0693,
+      "step": 10725
+    },
+    {
+      "epoch": 0.09310683066987266,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001953857841369654,
+      "loss": 0.1172,
+      "step": 10726
+    },
+    {
+      "epoch": 0.09311551115007682,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019538484334420065,
+      "loss": 0.103,
+      "step": 10727
+    },
+    {
+      "epoch": 0.09312419163028099,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001953839024580604,
+      "loss": 0.1348,
+      "step": 10728
+    },
+    {
+      "epoch": 0.09313287211048515,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019538296147854554,
+      "loss": 0.1426,
+      "step": 10729
+    },
+    {
+      "epoch": 0.09314155259068932,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019538202040565718,
+      "loss": 0.0996,
+      "step": 10730
+    },
+    {
+      "epoch": 0.09315023307089348,
+      "grad_norm": 2.015625,
+      "learning_rate": 0.0019538107923939628,
+      "loss": 0.1338,
+      "step": 10731
+    },
+    {
+      "epoch": 0.09315891355109765,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019538013797976393,
+      "loss": 0.0879,
+      "step": 10732
+    },
+    {
+      "epoch": 0.09316759403130181,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001953791966267612,
+      "loss": 0.1172,
+      "step": 10733
+    },
+    {
+      "epoch": 0.09317627451150598,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00195378255180389,
+      "loss": 0.1069,
+      "step": 10734
+    },
+    {
+      "epoch": 0.09318495499171014,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019537731364064844,
+      "loss": 0.1187,
+      "step": 10735
+    },
+    {
+      "epoch": 0.09319363547191431,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019537637200754054,
+      "loss": 0.0947,
+      "step": 10736
+    },
+    {
+      "epoch": 0.09320231595211848,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019537543028106635,
+      "loss": 0.1289,
+      "step": 10737
+    },
+    {
+      "epoch": 0.09321099643232264,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019537448846122683,
+      "loss": 0.1309,
+      "step": 10738
+    },
+    {
+      "epoch": 0.0932196769125268,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019537354654802306,
+      "loss": 0.0928,
+      "step": 10739
+    },
+    {
+      "epoch": 0.09322835739273097,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.001953726045414561,
+      "loss": 0.1021,
+      "step": 10740
+    },
+    {
+      "epoch": 0.09323703787293514,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001953716624415269,
+      "loss": 0.1465,
+      "step": 10741
+    },
+    {
+      "epoch": 0.0932457183531393,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019537072024823653,
+      "loss": 0.1128,
+      "step": 10742
+    },
+    {
+      "epoch": 0.09325439883334347,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019536977796158605,
+      "loss": 0.1543,
+      "step": 10743
+    },
+    {
+      "epoch": 0.09326307931354763,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0019536883558157647,
+      "loss": 0.1113,
+      "step": 10744
+    },
+    {
+      "epoch": 0.0932717597937518,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0019536789310820877,
+      "loss": 0.1699,
+      "step": 10745
+    },
+    {
+      "epoch": 0.09328044027395596,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0019536695054148405,
+      "loss": 0.1099,
+      "step": 10746
+    },
+    {
+      "epoch": 0.09328912075416013,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019536600788140333,
+      "loss": 0.1162,
+      "step": 10747
+    },
+    {
+      "epoch": 0.09329780123436429,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001953650651279676,
+      "loss": 0.1299,
+      "step": 10748
+    },
+    {
+      "epoch": 0.09330648171456846,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019536412228117793,
+      "loss": 0.1357,
+      "step": 10749
+    },
+    {
+      "epoch": 0.09331516219477262,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0019536317934103534,
+      "loss": 0.1069,
+      "step": 10750
+    },
+    {
+      "epoch": 0.09332384267497677,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019536223630754086,
+      "loss": 0.1289,
+      "step": 10751
+    },
+    {
+      "epoch": 0.09333252315518094,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.001953612931806955,
+      "loss": 0.1504,
+      "step": 10752
+    },
+    {
+      "epoch": 0.0933412036353851,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0019536034996050036,
+      "loss": 0.1699,
+      "step": 10753
+    },
+    {
+      "epoch": 0.09334988411558927,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019535940664695637,
+      "loss": 0.1855,
+      "step": 10754
+    },
+    {
+      "epoch": 0.09335856459579343,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001953584632400646,
+      "loss": 0.2051,
+      "step": 10755
+    },
+    {
+      "epoch": 0.0933672450759976,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0019535751973982617,
+      "loss": 0.1377,
+      "step": 10756
+    },
+    {
+      "epoch": 0.09337592555620176,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019535657614624205,
+      "loss": 0.1484,
+      "step": 10757
+    },
+    {
+      "epoch": 0.09338460603640593,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001953556324593132,
+      "loss": 0.1357,
+      "step": 10758
+    },
+    {
+      "epoch": 0.0933932865166101,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001953546886790407,
+      "loss": 0.105,
+      "step": 10759
+    },
+    {
+      "epoch": 0.09340196699681426,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019535374480542564,
+      "loss": 0.1416,
+      "step": 10760
+    },
+    {
+      "epoch": 0.09341064747701842,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0019535280083846896,
+      "loss": 0.124,
+      "step": 10761
+    },
+    {
+      "epoch": 0.09341932795722259,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0019535185677817184,
+      "loss": 0.1641,
+      "step": 10762
+    },
+    {
+      "epoch": 0.09342800843742675,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001953509126245351,
+      "loss": 0.1602,
+      "step": 10763
+    },
+    {
+      "epoch": 0.09343668891763092,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0019534996837755995,
+      "loss": 0.1416,
+      "step": 10764
+    },
+    {
+      "epoch": 0.09344536939783508,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019534902403724732,
+      "loss": 0.1367,
+      "step": 10765
+    },
+    {
+      "epoch": 0.09345404987803925,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001953480796035983,
+      "loss": 0.1016,
+      "step": 10766
+    },
+    {
+      "epoch": 0.09346273035824341,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001953471350766139,
+      "loss": 0.1201,
+      "step": 10767
+    },
+    {
+      "epoch": 0.09347141083844758,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019534619045629515,
+      "loss": 0.126,
+      "step": 10768
+    },
+    {
+      "epoch": 0.09348009131865175,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019534524574264306,
+      "loss": 0.1377,
+      "step": 10769
+    },
+    {
+      "epoch": 0.09348877179885591,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019534430093565876,
+      "loss": 0.084,
+      "step": 10770
+    },
+    {
+      "epoch": 0.09349745227906008,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0019534335603534317,
+      "loss": 0.1465,
+      "step": 10771
+    },
+    {
+      "epoch": 0.09350613275926424,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001953424110416974,
+      "loss": 0.1182,
+      "step": 10772
+    },
+    {
+      "epoch": 0.0935148132394684,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001953414659547224,
+      "loss": 0.1885,
+      "step": 10773
+    },
+    {
+      "epoch": 0.09352349371967257,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001953405207744193,
+      "loss": 0.127,
+      "step": 10774
+    },
+    {
+      "epoch": 0.09353217419987674,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00195339575500789,
+      "loss": 0.1465,
+      "step": 10775
+    },
+    {
+      "epoch": 0.0935408546800809,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0019533863013383272,
+      "loss": 0.1279,
+      "step": 10776
+    },
+    {
+      "epoch": 0.09354953516028507,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001953376846735514,
+      "loss": 0.1113,
+      "step": 10777
+    },
+    {
+      "epoch": 0.09355821564048923,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.00195336739119946,
+      "loss": 0.105,
+      "step": 10778
+    },
+    {
+      "epoch": 0.0935668961206934,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0019533579347301767,
+      "loss": 0.0898,
+      "step": 10779
+    },
+    {
+      "epoch": 0.09357557660089756,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001953348477327674,
+      "loss": 0.1543,
+      "step": 10780
+    },
+    {
+      "epoch": 0.09358425708110173,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001953339018991962,
+      "loss": 0.125,
+      "step": 10781
+    },
+    {
+      "epoch": 0.09359293756130589,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0019533295597230515,
+      "loss": 0.1436,
+      "step": 10782
+    },
+    {
+      "epoch": 0.09360161804151006,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0019533200995209524,
+      "loss": 0.1113,
+      "step": 10783
+    },
+    {
+      "epoch": 0.09361029852171422,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019533106383856757,
+      "loss": 0.1084,
+      "step": 10784
+    },
+    {
+      "epoch": 0.09361897900191839,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001953301176317231,
+      "loss": 0.1611,
+      "step": 10785
+    },
+    {
+      "epoch": 0.09362765948212255,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019532917133156286,
+      "loss": 0.1074,
+      "step": 10786
+    },
+    {
+      "epoch": 0.09363633996232672,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.00195328224938088,
+      "loss": 0.1162,
+      "step": 10787
+    },
+    {
+      "epoch": 0.09364502044253088,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0019532727845129943,
+      "loss": 0.1211,
+      "step": 10788
+    },
+    {
+      "epoch": 0.09365370092273505,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001953263318711982,
+      "loss": 0.1064,
+      "step": 10789
+    },
+    {
+      "epoch": 0.09366238140293921,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001953253851977854,
+      "loss": 0.1416,
+      "step": 10790
+    },
+    {
+      "epoch": 0.09367106188314338,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019532443843106205,
+      "loss": 0.1367,
+      "step": 10791
+    },
+    {
+      "epoch": 0.09367974236334754,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019532349157102918,
+      "loss": 0.1377,
+      "step": 10792
+    },
+    {
+      "epoch": 0.09368842284355171,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0019532254461768783,
+      "loss": 0.1328,
+      "step": 10793
+    },
+    {
+      "epoch": 0.09369710332375587,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019532159757103904,
+      "loss": 0.1279,
+      "step": 10794
+    },
+    {
+      "epoch": 0.09370578380396004,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019532065043108377,
+      "loss": 0.0898,
+      "step": 10795
+    },
+    {
+      "epoch": 0.0937144642841642,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019531970319782315,
+      "loss": 0.1465,
+      "step": 10796
+    },
+    {
+      "epoch": 0.09372314476436837,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0019531875587125817,
+      "loss": 0.1221,
+      "step": 10797
+    },
+    {
+      "epoch": 0.09373182524457253,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001953178084513899,
+      "loss": 0.1299,
+      "step": 10798
+    },
+    {
+      "epoch": 0.0937405057247767,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0019531686093821935,
+      "loss": 0.1377,
+      "step": 10799
+    },
+    {
+      "epoch": 0.09374918620498086,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001953159133317476,
+      "loss": 0.1172,
+      "step": 10800
+    },
+    {
+      "epoch": 0.09375786668518503,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0019531496563197563,
+      "loss": 0.1201,
+      "step": 10801
+    },
+    {
+      "epoch": 0.0937665471653892,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019531401783890448,
+      "loss": 0.1016,
+      "step": 10802
+    },
+    {
+      "epoch": 0.09377522764559336,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001953130699525352,
+      "loss": 0.1182,
+      "step": 10803
+    },
+    {
+      "epoch": 0.09378390812579752,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001953121219728688,
+      "loss": 0.1475,
+      "step": 10804
+    },
+    {
+      "epoch": 0.09379258860600169,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.001953111738999064,
+      "loss": 0.1416,
+      "step": 10805
+    },
+    {
+      "epoch": 0.09380126908620585,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019531022573364893,
+      "loss": 0.1172,
+      "step": 10806
+    },
+    {
+      "epoch": 0.09380994956641002,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019530927747409754,
+      "loss": 0.1084,
+      "step": 10807
+    },
+    {
+      "epoch": 0.09381863004661418,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019530832912125317,
+      "loss": 0.1055,
+      "step": 10808
+    },
+    {
+      "epoch": 0.09382731052681835,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019530738067511692,
+      "loss": 0.1016,
+      "step": 10809
+    },
+    {
+      "epoch": 0.09383599100702252,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0019530643213568975,
+      "loss": 0.1172,
+      "step": 10810
+    },
+    {
+      "epoch": 0.09384467148722668,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019530548350297277,
+      "loss": 0.1094,
+      "step": 10811
+    },
+    {
+      "epoch": 0.09385335196743083,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.00195304534776967,
+      "loss": 0.1758,
+      "step": 10812
+    },
+    {
+      "epoch": 0.093862032447635,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019530358595767348,
+      "loss": 0.1367,
+      "step": 10813
+    },
+    {
+      "epoch": 0.09387071292783916,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019530263704509324,
+      "loss": 0.0801,
+      "step": 10814
+    },
+    {
+      "epoch": 0.09387939340804333,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019530168803922729,
+      "loss": 0.1328,
+      "step": 10815
+    },
+    {
+      "epoch": 0.09388807388824749,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019530073894007672,
+      "loss": 0.0957,
+      "step": 10816
+    },
+    {
+      "epoch": 0.09389675436845166,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0019529978974764256,
+      "loss": 0.124,
+      "step": 10817
+    },
+    {
+      "epoch": 0.09390543484865582,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001952988404619258,
+      "loss": 0.0869,
+      "step": 10818
+    },
+    {
+      "epoch": 0.09391411532885999,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0019529789108292752,
+      "loss": 0.1465,
+      "step": 10819
+    },
+    {
+      "epoch": 0.09392279580906415,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019529694161064875,
+      "loss": 0.1074,
+      "step": 10820
+    },
+    {
+      "epoch": 0.09393147628926832,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019529599204509052,
+      "loss": 0.1465,
+      "step": 10821
+    },
+    {
+      "epoch": 0.09394015676947248,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019529504238625388,
+      "loss": 0.0938,
+      "step": 10822
+    },
+    {
+      "epoch": 0.09394883724967665,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019529409263413986,
+      "loss": 0.1299,
+      "step": 10823
+    },
+    {
+      "epoch": 0.09395751772988081,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019529314278874952,
+      "loss": 0.1201,
+      "step": 10824
+    },
+    {
+      "epoch": 0.09396619821008498,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019529219285008385,
+      "loss": 0.1416,
+      "step": 10825
+    },
+    {
+      "epoch": 0.09397487869028914,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019529124281814395,
+      "loss": 0.0938,
+      "step": 10826
+    },
+    {
+      "epoch": 0.09398355917049331,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019529029269293081,
+      "loss": 0.1123,
+      "step": 10827
+    },
+    {
+      "epoch": 0.09399223965069747,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019528934247444547,
+      "loss": 0.1318,
+      "step": 10828
+    },
+    {
+      "epoch": 0.09400092013090164,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0019528839216268903,
+      "loss": 0.1406,
+      "step": 10829
+    },
+    {
+      "epoch": 0.0940096006111058,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0019528744175766244,
+      "loss": 0.1128,
+      "step": 10830
+    },
+    {
+      "epoch": 0.09401828109130997,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001952864912593668,
+      "loss": 0.1128,
+      "step": 10831
+    },
+    {
+      "epoch": 0.09402696157151413,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0019528554066780318,
+      "loss": 0.105,
+      "step": 10832
+    },
+    {
+      "epoch": 0.0940356420517183,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019528458998297252,
+      "loss": 0.1406,
+      "step": 10833
+    },
+    {
+      "epoch": 0.09404432253192246,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019528363920487595,
+      "loss": 0.1367,
+      "step": 10834
+    },
+    {
+      "epoch": 0.09405300301212663,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019528268833351444,
+      "loss": 0.1436,
+      "step": 10835
+    },
+    {
+      "epoch": 0.0940616834923308,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001952817373688891,
+      "loss": 0.1377,
+      "step": 10836
+    },
+    {
+      "epoch": 0.09407036397253496,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0019528078631100088,
+      "loss": 0.1416,
+      "step": 10837
+    },
+    {
+      "epoch": 0.09407904445273912,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0019527983515985094,
+      "loss": 0.1543,
+      "step": 10838
+    },
+    {
+      "epoch": 0.09408772493294329,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001952788839154402,
+      "loss": 0.1338,
+      "step": 10839
+    },
+    {
+      "epoch": 0.09409640541314745,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019527793257776975,
+      "loss": 0.1201,
+      "step": 10840
+    },
+    {
+      "epoch": 0.09410508589335162,
+      "grad_norm": 2.265625,
+      "learning_rate": 0.0019527698114684068,
+      "loss": 0.1406,
+      "step": 10841
+    },
+    {
+      "epoch": 0.09411376637355579,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0019527602962265396,
+      "loss": 0.1143,
+      "step": 10842
+    },
+    {
+      "epoch": 0.09412244685375995,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0019527507800521063,
+      "loss": 0.1055,
+      "step": 10843
+    },
+    {
+      "epoch": 0.09413112733396412,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019527412629451177,
+      "loss": 0.1514,
+      "step": 10844
+    },
+    {
+      "epoch": 0.09413980781416828,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019527317449055846,
+      "loss": 0.1084,
+      "step": 10845
+    },
+    {
+      "epoch": 0.09414848829437245,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019527222259335165,
+      "loss": 0.0918,
+      "step": 10846
+    },
+    {
+      "epoch": 0.09415716877457661,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019527127060289238,
+      "loss": 0.1045,
+      "step": 10847
+    },
+    {
+      "epoch": 0.09416584925478078,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019527031851918177,
+      "loss": 0.1021,
+      "step": 10848
+    },
+    {
+      "epoch": 0.09417452973498494,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0019526936634222084,
+      "loss": 0.1904,
+      "step": 10849
+    },
+    {
+      "epoch": 0.0941832102151891,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019526841407201056,
+      "loss": 0.1055,
+      "step": 10850
+    },
+    {
+      "epoch": 0.09419189069539327,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019526746170855204,
+      "loss": 0.1143,
+      "step": 10851
+    },
+    {
+      "epoch": 0.09420057117559744,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019526650925184635,
+      "loss": 0.1216,
+      "step": 10852
+    },
+    {
+      "epoch": 0.0942092516558016,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019526555670189444,
+      "loss": 0.1079,
+      "step": 10853
+    },
+    {
+      "epoch": 0.09421793213600577,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.001952646040586974,
+      "loss": 0.1377,
+      "step": 10854
+    },
+    {
+      "epoch": 0.09422661261620993,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001952636513222563,
+      "loss": 0.1348,
+      "step": 10855
+    },
+    {
+      "epoch": 0.0942352930964141,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0019526269849257211,
+      "loss": 0.1523,
+      "step": 10856
+    },
+    {
+      "epoch": 0.09424397357661826,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019526174556964595,
+      "loss": 0.1328,
+      "step": 10857
+    },
+    {
+      "epoch": 0.09425265405682243,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019526079255347882,
+      "loss": 0.1133,
+      "step": 10858
+    },
+    {
+      "epoch": 0.09426133453702659,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019525983944407178,
+      "loss": 0.124,
+      "step": 10859
+    },
+    {
+      "epoch": 0.09427001501723076,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019525888624142582,
+      "loss": 0.1514,
+      "step": 10860
+    },
+    {
+      "epoch": 0.09427869549743492,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0019525793294554205,
+      "loss": 0.1572,
+      "step": 10861
+    },
+    {
+      "epoch": 0.09428737597763909,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0019525697955642147,
+      "loss": 0.1484,
+      "step": 10862
+    },
+    {
+      "epoch": 0.09429605645784325,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0019525602607406516,
+      "loss": 0.0815,
+      "step": 10863
+    },
+    {
+      "epoch": 0.09430473693804742,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019525507249847412,
+      "loss": 0.1523,
+      "step": 10864
+    },
+    {
+      "epoch": 0.09431341741825158,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019525411882964942,
+      "loss": 0.0947,
+      "step": 10865
+    },
+    {
+      "epoch": 0.09432209789845575,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001952531650675921,
+      "loss": 0.1201,
+      "step": 10866
+    },
+    {
+      "epoch": 0.09433077837865991,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019525221121230322,
+      "loss": 0.1074,
+      "step": 10867
+    },
+    {
+      "epoch": 0.09433945885886408,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.001952512572637838,
+      "loss": 0.123,
+      "step": 10868
+    },
+    {
+      "epoch": 0.09434813933906824,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019525030322203483,
+      "loss": 0.1045,
+      "step": 10869
+    },
+    {
+      "epoch": 0.09435681981927241,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019524934908705745,
+      "loss": 0.1445,
+      "step": 10870
+    },
+    {
+      "epoch": 0.09436550029947657,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019524839485885266,
+      "loss": 0.1309,
+      "step": 10871
+    },
+    {
+      "epoch": 0.09437418077968074,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001952474405374215,
+      "loss": 0.1465,
+      "step": 10872
+    },
+    {
+      "epoch": 0.0943828612598849,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0019524648612276503,
+      "loss": 0.1279,
+      "step": 10873
+    },
+    {
+      "epoch": 0.09439154174008905,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019524553161488429,
+      "loss": 0.1299,
+      "step": 10874
+    },
+    {
+      "epoch": 0.09440022222029322,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0019524457701378028,
+      "loss": 0.166,
+      "step": 10875
+    },
+    {
+      "epoch": 0.09440890270049739,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0019524362231945413,
+      "loss": 0.0928,
+      "step": 10876
+    },
+    {
+      "epoch": 0.09441758318070155,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0019524266753190682,
+      "loss": 0.0933,
+      "step": 10877
+    },
+    {
+      "epoch": 0.09442626366090572,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001952417126511394,
+      "loss": 0.1338,
+      "step": 10878
+    },
+    {
+      "epoch": 0.09443494414110988,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019524075767715292,
+      "loss": 0.1108,
+      "step": 10879
+    },
+    {
+      "epoch": 0.09444362462131405,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019523980260994845,
+      "loss": 0.1543,
+      "step": 10880
+    },
+    {
+      "epoch": 0.09445230510151821,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019523884744952701,
+      "loss": 0.1094,
+      "step": 10881
+    },
+    {
+      "epoch": 0.09446098558172238,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019523789219588962,
+      "loss": 0.126,
+      "step": 10882
+    },
+    {
+      "epoch": 0.09446966606192654,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019523693684903737,
+      "loss": 0.1191,
+      "step": 10883
+    },
+    {
+      "epoch": 0.0944783465421307,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0019523598140897128,
+      "loss": 0.1582,
+      "step": 10884
+    },
+    {
+      "epoch": 0.09448702702233487,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001952350258756924,
+      "loss": 0.1011,
+      "step": 10885
+    },
+    {
+      "epoch": 0.09449570750253904,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019523407024920176,
+      "loss": 0.0879,
+      "step": 10886
+    },
+    {
+      "epoch": 0.0945043879827432,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019523311452950048,
+      "loss": 0.1348,
+      "step": 10887
+    },
+    {
+      "epoch": 0.09451306846294737,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001952321587165895,
+      "loss": 0.1279,
+      "step": 10888
+    },
+    {
+      "epoch": 0.09452174894315153,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001952312028104699,
+      "loss": 0.127,
+      "step": 10889
+    },
+    {
+      "epoch": 0.0945304294233557,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019523024681114276,
+      "loss": 0.1328,
+      "step": 10890
+    },
+    {
+      "epoch": 0.09453910990355986,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001952292907186091,
+      "loss": 0.0942,
+      "step": 10891
+    },
+    {
+      "epoch": 0.09454779038376403,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019522833453286999,
+      "loss": 0.1279,
+      "step": 10892
+    },
+    {
+      "epoch": 0.09455647086396819,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001952273782539264,
+      "loss": 0.1064,
+      "step": 10893
+    },
+    {
+      "epoch": 0.09456515134417236,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001952264218817795,
+      "loss": 0.1191,
+      "step": 10894
+    },
+    {
+      "epoch": 0.09457383182437652,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001952254654164302,
+      "loss": 0.1328,
+      "step": 10895
+    },
+    {
+      "epoch": 0.09458251230458069,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019522450885787964,
+      "loss": 0.1357,
+      "step": 10896
+    },
+    {
+      "epoch": 0.09459119278478485,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0019522355220612886,
+      "loss": 0.1191,
+      "step": 10897
+    },
+    {
+      "epoch": 0.09459987326498902,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019522259546117884,
+      "loss": 0.1133,
+      "step": 10898
+    },
+    {
+      "epoch": 0.09460855374519318,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001952216386230307,
+      "loss": 0.1367,
+      "step": 10899
+    },
+    {
+      "epoch": 0.09461723422539735,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019522068169168544,
+      "loss": 0.0957,
+      "step": 10900
+    },
+    {
+      "epoch": 0.09462591470560151,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019521972466714414,
+      "loss": 0.1211,
+      "step": 10901
+    },
+    {
+      "epoch": 0.09463459518580568,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019521876754940782,
+      "loss": 0.0938,
+      "step": 10902
+    },
+    {
+      "epoch": 0.09464327566600984,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019521781033847753,
+      "loss": 0.1055,
+      "step": 10903
+    },
+    {
+      "epoch": 0.09465195614621401,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0019521685303435434,
+      "loss": 0.1191,
+      "step": 10904
+    },
+    {
+      "epoch": 0.09466063662641817,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019521589563703924,
+      "loss": 0.1128,
+      "step": 10905
+    },
+    {
+      "epoch": 0.09466931710662234,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019521493814653337,
+      "loss": 0.1299,
+      "step": 10906
+    },
+    {
+      "epoch": 0.0946779975868265,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001952139805628377,
+      "loss": 0.1016,
+      "step": 10907
+    },
+    {
+      "epoch": 0.09468667806703067,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019521302288595332,
+      "loss": 0.103,
+      "step": 10908
+    },
+    {
+      "epoch": 0.09469535854723483,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001952120651158812,
+      "loss": 0.1104,
+      "step": 10909
+    },
+    {
+      "epoch": 0.094704039027439,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001952111072526225,
+      "loss": 0.1021,
+      "step": 10910
+    },
+    {
+      "epoch": 0.09471271950764316,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0019521014929617822,
+      "loss": 0.1367,
+      "step": 10911
+    },
+    {
+      "epoch": 0.09472139998784733,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019520919124654938,
+      "loss": 0.1309,
+      "step": 10912
+    },
+    {
+      "epoch": 0.0947300804680515,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019520823310373704,
+      "loss": 0.1455,
+      "step": 10913
+    },
+    {
+      "epoch": 0.09473876094825566,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019520727486774226,
+      "loss": 0.1455,
+      "step": 10914
+    },
+    {
+      "epoch": 0.09474744142845982,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019520631653856609,
+      "loss": 0.1582,
+      "step": 10915
+    },
+    {
+      "epoch": 0.09475612190866399,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0019520535811620957,
+      "loss": 0.1484,
+      "step": 10916
+    },
+    {
+      "epoch": 0.09476480238886816,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019520439960067375,
+      "loss": 0.1523,
+      "step": 10917
+    },
+    {
+      "epoch": 0.09477348286907232,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001952034409919597,
+      "loss": 0.1011,
+      "step": 10918
+    },
+    {
+      "epoch": 0.09478216334927649,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0019520248229006843,
+      "loss": 0.1641,
+      "step": 10919
+    },
+    {
+      "epoch": 0.09479084382948065,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019520152349500098,
+      "loss": 0.1465,
+      "step": 10920
+    },
+    {
+      "epoch": 0.09479952430968482,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019520056460675846,
+      "loss": 0.1211,
+      "step": 10921
+    },
+    {
+      "epoch": 0.09480820478988898,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001951996056253419,
+      "loss": 0.0796,
+      "step": 10922
+    },
+    {
+      "epoch": 0.09481688527009315,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001951986465507523,
+      "loss": 0.0732,
+      "step": 10923
+    },
+    {
+      "epoch": 0.09482556575029731,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019519768738299073,
+      "loss": 0.1182,
+      "step": 10924
+    },
+    {
+      "epoch": 0.09483424623050148,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0019519672812205824,
+      "loss": 0.1445,
+      "step": 10925
+    },
+    {
+      "epoch": 0.09484292671070564,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001951957687679559,
+      "loss": 0.0977,
+      "step": 10926
+    },
+    {
+      "epoch": 0.0948516071909098,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019519480932068476,
+      "loss": 0.0957,
+      "step": 10927
+    },
+    {
+      "epoch": 0.09486028767111397,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019519384978024586,
+      "loss": 0.1113,
+      "step": 10928
+    },
+    {
+      "epoch": 0.09486896815131814,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019519289014664023,
+      "loss": 0.1611,
+      "step": 10929
+    },
+    {
+      "epoch": 0.0948776486315223,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019519193041986896,
+      "loss": 0.0908,
+      "step": 10930
+    },
+    {
+      "epoch": 0.09488632911172647,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0019519097059993305,
+      "loss": 0.0967,
+      "step": 10931
+    },
+    {
+      "epoch": 0.09489500959193063,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001951900106868336,
+      "loss": 0.1406,
+      "step": 10932
+    },
+    {
+      "epoch": 0.0949036900721348,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0019518905068057163,
+      "loss": 0.1143,
+      "step": 10933
+    },
+    {
+      "epoch": 0.09491237055233896,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019518809058114814,
+      "loss": 0.1016,
+      "step": 10934
+    },
+    {
+      "epoch": 0.09492105103254311,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.001951871303885643,
+      "loss": 0.0967,
+      "step": 10935
+    },
+    {
+      "epoch": 0.09492973151274728,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019518617010282108,
+      "loss": 0.1318,
+      "step": 10936
+    },
+    {
+      "epoch": 0.09493841199295144,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019518520972391954,
+      "loss": 0.1533,
+      "step": 10937
+    },
+    {
+      "epoch": 0.09494709247315561,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019518424925186071,
+      "loss": 0.1094,
+      "step": 10938
+    },
+    {
+      "epoch": 0.09495577295335977,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001951832886866457,
+      "loss": 0.1367,
+      "step": 10939
+    },
+    {
+      "epoch": 0.09496445343356394,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019518232802827552,
+      "loss": 0.127,
+      "step": 10940
+    },
+    {
+      "epoch": 0.0949731339137681,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001951813672767512,
+      "loss": 0.1426,
+      "step": 10941
+    },
+    {
+      "epoch": 0.09498181439397227,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0019518040643207385,
+      "loss": 0.2715,
+      "step": 10942
+    },
+    {
+      "epoch": 0.09499049487417643,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019517944549424444,
+      "loss": 0.1318,
+      "step": 10943
+    },
+    {
+      "epoch": 0.0949991753543806,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019517848446326412,
+      "loss": 0.1133,
+      "step": 10944
+    },
+    {
+      "epoch": 0.09500785583458476,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019517752333913386,
+      "loss": 0.1172,
+      "step": 10945
+    },
+    {
+      "epoch": 0.09501653631478893,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019517656212185476,
+      "loss": 0.1104,
+      "step": 10946
+    },
+    {
+      "epoch": 0.0950252167949931,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019517560081142782,
+      "loss": 0.1289,
+      "step": 10947
+    },
+    {
+      "epoch": 0.09503389727519726,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0019517463940785414,
+      "loss": 0.0996,
+      "step": 10948
+    },
+    {
+      "epoch": 0.09504257775540143,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019517367791113476,
+      "loss": 0.1104,
+      "step": 10949
+    },
+    {
+      "epoch": 0.09505125823560559,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0019517271632127073,
+      "loss": 0.1167,
+      "step": 10950
+    },
+    {
+      "epoch": 0.09505993871580976,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0019517175463826311,
+      "loss": 0.1123,
+      "step": 10951
+    },
+    {
+      "epoch": 0.09506861919601392,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001951707928621129,
+      "loss": 0.1289,
+      "step": 10952
+    },
+    {
+      "epoch": 0.09507729967621809,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001951698309928212,
+      "loss": 0.1279,
+      "step": 10953
+    },
+    {
+      "epoch": 0.09508598015642225,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019516886903038907,
+      "loss": 0.1201,
+      "step": 10954
+    },
+    {
+      "epoch": 0.09509466063662642,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0019516790697481751,
+      "loss": 0.1035,
+      "step": 10955
+    },
+    {
+      "epoch": 0.09510334111683058,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019516694482610765,
+      "loss": 0.0977,
+      "step": 10956
+    },
+    {
+      "epoch": 0.09511202159703475,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001951659825842605,
+      "loss": 0.125,
+      "step": 10957
+    },
+    {
+      "epoch": 0.09512070207723891,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0019516502024927707,
+      "loss": 0.1143,
+      "step": 10958
+    },
+    {
+      "epoch": 0.09512938255744308,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019516405782115847,
+      "loss": 0.1279,
+      "step": 10959
+    },
+    {
+      "epoch": 0.09513806303764724,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0019516309529990575,
+      "loss": 0.127,
+      "step": 10960
+    },
+    {
+      "epoch": 0.0951467435178514,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0019516213268551995,
+      "loss": 0.1602,
+      "step": 10961
+    },
+    {
+      "epoch": 0.09515542399805557,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001951611699780021,
+      "loss": 0.1309,
+      "step": 10962
+    },
+    {
+      "epoch": 0.09516410447825974,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019516020717735328,
+      "loss": 0.166,
+      "step": 10963
+    },
+    {
+      "epoch": 0.0951727849584639,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0019515924428357455,
+      "loss": 0.127,
+      "step": 10964
+    },
+    {
+      "epoch": 0.09518146543866807,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019515828129666693,
+      "loss": 0.1748,
+      "step": 10965
+    },
+    {
+      "epoch": 0.09519014591887223,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019515731821663149,
+      "loss": 0.1758,
+      "step": 10966
+    },
+    {
+      "epoch": 0.0951988263990764,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019515635504346932,
+      "loss": 0.1602,
+      "step": 10967
+    },
+    {
+      "epoch": 0.09520750687928056,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019515539177718142,
+      "loss": 0.1514,
+      "step": 10968
+    },
+    {
+      "epoch": 0.09521618735948473,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019515442841776885,
+      "loss": 0.1035,
+      "step": 10969
+    },
+    {
+      "epoch": 0.09522486783968889,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001951534649652327,
+      "loss": 0.1572,
+      "step": 10970
+    },
+    {
+      "epoch": 0.09523354831989306,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019515250141957398,
+      "loss": 0.0952,
+      "step": 10971
+    },
+    {
+      "epoch": 0.09524222880009722,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001951515377807938,
+      "loss": 0.167,
+      "step": 10972
+    },
+    {
+      "epoch": 0.09525090928030139,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019515057404889313,
+      "loss": 0.1367,
+      "step": 10973
+    },
+    {
+      "epoch": 0.09525958976050555,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0019514961022387312,
+      "loss": 0.1445,
+      "step": 10974
+    },
+    {
+      "epoch": 0.09526827024070972,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019514864630573471,
+      "loss": 0.1045,
+      "step": 10975
+    },
+    {
+      "epoch": 0.09527695072091388,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019514768229447908,
+      "loss": 0.104,
+      "step": 10976
+    },
+    {
+      "epoch": 0.09528563120111805,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001951467181901072,
+      "loss": 0.125,
+      "step": 10977
+    },
+    {
+      "epoch": 0.09529431168132221,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019514575399262016,
+      "loss": 0.1045,
+      "step": 10978
+    },
+    {
+      "epoch": 0.09530299216152638,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00195144789702019,
+      "loss": 0.1182,
+      "step": 10979
+    },
+    {
+      "epoch": 0.09531167264173054,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019514382531830474,
+      "loss": 0.1118,
+      "step": 10980
+    },
+    {
+      "epoch": 0.09532035312193471,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0019514286084147854,
+      "loss": 0.1025,
+      "step": 10981
+    },
+    {
+      "epoch": 0.09532903360213887,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019514189627154134,
+      "loss": 0.1426,
+      "step": 10982
+    },
+    {
+      "epoch": 0.09533771408234304,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019514093160849424,
+      "loss": 0.1064,
+      "step": 10983
+    },
+    {
+      "epoch": 0.0953463945625472,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0019513996685233833,
+      "loss": 0.1396,
+      "step": 10984
+    },
+    {
+      "epoch": 0.09535507504275137,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001951390020030746,
+      "loss": 0.1172,
+      "step": 10985
+    },
+    {
+      "epoch": 0.09536375552295553,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019513803706070415,
+      "loss": 0.1367,
+      "step": 10986
+    },
+    {
+      "epoch": 0.0953724360031597,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.00195137072025228,
+      "loss": 0.1211,
+      "step": 10987
+    },
+    {
+      "epoch": 0.09538111648336386,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019513610689664725,
+      "loss": 0.0947,
+      "step": 10988
+    },
+    {
+      "epoch": 0.09538979696356803,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0019513514167496294,
+      "loss": 0.1079,
+      "step": 10989
+    },
+    {
+      "epoch": 0.0953984774437722,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019513417636017607,
+      "loss": 0.1045,
+      "step": 10990
+    },
+    {
+      "epoch": 0.09540715792397636,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019513321095228782,
+      "loss": 0.1455,
+      "step": 10991
+    },
+    {
+      "epoch": 0.09541583840418053,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001951322454512991,
+      "loss": 0.1025,
+      "step": 10992
+    },
+    {
+      "epoch": 0.09542451888438469,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019513127985721108,
+      "loss": 0.1465,
+      "step": 10993
+    },
+    {
+      "epoch": 0.09543319936458886,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0019513031417002476,
+      "loss": 0.124,
+      "step": 10994
+    },
+    {
+      "epoch": 0.09544187984479302,
+      "grad_norm": 0.061767578125,
+      "learning_rate": 0.0019512934838974122,
+      "loss": 0.0874,
+      "step": 10995
+    },
+    {
+      "epoch": 0.09545056032499719,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019512838251636146,
+      "loss": 0.1084,
+      "step": 10996
+    },
+    {
+      "epoch": 0.09545924080520134,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001951274165498866,
+      "loss": 0.1206,
+      "step": 10997
+    },
+    {
+      "epoch": 0.0954679212854055,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001951264504903177,
+      "loss": 0.1182,
+      "step": 10998
+    },
+    {
+      "epoch": 0.09547660176560967,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019512548433765575,
+      "loss": 0.1201,
+      "step": 10999
+    },
+    {
+      "epoch": 0.09548528224581383,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001951245180919019,
+      "loss": 0.123,
+      "step": 11000
+    },
+    {
+      "epoch": 0.095493962726018,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001951235517530571,
+      "loss": 0.127,
+      "step": 11001
+    },
+    {
+      "epoch": 0.09550264320622216,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001951225853211225,
+      "loss": 0.1191,
+      "step": 11002
+    },
+    {
+      "epoch": 0.09551132368642633,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019512161879609914,
+      "loss": 0.1387,
+      "step": 11003
+    },
+    {
+      "epoch": 0.09552000416663049,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019512065217798802,
+      "loss": 0.126,
+      "step": 11004
+    },
+    {
+      "epoch": 0.09552868464683466,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019511968546679025,
+      "loss": 0.1035,
+      "step": 11005
+    },
+    {
+      "epoch": 0.09553736512703882,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019511871866250685,
+      "loss": 0.1338,
+      "step": 11006
+    },
+    {
+      "epoch": 0.09554604560724299,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0019511775176513893,
+      "loss": 0.1113,
+      "step": 11007
+    },
+    {
+      "epoch": 0.09555472608744715,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001951167847746875,
+      "loss": 0.1147,
+      "step": 11008
+    },
+    {
+      "epoch": 0.09556340656765132,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019511581769115363,
+      "loss": 0.1113,
+      "step": 11009
+    },
+    {
+      "epoch": 0.09557208704785548,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019511485051453836,
+      "loss": 0.1094,
+      "step": 11010
+    },
+    {
+      "epoch": 0.09558076752805965,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001951138832448428,
+      "loss": 0.1138,
+      "step": 11011
+    },
+    {
+      "epoch": 0.09558944800826381,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.00195112915882068,
+      "loss": 0.1055,
+      "step": 11012
+    },
+    {
+      "epoch": 0.09559812848846798,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0019511194842621493,
+      "loss": 0.1562,
+      "step": 11013
+    },
+    {
+      "epoch": 0.09560680896867214,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019511098087728474,
+      "loss": 0.0928,
+      "step": 11014
+    },
+    {
+      "epoch": 0.09561548944887631,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019511001323527849,
+      "loss": 0.0977,
+      "step": 11015
+    },
+    {
+      "epoch": 0.09562416992908047,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0019510904550019716,
+      "loss": 0.1602,
+      "step": 11016
+    },
+    {
+      "epoch": 0.09563285040928464,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019510807767204189,
+      "loss": 0.1289,
+      "step": 11017
+    },
+    {
+      "epoch": 0.0956415308894888,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019510710975081367,
+      "loss": 0.0967,
+      "step": 11018
+    },
+    {
+      "epoch": 0.09565021136969297,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019510614173651364,
+      "loss": 0.1279,
+      "step": 11019
+    },
+    {
+      "epoch": 0.09565889184989713,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0019510517362914277,
+      "loss": 0.1035,
+      "step": 11020
+    },
+    {
+      "epoch": 0.0956675723301013,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001951042054287022,
+      "loss": 0.0928,
+      "step": 11021
+    },
+    {
+      "epoch": 0.09567625281030546,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019510323713519291,
+      "loss": 0.1406,
+      "step": 11022
+    },
+    {
+      "epoch": 0.09568493329050963,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019510226874861604,
+      "loss": 0.1484,
+      "step": 11023
+    },
+    {
+      "epoch": 0.0956936137707138,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0019510130026897258,
+      "loss": 0.125,
+      "step": 11024
+    },
+    {
+      "epoch": 0.09570229425091796,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0019510033169626362,
+      "loss": 0.1211,
+      "step": 11025
+    },
+    {
+      "epoch": 0.09571097473112213,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019509936303049023,
+      "loss": 0.1191,
+      "step": 11026
+    },
+    {
+      "epoch": 0.09571965521132629,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019509839427165343,
+      "loss": 0.1562,
+      "step": 11027
+    },
+    {
+      "epoch": 0.09572833569153046,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0019509742541975433,
+      "loss": 0.1328,
+      "step": 11028
+    },
+    {
+      "epoch": 0.09573701617173462,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019509645647479393,
+      "loss": 0.1221,
+      "step": 11029
+    },
+    {
+      "epoch": 0.09574569665193879,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0019509548743677334,
+      "loss": 0.1162,
+      "step": 11030
+    },
+    {
+      "epoch": 0.09575437713214295,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0019509451830569363,
+      "loss": 0.1504,
+      "step": 11031
+    },
+    {
+      "epoch": 0.09576305761234712,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019509354908155581,
+      "loss": 0.1084,
+      "step": 11032
+    },
+    {
+      "epoch": 0.09577173809255128,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019509257976436094,
+      "loss": 0.0962,
+      "step": 11033
+    },
+    {
+      "epoch": 0.09578041857275545,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0019509161035411017,
+      "loss": 0.1094,
+      "step": 11034
+    },
+    {
+      "epoch": 0.09578909905295961,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019509064085080443,
+      "loss": 0.0938,
+      "step": 11035
+    },
+    {
+      "epoch": 0.09579777953316378,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019508967125444487,
+      "loss": 0.1543,
+      "step": 11036
+    },
+    {
+      "epoch": 0.09580646001336794,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0019508870156503251,
+      "loss": 0.1172,
+      "step": 11037
+    },
+    {
+      "epoch": 0.0958151404935721,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0019508773178256841,
+      "loss": 0.0894,
+      "step": 11038
+    },
+    {
+      "epoch": 0.09582382097377627,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001950867619070537,
+      "loss": 0.1025,
+      "step": 11039
+    },
+    {
+      "epoch": 0.09583250145398044,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0019508579193848934,
+      "loss": 0.1367,
+      "step": 11040
+    },
+    {
+      "epoch": 0.0958411819341846,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019508482187687644,
+      "loss": 0.1357,
+      "step": 11041
+    },
+    {
+      "epoch": 0.09584986241438877,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0019508385172221608,
+      "loss": 0.1196,
+      "step": 11042
+    },
+    {
+      "epoch": 0.09585854289459293,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.001950828814745093,
+      "loss": 0.0962,
+      "step": 11043
+    },
+    {
+      "epoch": 0.0958672233747971,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019508191113375714,
+      "loss": 0.1025,
+      "step": 11044
+    },
+    {
+      "epoch": 0.09587590385500126,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019508094069996067,
+      "loss": 0.1191,
+      "step": 11045
+    },
+    {
+      "epoch": 0.09588458433520543,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0019507997017312098,
+      "loss": 0.1191,
+      "step": 11046
+    },
+    {
+      "epoch": 0.09589326481540959,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0019507899955323912,
+      "loss": 0.1016,
+      "step": 11047
+    },
+    {
+      "epoch": 0.09590194529561376,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0019507802884031616,
+      "loss": 0.1289,
+      "step": 11048
+    },
+    {
+      "epoch": 0.09591062577581792,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001950770580343531,
+      "loss": 0.1177,
+      "step": 11049
+    },
+    {
+      "epoch": 0.09591930625602209,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019507608713535106,
+      "loss": 0.1436,
+      "step": 11050
+    },
+    {
+      "epoch": 0.09592798673622625,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001950751161433111,
+      "loss": 0.0967,
+      "step": 11051
+    },
+    {
+      "epoch": 0.09593666721643042,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0019507414505823427,
+      "loss": 0.1016,
+      "step": 11052
+    },
+    {
+      "epoch": 0.09594534769663458,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019507317388012164,
+      "loss": 0.124,
+      "step": 11053
+    },
+    {
+      "epoch": 0.09595402817683875,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019507220260897426,
+      "loss": 0.1504,
+      "step": 11054
+    },
+    {
+      "epoch": 0.09596270865704291,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.001950712312447932,
+      "loss": 0.0967,
+      "step": 11055
+    },
+    {
+      "epoch": 0.09597138913724708,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019507025978757948,
+      "loss": 0.1641,
+      "step": 11056
+    },
+    {
+      "epoch": 0.09598006961745124,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0019506928823733428,
+      "loss": 0.1152,
+      "step": 11057
+    },
+    {
+      "epoch": 0.09598875009765541,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0019506831659405852,
+      "loss": 0.1162,
+      "step": 11058
+    },
+    {
+      "epoch": 0.09599743057785956,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019506734485775332,
+      "loss": 0.1138,
+      "step": 11059
+    },
+    {
+      "epoch": 0.09600611105806373,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0019506637302841982,
+      "loss": 0.1348,
+      "step": 11060
+    },
+    {
+      "epoch": 0.09601479153826789,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019506540110605895,
+      "loss": 0.1367,
+      "step": 11061
+    },
+    {
+      "epoch": 0.09602347201847206,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0019506442909067187,
+      "loss": 0.1348,
+      "step": 11062
+    },
+    {
+      "epoch": 0.09603215249867622,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0019506345698225957,
+      "loss": 0.1064,
+      "step": 11063
+    },
+    {
+      "epoch": 0.09604083297888039,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001950624847808232,
+      "loss": 0.1123,
+      "step": 11064
+    },
+    {
+      "epoch": 0.09604951345908455,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019506151248636374,
+      "loss": 0.123,
+      "step": 11065
+    },
+    {
+      "epoch": 0.09605819393928872,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019506054009888233,
+      "loss": 0.0957,
+      "step": 11066
+    },
+    {
+      "epoch": 0.09606687441949288,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019505956761837996,
+      "loss": 0.0869,
+      "step": 11067
+    },
+    {
+      "epoch": 0.09607555489969705,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001950585950448577,
+      "loss": 0.1279,
+      "step": 11068
+    },
+    {
+      "epoch": 0.09608423537990121,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0019505762237831666,
+      "loss": 0.1543,
+      "step": 11069
+    },
+    {
+      "epoch": 0.09609291586010538,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019505664961875788,
+      "loss": 0.1445,
+      "step": 11070
+    },
+    {
+      "epoch": 0.09610159634030954,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019505567676618243,
+      "loss": 0.1543,
+      "step": 11071
+    },
+    {
+      "epoch": 0.09611027682051371,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0019505470382059139,
+      "loss": 0.126,
+      "step": 11072
+    },
+    {
+      "epoch": 0.09611895730071787,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019505373078198577,
+      "loss": 0.1357,
+      "step": 11073
+    },
+    {
+      "epoch": 0.09612763778092204,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.001950527576503667,
+      "loss": 0.1396,
+      "step": 11074
+    },
+    {
+      "epoch": 0.0961363182611262,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019505178442573519,
+      "loss": 0.1289,
+      "step": 11075
+    },
+    {
+      "epoch": 0.09614499874133037,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019505081110809232,
+      "loss": 0.1006,
+      "step": 11076
+    },
+    {
+      "epoch": 0.09615367922153453,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0019504983769743918,
+      "loss": 0.1299,
+      "step": 11077
+    },
+    {
+      "epoch": 0.0961623597017387,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001950488641937768,
+      "loss": 0.1348,
+      "step": 11078
+    },
+    {
+      "epoch": 0.09617104018194286,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0019504789059710628,
+      "loss": 0.1367,
+      "step": 11079
+    },
+    {
+      "epoch": 0.09617972066214703,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019504691690742865,
+      "loss": 0.1592,
+      "step": 11080
+    },
+    {
+      "epoch": 0.0961884011423512,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0019504594312474497,
+      "loss": 0.1523,
+      "step": 11081
+    },
+    {
+      "epoch": 0.09619708162255536,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019504496924905637,
+      "loss": 0.1172,
+      "step": 11082
+    },
+    {
+      "epoch": 0.09620576210275952,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019504399528036383,
+      "loss": 0.1357,
+      "step": 11083
+    },
+    {
+      "epoch": 0.09621444258296369,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019504302121866848,
+      "loss": 0.0928,
+      "step": 11084
+    },
+    {
+      "epoch": 0.09622312306316785,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019504204706397132,
+      "loss": 0.0967,
+      "step": 11085
+    },
+    {
+      "epoch": 0.09623180354337202,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019504107281627353,
+      "loss": 0.126,
+      "step": 11086
+    },
+    {
+      "epoch": 0.09624048402357618,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0019504009847557604,
+      "loss": 0.1309,
+      "step": 11087
+    },
+    {
+      "epoch": 0.09624916450378035,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019503912404188,
+      "loss": 0.1533,
+      "step": 11088
+    },
+    {
+      "epoch": 0.09625784498398451,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019503814951518643,
+      "loss": 0.1318,
+      "step": 11089
+    },
+    {
+      "epoch": 0.09626652546418868,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0019503717489549645,
+      "loss": 0.0986,
+      "step": 11090
+    },
+    {
+      "epoch": 0.09627520594439284,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019503620018281108,
+      "loss": 0.1138,
+      "step": 11091
+    },
+    {
+      "epoch": 0.09628388642459701,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0019503522537713137,
+      "loss": 0.1162,
+      "step": 11092
+    },
+    {
+      "epoch": 0.09629256690480117,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019503425047845847,
+      "loss": 0.1191,
+      "step": 11093
+    },
+    {
+      "epoch": 0.09630124738500534,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019503327548679333,
+      "loss": 0.0962,
+      "step": 11094
+    },
+    {
+      "epoch": 0.0963099278652095,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001950323004021371,
+      "loss": 0.1416,
+      "step": 11095
+    },
+    {
+      "epoch": 0.09631860834541367,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019503132522449085,
+      "loss": 0.1143,
+      "step": 11096
+    },
+    {
+      "epoch": 0.09632728882561784,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001950303499538556,
+      "loss": 0.1543,
+      "step": 11097
+    },
+    {
+      "epoch": 0.096335969305822,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0019502937459023244,
+      "loss": 0.1084,
+      "step": 11098
+    },
+    {
+      "epoch": 0.09634464978602617,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019502839913362244,
+      "loss": 0.1348,
+      "step": 11099
+    },
+    {
+      "epoch": 0.09635333026623033,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019502742358402663,
+      "loss": 0.1436,
+      "step": 11100
+    },
+    {
+      "epoch": 0.0963620107464345,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0019502644794144612,
+      "loss": 0.1113,
+      "step": 11101
+    },
+    {
+      "epoch": 0.09637069122663866,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0019502547220588199,
+      "loss": 0.1187,
+      "step": 11102
+    },
+    {
+      "epoch": 0.09637937170684283,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0019502449637733526,
+      "loss": 0.1182,
+      "step": 11103
+    },
+    {
+      "epoch": 0.09638805218704699,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019502352045580702,
+      "loss": 0.127,
+      "step": 11104
+    },
+    {
+      "epoch": 0.09639673266725116,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0019502254444129833,
+      "loss": 0.124,
+      "step": 11105
+    },
+    {
+      "epoch": 0.09640541314745532,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019502156833381028,
+      "loss": 0.1465,
+      "step": 11106
+    },
+    {
+      "epoch": 0.09641409362765949,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001950205921333439,
+      "loss": 0.124,
+      "step": 11107
+    },
+    {
+      "epoch": 0.09642277410786365,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001950196158399003,
+      "loss": 0.1475,
+      "step": 11108
+    },
+    {
+      "epoch": 0.09643145458806782,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.001950186394534805,
+      "loss": 0.0972,
+      "step": 11109
+    },
+    {
+      "epoch": 0.09644013506827198,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019501766297408563,
+      "loss": 0.1182,
+      "step": 11110
+    },
+    {
+      "epoch": 0.09644881554847615,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019501668640171668,
+      "loss": 0.0894,
+      "step": 11111
+    },
+    {
+      "epoch": 0.09645749602868031,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0019501570973637478,
+      "loss": 0.1455,
+      "step": 11112
+    },
+    {
+      "epoch": 0.09646617650888448,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.00195014732978061,
+      "loss": 0.1016,
+      "step": 11113
+    },
+    {
+      "epoch": 0.09647485698908864,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019501375612677634,
+      "loss": 0.0986,
+      "step": 11114
+    },
+    {
+      "epoch": 0.09648353746929281,
+      "grad_norm": 1.7265625,
+      "learning_rate": 0.0019501277918252194,
+      "loss": 0.1592,
+      "step": 11115
+    },
+    {
+      "epoch": 0.09649221794949697,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019501180214529883,
+      "loss": 0.126,
+      "step": 11116
+    },
+    {
+      "epoch": 0.09650089842970114,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019501082501510811,
+      "loss": 0.1309,
+      "step": 11117
+    },
+    {
+      "epoch": 0.0965095789099053,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019500984779195082,
+      "loss": 0.105,
+      "step": 11118
+    },
+    {
+      "epoch": 0.09651825939010947,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.00195008870475828,
+      "loss": 0.1084,
+      "step": 11119
+    },
+    {
+      "epoch": 0.09652693987031362,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019500789306674082,
+      "loss": 0.1309,
+      "step": 11120
+    },
+    {
+      "epoch": 0.09653562035051778,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019500691556469023,
+      "loss": 0.125,
+      "step": 11121
+    },
+    {
+      "epoch": 0.09654430083072195,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001950059379696774,
+      "loss": 0.0957,
+      "step": 11122
+    },
+    {
+      "epoch": 0.09655298131092611,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019500496028170332,
+      "loss": 0.1172,
+      "step": 11123
+    },
+    {
+      "epoch": 0.09656166179113028,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019500398250076913,
+      "loss": 0.1211,
+      "step": 11124
+    },
+    {
+      "epoch": 0.09657034227133444,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0019500300462687583,
+      "loss": 0.1426,
+      "step": 11125
+    },
+    {
+      "epoch": 0.09657902275153861,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0019500202666002454,
+      "loss": 0.1025,
+      "step": 11126
+    },
+    {
+      "epoch": 0.09658770323174277,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019500104860021628,
+      "loss": 0.0991,
+      "step": 11127
+    },
+    {
+      "epoch": 0.09659638371194694,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0019500007044745218,
+      "loss": 0.1279,
+      "step": 11128
+    },
+    {
+      "epoch": 0.0966050641921511,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019499909220173327,
+      "loss": 0.0708,
+      "step": 11129
+    },
+    {
+      "epoch": 0.09661374467235527,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019499811386306063,
+      "loss": 0.1157,
+      "step": 11130
+    },
+    {
+      "epoch": 0.09662242515255944,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019499713543143532,
+      "loss": 0.1011,
+      "step": 11131
+    },
+    {
+      "epoch": 0.0966311056327636,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019499615690685844,
+      "loss": 0.1523,
+      "step": 11132
+    },
+    {
+      "epoch": 0.09663978611296777,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019499517828933102,
+      "loss": 0.125,
+      "step": 11133
+    },
+    {
+      "epoch": 0.09664846659317193,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019499419957885416,
+      "loss": 0.1416,
+      "step": 11134
+    },
+    {
+      "epoch": 0.0966571470733761,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0019499322077542892,
+      "loss": 0.1562,
+      "step": 11135
+    },
+    {
+      "epoch": 0.09666582755358026,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001949922418790564,
+      "loss": 0.1006,
+      "step": 11136
+    },
+    {
+      "epoch": 0.09667450803378443,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0019499126288973758,
+      "loss": 0.0859,
+      "step": 11137
+    },
+    {
+      "epoch": 0.09668318851398859,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019499028380747364,
+      "loss": 0.0962,
+      "step": 11138
+    },
+    {
+      "epoch": 0.09669186899419276,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0019498930463226558,
+      "loss": 0.1006,
+      "step": 11139
+    },
+    {
+      "epoch": 0.09670054947439692,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001949883253641145,
+      "loss": 0.083,
+      "step": 11140
+    },
+    {
+      "epoch": 0.09670922995460109,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.0019498734600302146,
+      "loss": 0.1035,
+      "step": 11141
+    },
+    {
+      "epoch": 0.09671791043480525,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019498636654898755,
+      "loss": 0.1191,
+      "step": 11142
+    },
+    {
+      "epoch": 0.09672659091500942,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001949853870020138,
+      "loss": 0.0747,
+      "step": 11143
+    },
+    {
+      "epoch": 0.09673527139521358,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0019498440736210132,
+      "loss": 0.1074,
+      "step": 11144
+    },
+    {
+      "epoch": 0.09674395187541775,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001949834276292512,
+      "loss": 0.1016,
+      "step": 11145
+    },
+    {
+      "epoch": 0.09675263235562191,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019498244780346446,
+      "loss": 0.0957,
+      "step": 11146
+    },
+    {
+      "epoch": 0.09676131283582608,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0019498146788474217,
+      "loss": 0.1406,
+      "step": 11147
+    },
+    {
+      "epoch": 0.09676999331603024,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019498048787308542,
+      "loss": 0.1611,
+      "step": 11148
+    },
+    {
+      "epoch": 0.09677867379623441,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019497950776849533,
+      "loss": 0.1406,
+      "step": 11149
+    },
+    {
+      "epoch": 0.09678735427643857,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0019497852757097288,
+      "loss": 0.1328,
+      "step": 11150
+    },
+    {
+      "epoch": 0.09679603475664274,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001949775472805192,
+      "loss": 0.126,
+      "step": 11151
+    },
+    {
+      "epoch": 0.0968047152368469,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0019497656689713538,
+      "loss": 0.1235,
+      "step": 11152
+    },
+    {
+      "epoch": 0.09681339571705107,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0019497558642082246,
+      "loss": 0.1348,
+      "step": 11153
+    },
+    {
+      "epoch": 0.09682207619725523,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019497460585158146,
+      "loss": 0.1357,
+      "step": 11154
+    },
+    {
+      "epoch": 0.0968307566774594,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019497362518941356,
+      "loss": 0.1167,
+      "step": 11155
+    },
+    {
+      "epoch": 0.09683943715766356,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019497264443431978,
+      "loss": 0.0977,
+      "step": 11156
+    },
+    {
+      "epoch": 0.09684811763786773,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0019497166358630118,
+      "loss": 0.125,
+      "step": 11157
+    },
+    {
+      "epoch": 0.0968567981180719,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019497068264535883,
+      "loss": 0.1074,
+      "step": 11158
+    },
+    {
+      "epoch": 0.09686547859827606,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0019496970161149384,
+      "loss": 0.1553,
+      "step": 11159
+    },
+    {
+      "epoch": 0.09687415907848022,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019496872048470725,
+      "loss": 0.1167,
+      "step": 11160
+    },
+    {
+      "epoch": 0.09688283955868439,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019496773926500016,
+      "loss": 0.1406,
+      "step": 11161
+    },
+    {
+      "epoch": 0.09689152003888855,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019496675795237363,
+      "loss": 0.1206,
+      "step": 11162
+    },
+    {
+      "epoch": 0.09690020051909272,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019496577654682873,
+      "loss": 0.1387,
+      "step": 11163
+    },
+    {
+      "epoch": 0.09690888099929688,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019496479504836647,
+      "loss": 0.0942,
+      "step": 11164
+    },
+    {
+      "epoch": 0.09691756147950105,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0019496381345698807,
+      "loss": 0.1025,
+      "step": 11165
+    },
+    {
+      "epoch": 0.09692624195970521,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001949628317726945,
+      "loss": 0.125,
+      "step": 11166
+    },
+    {
+      "epoch": 0.09693492243990938,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019496184999548683,
+      "loss": 0.1187,
+      "step": 11167
+    },
+    {
+      "epoch": 0.09694360292011354,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019496086812536616,
+      "loss": 0.125,
+      "step": 11168
+    },
+    {
+      "epoch": 0.09695228340031771,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0019495988616233355,
+      "loss": 0.1484,
+      "step": 11169
+    },
+    {
+      "epoch": 0.09696096388052187,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019495890410639012,
+      "loss": 0.1001,
+      "step": 11170
+    },
+    {
+      "epoch": 0.09696964436072604,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.001949579219575369,
+      "loss": 0.1348,
+      "step": 11171
+    },
+    {
+      "epoch": 0.0969783248409302,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019495693971577498,
+      "loss": 0.1162,
+      "step": 11172
+    },
+    {
+      "epoch": 0.09698700532113437,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001949559573811054,
+      "loss": 0.0942,
+      "step": 11173
+    },
+    {
+      "epoch": 0.09699568580133854,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019495497495352926,
+      "loss": 0.1416,
+      "step": 11174
+    },
+    {
+      "epoch": 0.0970043662815427,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019495399243304767,
+      "loss": 0.1445,
+      "step": 11175
+    },
+    {
+      "epoch": 0.09701304676174687,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019495300981966165,
+      "loss": 0.1094,
+      "step": 11176
+    },
+    {
+      "epoch": 0.09702172724195103,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.001949520271133723,
+      "loss": 0.1006,
+      "step": 11177
+    },
+    {
+      "epoch": 0.0970304077221552,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0019495104431418067,
+      "loss": 0.1328,
+      "step": 11178
+    },
+    {
+      "epoch": 0.09703908820235936,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0019495006142208786,
+      "loss": 0.1777,
+      "step": 11179
+    },
+    {
+      "epoch": 0.09704776868256353,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019494907843709496,
+      "loss": 0.1172,
+      "step": 11180
+    },
+    {
+      "epoch": 0.09705644916276769,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019494809535920302,
+      "loss": 0.1094,
+      "step": 11181
+    },
+    {
+      "epoch": 0.09706512964297184,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001949471121884131,
+      "loss": 0.1143,
+      "step": 11182
+    },
+    {
+      "epoch": 0.09707381012317601,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001949461289247263,
+      "loss": 0.1533,
+      "step": 11183
+    },
+    {
+      "epoch": 0.09708249060338017,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0019494514556814369,
+      "loss": 0.1133,
+      "step": 11184
+    },
+    {
+      "epoch": 0.09709117108358434,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0019494416211866636,
+      "loss": 0.1396,
+      "step": 11185
+    },
+    {
+      "epoch": 0.0970998515637885,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019494317857629536,
+      "loss": 0.1689,
+      "step": 11186
+    },
+    {
+      "epoch": 0.09710853204399267,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019494219494103177,
+      "loss": 0.1855,
+      "step": 11187
+    },
+    {
+      "epoch": 0.09711721252419683,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019494121121287672,
+      "loss": 0.126,
+      "step": 11188
+    },
+    {
+      "epoch": 0.097125893004401,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0019494022739183117,
+      "loss": 0.0801,
+      "step": 11189
+    },
+    {
+      "epoch": 0.09713457348460516,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001949392434778963,
+      "loss": 0.1055,
+      "step": 11190
+    },
+    {
+      "epoch": 0.09714325396480933,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0019493825947107313,
+      "loss": 0.1069,
+      "step": 11191
+    },
+    {
+      "epoch": 0.0971519344450135,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0019493727537136277,
+      "loss": 0.0781,
+      "step": 11192
+    },
+    {
+      "epoch": 0.09716061492521766,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019493629117876626,
+      "loss": 0.0825,
+      "step": 11193
+    },
+    {
+      "epoch": 0.09716929540542182,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019493530689328473,
+      "loss": 0.1338,
+      "step": 11194
+    },
+    {
+      "epoch": 0.09717797588562599,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019493432251491921,
+      "loss": 0.1113,
+      "step": 11195
+    },
+    {
+      "epoch": 0.09718665636583015,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019493333804367079,
+      "loss": 0.1104,
+      "step": 11196
+    },
+    {
+      "epoch": 0.09719533684603432,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019493235347954054,
+      "loss": 0.127,
+      "step": 11197
+    },
+    {
+      "epoch": 0.09720401732623848,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019493136882252958,
+      "loss": 0.1426,
+      "step": 11198
+    },
+    {
+      "epoch": 0.09721269780644265,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.001949303840726389,
+      "loss": 0.1172,
+      "step": 11199
+    },
+    {
+      "epoch": 0.09722137828664681,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019492939922986965,
+      "loss": 0.0986,
+      "step": 11200
+    },
+    {
+      "epoch": 0.09723005876685098,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001949284142942229,
+      "loss": 0.125,
+      "step": 11201
+    },
+    {
+      "epoch": 0.09723873924705514,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001949274292656997,
+      "loss": 0.1738,
+      "step": 11202
+    },
+    {
+      "epoch": 0.09724741972725931,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019492644414430117,
+      "loss": 0.1211,
+      "step": 11203
+    },
+    {
+      "epoch": 0.09725610020746348,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0019492545893002831,
+      "loss": 0.1387,
+      "step": 11204
+    },
+    {
+      "epoch": 0.09726478068766764,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019492447362288224,
+      "loss": 0.1455,
+      "step": 11205
+    },
+    {
+      "epoch": 0.0972734611678718,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019492348822286406,
+      "loss": 0.1045,
+      "step": 11206
+    },
+    {
+      "epoch": 0.09728214164807597,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019492250272997486,
+      "loss": 0.1465,
+      "step": 11207
+    },
+    {
+      "epoch": 0.09729082212828014,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019492151714421565,
+      "loss": 0.1367,
+      "step": 11208
+    },
+    {
+      "epoch": 0.0972995026084843,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0019492053146558752,
+      "loss": 0.0884,
+      "step": 11209
+    },
+    {
+      "epoch": 0.09730818308868847,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019491954569409162,
+      "loss": 0.1406,
+      "step": 11210
+    },
+    {
+      "epoch": 0.09731686356889263,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0019491855982972897,
+      "loss": 0.168,
+      "step": 11211
+    },
+    {
+      "epoch": 0.0973255440490968,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019491757387250066,
+      "loss": 0.1318,
+      "step": 11212
+    },
+    {
+      "epoch": 0.09733422452930096,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019491658782240775,
+      "loss": 0.1289,
+      "step": 11213
+    },
+    {
+      "epoch": 0.09734290500950513,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019491560167945138,
+      "loss": 0.0869,
+      "step": 11214
+    },
+    {
+      "epoch": 0.09735158548970929,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019491461544363252,
+      "loss": 0.123,
+      "step": 11215
+    },
+    {
+      "epoch": 0.09736026596991346,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019491362911495233,
+      "loss": 0.1572,
+      "step": 11216
+    },
+    {
+      "epoch": 0.09736894645011762,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001949126426934119,
+      "loss": 0.0967,
+      "step": 11217
+    },
+    {
+      "epoch": 0.09737762693032179,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0019491165617901228,
+      "loss": 0.0889,
+      "step": 11218
+    },
+    {
+      "epoch": 0.09738630741052595,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019491066957175451,
+      "loss": 0.1211,
+      "step": 11219
+    },
+    {
+      "epoch": 0.09739498789073012,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019490968287163976,
+      "loss": 0.1143,
+      "step": 11220
+    },
+    {
+      "epoch": 0.09740366837093428,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.00194908696078669,
+      "loss": 0.1074,
+      "step": 11221
+    },
+    {
+      "epoch": 0.09741234885113845,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001949077091928434,
+      "loss": 0.1777,
+      "step": 11222
+    },
+    {
+      "epoch": 0.09742102933134261,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0019490672221416398,
+      "loss": 0.1445,
+      "step": 11223
+    },
+    {
+      "epoch": 0.09742970981154678,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019490573514263187,
+      "loss": 0.1357,
+      "step": 11224
+    },
+    {
+      "epoch": 0.09743839029175094,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019490474797824813,
+      "loss": 0.1299,
+      "step": 11225
+    },
+    {
+      "epoch": 0.09744707077195511,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001949037607210138,
+      "loss": 0.1025,
+      "step": 11226
+    },
+    {
+      "epoch": 0.09745575125215927,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019490277337093002,
+      "loss": 0.1016,
+      "step": 11227
+    },
+    {
+      "epoch": 0.09746443173236344,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019490178592799785,
+      "loss": 0.1367,
+      "step": 11228
+    },
+    {
+      "epoch": 0.0974731122125676,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019490079839221835,
+      "loss": 0.1816,
+      "step": 11229
+    },
+    {
+      "epoch": 0.09748179269277177,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001948998107635926,
+      "loss": 0.1089,
+      "step": 11230
+    },
+    {
+      "epoch": 0.09749047317297593,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.001948988230421217,
+      "loss": 0.1133,
+      "step": 11231
+    },
+    {
+      "epoch": 0.0974991536531801,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019489783522780675,
+      "loss": 0.0967,
+      "step": 11232
+    },
+    {
+      "epoch": 0.09750783413338426,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019489684732064876,
+      "loss": 0.1377,
+      "step": 11233
+    },
+    {
+      "epoch": 0.09751651461358843,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019489585932064887,
+      "loss": 0.1602,
+      "step": 11234
+    },
+    {
+      "epoch": 0.0975251950937926,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0019489487122780814,
+      "loss": 0.1064,
+      "step": 11235
+    },
+    {
+      "epoch": 0.09753387557399676,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019489388304212766,
+      "loss": 0.1187,
+      "step": 11236
+    },
+    {
+      "epoch": 0.09754255605420092,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019489289476360852,
+      "loss": 0.1191,
+      "step": 11237
+    },
+    {
+      "epoch": 0.09755123653440509,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019489190639225176,
+      "loss": 0.1426,
+      "step": 11238
+    },
+    {
+      "epoch": 0.09755991701460925,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019489091792805851,
+      "loss": 0.1377,
+      "step": 11239
+    },
+    {
+      "epoch": 0.09756859749481342,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.001948899293710298,
+      "loss": 0.1387,
+      "step": 11240
+    },
+    {
+      "epoch": 0.09757727797501758,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0019488894072116677,
+      "loss": 0.1582,
+      "step": 11241
+    },
+    {
+      "epoch": 0.09758595845522175,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0019488795197847044,
+      "loss": 0.1484,
+      "step": 11242
+    },
+    {
+      "epoch": 0.0975946389354259,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019488696314294194,
+      "loss": 0.1084,
+      "step": 11243
+    },
+    {
+      "epoch": 0.09760331941563007,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0019488597421458231,
+      "loss": 0.125,
+      "step": 11244
+    },
+    {
+      "epoch": 0.09761199989583423,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019488498519339267,
+      "loss": 0.1387,
+      "step": 11245
+    },
+    {
+      "epoch": 0.0976206803760384,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001948839960793741,
+      "loss": 0.1592,
+      "step": 11246
+    },
+    {
+      "epoch": 0.09762936085624256,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019488300687252765,
+      "loss": 0.1611,
+      "step": 11247
+    },
+    {
+      "epoch": 0.09763804133644673,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0019488201757285441,
+      "loss": 0.1191,
+      "step": 11248
+    },
+    {
+      "epoch": 0.09764672181665089,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001948810281803555,
+      "loss": 0.1641,
+      "step": 11249
+    },
+    {
+      "epoch": 0.09765540229685506,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019488003869503198,
+      "loss": 0.105,
+      "step": 11250
+    },
+    {
+      "epoch": 0.09766408277705922,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019487904911688488,
+      "loss": 0.0972,
+      "step": 11251
+    },
+    {
+      "epoch": 0.09767276325726339,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0019487805944591536,
+      "loss": 0.0903,
+      "step": 11252
+    },
+    {
+      "epoch": 0.09768144373746755,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0019487706968212444,
+      "loss": 0.1348,
+      "step": 11253
+    },
+    {
+      "epoch": 0.09769012421767172,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019487607982551325,
+      "loss": 0.0996,
+      "step": 11254
+    },
+    {
+      "epoch": 0.09769880469787588,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0019487508987608287,
+      "loss": 0.1504,
+      "step": 11255
+    },
+    {
+      "epoch": 0.09770748517808005,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019487409983383434,
+      "loss": 0.1445,
+      "step": 11256
+    },
+    {
+      "epoch": 0.09771616565828421,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019487310969876876,
+      "loss": 0.1211,
+      "step": 11257
+    },
+    {
+      "epoch": 0.09772484613848838,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0019487211947088728,
+      "loss": 0.127,
+      "step": 11258
+    },
+    {
+      "epoch": 0.09773352661869254,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0019487112915019087,
+      "loss": 0.1074,
+      "step": 11259
+    },
+    {
+      "epoch": 0.09774220709889671,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001948701387366807,
+      "loss": 0.1445,
+      "step": 11260
+    },
+    {
+      "epoch": 0.09775088757910087,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019486914823035782,
+      "loss": 0.1455,
+      "step": 11261
+    },
+    {
+      "epoch": 0.09775956805930504,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001948681576312233,
+      "loss": 0.1064,
+      "step": 11262
+    },
+    {
+      "epoch": 0.0977682485395092,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019486716693927824,
+      "loss": 0.0859,
+      "step": 11263
+    },
+    {
+      "epoch": 0.09777692901971337,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019486617615452372,
+      "loss": 0.1196,
+      "step": 11264
+    },
+    {
+      "epoch": 0.09778560949991753,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019486518527696082,
+      "loss": 0.1045,
+      "step": 11265
+    },
+    {
+      "epoch": 0.0977942899801217,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019486419430659063,
+      "loss": 0.1475,
+      "step": 11266
+    },
+    {
+      "epoch": 0.09780297046032586,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0019486320324341427,
+      "loss": 0.1147,
+      "step": 11267
+    },
+    {
+      "epoch": 0.09781165094053003,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019486221208743274,
+      "loss": 0.1641,
+      "step": 11268
+    },
+    {
+      "epoch": 0.0978203314207342,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019486122083864717,
+      "loss": 0.1475,
+      "step": 11269
+    },
+    {
+      "epoch": 0.09782901190093836,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019486022949705866,
+      "loss": 0.1504,
+      "step": 11270
+    },
+    {
+      "epoch": 0.09783769238114252,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019485923806266828,
+      "loss": 0.126,
+      "step": 11271
+    },
+    {
+      "epoch": 0.09784637286134669,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001948582465354771,
+      "loss": 0.1289,
+      "step": 11272
+    },
+    {
+      "epoch": 0.09785505334155085,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0019485725491548624,
+      "loss": 0.1162,
+      "step": 11273
+    },
+    {
+      "epoch": 0.09786373382175502,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019485626320269675,
+      "loss": 0.1084,
+      "step": 11274
+    },
+    {
+      "epoch": 0.09787241430195918,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001948552713971097,
+      "loss": 0.1289,
+      "step": 11275
+    },
+    {
+      "epoch": 0.09788109478216335,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0019485427949872624,
+      "loss": 0.1328,
+      "step": 11276
+    },
+    {
+      "epoch": 0.09788977526236751,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019485328750754739,
+      "loss": 0.1133,
+      "step": 11277
+    },
+    {
+      "epoch": 0.09789845574257168,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0019485229542357426,
+      "loss": 0.1123,
+      "step": 11278
+    },
+    {
+      "epoch": 0.09790713622277585,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019485130324680795,
+      "loss": 0.1162,
+      "step": 11279
+    },
+    {
+      "epoch": 0.09791581670298001,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001948503109772495,
+      "loss": 0.1045,
+      "step": 11280
+    },
+    {
+      "epoch": 0.09792449718318418,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019484931861490004,
+      "loss": 0.0967,
+      "step": 11281
+    },
+    {
+      "epoch": 0.09793317766338834,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0019484832615976066,
+      "loss": 0.125,
+      "step": 11282
+    },
+    {
+      "epoch": 0.0979418581435925,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019484733361183242,
+      "loss": 0.1069,
+      "step": 11283
+    },
+    {
+      "epoch": 0.09795053862379667,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001948463409711164,
+      "loss": 0.1299,
+      "step": 11284
+    },
+    {
+      "epoch": 0.09795921910400084,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.001948453482376137,
+      "loss": 0.125,
+      "step": 11285
+    },
+    {
+      "epoch": 0.097967899584205,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019484435541132542,
+      "loss": 0.1113,
+      "step": 11286
+    },
+    {
+      "epoch": 0.09797658006440917,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.001948433624922526,
+      "loss": 0.1055,
+      "step": 11287
+    },
+    {
+      "epoch": 0.09798526054461333,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019484236948039636,
+      "loss": 0.125,
+      "step": 11288
+    },
+    {
+      "epoch": 0.0979939410248175,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0019484137637575778,
+      "loss": 0.1221,
+      "step": 11289
+    },
+    {
+      "epoch": 0.09800262150502166,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019484038317833792,
+      "loss": 0.1152,
+      "step": 11290
+    },
+    {
+      "epoch": 0.09801130198522583,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019483938988813794,
+      "loss": 0.1147,
+      "step": 11291
+    },
+    {
+      "epoch": 0.09801998246542999,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019483839650515885,
+      "loss": 0.0947,
+      "step": 11292
+    },
+    {
+      "epoch": 0.09802866294563416,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019483740302940176,
+      "loss": 0.1011,
+      "step": 11293
+    },
+    {
+      "epoch": 0.09803734342583832,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0019483640946086778,
+      "loss": 0.1377,
+      "step": 11294
+    },
+    {
+      "epoch": 0.09804602390604249,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019483541579955797,
+      "loss": 0.1201,
+      "step": 11295
+    },
+    {
+      "epoch": 0.09805470438624665,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0019483442204547342,
+      "loss": 0.168,
+      "step": 11296
+    },
+    {
+      "epoch": 0.09806338486645082,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001948334281986152,
+      "loss": 0.1191,
+      "step": 11297
+    },
+    {
+      "epoch": 0.09807206534665498,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019483243425898445,
+      "loss": 0.1152,
+      "step": 11298
+    },
+    {
+      "epoch": 0.09808074582685915,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001948314402265822,
+      "loss": 0.1221,
+      "step": 11299
+    },
+    {
+      "epoch": 0.09808942630706331,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0019483044610140961,
+      "loss": 0.1289,
+      "step": 11300
+    },
+    {
+      "epoch": 0.09809810678726748,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0019482945188346765,
+      "loss": 0.1406,
+      "step": 11301
+    },
+    {
+      "epoch": 0.09810678726747164,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019482845757275754,
+      "loss": 0.1338,
+      "step": 11302
+    },
+    {
+      "epoch": 0.09811546774767581,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019482746316928025,
+      "loss": 0.0747,
+      "step": 11303
+    },
+    {
+      "epoch": 0.09812414822787997,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019482646867303697,
+      "loss": 0.1162,
+      "step": 11304
+    },
+    {
+      "epoch": 0.09813282870808412,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001948254740840287,
+      "loss": 0.1128,
+      "step": 11305
+    },
+    {
+      "epoch": 0.09814150918828829,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.001948244794022566,
+      "loss": 0.0854,
+      "step": 11306
+    },
+    {
+      "epoch": 0.09815018966849245,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.001948234846277217,
+      "loss": 0.1289,
+      "step": 11307
+    },
+    {
+      "epoch": 0.09815887014869662,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001948224897604251,
+      "loss": 0.126,
+      "step": 11308
+    },
+    {
+      "epoch": 0.09816755062890078,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019482149480036792,
+      "loss": 0.0962,
+      "step": 11309
+    },
+    {
+      "epoch": 0.09817623110910495,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019482049974755124,
+      "loss": 0.1553,
+      "step": 11310
+    },
+    {
+      "epoch": 0.09818491158930912,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019481950460197611,
+      "loss": 0.123,
+      "step": 11311
+    },
+    {
+      "epoch": 0.09819359206951328,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019481850936364366,
+      "loss": 0.1123,
+      "step": 11312
+    },
+    {
+      "epoch": 0.09820227254971745,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019481751403255495,
+      "loss": 0.125,
+      "step": 11313
+    },
+    {
+      "epoch": 0.09821095302992161,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0019481651860871107,
+      "loss": 0.1196,
+      "step": 11314
+    },
+    {
+      "epoch": 0.09821963351012578,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019481552309211314,
+      "loss": 0.1426,
+      "step": 11315
+    },
+    {
+      "epoch": 0.09822831399032994,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0019481452748276223,
+      "loss": 0.1377,
+      "step": 11316
+    },
+    {
+      "epoch": 0.0982369944705341,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019481353178065941,
+      "loss": 0.1162,
+      "step": 11317
+    },
+    {
+      "epoch": 0.09824567495073827,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019481253598580583,
+      "loss": 0.1064,
+      "step": 11318
+    },
+    {
+      "epoch": 0.09825435543094244,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0019481154009820245,
+      "loss": 0.125,
+      "step": 11319
+    },
+    {
+      "epoch": 0.0982630359111466,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001948105441178505,
+      "loss": 0.1445,
+      "step": 11320
+    },
+    {
+      "epoch": 0.09827171639135077,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019480954804475105,
+      "loss": 0.1104,
+      "step": 11321
+    },
+    {
+      "epoch": 0.09828039687155493,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.001948085518789051,
+      "loss": 0.1582,
+      "step": 11322
+    },
+    {
+      "epoch": 0.0982890773517591,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001948075556203138,
+      "loss": 0.1123,
+      "step": 11323
+    },
+    {
+      "epoch": 0.09829775783196326,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.0019480655926897824,
+      "loss": 0.1172,
+      "step": 11324
+    },
+    {
+      "epoch": 0.09830643831216743,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019480556282489947,
+      "loss": 0.1016,
+      "step": 11325
+    },
+    {
+      "epoch": 0.09831511879237159,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0019480456628807865,
+      "loss": 0.0684,
+      "step": 11326
+    },
+    {
+      "epoch": 0.09832379927257576,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0019480356965851679,
+      "loss": 0.1123,
+      "step": 11327
+    },
+    {
+      "epoch": 0.09833247975277992,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001948025729362151,
+      "loss": 0.1445,
+      "step": 11328
+    },
+    {
+      "epoch": 0.09834116023298409,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0019480157612117453,
+      "loss": 0.1221,
+      "step": 11329
+    },
+    {
+      "epoch": 0.09834984071318825,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019480057921339622,
+      "loss": 0.0962,
+      "step": 11330
+    },
+    {
+      "epoch": 0.09835852119339242,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001947995822128813,
+      "loss": 0.1172,
+      "step": 11331
+    },
+    {
+      "epoch": 0.09836720167359658,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001947985851196308,
+      "loss": 0.1357,
+      "step": 11332
+    },
+    {
+      "epoch": 0.09837588215380075,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0019479758793364587,
+      "loss": 0.0962,
+      "step": 11333
+    },
+    {
+      "epoch": 0.09838456263400491,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019479659065492756,
+      "loss": 0.1699,
+      "step": 11334
+    },
+    {
+      "epoch": 0.09839324311420908,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0019479559328347698,
+      "loss": 0.0864,
+      "step": 11335
+    },
+    {
+      "epoch": 0.09840192359441324,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019479459581929523,
+      "loss": 0.1084,
+      "step": 11336
+    },
+    {
+      "epoch": 0.09841060407461741,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0019479359826238335,
+      "loss": 0.0996,
+      "step": 11337
+    },
+    {
+      "epoch": 0.09841928455482157,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.001947926006127425,
+      "loss": 0.0781,
+      "step": 11338
+    },
+    {
+      "epoch": 0.09842796503502574,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019479160287037367,
+      "loss": 0.1543,
+      "step": 11339
+    },
+    {
+      "epoch": 0.0984366455152299,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019479060503527809,
+      "loss": 0.1123,
+      "step": 11340
+    },
+    {
+      "epoch": 0.09844532599543407,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019478960710745674,
+      "loss": 0.0752,
+      "step": 11341
+    },
+    {
+      "epoch": 0.09845400647563823,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019478860908691074,
+      "loss": 0.1118,
+      "step": 11342
+    },
+    {
+      "epoch": 0.0984626869558424,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001947876109736412,
+      "loss": 0.0938,
+      "step": 11343
+    },
+    {
+      "epoch": 0.09847136743604656,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019478661276764923,
+      "loss": 0.1094,
+      "step": 11344
+    },
+    {
+      "epoch": 0.09848004791625073,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0019478561446893588,
+      "loss": 0.0938,
+      "step": 11345
+    },
+    {
+      "epoch": 0.0984887283964549,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019478461607750227,
+      "loss": 0.1367,
+      "step": 11346
+    },
+    {
+      "epoch": 0.09849740887665906,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019478361759334947,
+      "loss": 0.1206,
+      "step": 11347
+    },
+    {
+      "epoch": 0.09850608935686322,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0019478261901647856,
+      "loss": 0.1455,
+      "step": 11348
+    },
+    {
+      "epoch": 0.09851476983706739,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0019478162034689064,
+      "loss": 0.123,
+      "step": 11349
+    },
+    {
+      "epoch": 0.09852345031727155,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019478062158458686,
+      "loss": 0.1377,
+      "step": 11350
+    },
+    {
+      "epoch": 0.09853213079747572,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019477962272956822,
+      "loss": 0.1436,
+      "step": 11351
+    },
+    {
+      "epoch": 0.09854081127767989,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001947786237818359,
+      "loss": 0.0977,
+      "step": 11352
+    },
+    {
+      "epoch": 0.09854949175788405,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.001947776247413909,
+      "loss": 0.0898,
+      "step": 11353
+    },
+    {
+      "epoch": 0.09855817223808822,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019477662560823442,
+      "loss": 0.124,
+      "step": 11354
+    },
+    {
+      "epoch": 0.09856685271829238,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019477562638236746,
+      "loss": 0.0986,
+      "step": 11355
+    },
+    {
+      "epoch": 0.09857553319849655,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0019477462706379114,
+      "loss": 0.0918,
+      "step": 11356
+    },
+    {
+      "epoch": 0.09858421367870071,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001947736276525066,
+      "loss": 0.1367,
+      "step": 11357
+    },
+    {
+      "epoch": 0.09859289415890488,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0019477262814851489,
+      "loss": 0.1396,
+      "step": 11358
+    },
+    {
+      "epoch": 0.09860157463910904,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019477162855181707,
+      "loss": 0.1221,
+      "step": 11359
+    },
+    {
+      "epoch": 0.0986102551193132,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001947706288624143,
+      "loss": 0.1064,
+      "step": 11360
+    },
+    {
+      "epoch": 0.09861893559951737,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0019476962908030763,
+      "loss": 0.0981,
+      "step": 11361
+    },
+    {
+      "epoch": 0.09862761607972154,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019476862920549819,
+      "loss": 0.1377,
+      "step": 11362
+    },
+    {
+      "epoch": 0.0986362965599257,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019476762923798703,
+      "loss": 0.0889,
+      "step": 11363
+    },
+    {
+      "epoch": 0.09864497704012987,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019476662917777526,
+      "loss": 0.1094,
+      "step": 11364
+    },
+    {
+      "epoch": 0.09865365752033403,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019476562902486399,
+      "loss": 0.1602,
+      "step": 11365
+    },
+    {
+      "epoch": 0.0986623380005382,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001947646287792543,
+      "loss": 0.1172,
+      "step": 11366
+    },
+    {
+      "epoch": 0.09867101848074235,
+      "grad_norm": 2.21875,
+      "learning_rate": 0.0019476362844094727,
+      "loss": 0.2695,
+      "step": 11367
+    },
+    {
+      "epoch": 0.09867969896094651,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0019476262800994404,
+      "loss": 0.1367,
+      "step": 11368
+    },
+    {
+      "epoch": 0.09868837944115068,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019476162748624567,
+      "loss": 0.1367,
+      "step": 11369
+    },
+    {
+      "epoch": 0.09869705992135484,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0019476062686985323,
+      "loss": 0.1348,
+      "step": 11370
+    },
+    {
+      "epoch": 0.09870574040155901,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0019475962616076787,
+      "loss": 0.1221,
+      "step": 11371
+    },
+    {
+      "epoch": 0.09871442088176317,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019475862535899063,
+      "loss": 0.1406,
+      "step": 11372
+    },
+    {
+      "epoch": 0.09872310136196734,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019475762446452263,
+      "loss": 0.1533,
+      "step": 11373
+    },
+    {
+      "epoch": 0.0987317818421715,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.00194756623477365,
+      "loss": 0.1084,
+      "step": 11374
+    },
+    {
+      "epoch": 0.09874046232237567,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019475562239751878,
+      "loss": 0.1523,
+      "step": 11375
+    },
+    {
+      "epoch": 0.09874914280257983,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001947546212249851,
+      "loss": 0.1416,
+      "step": 11376
+    },
+    {
+      "epoch": 0.098757823282784,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0019475361995976502,
+      "loss": 0.1406,
+      "step": 11377
+    },
+    {
+      "epoch": 0.09876650376298816,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019475261860185963,
+      "loss": 0.1138,
+      "step": 11378
+    },
+    {
+      "epoch": 0.09877518424319233,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0019475161715127012,
+      "loss": 0.1084,
+      "step": 11379
+    },
+    {
+      "epoch": 0.0987838647233965,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0019475061560799746,
+      "loss": 0.1035,
+      "step": 11380
+    },
+    {
+      "epoch": 0.09879254520360066,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0019474961397204283,
+      "loss": 0.1309,
+      "step": 11381
+    },
+    {
+      "epoch": 0.09880122568380482,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0019474861224340724,
+      "loss": 0.1104,
+      "step": 11382
+    },
+    {
+      "epoch": 0.09880990616400899,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.001947476104220919,
+      "loss": 0.2246,
+      "step": 11383
+    },
+    {
+      "epoch": 0.09881858664421315,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019474660850809782,
+      "loss": 0.0972,
+      "step": 11384
+    },
+    {
+      "epoch": 0.09882726712441732,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0019474560650142613,
+      "loss": 0.1172,
+      "step": 11385
+    },
+    {
+      "epoch": 0.09883594760462149,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0019474460440207792,
+      "loss": 0.126,
+      "step": 11386
+    },
+    {
+      "epoch": 0.09884462808482565,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0019474360221005426,
+      "loss": 0.1328,
+      "step": 11387
+    },
+    {
+      "epoch": 0.09885330856502982,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019474259992535632,
+      "loss": 0.125,
+      "step": 11388
+    },
+    {
+      "epoch": 0.09886198904523398,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001947415975479851,
+      "loss": 0.1123,
+      "step": 11389
+    },
+    {
+      "epoch": 0.09887066952543815,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019474059507794176,
+      "loss": 0.2207,
+      "step": 11390
+    },
+    {
+      "epoch": 0.09887935000564231,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001947395925152274,
+      "loss": 0.1128,
+      "step": 11391
+    },
+    {
+      "epoch": 0.09888803048584648,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0019473858985984307,
+      "loss": 0.1045,
+      "step": 11392
+    },
+    {
+      "epoch": 0.09889671096605064,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019473758711178988,
+      "loss": 0.1064,
+      "step": 11393
+    },
+    {
+      "epoch": 0.0989053914462548,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019473658427106896,
+      "loss": 0.123,
+      "step": 11394
+    },
+    {
+      "epoch": 0.09891407192645897,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019473558133768136,
+      "loss": 0.1172,
+      "step": 11395
+    },
+    {
+      "epoch": 0.09892275240666314,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0019473457831162822,
+      "loss": 0.0996,
+      "step": 11396
+    },
+    {
+      "epoch": 0.0989314328868673,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0019473357519291061,
+      "loss": 0.0996,
+      "step": 11397
+    },
+    {
+      "epoch": 0.09894011336707147,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019473257198152965,
+      "loss": 0.1416,
+      "step": 11398
+    },
+    {
+      "epoch": 0.09894879384727563,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019473156867748642,
+      "loss": 0.1152,
+      "step": 11399
+    },
+    {
+      "epoch": 0.0989574743274798,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.00194730565280782,
+      "loss": 0.123,
+      "step": 11400
+    },
+    {
+      "epoch": 0.09896615480768396,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0019472956179141753,
+      "loss": 0.1045,
+      "step": 11401
+    },
+    {
+      "epoch": 0.09897483528788813,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019472855820939405,
+      "loss": 0.1182,
+      "step": 11402
+    },
+    {
+      "epoch": 0.09898351576809229,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0019472755453471273,
+      "loss": 0.1006,
+      "step": 11403
+    },
+    {
+      "epoch": 0.09899219624829646,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019472655076737462,
+      "loss": 0.0894,
+      "step": 11404
+    },
+    {
+      "epoch": 0.09900087672850062,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0019472554690738084,
+      "loss": 0.1016,
+      "step": 11405
+    },
+    {
+      "epoch": 0.09900955720870479,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019472454295473246,
+      "loss": 0.1406,
+      "step": 11406
+    },
+    {
+      "epoch": 0.09901823768890895,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019472353890943057,
+      "loss": 0.1182,
+      "step": 11407
+    },
+    {
+      "epoch": 0.09902691816911312,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019472253477147631,
+      "loss": 0.1279,
+      "step": 11408
+    },
+    {
+      "epoch": 0.09903559864931728,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0019472153054087076,
+      "loss": 0.1338,
+      "step": 11409
+    },
+    {
+      "epoch": 0.09904427912952145,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019472052621761502,
+      "loss": 0.1553,
+      "step": 11410
+    },
+    {
+      "epoch": 0.09905295960972561,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019471952180171019,
+      "loss": 0.1553,
+      "step": 11411
+    },
+    {
+      "epoch": 0.09906164008992978,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019471851729315737,
+      "loss": 0.1055,
+      "step": 11412
+    },
+    {
+      "epoch": 0.09907032057013394,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0019471751269195762,
+      "loss": 0.0986,
+      "step": 11413
+    },
+    {
+      "epoch": 0.09907900105033811,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001947165079981121,
+      "loss": 0.1299,
+      "step": 11414
+    },
+    {
+      "epoch": 0.09908768153054227,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019471550321162186,
+      "loss": 0.1118,
+      "step": 11415
+    },
+    {
+      "epoch": 0.09909636201074644,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0019471449833248801,
+      "loss": 0.1201,
+      "step": 11416
+    },
+    {
+      "epoch": 0.0991050424909506,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001947134933607117,
+      "loss": 0.1201,
+      "step": 11417
+    },
+    {
+      "epoch": 0.09911372297115477,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019471248829629397,
+      "loss": 0.1406,
+      "step": 11418
+    },
+    {
+      "epoch": 0.09912240345135893,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019471148313923593,
+      "loss": 0.0933,
+      "step": 11419
+    },
+    {
+      "epoch": 0.0991310839315631,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019471047788953866,
+      "loss": 0.1045,
+      "step": 11420
+    },
+    {
+      "epoch": 0.09913976441176726,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019470947254720333,
+      "loss": 0.1123,
+      "step": 11421
+    },
+    {
+      "epoch": 0.09914844489197143,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019470846711223096,
+      "loss": 0.1377,
+      "step": 11422
+    },
+    {
+      "epoch": 0.0991571253721756,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0019470746158462271,
+      "loss": 0.1211,
+      "step": 11423
+    },
+    {
+      "epoch": 0.09916580585237976,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0019470645596437965,
+      "loss": 0.0938,
+      "step": 11424
+    },
+    {
+      "epoch": 0.09917448633258392,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019470545025150286,
+      "loss": 0.0879,
+      "step": 11425
+    },
+    {
+      "epoch": 0.09918316681278809,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019470444444599346,
+      "loss": 0.1416,
+      "step": 11426
+    },
+    {
+      "epoch": 0.09919184729299226,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019470343854785254,
+      "loss": 0.0776,
+      "step": 11427
+    },
+    {
+      "epoch": 0.0992005277731964,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019470243255708124,
+      "loss": 0.0977,
+      "step": 11428
+    },
+    {
+      "epoch": 0.09920920825340057,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0019470142647368062,
+      "loss": 0.1211,
+      "step": 11429
+    },
+    {
+      "epoch": 0.09921788873360474,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0019470042029765182,
+      "loss": 0.1455,
+      "step": 11430
+    },
+    {
+      "epoch": 0.0992265692138089,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.001946994140289959,
+      "loss": 0.1387,
+      "step": 11431
+    },
+    {
+      "epoch": 0.09923524969401307,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019469840766771394,
+      "loss": 0.1406,
+      "step": 11432
+    },
+    {
+      "epoch": 0.09924393017421723,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019469740121380711,
+      "loss": 0.1143,
+      "step": 11433
+    },
+    {
+      "epoch": 0.0992526106544214,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019469639466727646,
+      "loss": 0.124,
+      "step": 11434
+    },
+    {
+      "epoch": 0.09926129113462556,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0019469538802812313,
+      "loss": 0.1299,
+      "step": 11435
+    },
+    {
+      "epoch": 0.09926997161482973,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019469438129634817,
+      "loss": 0.1318,
+      "step": 11436
+    },
+    {
+      "epoch": 0.09927865209503389,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019469337447195272,
+      "loss": 0.0898,
+      "step": 11437
+    },
+    {
+      "epoch": 0.09928733257523806,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019469236755493785,
+      "loss": 0.1484,
+      "step": 11438
+    },
+    {
+      "epoch": 0.09929601305544222,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001946913605453047,
+      "loss": 0.1299,
+      "step": 11439
+    },
+    {
+      "epoch": 0.09930469353564639,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0019469035344305433,
+      "loss": 0.127,
+      "step": 11440
+    },
+    {
+      "epoch": 0.09931337401585055,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019468934624818789,
+      "loss": 0.1201,
+      "step": 11441
+    },
+    {
+      "epoch": 0.09932205449605472,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019468833896070644,
+      "loss": 0.1318,
+      "step": 11442
+    },
+    {
+      "epoch": 0.09933073497625888,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019468733158061107,
+      "loss": 0.1543,
+      "step": 11443
+    },
+    {
+      "epoch": 0.09933941545646305,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019468632410790294,
+      "loss": 0.0991,
+      "step": 11444
+    },
+    {
+      "epoch": 0.09934809593666721,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001946853165425831,
+      "loss": 0.1426,
+      "step": 11445
+    },
+    {
+      "epoch": 0.09935677641687138,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0019468430888465269,
+      "loss": 0.1143,
+      "step": 11446
+    },
+    {
+      "epoch": 0.09936545689707554,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019468330113411277,
+      "loss": 0.1455,
+      "step": 11447
+    },
+    {
+      "epoch": 0.09937413737727971,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001946822932909645,
+      "loss": 0.1143,
+      "step": 11448
+    },
+    {
+      "epoch": 0.09938281785748387,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001946812853552089,
+      "loss": 0.1162,
+      "step": 11449
+    },
+    {
+      "epoch": 0.09939149833768804,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0019468027732684713,
+      "loss": 0.1582,
+      "step": 11450
+    },
+    {
+      "epoch": 0.0994001788178922,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001946792692058803,
+      "loss": 0.0991,
+      "step": 11451
+    },
+    {
+      "epoch": 0.09940885929809637,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0019467826099230947,
+      "loss": 0.125,
+      "step": 11452
+    },
+    {
+      "epoch": 0.09941753977830053,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0019467725268613582,
+      "loss": 0.103,
+      "step": 11453
+    },
+    {
+      "epoch": 0.0994262202585047,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019467624428736033,
+      "loss": 0.1328,
+      "step": 11454
+    },
+    {
+      "epoch": 0.09943490073870886,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019467523579598418,
+      "loss": 0.1445,
+      "step": 11455
+    },
+    {
+      "epoch": 0.09944358121891303,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001946742272120085,
+      "loss": 0.166,
+      "step": 11456
+    },
+    {
+      "epoch": 0.0994522616991172,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0019467321853543435,
+      "loss": 0.1582,
+      "step": 11457
+    },
+    {
+      "epoch": 0.09946094217932136,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019467220976626283,
+      "loss": 0.106,
+      "step": 11458
+    },
+    {
+      "epoch": 0.09946962265952553,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0019467120090449505,
+      "loss": 0.1055,
+      "step": 11459
+    },
+    {
+      "epoch": 0.09947830313972969,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019467019195013213,
+      "loss": 0.1191,
+      "step": 11460
+    },
+    {
+      "epoch": 0.09948698361993386,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0019466918290317514,
+      "loss": 0.1162,
+      "step": 11461
+    },
+    {
+      "epoch": 0.09949566410013802,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0019466817376362522,
+      "loss": 0.1494,
+      "step": 11462
+    },
+    {
+      "epoch": 0.09950434458034219,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0019466716453148346,
+      "loss": 0.1426,
+      "step": 11463
+    },
+    {
+      "epoch": 0.09951302506054635,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019466615520675097,
+      "loss": 0.1025,
+      "step": 11464
+    },
+    {
+      "epoch": 0.09952170554075052,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0019466514578942882,
+      "loss": 0.1016,
+      "step": 11465
+    },
+    {
+      "epoch": 0.09953038602095468,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019466413627951813,
+      "loss": 0.0967,
+      "step": 11466
+    },
+    {
+      "epoch": 0.09953906650115885,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0019466312667702004,
+      "loss": 0.1245,
+      "step": 11467
+    },
+    {
+      "epoch": 0.09954774698136301,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019466211698193564,
+      "loss": 0.1289,
+      "step": 11468
+    },
+    {
+      "epoch": 0.09955642746156718,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019466110719426597,
+      "loss": 0.1221,
+      "step": 11469
+    },
+    {
+      "epoch": 0.09956510794177134,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019466009731401222,
+      "loss": 0.1104,
+      "step": 11470
+    },
+    {
+      "epoch": 0.0995737884219755,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019465908734117544,
+      "loss": 0.1611,
+      "step": 11471
+    },
+    {
+      "epoch": 0.09958246890217967,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019465807727575676,
+      "loss": 0.1455,
+      "step": 11472
+    },
+    {
+      "epoch": 0.09959114938238384,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.001946570671177573,
+      "loss": 0.1738,
+      "step": 11473
+    },
+    {
+      "epoch": 0.099599829862588,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0019465605686717813,
+      "loss": 0.1377,
+      "step": 11474
+    },
+    {
+      "epoch": 0.09960851034279217,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0019465504652402038,
+      "loss": 0.1162,
+      "step": 11475
+    },
+    {
+      "epoch": 0.09961719082299633,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019465403608828512,
+      "loss": 0.0898,
+      "step": 11476
+    },
+    {
+      "epoch": 0.0996258713032005,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0019465302555997349,
+      "loss": 0.127,
+      "step": 11477
+    },
+    {
+      "epoch": 0.09963455178340466,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001946520149390866,
+      "loss": 0.0996,
+      "step": 11478
+    },
+    {
+      "epoch": 0.09964323226360883,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001946510042256255,
+      "loss": 0.126,
+      "step": 11479
+    },
+    {
+      "epoch": 0.09965191274381299,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0019464999341959135,
+      "loss": 0.1426,
+      "step": 11480
+    },
+    {
+      "epoch": 0.09966059322401716,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0019464898252098523,
+      "loss": 0.1387,
+      "step": 11481
+    },
+    {
+      "epoch": 0.09966927370422132,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0019464797152980828,
+      "loss": 0.1113,
+      "step": 11482
+    },
+    {
+      "epoch": 0.09967795418442549,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019464696044606158,
+      "loss": 0.1055,
+      "step": 11483
+    },
+    {
+      "epoch": 0.09968663466462965,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019464594926974622,
+      "loss": 0.1172,
+      "step": 11484
+    },
+    {
+      "epoch": 0.09969531514483382,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0019464493800086331,
+      "loss": 0.127,
+      "step": 11485
+    },
+    {
+      "epoch": 0.09970399562503798,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00194643926639414,
+      "loss": 0.1309,
+      "step": 11486
+    },
+    {
+      "epoch": 0.09971267610524215,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019464291518539933,
+      "loss": 0.0762,
+      "step": 11487
+    },
+    {
+      "epoch": 0.09972135658544631,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019464190363882047,
+      "loss": 0.1123,
+      "step": 11488
+    },
+    {
+      "epoch": 0.09973003706565048,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019464089199967846,
+      "loss": 0.0981,
+      "step": 11489
+    },
+    {
+      "epoch": 0.09973871754585463,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019463988026797447,
+      "loss": 0.1143,
+      "step": 11490
+    },
+    {
+      "epoch": 0.0997473980260588,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001946388684437096,
+      "loss": 0.1553,
+      "step": 11491
+    },
+    {
+      "epoch": 0.09975607850626296,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001946378565268849,
+      "loss": 0.1211,
+      "step": 11492
+    },
+    {
+      "epoch": 0.09976475898646713,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001946368445175015,
+      "loss": 0.1602,
+      "step": 11493
+    },
+    {
+      "epoch": 0.09977343946667129,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019463583241556056,
+      "loss": 0.1396,
+      "step": 11494
+    },
+    {
+      "epoch": 0.09978211994687546,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0019463482022106311,
+      "loss": 0.1289,
+      "step": 11495
+    },
+    {
+      "epoch": 0.09979080042707962,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001946338079340103,
+      "loss": 0.106,
+      "step": 11496
+    },
+    {
+      "epoch": 0.09979948090728379,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0019463279555440324,
+      "loss": 0.1074,
+      "step": 11497
+    },
+    {
+      "epoch": 0.09980816138748795,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.00194631783082243,
+      "loss": 0.1436,
+      "step": 11498
+    },
+    {
+      "epoch": 0.09981684186769212,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019463077051753077,
+      "loss": 0.1631,
+      "step": 11499
+    },
+    {
+      "epoch": 0.09982552234789628,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019462975786026756,
+      "loss": 0.1719,
+      "step": 11500
+    },
+    {
+      "epoch": 0.09983420282810045,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.001946287451104545,
+      "loss": 0.0859,
+      "step": 11501
+    },
+    {
+      "epoch": 0.09984288330830461,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019462773226809277,
+      "loss": 0.0981,
+      "step": 11502
+    },
+    {
+      "epoch": 0.09985156378850878,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0019462671933318338,
+      "loss": 0.1123,
+      "step": 11503
+    },
+    {
+      "epoch": 0.09986024426871294,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0019462570630572752,
+      "loss": 0.0952,
+      "step": 11504
+    },
+    {
+      "epoch": 0.0998689247489171,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0019462469318572621,
+      "loss": 0.0835,
+      "step": 11505
+    },
+    {
+      "epoch": 0.09987760522912127,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019462367997318062,
+      "loss": 0.1162,
+      "step": 11506
+    },
+    {
+      "epoch": 0.09988628570932544,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019462266666809187,
+      "loss": 0.1289,
+      "step": 11507
+    },
+    {
+      "epoch": 0.0998949661895296,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019462165327046102,
+      "loss": 0.1113,
+      "step": 11508
+    },
+    {
+      "epoch": 0.09990364666973377,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0019462063978028922,
+      "loss": 0.1064,
+      "step": 11509
+    },
+    {
+      "epoch": 0.09991232714993793,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019461962619757756,
+      "loss": 0.1191,
+      "step": 11510
+    },
+    {
+      "epoch": 0.0999210076301421,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019461861252232713,
+      "loss": 0.248,
+      "step": 11511
+    },
+    {
+      "epoch": 0.09992968811034626,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019461759875453906,
+      "loss": 0.0977,
+      "step": 11512
+    },
+    {
+      "epoch": 0.09993836859055043,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0019461658489421446,
+      "loss": 0.084,
+      "step": 11513
+    },
+    {
+      "epoch": 0.09994704907075459,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0019461557094135444,
+      "loss": 0.166,
+      "step": 11514
+    },
+    {
+      "epoch": 0.09995572955095876,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001946145568959601,
+      "loss": 0.1338,
+      "step": 11515
+    },
+    {
+      "epoch": 0.09996441003116292,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019461354275803254,
+      "loss": 0.1064,
+      "step": 11516
+    },
+    {
+      "epoch": 0.09997309051136709,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.001946125285275729,
+      "loss": 0.1455,
+      "step": 11517
+    },
+    {
+      "epoch": 0.09998177099157125,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0019461151420458228,
+      "loss": 0.1006,
+      "step": 11518
+    },
+    {
+      "epoch": 0.09999045147177542,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001946104997890617,
+      "loss": 0.1089,
+      "step": 11519
+    },
+    {
+      "epoch": 0.09999913195197958,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0019460948528101244,
+      "loss": 0.1187,
+      "step": 11520
+    },
+    {
+      "epoch": 0.10000781243218375,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0019460847068043547,
+      "loss": 0.1309,
+      "step": 11521
+    },
+    {
+      "epoch": 0.10001649291238791,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.00194607455987332,
+      "loss": 0.1328,
+      "step": 11522
+    },
+    {
+      "epoch": 0.10002517339259208,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019460644120170305,
+      "loss": 0.1475,
+      "step": 11523
+    },
+    {
+      "epoch": 0.10003385387279624,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019460542632354976,
+      "loss": 0.0957,
+      "step": 11524
+    },
+    {
+      "epoch": 0.10004253435300041,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019460441135287325,
+      "loss": 0.1328,
+      "step": 11525
+    },
+    {
+      "epoch": 0.10005121483320457,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019460339628967463,
+      "loss": 0.1064,
+      "step": 11526
+    },
+    {
+      "epoch": 0.10005989531340874,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.00194602381133955,
+      "loss": 0.1299,
+      "step": 11527
+    },
+    {
+      "epoch": 0.1000685757936129,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.001946013658857155,
+      "loss": 0.0967,
+      "step": 11528
+    },
+    {
+      "epoch": 0.10007725627381707,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019460035054495721,
+      "loss": 0.084,
+      "step": 11529
+    },
+    {
+      "epoch": 0.10008593675402123,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0019459933511168125,
+      "loss": 0.124,
+      "step": 11530
+    },
+    {
+      "epoch": 0.1000946172342254,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001945983195858887,
+      "loss": 0.2207,
+      "step": 11531
+    },
+    {
+      "epoch": 0.10010329771442956,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019459730396758073,
+      "loss": 0.1191,
+      "step": 11532
+    },
+    {
+      "epoch": 0.10011197819463373,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001945962882567584,
+      "loss": 0.0986,
+      "step": 11533
+    },
+    {
+      "epoch": 0.1001206586748379,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019459527245342288,
+      "loss": 0.1064,
+      "step": 11534
+    },
+    {
+      "epoch": 0.10012933915504206,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019459425655757518,
+      "loss": 0.1562,
+      "step": 11535
+    },
+    {
+      "epoch": 0.10013801963524623,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001945932405692165,
+      "loss": 0.1367,
+      "step": 11536
+    },
+    {
+      "epoch": 0.10014670011545039,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019459222448834795,
+      "loss": 0.1074,
+      "step": 11537
+    },
+    {
+      "epoch": 0.10015538059565456,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019459120831497056,
+      "loss": 0.1172,
+      "step": 11538
+    },
+    {
+      "epoch": 0.10016406107585872,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019459019204908553,
+      "loss": 0.1113,
+      "step": 11539
+    },
+    {
+      "epoch": 0.10017274155606289,
+      "grad_norm": 5.46875,
+      "learning_rate": 0.0019458917569069394,
+      "loss": 0.2012,
+      "step": 11540
+    },
+    {
+      "epoch": 0.10018142203626705,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001945881592397969,
+      "loss": 0.1787,
+      "step": 11541
+    },
+    {
+      "epoch": 0.10019010251647122,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001945871426963955,
+      "loss": 0.0967,
+      "step": 11542
+    },
+    {
+      "epoch": 0.10019878299667538,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019458612606049086,
+      "loss": 0.1406,
+      "step": 11543
+    },
+    {
+      "epoch": 0.10020746347687955,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0019458510933208415,
+      "loss": 0.1348,
+      "step": 11544
+    },
+    {
+      "epoch": 0.10021614395708371,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0019458409251117642,
+      "loss": 0.123,
+      "step": 11545
+    },
+    {
+      "epoch": 0.10022482443728788,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019458307559776877,
+      "loss": 0.1143,
+      "step": 11546
+    },
+    {
+      "epoch": 0.10023350491749204,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0019458205859186235,
+      "loss": 0.0947,
+      "step": 11547
+    },
+    {
+      "epoch": 0.1002421853976962,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019458104149345829,
+      "loss": 0.1602,
+      "step": 11548
+    },
+    {
+      "epoch": 0.10025086587790037,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019458002430255764,
+      "loss": 0.165,
+      "step": 11549
+    },
+    {
+      "epoch": 0.10025954635810454,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019457900701916155,
+      "loss": 0.1719,
+      "step": 11550
+    },
+    {
+      "epoch": 0.10026822683830869,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019457798964327114,
+      "loss": 0.1396,
+      "step": 11551
+    },
+    {
+      "epoch": 0.10027690731851285,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0019457697217488753,
+      "loss": 0.0918,
+      "step": 11552
+    },
+    {
+      "epoch": 0.10028558779871702,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019457595461401178,
+      "loss": 0.1094,
+      "step": 11553
+    },
+    {
+      "epoch": 0.10029426827892118,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0019457493696064506,
+      "loss": 0.1187,
+      "step": 11554
+    },
+    {
+      "epoch": 0.10030294875912535,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019457391921478841,
+      "loss": 0.0918,
+      "step": 11555
+    },
+    {
+      "epoch": 0.10031162923932951,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019457290137644306,
+      "loss": 0.1455,
+      "step": 11556
+    },
+    {
+      "epoch": 0.10032030971953368,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019457188344561003,
+      "loss": 0.1143,
+      "step": 11557
+    },
+    {
+      "epoch": 0.10032899019973784,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0019457086542229048,
+      "loss": 0.1245,
+      "step": 11558
+    },
+    {
+      "epoch": 0.10033767067994201,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0019456984730648547,
+      "loss": 0.1328,
+      "step": 11559
+    },
+    {
+      "epoch": 0.10034635116014617,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0019456882909819613,
+      "loss": 0.1133,
+      "step": 11560
+    },
+    {
+      "epoch": 0.10035503164035034,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019456781079742363,
+      "loss": 0.0898,
+      "step": 11561
+    },
+    {
+      "epoch": 0.1003637121205545,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.00194566792404169,
+      "loss": 0.1289,
+      "step": 11562
+    },
+    {
+      "epoch": 0.10037239260075867,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019456577391843343,
+      "loss": 0.1348,
+      "step": 11563
+    },
+    {
+      "epoch": 0.10038107308096283,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0019456475534021802,
+      "loss": 0.1719,
+      "step": 11564
+    },
+    {
+      "epoch": 0.100389753561167,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019456373666952383,
+      "loss": 0.0859,
+      "step": 11565
+    },
+    {
+      "epoch": 0.10039843404137117,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.00194562717906352,
+      "loss": 0.125,
+      "step": 11566
+    },
+    {
+      "epoch": 0.10040711452157533,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0019456169905070367,
+      "loss": 0.1143,
+      "step": 11567
+    },
+    {
+      "epoch": 0.1004157950017795,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019456068010257994,
+      "loss": 0.1279,
+      "step": 11568
+    },
+    {
+      "epoch": 0.10042447548198366,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019455966106198192,
+      "loss": 0.0957,
+      "step": 11569
+    },
+    {
+      "epoch": 0.10043315596218783,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019455864192891071,
+      "loss": 0.125,
+      "step": 11570
+    },
+    {
+      "epoch": 0.10044183644239199,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0019455762270336747,
+      "loss": 0.1465,
+      "step": 11571
+    },
+    {
+      "epoch": 0.10045051692259616,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0019455660338535325,
+      "loss": 0.1094,
+      "step": 11572
+    },
+    {
+      "epoch": 0.10045919740280032,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001945555839748692,
+      "loss": 0.1006,
+      "step": 11573
+    },
+    {
+      "epoch": 0.10046787788300449,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019455456447191647,
+      "loss": 0.1572,
+      "step": 11574
+    },
+    {
+      "epoch": 0.10047655836320865,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.001945535448764961,
+      "loss": 0.0986,
+      "step": 11575
+    },
+    {
+      "epoch": 0.10048523884341282,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019455252518860927,
+      "loss": 0.084,
+      "step": 11576
+    },
+    {
+      "epoch": 0.10049391932361698,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019455150540825703,
+      "loss": 0.1611,
+      "step": 11577
+    },
+    {
+      "epoch": 0.10050259980382115,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019455048553544058,
+      "loss": 0.1201,
+      "step": 11578
+    },
+    {
+      "epoch": 0.10051128028402531,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0019454946557016098,
+      "loss": 0.1045,
+      "step": 11579
+    },
+    {
+      "epoch": 0.10051996076422948,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0019454844551241933,
+      "loss": 0.0942,
+      "step": 11580
+    },
+    {
+      "epoch": 0.10052864124443364,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001945474253622168,
+      "loss": 0.1465,
+      "step": 11581
+    },
+    {
+      "epoch": 0.10053732172463781,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019454640511955446,
+      "loss": 0.1123,
+      "step": 11582
+    },
+    {
+      "epoch": 0.10054600220484197,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019454538478443344,
+      "loss": 0.1182,
+      "step": 11583
+    },
+    {
+      "epoch": 0.10055468268504614,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019454436435685488,
+      "loss": 0.1709,
+      "step": 11584
+    },
+    {
+      "epoch": 0.1005633631652503,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019454334383681984,
+      "loss": 0.124,
+      "step": 11585
+    },
+    {
+      "epoch": 0.10057204364545447,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019454232322432949,
+      "loss": 0.0986,
+      "step": 11586
+    },
+    {
+      "epoch": 0.10058072412565863,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019454130251938492,
+      "loss": 0.1147,
+      "step": 11587
+    },
+    {
+      "epoch": 0.1005894046058628,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019454028172198727,
+      "loss": 0.1104,
+      "step": 11588
+    },
+    {
+      "epoch": 0.10059808508606696,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.001945392608321376,
+      "loss": 0.2051,
+      "step": 11589
+    },
+    {
+      "epoch": 0.10060676556627113,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0019453823984983706,
+      "loss": 0.1289,
+      "step": 11590
+    },
+    {
+      "epoch": 0.1006154460464753,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019453721877508684,
+      "loss": 0.1143,
+      "step": 11591
+    },
+    {
+      "epoch": 0.10062412652667946,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019453619760788792,
+      "loss": 0.1143,
+      "step": 11592
+    },
+    {
+      "epoch": 0.10063280700688362,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001945351763482415,
+      "loss": 0.1162,
+      "step": 11593
+    },
+    {
+      "epoch": 0.10064148748708779,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019453415499614871,
+      "loss": 0.1118,
+      "step": 11594
+    },
+    {
+      "epoch": 0.10065016796729195,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019453313355161062,
+      "loss": 0.1211,
+      "step": 11595
+    },
+    {
+      "epoch": 0.10065884844749612,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019453211201462836,
+      "loss": 0.1318,
+      "step": 11596
+    },
+    {
+      "epoch": 0.10066752892770028,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019453109038520306,
+      "loss": 0.1118,
+      "step": 11597
+    },
+    {
+      "epoch": 0.10067620940790445,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019453006866333583,
+      "loss": 0.0908,
+      "step": 11598
+    },
+    {
+      "epoch": 0.10068488988810861,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019452904684902775,
+      "loss": 0.1455,
+      "step": 11599
+    },
+    {
+      "epoch": 0.10069357036831278,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019452802494228,
+      "loss": 0.1172,
+      "step": 11600
+    },
+    {
+      "epoch": 0.10070225084851694,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0019452700294309369,
+      "loss": 0.1621,
+      "step": 11601
+    },
+    {
+      "epoch": 0.10071093132872111,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019452598085146992,
+      "loss": 0.1216,
+      "step": 11602
+    },
+    {
+      "epoch": 0.10071961180892527,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001945249586674098,
+      "loss": 0.1123,
+      "step": 11603
+    },
+    {
+      "epoch": 0.10072829228912944,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019452393639091444,
+      "loss": 0.1221,
+      "step": 11604
+    },
+    {
+      "epoch": 0.1007369727693336,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019452291402198498,
+      "loss": 0.1221,
+      "step": 11605
+    },
+    {
+      "epoch": 0.10074565324953777,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019452189156062252,
+      "loss": 0.1191,
+      "step": 11606
+    },
+    {
+      "epoch": 0.10075433372974194,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0019452086900682822,
+      "loss": 0.1094,
+      "step": 11607
+    },
+    {
+      "epoch": 0.1007630142099461,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019451984636060316,
+      "loss": 0.082,
+      "step": 11608
+    },
+    {
+      "epoch": 0.10077169469015027,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019451882362194845,
+      "loss": 0.2031,
+      "step": 11609
+    },
+    {
+      "epoch": 0.10078037517035443,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019451780079086523,
+      "loss": 0.1289,
+      "step": 11610
+    },
+    {
+      "epoch": 0.1007890556505586,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019451677786735462,
+      "loss": 0.125,
+      "step": 11611
+    },
+    {
+      "epoch": 0.10079773613076276,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0019451575485141773,
+      "loss": 0.1338,
+      "step": 11612
+    },
+    {
+      "epoch": 0.10080641661096691,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0019451473174305568,
+      "loss": 0.0869,
+      "step": 11613
+    },
+    {
+      "epoch": 0.10081509709117108,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019451370854226958,
+      "loss": 0.1035,
+      "step": 11614
+    },
+    {
+      "epoch": 0.10082377757137524,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019451268524906056,
+      "loss": 0.1143,
+      "step": 11615
+    },
+    {
+      "epoch": 0.10083245805157941,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0019451166186342974,
+      "loss": 0.1445,
+      "step": 11616
+    },
+    {
+      "epoch": 0.10084113853178357,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0019451063838537826,
+      "loss": 0.1221,
+      "step": 11617
+    },
+    {
+      "epoch": 0.10084981901198774,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019450961481490717,
+      "loss": 0.1143,
+      "step": 11618
+    },
+    {
+      "epoch": 0.1008584994921919,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019450859115201764,
+      "loss": 0.1484,
+      "step": 11619
+    },
+    {
+      "epoch": 0.10086717997239607,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019450756739671084,
+      "loss": 0.0913,
+      "step": 11620
+    },
+    {
+      "epoch": 0.10087586045260023,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0019450654354898778,
+      "loss": 0.0835,
+      "step": 11621
+    },
+    {
+      "epoch": 0.1008845409328044,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019450551960884964,
+      "loss": 0.1309,
+      "step": 11622
+    },
+    {
+      "epoch": 0.10089322141300856,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019450449557629754,
+      "loss": 0.1436,
+      "step": 11623
+    },
+    {
+      "epoch": 0.10090190189321273,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0019450347145133263,
+      "loss": 0.0825,
+      "step": 11624
+    },
+    {
+      "epoch": 0.1009105823734169,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0019450244723395595,
+      "loss": 0.1367,
+      "step": 11625
+    },
+    {
+      "epoch": 0.10091926285362106,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0019450142292416867,
+      "loss": 0.0957,
+      "step": 11626
+    },
+    {
+      "epoch": 0.10092794333382522,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019450039852197188,
+      "loss": 0.1309,
+      "step": 11627
+    },
+    {
+      "epoch": 0.10093662381402939,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019449937402736678,
+      "loss": 0.1377,
+      "step": 11628
+    },
+    {
+      "epoch": 0.10094530429423355,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019449834944035439,
+      "loss": 0.123,
+      "step": 11629
+    },
+    {
+      "epoch": 0.10095398477443772,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001944973247609359,
+      "loss": 0.1445,
+      "step": 11630
+    },
+    {
+      "epoch": 0.10096266525464188,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0019449629998911238,
+      "loss": 0.2344,
+      "step": 11631
+    },
+    {
+      "epoch": 0.10097134573484605,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019449527512488498,
+      "loss": 0.1113,
+      "step": 11632
+    },
+    {
+      "epoch": 0.10098002621505021,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019449425016825487,
+      "loss": 0.1631,
+      "step": 11633
+    },
+    {
+      "epoch": 0.10098870669525438,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0019449322511922306,
+      "loss": 0.1348,
+      "step": 11634
+    },
+    {
+      "epoch": 0.10099738717545854,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019449219997779072,
+      "loss": 0.1245,
+      "step": 11635
+    },
+    {
+      "epoch": 0.10100606765566271,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0019449117474395904,
+      "loss": 0.0952,
+      "step": 11636
+    },
+    {
+      "epoch": 0.10101474813586687,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019449014941772902,
+      "loss": 0.1094,
+      "step": 11637
+    },
+    {
+      "epoch": 0.10102342861607104,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0019448912399910186,
+      "loss": 0.0981,
+      "step": 11638
+    },
+    {
+      "epoch": 0.1010321090962752,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019448809848807867,
+      "loss": 0.123,
+      "step": 11639
+    },
+    {
+      "epoch": 0.10104078957647937,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019448707288466057,
+      "loss": 0.1074,
+      "step": 11640
+    },
+    {
+      "epoch": 0.10104947005668354,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019448604718884867,
+      "loss": 0.1328,
+      "step": 11641
+    },
+    {
+      "epoch": 0.1010581505368877,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001944850214006441,
+      "loss": 0.1533,
+      "step": 11642
+    },
+    {
+      "epoch": 0.10106683101709187,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019448399552004798,
+      "loss": 0.1152,
+      "step": 11643
+    },
+    {
+      "epoch": 0.10107551149729603,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019448296954706142,
+      "loss": 0.1348,
+      "step": 11644
+    },
+    {
+      "epoch": 0.1010841919775002,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019448194348168557,
+      "loss": 0.0933,
+      "step": 11645
+    },
+    {
+      "epoch": 0.10109287245770436,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0019448091732392153,
+      "loss": 0.123,
+      "step": 11646
+    },
+    {
+      "epoch": 0.10110155293790853,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019447989107377044,
+      "loss": 0.1069,
+      "step": 11647
+    },
+    {
+      "epoch": 0.10111023341811269,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001944788647312334,
+      "loss": 0.127,
+      "step": 11648
+    },
+    {
+      "epoch": 0.10111891389831686,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001944778382963115,
+      "loss": 0.0942,
+      "step": 11649
+    },
+    {
+      "epoch": 0.10112759437852102,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019447681176900599,
+      "loss": 0.1201,
+      "step": 11650
+    },
+    {
+      "epoch": 0.10113627485872519,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019447578514931786,
+      "loss": 0.1133,
+      "step": 11651
+    },
+    {
+      "epoch": 0.10114495533892935,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019447475843724826,
+      "loss": 0.1475,
+      "step": 11652
+    },
+    {
+      "epoch": 0.10115363581913352,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001944737316327984,
+      "loss": 0.1113,
+      "step": 11653
+    },
+    {
+      "epoch": 0.10116231629933768,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019447270473596928,
+      "loss": 0.1484,
+      "step": 11654
+    },
+    {
+      "epoch": 0.10117099677954185,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.001944716777467621,
+      "loss": 0.1133,
+      "step": 11655
+    },
+    {
+      "epoch": 0.10117967725974601,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019447065066517796,
+      "loss": 0.1152,
+      "step": 11656
+    },
+    {
+      "epoch": 0.10118835773995018,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019446962349121797,
+      "loss": 0.1289,
+      "step": 11657
+    },
+    {
+      "epoch": 0.10119703822015434,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001944685962248833,
+      "loss": 0.1167,
+      "step": 11658
+    },
+    {
+      "epoch": 0.10120571870035851,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0019446756886617501,
+      "loss": 0.1245,
+      "step": 11659
+    },
+    {
+      "epoch": 0.10121439918056267,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0019446654141509429,
+      "loss": 0.1084,
+      "step": 11660
+    },
+    {
+      "epoch": 0.10122307966076684,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001944655138716422,
+      "loss": 0.1055,
+      "step": 11661
+    },
+    {
+      "epoch": 0.101231760140971,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0019446448623581987,
+      "loss": 0.1436,
+      "step": 11662
+    },
+    {
+      "epoch": 0.10124044062117517,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019446345850762847,
+      "loss": 0.1484,
+      "step": 11663
+    },
+    {
+      "epoch": 0.10124912110137933,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019446243068706914,
+      "loss": 0.1113,
+      "step": 11664
+    },
+    {
+      "epoch": 0.1012578015815835,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019446140277414293,
+      "loss": 0.1045,
+      "step": 11665
+    },
+    {
+      "epoch": 0.10126648206178766,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019446037476885099,
+      "loss": 0.1816,
+      "step": 11666
+    },
+    {
+      "epoch": 0.10127516254199183,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001944593466711945,
+      "loss": 0.1201,
+      "step": 11667
+    },
+    {
+      "epoch": 0.101283843022196,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019445831848117448,
+      "loss": 0.1445,
+      "step": 11668
+    },
+    {
+      "epoch": 0.10129252350240016,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0019445729019879215,
+      "loss": 0.1104,
+      "step": 11669
+    },
+    {
+      "epoch": 0.10130120398260432,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019445626182404859,
+      "loss": 0.2012,
+      "step": 11670
+    },
+    {
+      "epoch": 0.10130988446280849,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001944552333569449,
+      "loss": 0.1045,
+      "step": 11671
+    },
+    {
+      "epoch": 0.10131856494301265,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019445420479748225,
+      "loss": 0.1387,
+      "step": 11672
+    },
+    {
+      "epoch": 0.10132724542321682,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0019445317614566178,
+      "loss": 0.1084,
+      "step": 11673
+    },
+    {
+      "epoch": 0.10133592590342098,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019445214740148457,
+      "loss": 0.1123,
+      "step": 11674
+    },
+    {
+      "epoch": 0.10134460638362514,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0019445111856495176,
+      "loss": 0.0942,
+      "step": 11675
+    },
+    {
+      "epoch": 0.1013532868638293,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019445008963606448,
+      "loss": 0.082,
+      "step": 11676
+    },
+    {
+      "epoch": 0.10136196734403347,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019444906061482383,
+      "loss": 0.1006,
+      "step": 11677
+    },
+    {
+      "epoch": 0.10137064782423763,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019444803150123096,
+      "loss": 0.1387,
+      "step": 11678
+    },
+    {
+      "epoch": 0.1013793283044418,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.00194447002295287,
+      "loss": 0.123,
+      "step": 11679
+    },
+    {
+      "epoch": 0.10138800878464596,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0019444597299699308,
+      "loss": 0.0928,
+      "step": 11680
+    },
+    {
+      "epoch": 0.10139668926485013,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001944449436063503,
+      "loss": 0.127,
+      "step": 11681
+    },
+    {
+      "epoch": 0.10140536974505429,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019444391412335978,
+      "loss": 0.1465,
+      "step": 11682
+    },
+    {
+      "epoch": 0.10141405022525846,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0019444288454802268,
+      "loss": 0.0903,
+      "step": 11683
+    },
+    {
+      "epoch": 0.10142273070546262,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0019444185488034012,
+      "loss": 0.1406,
+      "step": 11684
+    },
+    {
+      "epoch": 0.10143141118566679,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019444082512031322,
+      "loss": 0.1084,
+      "step": 11685
+    },
+    {
+      "epoch": 0.10144009166587095,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019443979526794308,
+      "loss": 0.1582,
+      "step": 11686
+    },
+    {
+      "epoch": 0.10144877214607512,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0019443876532323086,
+      "loss": 0.123,
+      "step": 11687
+    },
+    {
+      "epoch": 0.10145745262627928,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019443773528617766,
+      "loss": 0.1846,
+      "step": 11688
+    },
+    {
+      "epoch": 0.10146613310648345,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0019443670515678464,
+      "loss": 0.1113,
+      "step": 11689
+    },
+    {
+      "epoch": 0.10147481358668761,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019443567493505293,
+      "loss": 0.1201,
+      "step": 11690
+    },
+    {
+      "epoch": 0.10148349406689178,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001944346446209836,
+      "loss": 0.1006,
+      "step": 11691
+    },
+    {
+      "epoch": 0.10149217454709594,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001944336142145778,
+      "loss": 0.123,
+      "step": 11692
+    },
+    {
+      "epoch": 0.10150085502730011,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.001944325837158367,
+      "loss": 0.126,
+      "step": 11693
+    },
+    {
+      "epoch": 0.10150953550750427,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019443155312476139,
+      "loss": 0.1201,
+      "step": 11694
+    },
+    {
+      "epoch": 0.10151821598770844,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019443052244135298,
+      "loss": 0.1001,
+      "step": 11695
+    },
+    {
+      "epoch": 0.1015268964679126,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019442949166561265,
+      "loss": 0.1196,
+      "step": 11696
+    },
+    {
+      "epoch": 0.10153557694811677,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001944284607975415,
+      "loss": 0.1143,
+      "step": 11697
+    },
+    {
+      "epoch": 0.10154425742832093,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0019442742983714063,
+      "loss": 0.1494,
+      "step": 11698
+    },
+    {
+      "epoch": 0.1015529379085251,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001944263987844112,
+      "loss": 0.1191,
+      "step": 11699
+    },
+    {
+      "epoch": 0.10156161838872926,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001944253676393543,
+      "loss": 0.123,
+      "step": 11700
+    },
+    {
+      "epoch": 0.10157029886893343,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019442433640197113,
+      "loss": 0.1113,
+      "step": 11701
+    },
+    {
+      "epoch": 0.1015789793491376,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019442330507226273,
+      "loss": 0.124,
+      "step": 11702
+    },
+    {
+      "epoch": 0.10158765982934176,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0019442227365023032,
+      "loss": 0.1377,
+      "step": 11703
+    },
+    {
+      "epoch": 0.10159634030954592,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019442124213587494,
+      "loss": 0.0889,
+      "step": 11704
+    },
+    {
+      "epoch": 0.10160502078975009,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0019442021052919777,
+      "loss": 0.124,
+      "step": 11705
+    },
+    {
+      "epoch": 0.10161370126995425,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0019441917883019995,
+      "loss": 0.1523,
+      "step": 11706
+    },
+    {
+      "epoch": 0.10162238175015842,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0019441814703888257,
+      "loss": 0.0967,
+      "step": 11707
+    },
+    {
+      "epoch": 0.10163106223036258,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019441711515524676,
+      "loss": 0.1299,
+      "step": 11708
+    },
+    {
+      "epoch": 0.10163974271056675,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0019441608317929367,
+      "loss": 0.1797,
+      "step": 11709
+    },
+    {
+      "epoch": 0.10164842319077091,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0019441505111102441,
+      "loss": 0.1123,
+      "step": 11710
+    },
+    {
+      "epoch": 0.10165710367097508,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019441401895044013,
+      "loss": 0.0674,
+      "step": 11711
+    },
+    {
+      "epoch": 0.10166578415117924,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019441298669754196,
+      "loss": 0.1006,
+      "step": 11712
+    },
+    {
+      "epoch": 0.10167446463138341,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019441195435233101,
+      "loss": 0.1348,
+      "step": 11713
+    },
+    {
+      "epoch": 0.10168314511158758,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0019441092191480839,
+      "loss": 0.104,
+      "step": 11714
+    },
+    {
+      "epoch": 0.10169182559179174,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001944098893849753,
+      "loss": 0.1299,
+      "step": 11715
+    },
+    {
+      "epoch": 0.1017005060719959,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019440885676283274,
+      "loss": 0.1006,
+      "step": 11716
+    },
+    {
+      "epoch": 0.10170918655220007,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.00194407824048382,
+      "loss": 0.1074,
+      "step": 11717
+    },
+    {
+      "epoch": 0.10171786703240424,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001944067912416241,
+      "loss": 0.1299,
+      "step": 11718
+    },
+    {
+      "epoch": 0.1017265475126084,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019440575834256022,
+      "loss": 0.1006,
+      "step": 11719
+    },
+    {
+      "epoch": 0.10173522799281257,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019440472535119148,
+      "loss": 0.1016,
+      "step": 11720
+    },
+    {
+      "epoch": 0.10174390847301673,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019440369226751894,
+      "loss": 0.1021,
+      "step": 11721
+    },
+    {
+      "epoch": 0.1017525889532209,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019440265909154385,
+      "loss": 0.1162,
+      "step": 11722
+    },
+    {
+      "epoch": 0.10176126943342506,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019440162582326727,
+      "loss": 0.123,
+      "step": 11723
+    },
+    {
+      "epoch": 0.10176994991362923,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001944005924626903,
+      "loss": 0.1118,
+      "step": 11724
+    },
+    {
+      "epoch": 0.10177863039383339,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019439955900981416,
+      "loss": 0.1582,
+      "step": 11725
+    },
+    {
+      "epoch": 0.10178731087403756,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001943985254646399,
+      "loss": 0.1445,
+      "step": 11726
+    },
+    {
+      "epoch": 0.10179599135424172,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001943974918271687,
+      "loss": 0.1191,
+      "step": 11727
+    },
+    {
+      "epoch": 0.10180467183444589,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0019439645809740168,
+      "loss": 0.127,
+      "step": 11728
+    },
+    {
+      "epoch": 0.10181335231465005,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0019439542427533993,
+      "loss": 0.1201,
+      "step": 11729
+    },
+    {
+      "epoch": 0.10182203279485422,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019439439036098463,
+      "loss": 0.1006,
+      "step": 11730
+    },
+    {
+      "epoch": 0.10183071327505838,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0019439335635433688,
+      "loss": 0.1128,
+      "step": 11731
+    },
+    {
+      "epoch": 0.10183939375526255,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019439232225539785,
+      "loss": 0.1035,
+      "step": 11732
+    },
+    {
+      "epoch": 0.10184807423546671,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0019439128806416864,
+      "loss": 0.1143,
+      "step": 11733
+    },
+    {
+      "epoch": 0.10185675471567088,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019439025378065037,
+      "loss": 0.1221,
+      "step": 11734
+    },
+    {
+      "epoch": 0.10186543519587504,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001943892194048442,
+      "loss": 0.1011,
+      "step": 11735
+    },
+    {
+      "epoch": 0.1018741156760792,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001943881849367512,
+      "loss": 0.0889,
+      "step": 11736
+    },
+    {
+      "epoch": 0.10188279615628336,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.001943871503763726,
+      "loss": 0.1367,
+      "step": 11737
+    },
+    {
+      "epoch": 0.10189147663648752,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0019438611572370945,
+      "loss": 0.1152,
+      "step": 11738
+    },
+    {
+      "epoch": 0.10190015711669169,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019438508097876295,
+      "loss": 0.1416,
+      "step": 11739
+    },
+    {
+      "epoch": 0.10190883759689585,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019438404614153418,
+      "loss": 0.1523,
+      "step": 11740
+    },
+    {
+      "epoch": 0.10191751807710002,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019438301121202425,
+      "loss": 0.124,
+      "step": 11741
+    },
+    {
+      "epoch": 0.10192619855730418,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019438197619023436,
+      "loss": 0.1533,
+      "step": 11742
+    },
+    {
+      "epoch": 0.10193487903750835,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001943809410761656,
+      "loss": 0.084,
+      "step": 11743
+    },
+    {
+      "epoch": 0.10194355951771251,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001943799058698191,
+      "loss": 0.0938,
+      "step": 11744
+    },
+    {
+      "epoch": 0.10195223999791668,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0019437887057119603,
+      "loss": 0.1196,
+      "step": 11745
+    },
+    {
+      "epoch": 0.10196092047812085,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019437783518029747,
+      "loss": 0.1396,
+      "step": 11746
+    },
+    {
+      "epoch": 0.10196960095832501,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019437679969712461,
+      "loss": 0.1289,
+      "step": 11747
+    },
+    {
+      "epoch": 0.10197828143852918,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019437576412167851,
+      "loss": 0.0879,
+      "step": 11748
+    },
+    {
+      "epoch": 0.10198696191873334,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019437472845396037,
+      "loss": 0.1582,
+      "step": 11749
+    },
+    {
+      "epoch": 0.1019956423989375,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019437369269397126,
+      "loss": 0.1094,
+      "step": 11750
+    },
+    {
+      "epoch": 0.10200432287914167,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001943726568417124,
+      "loss": 0.166,
+      "step": 11751
+    },
+    {
+      "epoch": 0.10201300335934584,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019437162089718484,
+      "loss": 0.0991,
+      "step": 11752
+    },
+    {
+      "epoch": 0.10202168383955,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019437058486038972,
+      "loss": 0.1455,
+      "step": 11753
+    },
+    {
+      "epoch": 0.10203036431975417,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019436954873132822,
+      "loss": 0.0908,
+      "step": 11754
+    },
+    {
+      "epoch": 0.10203904479995833,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019436851251000147,
+      "loss": 0.1289,
+      "step": 11755
+    },
+    {
+      "epoch": 0.1020477252801625,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019436747619641056,
+      "loss": 0.1289,
+      "step": 11756
+    },
+    {
+      "epoch": 0.10205640576036666,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019436643979055665,
+      "loss": 0.1309,
+      "step": 11757
+    },
+    {
+      "epoch": 0.10206508624057083,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019436540329244086,
+      "loss": 0.1016,
+      "step": 11758
+    },
+    {
+      "epoch": 0.10207376672077499,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0019436436670206433,
+      "loss": 0.1406,
+      "step": 11759
+    },
+    {
+      "epoch": 0.10208244720097916,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019436333001942821,
+      "loss": 0.1455,
+      "step": 11760
+    },
+    {
+      "epoch": 0.10209112768118332,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001943622932445336,
+      "loss": 0.1064,
+      "step": 11761
+    },
+    {
+      "epoch": 0.10209980816138749,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0019436125637738166,
+      "loss": 0.1094,
+      "step": 11762
+    },
+    {
+      "epoch": 0.10210848864159165,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0019436021941797354,
+      "loss": 0.1191,
+      "step": 11763
+    },
+    {
+      "epoch": 0.10211716912179582,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019435918236631036,
+      "loss": 0.0996,
+      "step": 11764
+    },
+    {
+      "epoch": 0.10212584960199998,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019435814522239321,
+      "loss": 0.1494,
+      "step": 11765
+    },
+    {
+      "epoch": 0.10213453008220415,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0019435710798622328,
+      "loss": 0.0947,
+      "step": 11766
+    },
+    {
+      "epoch": 0.10214321056240831,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019435607065780168,
+      "loss": 0.0942,
+      "step": 11767
+    },
+    {
+      "epoch": 0.10215189104261248,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0019435503323712953,
+      "loss": 0.1177,
+      "step": 11768
+    },
+    {
+      "epoch": 0.10216057152281664,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.00194353995724208,
+      "loss": 0.166,
+      "step": 11769
+    },
+    {
+      "epoch": 0.10216925200302081,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001943529581190382,
+      "loss": 0.0957,
+      "step": 11770
+    },
+    {
+      "epoch": 0.10217793248322497,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019435192042162128,
+      "loss": 0.1348,
+      "step": 11771
+    },
+    {
+      "epoch": 0.10218661296342914,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019435088263195838,
+      "loss": 0.0967,
+      "step": 11772
+    },
+    {
+      "epoch": 0.1021952934436333,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019434984475005062,
+      "loss": 0.0977,
+      "step": 11773
+    },
+    {
+      "epoch": 0.10220397392383747,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019434880677589913,
+      "loss": 0.0996,
+      "step": 11774
+    },
+    {
+      "epoch": 0.10221265440404163,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019434776870950504,
+      "loss": 0.1133,
+      "step": 11775
+    },
+    {
+      "epoch": 0.1022213348842458,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019434673055086949,
+      "loss": 0.1416,
+      "step": 11776
+    },
+    {
+      "epoch": 0.10223001536444996,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0019434569229999366,
+      "loss": 0.1079,
+      "step": 11777
+    },
+    {
+      "epoch": 0.10223869584465413,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001943446539568786,
+      "loss": 0.1533,
+      "step": 11778
+    },
+    {
+      "epoch": 0.1022473763248583,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0019434361552152556,
+      "loss": 0.0913,
+      "step": 11779
+    },
+    {
+      "epoch": 0.10225605680506246,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0019434257699393555,
+      "loss": 0.106,
+      "step": 11780
+    },
+    {
+      "epoch": 0.10226473728526662,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0019434153837410977,
+      "loss": 0.1055,
+      "step": 11781
+    },
+    {
+      "epoch": 0.10227341776547079,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019434049966204937,
+      "loss": 0.0986,
+      "step": 11782
+    },
+    {
+      "epoch": 0.10228209824567495,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019433946085775547,
+      "loss": 0.125,
+      "step": 11783
+    },
+    {
+      "epoch": 0.10229077872587912,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001943384219612292,
+      "loss": 0.124,
+      "step": 11784
+    },
+    {
+      "epoch": 0.10229945920608328,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001943373829724717,
+      "loss": 0.1348,
+      "step": 11785
+    },
+    {
+      "epoch": 0.10230813968628745,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0019433634389148407,
+      "loss": 0.0986,
+      "step": 11786
+    },
+    {
+      "epoch": 0.10231682016649161,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019433530471826752,
+      "loss": 0.0947,
+      "step": 11787
+    },
+    {
+      "epoch": 0.10232550064669578,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019433426545282313,
+      "loss": 0.125,
+      "step": 11788
+    },
+    {
+      "epoch": 0.10233418112689995,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0019433322609515207,
+      "loss": 0.1143,
+      "step": 11789
+    },
+    {
+      "epoch": 0.10234286160710411,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0019433218664525545,
+      "loss": 0.1279,
+      "step": 11790
+    },
+    {
+      "epoch": 0.10235154208730828,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0019433114710313442,
+      "loss": 0.0723,
+      "step": 11791
+    },
+    {
+      "epoch": 0.10236022256751244,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019433010746879012,
+      "loss": 0.1484,
+      "step": 11792
+    },
+    {
+      "epoch": 0.1023689030477166,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0019432906774222367,
+      "loss": 0.1221,
+      "step": 11793
+    },
+    {
+      "epoch": 0.10237758352792077,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019432802792343625,
+      "loss": 0.0918,
+      "step": 11794
+    },
+    {
+      "epoch": 0.10238626400812494,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0019432698801242893,
+      "loss": 0.1162,
+      "step": 11795
+    },
+    {
+      "epoch": 0.1023949444883291,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001943259480092029,
+      "loss": 0.1221,
+      "step": 11796
+    },
+    {
+      "epoch": 0.10240362496853327,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0019432490791375925,
+      "loss": 0.1445,
+      "step": 11797
+    },
+    {
+      "epoch": 0.10241230544873742,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001943238677260992,
+      "loss": 0.1162,
+      "step": 11798
+    },
+    {
+      "epoch": 0.10242098592894158,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001943228274462238,
+      "loss": 0.1084,
+      "step": 11799
+    },
+    {
+      "epoch": 0.10242966640914575,
+      "grad_norm": 1.953125,
+      "learning_rate": 0.0019432178707413424,
+      "loss": 0.1973,
+      "step": 11800
+    },
+    {
+      "epoch": 0.10243834688934991,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019432074660983162,
+      "loss": 0.1074,
+      "step": 11801
+    },
+    {
+      "epoch": 0.10244702736955408,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.001943197060533171,
+      "loss": 0.1064,
+      "step": 11802
+    },
+    {
+      "epoch": 0.10245570784975824,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0019431866540459183,
+      "loss": 0.1113,
+      "step": 11803
+    },
+    {
+      "epoch": 0.10246438832996241,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0019431762466365695,
+      "loss": 0.125,
+      "step": 11804
+    },
+    {
+      "epoch": 0.10247306881016657,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019431658383051355,
+      "loss": 0.1045,
+      "step": 11805
+    },
+    {
+      "epoch": 0.10248174929037074,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001943155429051628,
+      "loss": 0.0928,
+      "step": 11806
+    },
+    {
+      "epoch": 0.1024904297705749,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019431450188760586,
+      "loss": 0.0845,
+      "step": 11807
+    },
+    {
+      "epoch": 0.10249911025077907,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0019431346077784385,
+      "loss": 0.103,
+      "step": 11808
+    },
+    {
+      "epoch": 0.10250779073098323,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019431241957587788,
+      "loss": 0.1338,
+      "step": 11809
+    },
+    {
+      "epoch": 0.1025164712111874,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019431137828170915,
+      "loss": 0.1406,
+      "step": 11810
+    },
+    {
+      "epoch": 0.10252515169139156,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019431033689533872,
+      "loss": 0.1328,
+      "step": 11811
+    },
+    {
+      "epoch": 0.10253383217159573,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019430929541676782,
+      "loss": 0.0845,
+      "step": 11812
+    },
+    {
+      "epoch": 0.1025425126517999,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001943082538459975,
+      "loss": 0.1357,
+      "step": 11813
+    },
+    {
+      "epoch": 0.10255119313200406,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0019430721218302897,
+      "loss": 0.1289,
+      "step": 11814
+    },
+    {
+      "epoch": 0.10255987361220822,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019430617042786333,
+      "loss": 0.0991,
+      "step": 11815
+    },
+    {
+      "epoch": 0.10256855409241239,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0019430512858050171,
+      "loss": 0.1172,
+      "step": 11816
+    },
+    {
+      "epoch": 0.10257723457261655,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001943040866409453,
+      "loss": 0.1021,
+      "step": 11817
+    },
+    {
+      "epoch": 0.10258591505282072,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001943030446091952,
+      "loss": 0.0913,
+      "step": 11818
+    },
+    {
+      "epoch": 0.10259459553302488,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019430200248525254,
+      "loss": 0.0996,
+      "step": 11819
+    },
+    {
+      "epoch": 0.10260327601322905,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001943009602691185,
+      "loss": 0.1621,
+      "step": 11820
+    },
+    {
+      "epoch": 0.10261195649343322,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019429991796079415,
+      "loss": 0.1211,
+      "step": 11821
+    },
+    {
+      "epoch": 0.10262063697363738,
+      "grad_norm": 0.055419921875,
+      "learning_rate": 0.0019429887556028074,
+      "loss": 0.1094,
+      "step": 11822
+    },
+    {
+      "epoch": 0.10262931745384155,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019429783306757929,
+      "loss": 0.1758,
+      "step": 11823
+    },
+    {
+      "epoch": 0.10263799793404571,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019429679048269103,
+      "loss": 0.1387,
+      "step": 11824
+    },
+    {
+      "epoch": 0.10264667841424988,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019429574780561708,
+      "loss": 0.207,
+      "step": 11825
+    },
+    {
+      "epoch": 0.10265535889445404,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019429470503635853,
+      "loss": 0.0732,
+      "step": 11826
+    },
+    {
+      "epoch": 0.1026640393746582,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019429366217491657,
+      "loss": 0.1143,
+      "step": 11827
+    },
+    {
+      "epoch": 0.10267271985486237,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019429261922129233,
+      "loss": 0.1216,
+      "step": 11828
+    },
+    {
+      "epoch": 0.10268140033506654,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0019429157617548694,
+      "loss": 0.1104,
+      "step": 11829
+    },
+    {
+      "epoch": 0.1026900808152707,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019429053303750157,
+      "loss": 0.0996,
+      "step": 11830
+    },
+    {
+      "epoch": 0.10269876129547487,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019428948980733732,
+      "loss": 0.1064,
+      "step": 11831
+    },
+    {
+      "epoch": 0.10270744177567903,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019428844648499534,
+      "loss": 0.1157,
+      "step": 11832
+    },
+    {
+      "epoch": 0.1027161222558832,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0019428740307047684,
+      "loss": 0.1025,
+      "step": 11833
+    },
+    {
+      "epoch": 0.10272480273608736,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0019428635956378283,
+      "loss": 0.127,
+      "step": 11834
+    },
+    {
+      "epoch": 0.10273348321629153,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019428531596491457,
+      "loss": 0.1455,
+      "step": 11835
+    },
+    {
+      "epoch": 0.10274216369649569,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019428427227387312,
+      "loss": 0.1455,
+      "step": 11836
+    },
+    {
+      "epoch": 0.10275084417669986,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019428322849065966,
+      "loss": 0.0996,
+      "step": 11837
+    },
+    {
+      "epoch": 0.10275952465690402,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0019428218461527535,
+      "loss": 0.1055,
+      "step": 11838
+    },
+    {
+      "epoch": 0.10276820513710819,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0019428114064772128,
+      "loss": 0.1133,
+      "step": 11839
+    },
+    {
+      "epoch": 0.10277688561731235,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0019428009658799866,
+      "loss": 0.0864,
+      "step": 11840
+    },
+    {
+      "epoch": 0.10278556609751652,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001942790524361086,
+      "loss": 0.1133,
+      "step": 11841
+    },
+    {
+      "epoch": 0.10279424657772068,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001942780081920522,
+      "loss": 0.0957,
+      "step": 11842
+    },
+    {
+      "epoch": 0.10280292705792485,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019427696385583065,
+      "loss": 0.1045,
+      "step": 11843
+    },
+    {
+      "epoch": 0.10281160753812901,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019427591942744508,
+      "loss": 0.1191,
+      "step": 11844
+    },
+    {
+      "epoch": 0.10282028801833318,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0019427487490689661,
+      "loss": 0.1016,
+      "step": 11845
+    },
+    {
+      "epoch": 0.10282896849853734,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019427383029418641,
+      "loss": 0.2158,
+      "step": 11846
+    },
+    {
+      "epoch": 0.10283764897874151,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0019427278558931563,
+      "loss": 0.124,
+      "step": 11847
+    },
+    {
+      "epoch": 0.10284632945894567,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001942717407922854,
+      "loss": 0.085,
+      "step": 11848
+    },
+    {
+      "epoch": 0.10285500993914984,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0019427069590309686,
+      "loss": 0.126,
+      "step": 11849
+    },
+    {
+      "epoch": 0.102863690419354,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019426965092175113,
+      "loss": 0.1191,
+      "step": 11850
+    },
+    {
+      "epoch": 0.10287237089955817,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001942686058482494,
+      "loss": 0.1172,
+      "step": 11851
+    },
+    {
+      "epoch": 0.10288105137976233,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019426756068259277,
+      "loss": 0.1064,
+      "step": 11852
+    },
+    {
+      "epoch": 0.1028897318599665,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0019426651542478243,
+      "loss": 0.1338,
+      "step": 11853
+    },
+    {
+      "epoch": 0.10289841234017066,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0019426547007481948,
+      "loss": 0.1216,
+      "step": 11854
+    },
+    {
+      "epoch": 0.10290709282037483,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019426442463270507,
+      "loss": 0.1182,
+      "step": 11855
+    },
+    {
+      "epoch": 0.102915773300579,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0019426337909844034,
+      "loss": 0.1836,
+      "step": 11856
+    },
+    {
+      "epoch": 0.10292445378078316,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019426233347202646,
+      "loss": 0.0957,
+      "step": 11857
+    },
+    {
+      "epoch": 0.10293313426098732,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019426128775346455,
+      "loss": 0.1201,
+      "step": 11858
+    },
+    {
+      "epoch": 0.10294181474119148,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019426024194275579,
+      "loss": 0.1167,
+      "step": 11859
+    },
+    {
+      "epoch": 0.10295049522139564,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0019425919603990127,
+      "loss": 0.1216,
+      "step": 11860
+    },
+    {
+      "epoch": 0.1029591757015998,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019425815004490215,
+      "loss": 0.0996,
+      "step": 11861
+    },
+    {
+      "epoch": 0.10296785618180397,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019425710395775961,
+      "loss": 0.1045,
+      "step": 11862
+    },
+    {
+      "epoch": 0.10297653666200814,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0019425605777847474,
+      "loss": 0.1719,
+      "step": 11863
+    },
+    {
+      "epoch": 0.1029852171422123,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019425501150704873,
+      "loss": 0.0903,
+      "step": 11864
+    },
+    {
+      "epoch": 0.10299389762241647,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001942539651434827,
+      "loss": 0.1475,
+      "step": 11865
+    },
+    {
+      "epoch": 0.10300257810262063,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0019425291868777781,
+      "loss": 0.1299,
+      "step": 11866
+    },
+    {
+      "epoch": 0.1030112585828248,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0019425187213993518,
+      "loss": 0.127,
+      "step": 11867
+    },
+    {
+      "epoch": 0.10301993906302896,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019425082549995598,
+      "loss": 0.1035,
+      "step": 11868
+    },
+    {
+      "epoch": 0.10302861954323313,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019424977876784137,
+      "loss": 0.1143,
+      "step": 11869
+    },
+    {
+      "epoch": 0.10303730002343729,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001942487319435924,
+      "loss": 0.1104,
+      "step": 11870
+    },
+    {
+      "epoch": 0.10304598050364146,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0019424768502721032,
+      "loss": 0.1289,
+      "step": 11871
+    },
+    {
+      "epoch": 0.10305466098384562,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0019424663801869623,
+      "loss": 0.1406,
+      "step": 11872
+    },
+    {
+      "epoch": 0.10306334146404979,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0019424559091805128,
+      "loss": 0.1172,
+      "step": 11873
+    },
+    {
+      "epoch": 0.10307202194425395,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0019424454372527662,
+      "loss": 0.1104,
+      "step": 11874
+    },
+    {
+      "epoch": 0.10308070242445812,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019424349644037342,
+      "loss": 0.1133,
+      "step": 11875
+    },
+    {
+      "epoch": 0.10308938290466228,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019424244906334277,
+      "loss": 0.1445,
+      "step": 11876
+    },
+    {
+      "epoch": 0.10309806338486645,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0019424140159418585,
+      "loss": 0.1885,
+      "step": 11877
+    },
+    {
+      "epoch": 0.10310674386507061,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001942403540329038,
+      "loss": 0.1426,
+      "step": 11878
+    },
+    {
+      "epoch": 0.10311542434527478,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019423930637949777,
+      "loss": 0.1484,
+      "step": 11879
+    },
+    {
+      "epoch": 0.10312410482547894,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019423825863396887,
+      "loss": 0.1143,
+      "step": 11880
+    },
+    {
+      "epoch": 0.10313278530568311,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001942372107963183,
+      "loss": 0.1006,
+      "step": 11881
+    },
+    {
+      "epoch": 0.10314146578588727,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019423616286654718,
+      "loss": 0.1221,
+      "step": 11882
+    },
+    {
+      "epoch": 0.10315014626609144,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019423511484465668,
+      "loss": 0.1064,
+      "step": 11883
+    },
+    {
+      "epoch": 0.1031588267462956,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001942340667306479,
+      "loss": 0.0933,
+      "step": 11884
+    },
+    {
+      "epoch": 0.10316750722649977,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00194233018524522,
+      "loss": 0.1011,
+      "step": 11885
+    },
+    {
+      "epoch": 0.10317618770670393,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0019423197022628016,
+      "loss": 0.1035,
+      "step": 11886
+    },
+    {
+      "epoch": 0.1031848681869081,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019423092183592347,
+      "loss": 0.0928,
+      "step": 11887
+    },
+    {
+      "epoch": 0.10319354866711226,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0019422987335345314,
+      "loss": 0.1289,
+      "step": 11888
+    },
+    {
+      "epoch": 0.10320222914731643,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019422882477887027,
+      "loss": 0.1035,
+      "step": 11889
+    },
+    {
+      "epoch": 0.1032109096275206,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019422777611217603,
+      "loss": 0.1455,
+      "step": 11890
+    },
+    {
+      "epoch": 0.10321959010772476,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019422672735337158,
+      "loss": 0.1758,
+      "step": 11891
+    },
+    {
+      "epoch": 0.10322827058792892,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0019422567850245803,
+      "loss": 0.1211,
+      "step": 11892
+    },
+    {
+      "epoch": 0.10323695106813309,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019422462955943653,
+      "loss": 0.1377,
+      "step": 11893
+    },
+    {
+      "epoch": 0.10324563154833725,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019422358052430823,
+      "loss": 0.1045,
+      "step": 11894
+    },
+    {
+      "epoch": 0.10325431202854142,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019422253139707436,
+      "loss": 0.1406,
+      "step": 11895
+    },
+    {
+      "epoch": 0.10326299250874559,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001942214821777359,
+      "loss": 0.1143,
+      "step": 11896
+    },
+    {
+      "epoch": 0.10327167298894975,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019422043286629416,
+      "loss": 0.1123,
+      "step": 11897
+    },
+    {
+      "epoch": 0.10328035346915392,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0019421938346275017,
+      "loss": 0.1699,
+      "step": 11898
+    },
+    {
+      "epoch": 0.10328903394935808,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0019421833396710516,
+      "loss": 0.1377,
+      "step": 11899
+    },
+    {
+      "epoch": 0.10329771442956225,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019421728437936023,
+      "loss": 0.1152,
+      "step": 11900
+    },
+    {
+      "epoch": 0.10330639490976641,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001942162346995166,
+      "loss": 0.0957,
+      "step": 11901
+    },
+    {
+      "epoch": 0.10331507538997058,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019421518492757528,
+      "loss": 0.1426,
+      "step": 11902
+    },
+    {
+      "epoch": 0.10332375587017474,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019421413506353754,
+      "loss": 0.1113,
+      "step": 11903
+    },
+    {
+      "epoch": 0.1033324363503789,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019421308510740448,
+      "loss": 0.168,
+      "step": 11904
+    },
+    {
+      "epoch": 0.10334111683058307,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019421203505917727,
+      "loss": 0.1611,
+      "step": 11905
+    },
+    {
+      "epoch": 0.10334979731078724,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.00194210984918857,
+      "loss": 0.1162,
+      "step": 11906
+    },
+    {
+      "epoch": 0.1033584777909914,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001942099346864449,
+      "loss": 0.1289,
+      "step": 11907
+    },
+    {
+      "epoch": 0.10336715827119557,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019420888436194206,
+      "loss": 0.1699,
+      "step": 11908
+    },
+    {
+      "epoch": 0.10337583875139973,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019420783394534969,
+      "loss": 0.1079,
+      "step": 11909
+    },
+    {
+      "epoch": 0.1033845192316039,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019420678343666884,
+      "loss": 0.1367,
+      "step": 11910
+    },
+    {
+      "epoch": 0.10339319971180806,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019420573283590076,
+      "loss": 0.1299,
+      "step": 11911
+    },
+    {
+      "epoch": 0.10340188019201223,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019420468214304653,
+      "loss": 0.1406,
+      "step": 11912
+    },
+    {
+      "epoch": 0.10341056067221639,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0019420363135810735,
+      "loss": 0.1523,
+      "step": 11913
+    },
+    {
+      "epoch": 0.10341924115242056,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001942025804810843,
+      "loss": 0.1187,
+      "step": 11914
+    },
+    {
+      "epoch": 0.10342792163262472,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019420152951197863,
+      "loss": 0.1035,
+      "step": 11915
+    },
+    {
+      "epoch": 0.10343660211282889,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019420047845079141,
+      "loss": 0.0933,
+      "step": 11916
+    },
+    {
+      "epoch": 0.10344528259303305,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019419942729752382,
+      "loss": 0.1562,
+      "step": 11917
+    },
+    {
+      "epoch": 0.10345396307323722,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0019419837605217697,
+      "loss": 0.1055,
+      "step": 11918
+    },
+    {
+      "epoch": 0.10346264355344138,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019419732471475204,
+      "loss": 0.1221,
+      "step": 11919
+    },
+    {
+      "epoch": 0.10347132403364555,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019419627328525022,
+      "loss": 0.127,
+      "step": 11920
+    },
+    {
+      "epoch": 0.1034800045138497,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019419522176367262,
+      "loss": 0.1074,
+      "step": 11921
+    },
+    {
+      "epoch": 0.10348868499405386,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019419417015002034,
+      "loss": 0.1504,
+      "step": 11922
+    },
+    {
+      "epoch": 0.10349736547425803,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0019419311844429462,
+      "loss": 0.1318,
+      "step": 11923
+    },
+    {
+      "epoch": 0.1035060459544622,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0019419206664649657,
+      "loss": 0.1045,
+      "step": 11924
+    },
+    {
+      "epoch": 0.10351472643466636,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019419101475662733,
+      "loss": 0.1553,
+      "step": 11925
+    },
+    {
+      "epoch": 0.10352340691487052,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019418996277468807,
+      "loss": 0.1484,
+      "step": 11926
+    },
+    {
+      "epoch": 0.10353208739507469,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0019418891070067993,
+      "loss": 0.1099,
+      "step": 11927
+    },
+    {
+      "epoch": 0.10354076787527886,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0019418785853460405,
+      "loss": 0.1836,
+      "step": 11928
+    },
+    {
+      "epoch": 0.10354944835548302,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001941868062764616,
+      "loss": 0.1445,
+      "step": 11929
+    },
+    {
+      "epoch": 0.10355812883568719,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0019418575392625374,
+      "loss": 0.1357,
+      "step": 11930
+    },
+    {
+      "epoch": 0.10356680931589135,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0019418470148398158,
+      "loss": 0.1797,
+      "step": 11931
+    },
+    {
+      "epoch": 0.10357548979609552,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019418364894964628,
+      "loss": 0.0972,
+      "step": 11932
+    },
+    {
+      "epoch": 0.10358417027629968,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019418259632324904,
+      "loss": 0.1514,
+      "step": 11933
+    },
+    {
+      "epoch": 0.10359285075650385,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.00194181543604791,
+      "loss": 0.1543,
+      "step": 11934
+    },
+    {
+      "epoch": 0.10360153123670801,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0019418049079427322,
+      "loss": 0.1699,
+      "step": 11935
+    },
+    {
+      "epoch": 0.10361021171691218,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019417943789169698,
+      "loss": 0.1216,
+      "step": 11936
+    },
+    {
+      "epoch": 0.10361889219711634,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019417838489706335,
+      "loss": 0.126,
+      "step": 11937
+    },
+    {
+      "epoch": 0.1036275726773205,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001941773318103735,
+      "loss": 0.1289,
+      "step": 11938
+    },
+    {
+      "epoch": 0.10363625315752467,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0019417627863162858,
+      "loss": 0.0986,
+      "step": 11939
+    },
+    {
+      "epoch": 0.10364493363772884,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019417522536082977,
+      "loss": 0.1367,
+      "step": 11940
+    },
+    {
+      "epoch": 0.103653614117933,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019417417199797816,
+      "loss": 0.1328,
+      "step": 11941
+    },
+    {
+      "epoch": 0.10366229459813717,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019417311854307495,
+      "loss": 0.125,
+      "step": 11942
+    },
+    {
+      "epoch": 0.10367097507834133,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019417206499612132,
+      "loss": 0.1162,
+      "step": 11943
+    },
+    {
+      "epoch": 0.1036796555585455,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019417101135711834,
+      "loss": 0.1226,
+      "step": 11944
+    },
+    {
+      "epoch": 0.10368833603874966,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019416995762606725,
+      "loss": 0.1006,
+      "step": 11945
+    },
+    {
+      "epoch": 0.10369701651895383,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0019416890380296913,
+      "loss": 0.4785,
+      "step": 11946
+    },
+    {
+      "epoch": 0.10370569699915799,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0019416784988782517,
+      "loss": 0.1289,
+      "step": 11947
+    },
+    {
+      "epoch": 0.10371437747936216,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001941667958806365,
+      "loss": 0.104,
+      "step": 11948
+    },
+    {
+      "epoch": 0.10372305795956632,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001941657417814043,
+      "loss": 0.0938,
+      "step": 11949
+    },
+    {
+      "epoch": 0.10373173843977049,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0019416468759012968,
+      "loss": 0.0698,
+      "step": 11950
+    },
+    {
+      "epoch": 0.10374041891997465,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019416363330681387,
+      "loss": 0.1455,
+      "step": 11951
+    },
+    {
+      "epoch": 0.10374909940017882,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019416257893145794,
+      "loss": 0.1523,
+      "step": 11952
+    },
+    {
+      "epoch": 0.10375777988038298,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.001941615244640631,
+      "loss": 0.1221,
+      "step": 11953
+    },
+    {
+      "epoch": 0.10376646036058715,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019416046990463048,
+      "loss": 0.1006,
+      "step": 11954
+    },
+    {
+      "epoch": 0.10377514084079131,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019415941525316124,
+      "loss": 0.1182,
+      "step": 11955
+    },
+    {
+      "epoch": 0.10378382132099548,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0019415836050965653,
+      "loss": 0.1475,
+      "step": 11956
+    },
+    {
+      "epoch": 0.10379250180119964,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019415730567411745,
+      "loss": 0.1338,
+      "step": 11957
+    },
+    {
+      "epoch": 0.10380118228140381,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019415625074654527,
+      "loss": 0.1162,
+      "step": 11958
+    },
+    {
+      "epoch": 0.10380986276160797,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0019415519572694104,
+      "loss": 0.1152,
+      "step": 11959
+    },
+    {
+      "epoch": 0.10381854324181214,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019415414061530597,
+      "loss": 0.1406,
+      "step": 11960
+    },
+    {
+      "epoch": 0.1038272237220163,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019415308541164114,
+      "loss": 0.1299,
+      "step": 11961
+    },
+    {
+      "epoch": 0.10383590420222047,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0019415203011594784,
+      "loss": 0.1289,
+      "step": 11962
+    },
+    {
+      "epoch": 0.10384458468242463,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001941509747282271,
+      "loss": 0.1357,
+      "step": 11963
+    },
+    {
+      "epoch": 0.1038532651626288,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019414991924848013,
+      "loss": 0.126,
+      "step": 11964
+    },
+    {
+      "epoch": 0.10386194564283296,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019414886367670808,
+      "loss": 0.1055,
+      "step": 11965
+    },
+    {
+      "epoch": 0.10387062612303713,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001941478080129121,
+      "loss": 0.1504,
+      "step": 11966
+    },
+    {
+      "epoch": 0.1038793066032413,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019414675225709332,
+      "loss": 0.0977,
+      "step": 11967
+    },
+    {
+      "epoch": 0.10388798708344546,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0019414569640925292,
+      "loss": 0.0688,
+      "step": 11968
+    },
+    {
+      "epoch": 0.10389666756364963,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019414464046939206,
+      "loss": 0.1187,
+      "step": 11969
+    },
+    {
+      "epoch": 0.10390534804385379,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001941435844375119,
+      "loss": 0.1357,
+      "step": 11970
+    },
+    {
+      "epoch": 0.10391402852405796,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019414252831361357,
+      "loss": 0.126,
+      "step": 11971
+    },
+    {
+      "epoch": 0.10392270900426212,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0019414147209769821,
+      "loss": 0.1465,
+      "step": 11972
+    },
+    {
+      "epoch": 0.10393138948446629,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019414041578976705,
+      "loss": 0.1011,
+      "step": 11973
+    },
+    {
+      "epoch": 0.10394006996467045,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019413935938982117,
+      "loss": 0.1426,
+      "step": 11974
+    },
+    {
+      "epoch": 0.10394875044487462,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019413830289786171,
+      "loss": 0.1079,
+      "step": 11975
+    },
+    {
+      "epoch": 0.10395743092507878,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019413724631388994,
+      "loss": 0.123,
+      "step": 11976
+    },
+    {
+      "epoch": 0.10396611140528295,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019413618963790688,
+      "loss": 0.1182,
+      "step": 11977
+    },
+    {
+      "epoch": 0.10397479188548711,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019413513286991378,
+      "loss": 0.1001,
+      "step": 11978
+    },
+    {
+      "epoch": 0.10398347236569128,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0019413407600991175,
+      "loss": 0.1387,
+      "step": 11979
+    },
+    {
+      "epoch": 0.10399215284589544,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0019413301905790196,
+      "loss": 0.1475,
+      "step": 11980
+    },
+    {
+      "epoch": 0.1040008333260996,
+      "grad_norm": 2.765625,
+      "learning_rate": 0.0019413196201388556,
+      "loss": 0.3945,
+      "step": 11981
+    },
+    {
+      "epoch": 0.10400951380630377,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019413090487786372,
+      "loss": 0.0908,
+      "step": 11982
+    },
+    {
+      "epoch": 0.10401819428650792,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019412984764983759,
+      "loss": 0.1406,
+      "step": 11983
+    },
+    {
+      "epoch": 0.10402687476671209,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019412879032980833,
+      "loss": 0.1055,
+      "step": 11984
+    },
+    {
+      "epoch": 0.10403555524691625,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0019412773291777705,
+      "loss": 0.1147,
+      "step": 11985
+    },
+    {
+      "epoch": 0.10404423572712042,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019412667541374498,
+      "loss": 0.1099,
+      "step": 11986
+    },
+    {
+      "epoch": 0.10405291620732458,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0019412561781771323,
+      "loss": 0.1416,
+      "step": 11987
+    },
+    {
+      "epoch": 0.10406159668752875,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0019412456012968301,
+      "loss": 0.0879,
+      "step": 11988
+    },
+    {
+      "epoch": 0.10407027716773291,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019412350234965536,
+      "loss": 0.168,
+      "step": 11989
+    },
+    {
+      "epoch": 0.10407895764793708,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019412244447763157,
+      "loss": 0.0942,
+      "step": 11990
+    },
+    {
+      "epoch": 0.10408763812814124,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0019412138651361268,
+      "loss": 0.1357,
+      "step": 11991
+    },
+    {
+      "epoch": 0.10409631860834541,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019412032845759998,
+      "loss": 0.1406,
+      "step": 11992
+    },
+    {
+      "epoch": 0.10410499908854957,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019411927030959452,
+      "loss": 0.1211,
+      "step": 11993
+    },
+    {
+      "epoch": 0.10411367956875374,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019411821206959745,
+      "loss": 0.1533,
+      "step": 11994
+    },
+    {
+      "epoch": 0.1041223600489579,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0019411715373761,
+      "loss": 0.1133,
+      "step": 11995
+    },
+    {
+      "epoch": 0.10413104052916207,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001941160953136333,
+      "loss": 0.0928,
+      "step": 11996
+    },
+    {
+      "epoch": 0.10413972100936623,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019411503679766851,
+      "loss": 0.125,
+      "step": 11997
+    },
+    {
+      "epoch": 0.1041484014895704,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0019411397818971676,
+      "loss": 0.0796,
+      "step": 11998
+    },
+    {
+      "epoch": 0.10415708196977456,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0019411291948977923,
+      "loss": 0.1143,
+      "step": 11999
+    },
+    {
+      "epoch": 0.10416576244997873,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019411186069785708,
+      "loss": 0.1123,
+      "step": 12000
+    },
+    {
+      "epoch": 0.1041744429301829,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019411080181395148,
+      "loss": 0.1006,
+      "step": 12001
+    },
+    {
+      "epoch": 0.10418312341038706,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0019410974283806356,
+      "loss": 0.1108,
+      "step": 12002
+    },
+    {
+      "epoch": 0.10419180389059123,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0019410868377019448,
+      "loss": 0.1875,
+      "step": 12003
+    },
+    {
+      "epoch": 0.10420048437079539,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019410762461034542,
+      "loss": 0.1396,
+      "step": 12004
+    },
+    {
+      "epoch": 0.10420916485099956,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001941065653585175,
+      "loss": 0.1113,
+      "step": 12005
+    },
+    {
+      "epoch": 0.10421784533120372,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0019410550601471194,
+      "loss": 0.1426,
+      "step": 12006
+    },
+    {
+      "epoch": 0.10422652581140789,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019410444657892983,
+      "loss": 0.1289,
+      "step": 12007
+    },
+    {
+      "epoch": 0.10423520629161205,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019410338705117237,
+      "loss": 0.1191,
+      "step": 12008
+    },
+    {
+      "epoch": 0.10424388677181622,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019410232743144072,
+      "loss": 0.1309,
+      "step": 12009
+    },
+    {
+      "epoch": 0.10425256725202038,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019410126771973602,
+      "loss": 0.1641,
+      "step": 12010
+    },
+    {
+      "epoch": 0.10426124773222455,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019410020791605941,
+      "loss": 0.1104,
+      "step": 12011
+    },
+    {
+      "epoch": 0.10426992821242871,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019409914802041212,
+      "loss": 0.1699,
+      "step": 12012
+    },
+    {
+      "epoch": 0.10427860869263288,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019409808803279526,
+      "loss": 0.1016,
+      "step": 12013
+    },
+    {
+      "epoch": 0.10428728917283704,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0019409702795320995,
+      "loss": 0.125,
+      "step": 12014
+    },
+    {
+      "epoch": 0.1042959696530412,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0019409596778165745,
+      "loss": 0.125,
+      "step": 12015
+    },
+    {
+      "epoch": 0.10430465013324537,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019409490751813882,
+      "loss": 0.1455,
+      "step": 12016
+    },
+    {
+      "epoch": 0.10431333061344954,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001940938471626553,
+      "loss": 0.0991,
+      "step": 12017
+    },
+    {
+      "epoch": 0.1043220110936537,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00194092786715208,
+      "loss": 0.1299,
+      "step": 12018
+    },
+    {
+      "epoch": 0.10433069157385787,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019409172617579809,
+      "loss": 0.1904,
+      "step": 12019
+    },
+    {
+      "epoch": 0.10433937205406203,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019409066554442673,
+      "loss": 0.1221,
+      "step": 12020
+    },
+    {
+      "epoch": 0.1043480525342662,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001940896048210951,
+      "loss": 0.1045,
+      "step": 12021
+    },
+    {
+      "epoch": 0.10435673301447036,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019408854400580434,
+      "loss": 0.1201,
+      "step": 12022
+    },
+    {
+      "epoch": 0.10436541349467453,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019408748309855555,
+      "loss": 0.1445,
+      "step": 12023
+    },
+    {
+      "epoch": 0.10437409397487869,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019408642209935002,
+      "loss": 0.1416,
+      "step": 12024
+    },
+    {
+      "epoch": 0.10438277445508286,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019408536100818882,
+      "loss": 0.1445,
+      "step": 12025
+    },
+    {
+      "epoch": 0.10439145493528702,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019408429982507311,
+      "loss": 0.127,
+      "step": 12026
+    },
+    {
+      "epoch": 0.10440013541549119,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019408323855000412,
+      "loss": 0.126,
+      "step": 12027
+    },
+    {
+      "epoch": 0.10440881589569535,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0019408217718298293,
+      "loss": 0.0967,
+      "step": 12028
+    },
+    {
+      "epoch": 0.10441749637589952,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0019408111572401072,
+      "loss": 0.1113,
+      "step": 12029
+    },
+    {
+      "epoch": 0.10442617685610368,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019408005417308868,
+      "loss": 0.1094,
+      "step": 12030
+    },
+    {
+      "epoch": 0.10443485733630785,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.00194078992530218,
+      "loss": 0.1709,
+      "step": 12031
+    },
+    {
+      "epoch": 0.10444353781651201,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0019407793079539975,
+      "loss": 0.1182,
+      "step": 12032
+    },
+    {
+      "epoch": 0.10445221829671618,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019407686896863515,
+      "loss": 0.1494,
+      "step": 12033
+    },
+    {
+      "epoch": 0.10446089877692034,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019407580704992532,
+      "loss": 0.1035,
+      "step": 12034
+    },
+    {
+      "epoch": 0.10446957925712451,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0019407474503927151,
+      "loss": 0.1318,
+      "step": 12035
+    },
+    {
+      "epoch": 0.10447825973732867,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001940736829366748,
+      "loss": 0.1191,
+      "step": 12036
+    },
+    {
+      "epoch": 0.10448694021753284,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019407262074213633,
+      "loss": 0.1396,
+      "step": 12037
+    },
+    {
+      "epoch": 0.104495620697737,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019407155845565737,
+      "loss": 0.126,
+      "step": 12038
+    },
+    {
+      "epoch": 0.10450430117794117,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019407049607723897,
+      "loss": 0.1582,
+      "step": 12039
+    },
+    {
+      "epoch": 0.10451298165814533,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0019406943360688236,
+      "loss": 0.127,
+      "step": 12040
+    },
+    {
+      "epoch": 0.1045216621383495,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0019406837104458867,
+      "loss": 0.1152,
+      "step": 12041
+    },
+    {
+      "epoch": 0.10453034261855366,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019406730839035906,
+      "loss": 0.1436,
+      "step": 12042
+    },
+    {
+      "epoch": 0.10453902309875783,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0019406624564419475,
+      "loss": 0.1035,
+      "step": 12043
+    },
+    {
+      "epoch": 0.10454770357896198,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0019406518280609684,
+      "loss": 0.1138,
+      "step": 12044
+    },
+    {
+      "epoch": 0.10455638405916615,
+      "grad_norm": 0.0595703125,
+      "learning_rate": 0.0019406411987606647,
+      "loss": 0.106,
+      "step": 12045
+    },
+    {
+      "epoch": 0.10456506453937031,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019406305685410487,
+      "loss": 0.0996,
+      "step": 12046
+    },
+    {
+      "epoch": 0.10457374501957448,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019406199374021318,
+      "loss": 0.1299,
+      "step": 12047
+    },
+    {
+      "epoch": 0.10458242549977864,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0019406093053439255,
+      "loss": 0.106,
+      "step": 12048
+    },
+    {
+      "epoch": 0.1045911059799828,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0019405986723664416,
+      "loss": 0.1338,
+      "step": 12049
+    },
+    {
+      "epoch": 0.10459978646018697,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019405880384696914,
+      "loss": 0.0942,
+      "step": 12050
+    },
+    {
+      "epoch": 0.10460846694039114,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001940577403653687,
+      "loss": 0.1318,
+      "step": 12051
+    },
+    {
+      "epoch": 0.1046171474205953,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0019405667679184395,
+      "loss": 0.1045,
+      "step": 12052
+    },
+    {
+      "epoch": 0.10462582790079947,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019405561312639613,
+      "loss": 0.0728,
+      "step": 12053
+    },
+    {
+      "epoch": 0.10463450838100363,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019405454936902628,
+      "loss": 0.1162,
+      "step": 12054
+    },
+    {
+      "epoch": 0.1046431888612078,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019405348551973572,
+      "loss": 0.1084,
+      "step": 12055
+    },
+    {
+      "epoch": 0.10465186934141196,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0019405242157852548,
+      "loss": 0.1152,
+      "step": 12056
+    },
+    {
+      "epoch": 0.10466054982161613,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0019405135754539677,
+      "loss": 0.1143,
+      "step": 12057
+    },
+    {
+      "epoch": 0.10466923030182029,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0019405029342035077,
+      "loss": 0.0815,
+      "step": 12058
+    },
+    {
+      "epoch": 0.10467791078202446,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0019404922920338865,
+      "loss": 0.125,
+      "step": 12059
+    },
+    {
+      "epoch": 0.10468659126222862,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0019404816489451153,
+      "loss": 0.1104,
+      "step": 12060
+    },
+    {
+      "epoch": 0.10469527174243279,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001940471004937206,
+      "loss": 0.1592,
+      "step": 12061
+    },
+    {
+      "epoch": 0.10470395222263695,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0019404603600101705,
+      "loss": 0.1504,
+      "step": 12062
+    },
+    {
+      "epoch": 0.10471263270284112,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.00194044971416402,
+      "loss": 0.166,
+      "step": 12063
+    },
+    {
+      "epoch": 0.10472131318304528,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0019404390673987664,
+      "loss": 0.1357,
+      "step": 12064
+    },
+    {
+      "epoch": 0.10472999366324945,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001940428419714421,
+      "loss": 0.1143,
+      "step": 12065
+    },
+    {
+      "epoch": 0.10473867414345361,
+      "grad_norm": 4.3125,
+      "learning_rate": 0.001940417771110996,
+      "loss": 0.4766,
+      "step": 12066
+    },
+    {
+      "epoch": 0.10474735462365778,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019404071215885028,
+      "loss": 0.1245,
+      "step": 12067
+    },
+    {
+      "epoch": 0.10475603510386194,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019403964711469527,
+      "loss": 0.125,
+      "step": 12068
+    },
+    {
+      "epoch": 0.10476471558406611,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001940385819786358,
+      "loss": 0.1016,
+      "step": 12069
+    },
+    {
+      "epoch": 0.10477339606427027,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0019403751675067297,
+      "loss": 0.0981,
+      "step": 12070
+    },
+    {
+      "epoch": 0.10478207654447444,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019403645143080798,
+      "loss": 0.2598,
+      "step": 12071
+    },
+    {
+      "epoch": 0.1047907570246786,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.00194035386019042,
+      "loss": 0.1318,
+      "step": 12072
+    },
+    {
+      "epoch": 0.10479943750488277,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019403432051537618,
+      "loss": 0.0923,
+      "step": 12073
+    },
+    {
+      "epoch": 0.10480811798508693,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001940332549198117,
+      "loss": 0.1035,
+      "step": 12074
+    },
+    {
+      "epoch": 0.1048167984652911,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001940321892323497,
+      "loss": 0.126,
+      "step": 12075
+    },
+    {
+      "epoch": 0.10482547894549527,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019403112345299136,
+      "loss": 0.1177,
+      "step": 12076
+    },
+    {
+      "epoch": 0.10483415942569943,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019403005758173784,
+      "loss": 0.1074,
+      "step": 12077
+    },
+    {
+      "epoch": 0.1048428399059036,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0019402899161859034,
+      "loss": 0.124,
+      "step": 12078
+    },
+    {
+      "epoch": 0.10485152038610776,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019402792556354996,
+      "loss": 0.1094,
+      "step": 12079
+    },
+    {
+      "epoch": 0.10486020086631193,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0019402685941661793,
+      "loss": 0.1172,
+      "step": 12080
+    },
+    {
+      "epoch": 0.10486888134651609,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0019402579317779535,
+      "loss": 0.0986,
+      "step": 12081
+    },
+    {
+      "epoch": 0.10487756182672026,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019402472684708347,
+      "loss": 0.1553,
+      "step": 12082
+    },
+    {
+      "epoch": 0.10488624230692442,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001940236604244834,
+      "loss": 0.0986,
+      "step": 12083
+    },
+    {
+      "epoch": 0.10489492278712859,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001940225939099963,
+      "loss": 0.1621,
+      "step": 12084
+    },
+    {
+      "epoch": 0.10490360326733275,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001940215273036234,
+      "loss": 0.1118,
+      "step": 12085
+    },
+    {
+      "epoch": 0.10491228374753692,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0019402046060536578,
+      "loss": 0.1025,
+      "step": 12086
+    },
+    {
+      "epoch": 0.10492096422774108,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019401939381522466,
+      "loss": 0.1045,
+      "step": 12087
+    },
+    {
+      "epoch": 0.10492964470794525,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019401832693320118,
+      "loss": 0.1494,
+      "step": 12088
+    },
+    {
+      "epoch": 0.10493832518814941,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0019401725995929655,
+      "loss": 0.1138,
+      "step": 12089
+    },
+    {
+      "epoch": 0.10494700566835358,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0019401619289351184,
+      "loss": 0.1133,
+      "step": 12090
+    },
+    {
+      "epoch": 0.10495568614855774,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019401512573584837,
+      "loss": 0.166,
+      "step": 12091
+    },
+    {
+      "epoch": 0.10496436662876191,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019401405848630715,
+      "loss": 0.1104,
+      "step": 12092
+    },
+    {
+      "epoch": 0.10497304710896607,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019401299114488945,
+      "loss": 0.1484,
+      "step": 12093
+    },
+    {
+      "epoch": 0.10498172758917024,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0019401192371159644,
+      "loss": 0.1113,
+      "step": 12094
+    },
+    {
+      "epoch": 0.1049904080693744,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001940108561864292,
+      "loss": 0.1699,
+      "step": 12095
+    },
+    {
+      "epoch": 0.10499908854957857,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0019400978856938895,
+      "loss": 0.1396,
+      "step": 12096
+    },
+    {
+      "epoch": 0.10500776902978273,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0019400872086047687,
+      "loss": 0.1719,
+      "step": 12097
+    },
+    {
+      "epoch": 0.1050164495099869,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0019400765305969412,
+      "loss": 0.1162,
+      "step": 12098
+    },
+    {
+      "epoch": 0.10502512999019106,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0019400658516704186,
+      "loss": 0.1445,
+      "step": 12099
+    },
+    {
+      "epoch": 0.10503381047039523,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019400551718252127,
+      "loss": 0.1445,
+      "step": 12100
+    },
+    {
+      "epoch": 0.1050424909505994,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0019400444910613348,
+      "loss": 0.1113,
+      "step": 12101
+    },
+    {
+      "epoch": 0.10505117143080356,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019400338093787974,
+      "loss": 0.1826,
+      "step": 12102
+    },
+    {
+      "epoch": 0.10505985191100772,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019400231267776111,
+      "loss": 0.1279,
+      "step": 12103
+    },
+    {
+      "epoch": 0.10506853239121189,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0019400124432577884,
+      "loss": 0.1182,
+      "step": 12104
+    },
+    {
+      "epoch": 0.10507721287141605,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019400017588193407,
+      "loss": 0.1387,
+      "step": 12105
+    },
+    {
+      "epoch": 0.1050858933516202,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019399910734622797,
+      "loss": 0.1143,
+      "step": 12106
+    },
+    {
+      "epoch": 0.10509457383182437,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019399803871866168,
+      "loss": 0.1553,
+      "step": 12107
+    },
+    {
+      "epoch": 0.10510325431202854,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0019399696999923644,
+      "loss": 0.1025,
+      "step": 12108
+    },
+    {
+      "epoch": 0.1051119347922327,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0019399590118795336,
+      "loss": 0.1089,
+      "step": 12109
+    },
+    {
+      "epoch": 0.10512061527243687,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001939948322848136,
+      "loss": 0.1289,
+      "step": 12110
+    },
+    {
+      "epoch": 0.10512929575264103,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019399376328981838,
+      "loss": 0.1787,
+      "step": 12111
+    },
+    {
+      "epoch": 0.1051379762328452,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019399269420296884,
+      "loss": 0.1074,
+      "step": 12112
+    },
+    {
+      "epoch": 0.10514665671304936,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019399162502426615,
+      "loss": 0.1172,
+      "step": 12113
+    },
+    {
+      "epoch": 0.10515533719325353,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001939905557537115,
+      "loss": 0.1211,
+      "step": 12114
+    },
+    {
+      "epoch": 0.10516401767345769,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019398948639130597,
+      "loss": 0.125,
+      "step": 12115
+    },
+    {
+      "epoch": 0.10517269815366186,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0019398841693705087,
+      "loss": 0.1279,
+      "step": 12116
+    },
+    {
+      "epoch": 0.10518137863386602,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0019398734739094728,
+      "loss": 0.1289,
+      "step": 12117
+    },
+    {
+      "epoch": 0.10519005911407019,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001939862777529964,
+      "loss": 0.127,
+      "step": 12118
+    },
+    {
+      "epoch": 0.10519873959427435,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0019398520802319935,
+      "loss": 0.1777,
+      "step": 12119
+    },
+    {
+      "epoch": 0.10520742007447852,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0019398413820155738,
+      "loss": 0.1484,
+      "step": 12120
+    },
+    {
+      "epoch": 0.10521610055468268,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001939830682880716,
+      "loss": 0.1318,
+      "step": 12121
+    },
+    {
+      "epoch": 0.10522478103488685,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.001939819982827432,
+      "loss": 0.1289,
+      "step": 12122
+    },
+    {
+      "epoch": 0.10523346151509101,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0019398092818557337,
+      "loss": 0.1182,
+      "step": 12123
+    },
+    {
+      "epoch": 0.10524214199529518,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0019397985799656321,
+      "loss": 0.1445,
+      "step": 12124
+    },
+    {
+      "epoch": 0.10525082247549934,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0019397878771571398,
+      "loss": 0.1406,
+      "step": 12125
+    },
+    {
+      "epoch": 0.10525950295570351,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001939777173430268,
+      "loss": 0.1611,
+      "step": 12126
+    },
+    {
+      "epoch": 0.10526818343590767,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019397664687850283,
+      "loss": 0.1118,
+      "step": 12127
+    },
+    {
+      "epoch": 0.10527686391611184,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019397557632214327,
+      "loss": 0.0889,
+      "step": 12128
+    },
+    {
+      "epoch": 0.105285544396316,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001939745056739493,
+      "loss": 0.1582,
+      "step": 12129
+    },
+    {
+      "epoch": 0.10529422487652017,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019397343493392206,
+      "loss": 0.125,
+      "step": 12130
+    },
+    {
+      "epoch": 0.10530290535672433,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019397236410206274,
+      "loss": 0.1406,
+      "step": 12131
+    },
+    {
+      "epoch": 0.1053115858369285,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001939712931783725,
+      "loss": 0.1553,
+      "step": 12132
+    },
+    {
+      "epoch": 0.10532026631713266,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001939702221628525,
+      "loss": 0.0762,
+      "step": 12133
+    },
+    {
+      "epoch": 0.10532894679733683,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019396915105550395,
+      "loss": 0.1309,
+      "step": 12134
+    },
+    {
+      "epoch": 0.105337627277541,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019396807985632798,
+      "loss": 0.1128,
+      "step": 12135
+    },
+    {
+      "epoch": 0.10534630775774516,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019396700856532578,
+      "loss": 0.123,
+      "step": 12136
+    },
+    {
+      "epoch": 0.10535498823794932,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019396593718249853,
+      "loss": 0.0977,
+      "step": 12137
+    },
+    {
+      "epoch": 0.10536366871815349,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019396486570784739,
+      "loss": 0.0674,
+      "step": 12138
+    },
+    {
+      "epoch": 0.10537234919835765,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0019396379414137357,
+      "loss": 0.0991,
+      "step": 12139
+    },
+    {
+      "epoch": 0.10538102967856182,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019396272248307814,
+      "loss": 0.1084,
+      "step": 12140
+    },
+    {
+      "epoch": 0.10538971015876598,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001939616507329624,
+      "loss": 0.1348,
+      "step": 12141
+    },
+    {
+      "epoch": 0.10539839063897015,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001939605788910274,
+      "loss": 0.1221,
+      "step": 12142
+    },
+    {
+      "epoch": 0.10540707111917431,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019395950695727442,
+      "loss": 0.1104,
+      "step": 12143
+    },
+    {
+      "epoch": 0.10541575159937848,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0019395843493170456,
+      "loss": 0.207,
+      "step": 12144
+    },
+    {
+      "epoch": 0.10542443207958264,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019395736281431904,
+      "loss": 0.1201,
+      "step": 12145
+    },
+    {
+      "epoch": 0.10543311255978681,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.00193956290605119,
+      "loss": 0.1777,
+      "step": 12146
+    },
+    {
+      "epoch": 0.10544179303999097,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001939552183041056,
+      "loss": 0.1016,
+      "step": 12147
+    },
+    {
+      "epoch": 0.10545047352019514,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019395414591128006,
+      "loss": 0.1123,
+      "step": 12148
+    },
+    {
+      "epoch": 0.1054591540003993,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0019395307342664353,
+      "loss": 0.1143,
+      "step": 12149
+    },
+    {
+      "epoch": 0.10546783448060347,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019395200085019716,
+      "loss": 0.0986,
+      "step": 12150
+    },
+    {
+      "epoch": 0.10547651496080764,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019395092818194214,
+      "loss": 0.1089,
+      "step": 12151
+    },
+    {
+      "epoch": 0.1054851954410118,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0019394985542187965,
+      "loss": 0.1157,
+      "step": 12152
+    },
+    {
+      "epoch": 0.10549387592121597,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019394878257001087,
+      "loss": 0.0977,
+      "step": 12153
+    },
+    {
+      "epoch": 0.10550255640142013,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019394770962633697,
+      "loss": 0.084,
+      "step": 12154
+    },
+    {
+      "epoch": 0.1055112368816243,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001939466365908591,
+      "loss": 0.0884,
+      "step": 12155
+    },
+    {
+      "epoch": 0.10551991736182846,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019394556346357845,
+      "loss": 0.1514,
+      "step": 12156
+    },
+    {
+      "epoch": 0.10552859784203263,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001939444902444962,
+      "loss": 0.127,
+      "step": 12157
+    },
+    {
+      "epoch": 0.10553727832223679,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019394341693361353,
+      "loss": 0.0962,
+      "step": 12158
+    },
+    {
+      "epoch": 0.10554595880244096,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001939423435309316,
+      "loss": 0.1465,
+      "step": 12159
+    },
+    {
+      "epoch": 0.10555463928264512,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019394127003645157,
+      "loss": 0.1172,
+      "step": 12160
+    },
+    {
+      "epoch": 0.10556331976284929,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001939401964501746,
+      "loss": 0.0991,
+      "step": 12161
+    },
+    {
+      "epoch": 0.10557200024305345,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0019393912277210193,
+      "loss": 0.1133,
+      "step": 12162
+    },
+    {
+      "epoch": 0.10558068072325762,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0019393804900223468,
+      "loss": 0.1797,
+      "step": 12163
+    },
+    {
+      "epoch": 0.10558936120346178,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019393697514057404,
+      "loss": 0.1201,
+      "step": 12164
+    },
+    {
+      "epoch": 0.10559804168366595,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.001939359011871212,
+      "loss": 0.0859,
+      "step": 12165
+    },
+    {
+      "epoch": 0.10560672216387011,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019393482714187731,
+      "loss": 0.1318,
+      "step": 12166
+    },
+    {
+      "epoch": 0.10561540264407426,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019393375300484359,
+      "loss": 0.0996,
+      "step": 12167
+    },
+    {
+      "epoch": 0.10562408312427843,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0019393267877602111,
+      "loss": 0.1191,
+      "step": 12168
+    },
+    {
+      "epoch": 0.1056327636044826,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019393160445541119,
+      "loss": 0.1426,
+      "step": 12169
+    },
+    {
+      "epoch": 0.10564144408468676,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019393053004301485,
+      "loss": 0.1328,
+      "step": 12170
+    },
+    {
+      "epoch": 0.10565012456489092,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001939294555388334,
+      "loss": 0.1045,
+      "step": 12171
+    },
+    {
+      "epoch": 0.10565880504509509,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0019392838094286792,
+      "loss": 0.1064,
+      "step": 12172
+    },
+    {
+      "epoch": 0.10566748552529925,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0019392730625511965,
+      "loss": 0.1123,
+      "step": 12173
+    },
+    {
+      "epoch": 0.10567616600550342,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0019392623147558978,
+      "loss": 0.083,
+      "step": 12174
+    },
+    {
+      "epoch": 0.10568484648570758,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019392515660427937,
+      "loss": 0.1377,
+      "step": 12175
+    },
+    {
+      "epoch": 0.10569352696591175,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001939240816411897,
+      "loss": 0.1006,
+      "step": 12176
+    },
+    {
+      "epoch": 0.10570220744611591,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019392300658632189,
+      "loss": 0.1338,
+      "step": 12177
+    },
+    {
+      "epoch": 0.10571088792632008,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019392193143967717,
+      "loss": 0.1016,
+      "step": 12178
+    },
+    {
+      "epoch": 0.10571956840652424,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019392085620125667,
+      "loss": 0.1396,
+      "step": 12179
+    },
+    {
+      "epoch": 0.10572824888672841,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019391978087106162,
+      "loss": 0.1318,
+      "step": 12180
+    },
+    {
+      "epoch": 0.10573692936693257,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001939187054490931,
+      "loss": 0.1104,
+      "step": 12181
+    },
+    {
+      "epoch": 0.10574560984713674,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001939176299353524,
+      "loss": 0.1699,
+      "step": 12182
+    },
+    {
+      "epoch": 0.1057542903273409,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0019391655432984061,
+      "loss": 0.1826,
+      "step": 12183
+    },
+    {
+      "epoch": 0.10576297080754507,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019391547863255893,
+      "loss": 0.1494,
+      "step": 12184
+    },
+    {
+      "epoch": 0.10577165128774924,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019391440284350857,
+      "loss": 0.1582,
+      "step": 12185
+    },
+    {
+      "epoch": 0.1057803317679534,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019391332696269065,
+      "loss": 0.1221,
+      "step": 12186
+    },
+    {
+      "epoch": 0.10578901224815757,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001939122509901064,
+      "loss": 0.1055,
+      "step": 12187
+    },
+    {
+      "epoch": 0.10579769272836173,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0019391117492575697,
+      "loss": 0.1699,
+      "step": 12188
+    },
+    {
+      "epoch": 0.1058063732085659,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019391009876964353,
+      "loss": 0.1099,
+      "step": 12189
+    },
+    {
+      "epoch": 0.10581505368877006,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0019390902252176727,
+      "loss": 0.1084,
+      "step": 12190
+    },
+    {
+      "epoch": 0.10582373416897423,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001939079461821294,
+      "loss": 0.123,
+      "step": 12191
+    },
+    {
+      "epoch": 0.10583241464917839,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.00193906869750731,
+      "loss": 0.1377,
+      "step": 12192
+    },
+    {
+      "epoch": 0.10584109512938256,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019390579322757335,
+      "loss": 0.1289,
+      "step": 12193
+    },
+    {
+      "epoch": 0.10584977560958672,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019390471661265756,
+      "loss": 0.125,
+      "step": 12194
+    },
+    {
+      "epoch": 0.10585845608979089,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019390363990598482,
+      "loss": 0.1602,
+      "step": 12195
+    },
+    {
+      "epoch": 0.10586713656999505,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0019390256310755636,
+      "loss": 0.1543,
+      "step": 12196
+    },
+    {
+      "epoch": 0.10587581705019922,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001939014862173733,
+      "loss": 0.1562,
+      "step": 12197
+    },
+    {
+      "epoch": 0.10588449753040338,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019390040923543683,
+      "loss": 0.1592,
+      "step": 12198
+    },
+    {
+      "epoch": 0.10589317801060755,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0019389933216174815,
+      "loss": 0.127,
+      "step": 12199
+    },
+    {
+      "epoch": 0.10590185849081171,
+      "grad_norm": 1.7265625,
+      "learning_rate": 0.001938982549963084,
+      "loss": 0.1348,
+      "step": 12200
+    },
+    {
+      "epoch": 0.10591053897101588,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001938971777391188,
+      "loss": 0.1357,
+      "step": 12201
+    },
+    {
+      "epoch": 0.10591921945122004,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0019389610039018048,
+      "loss": 0.104,
+      "step": 12202
+    },
+    {
+      "epoch": 0.10592789993142421,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019389502294949467,
+      "loss": 0.1133,
+      "step": 12203
+    },
+    {
+      "epoch": 0.10593658041162837,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019389394541706251,
+      "loss": 0.0889,
+      "step": 12204
+    },
+    {
+      "epoch": 0.10594526089183254,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019389286779288519,
+      "loss": 0.084,
+      "step": 12205
+    },
+    {
+      "epoch": 0.1059539413720367,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019389179007696389,
+      "loss": 0.123,
+      "step": 12206
+    },
+    {
+      "epoch": 0.10596262185224087,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.001938907122692998,
+      "loss": 0.123,
+      "step": 12207
+    },
+    {
+      "epoch": 0.10597130233244503,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001938896343698941,
+      "loss": 0.1055,
+      "step": 12208
+    },
+    {
+      "epoch": 0.1059799828126492,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019388855637874792,
+      "loss": 0.0947,
+      "step": 12209
+    },
+    {
+      "epoch": 0.10598866329285336,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019388747829586252,
+      "loss": 0.1211,
+      "step": 12210
+    },
+    {
+      "epoch": 0.10599734377305753,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019388640012123898,
+      "loss": 0.0957,
+      "step": 12211
+    },
+    {
+      "epoch": 0.1060060242532617,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0019388532185487858,
+      "loss": 0.1602,
+      "step": 12212
+    },
+    {
+      "epoch": 0.10601470473346586,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 0.0019388424349678242,
+      "loss": 0.1074,
+      "step": 12213
+    },
+    {
+      "epoch": 0.10602338521367002,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0019388316504695172,
+      "loss": 0.1133,
+      "step": 12214
+    },
+    {
+      "epoch": 0.10603206569387419,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019388208650538766,
+      "loss": 0.1504,
+      "step": 12215
+    },
+    {
+      "epoch": 0.10604074617407835,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019388100787209141,
+      "loss": 0.1602,
+      "step": 12216
+    },
+    {
+      "epoch": 0.10604942665428252,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019387992914706416,
+      "loss": 0.1167,
+      "step": 12217
+    },
+    {
+      "epoch": 0.10605810713448668,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019387885033030705,
+      "loss": 0.0991,
+      "step": 12218
+    },
+    {
+      "epoch": 0.10606678761469085,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019387777142182133,
+      "loss": 0.1196,
+      "step": 12219
+    },
+    {
+      "epoch": 0.10607546809489501,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001938766924216081,
+      "loss": 0.1064,
+      "step": 12220
+    },
+    {
+      "epoch": 0.10608414857509918,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0019387561332966858,
+      "loss": 0.1445,
+      "step": 12221
+    },
+    {
+      "epoch": 0.10609282905530334,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0019387453414600397,
+      "loss": 0.1348,
+      "step": 12222
+    },
+    {
+      "epoch": 0.10610150953550751,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019387345487061542,
+      "loss": 0.1084,
+      "step": 12223
+    },
+    {
+      "epoch": 0.10611019001571168,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019387237550350415,
+      "loss": 0.1035,
+      "step": 12224
+    },
+    {
+      "epoch": 0.10611887049591584,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0019387129604467128,
+      "loss": 0.1094,
+      "step": 12225
+    },
+    {
+      "epoch": 0.10612755097612,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019387021649411801,
+      "loss": 0.1592,
+      "step": 12226
+    },
+    {
+      "epoch": 0.10613623145632417,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019386913685184555,
+      "loss": 0.1045,
+      "step": 12227
+    },
+    {
+      "epoch": 0.10614491193652834,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019386805711785506,
+      "loss": 0.127,
+      "step": 12228
+    },
+    {
+      "epoch": 0.10615359241673249,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001938669772921477,
+      "loss": 0.0952,
+      "step": 12229
+    },
+    {
+      "epoch": 0.10616227289693665,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0019386589737472471,
+      "loss": 0.1069,
+      "step": 12230
+    },
+    {
+      "epoch": 0.10617095337714082,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019386481736558723,
+      "loss": 0.1064,
+      "step": 12231
+    },
+    {
+      "epoch": 0.10617963385734498,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0019386373726473644,
+      "loss": 0.1338,
+      "step": 12232
+    },
+    {
+      "epoch": 0.10618831433754915,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019386265707217352,
+      "loss": 0.1367,
+      "step": 12233
+    },
+    {
+      "epoch": 0.10619699481775331,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0019386157678789966,
+      "loss": 0.1226,
+      "step": 12234
+    },
+    {
+      "epoch": 0.10620567529795748,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019386049641191604,
+      "loss": 0.1465,
+      "step": 12235
+    },
+    {
+      "epoch": 0.10621435577816164,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0019385941594422386,
+      "loss": 0.1504,
+      "step": 12236
+    },
+    {
+      "epoch": 0.10622303625836581,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019385833538482424,
+      "loss": 0.123,
+      "step": 12237
+    },
+    {
+      "epoch": 0.10623171673856997,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019385725473371844,
+      "loss": 0.083,
+      "step": 12238
+    },
+    {
+      "epoch": 0.10624039721877414,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019385617399090757,
+      "loss": 0.125,
+      "step": 12239
+    },
+    {
+      "epoch": 0.1062490776989783,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0019385509315639287,
+      "loss": 0.1006,
+      "step": 12240
+    },
+    {
+      "epoch": 0.10625775817918247,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019385401223017552,
+      "loss": 0.1235,
+      "step": 12241
+    },
+    {
+      "epoch": 0.10626643865938663,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001938529312122567,
+      "loss": 0.1201,
+      "step": 12242
+    },
+    {
+      "epoch": 0.1062751191395908,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019385185010263749,
+      "loss": 0.1191,
+      "step": 12243
+    },
+    {
+      "epoch": 0.10628379961979496,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001938507689013192,
+      "loss": 0.1216,
+      "step": 12244
+    },
+    {
+      "epoch": 0.10629248009999913,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019384968760830302,
+      "loss": 0.1445,
+      "step": 12245
+    },
+    {
+      "epoch": 0.1063011605802033,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0019384860622359001,
+      "loss": 0.1807,
+      "step": 12246
+    },
+    {
+      "epoch": 0.10630984106040746,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019384752474718146,
+      "loss": 0.1562,
+      "step": 12247
+    },
+    {
+      "epoch": 0.10631852154061162,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001938464431790785,
+      "loss": 0.1289,
+      "step": 12248
+    },
+    {
+      "epoch": 0.10632720202081579,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019384536151928235,
+      "loss": 0.1328,
+      "step": 12249
+    },
+    {
+      "epoch": 0.10633588250101995,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019384427976779414,
+      "loss": 0.0864,
+      "step": 12250
+    },
+    {
+      "epoch": 0.10634456298122412,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0019384319792461512,
+      "loss": 0.1084,
+      "step": 12251
+    },
+    {
+      "epoch": 0.10635324346142828,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001938421159897464,
+      "loss": 0.1211,
+      "step": 12252
+    },
+    {
+      "epoch": 0.10636192394163245,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0019384103396318925,
+      "loss": 0.1113,
+      "step": 12253
+    },
+    {
+      "epoch": 0.10637060442183661,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019383995184494476,
+      "loss": 0.1187,
+      "step": 12254
+    },
+    {
+      "epoch": 0.10637928490204078,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019383886963501421,
+      "loss": 0.0938,
+      "step": 12255
+    },
+    {
+      "epoch": 0.10638796538224495,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019383778733339868,
+      "loss": 0.1279,
+      "step": 12256
+    },
+    {
+      "epoch": 0.10639664586244911,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019383670494009942,
+      "loss": 0.1299,
+      "step": 12257
+    },
+    {
+      "epoch": 0.10640532634265328,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001938356224551176,
+      "loss": 0.1138,
+      "step": 12258
+    },
+    {
+      "epoch": 0.10641400682285744,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001938345398784544,
+      "loss": 0.1221,
+      "step": 12259
+    },
+    {
+      "epoch": 0.1064226873030616,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019383345721011103,
+      "loss": 0.1069,
+      "step": 12260
+    },
+    {
+      "epoch": 0.10643136778326577,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019383237445008861,
+      "loss": 0.1113,
+      "step": 12261
+    },
+    {
+      "epoch": 0.10644004826346994,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019383129159838842,
+      "loss": 0.1211,
+      "step": 12262
+    },
+    {
+      "epoch": 0.1064487287436741,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019383020865501152,
+      "loss": 0.103,
+      "step": 12263
+    },
+    {
+      "epoch": 0.10645740922387827,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019382912561995923,
+      "loss": 0.0894,
+      "step": 12264
+    },
+    {
+      "epoch": 0.10646608970408243,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0019382804249323262,
+      "loss": 0.2109,
+      "step": 12265
+    },
+    {
+      "epoch": 0.1064747701842866,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0019382695927483299,
+      "loss": 0.0977,
+      "step": 12266
+    },
+    {
+      "epoch": 0.10648345066449076,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019382587596476137,
+      "loss": 0.1289,
+      "step": 12267
+    },
+    {
+      "epoch": 0.10649213114469493,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001938247925630191,
+      "loss": 0.1445,
+      "step": 12268
+    },
+    {
+      "epoch": 0.10650081162489909,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019382370906960724,
+      "loss": 0.1089,
+      "step": 12269
+    },
+    {
+      "epoch": 0.10650949210510326,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019382262548452707,
+      "loss": 0.1279,
+      "step": 12270
+    },
+    {
+      "epoch": 0.10651817258530742,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0019382154180777973,
+      "loss": 0.1309,
+      "step": 12271
+    },
+    {
+      "epoch": 0.10652685306551159,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019382045803936641,
+      "loss": 0.0991,
+      "step": 12272
+    },
+    {
+      "epoch": 0.10653553354571575,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019381937417928827,
+      "loss": 0.1465,
+      "step": 12273
+    },
+    {
+      "epoch": 0.10654421402591992,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019381829022754656,
+      "loss": 0.1172,
+      "step": 12274
+    },
+    {
+      "epoch": 0.10655289450612408,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.001938172061841424,
+      "loss": 0.1289,
+      "step": 12275
+    },
+    {
+      "epoch": 0.10656157498632825,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00193816122049077,
+      "loss": 0.1152,
+      "step": 12276
+    },
+    {
+      "epoch": 0.10657025546653241,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001938150378223516,
+      "loss": 0.1309,
+      "step": 12277
+    },
+    {
+      "epoch": 0.10657893594673658,
+      "grad_norm": 2.09375,
+      "learning_rate": 0.001938139535039673,
+      "loss": 0.1836,
+      "step": 12278
+    },
+    {
+      "epoch": 0.10658761642694074,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001938128690939253,
+      "loss": 0.1611,
+      "step": 12279
+    },
+    {
+      "epoch": 0.10659629690714491,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001938117845922268,
+      "loss": 0.1445,
+      "step": 12280
+    },
+    {
+      "epoch": 0.10660497738734907,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0019381069999887303,
+      "loss": 0.1328,
+      "step": 12281
+    },
+    {
+      "epoch": 0.10661365786755324,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019380961531386511,
+      "loss": 0.1104,
+      "step": 12282
+    },
+    {
+      "epoch": 0.1066223383477574,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019380853053720427,
+      "loss": 0.1035,
+      "step": 12283
+    },
+    {
+      "epoch": 0.10663101882796157,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019380744566889169,
+      "loss": 0.105,
+      "step": 12284
+    },
+    {
+      "epoch": 0.10663969930816573,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019380636070892852,
+      "loss": 0.123,
+      "step": 12285
+    },
+    {
+      "epoch": 0.1066483797883699,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019380527565731598,
+      "loss": 0.1484,
+      "step": 12286
+    },
+    {
+      "epoch": 0.10665706026857406,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0019380419051405527,
+      "loss": 0.1436,
+      "step": 12287
+    },
+    {
+      "epoch": 0.10666574074877823,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019380310527914752,
+      "loss": 0.1416,
+      "step": 12288
+    },
+    {
+      "epoch": 0.1066744212289824,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00193802019952594,
+      "loss": 0.1387,
+      "step": 12289
+    },
+    {
+      "epoch": 0.10668310170918656,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001938009345343958,
+      "loss": 0.1104,
+      "step": 12290
+    },
+    {
+      "epoch": 0.10669178218939071,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019379984902455419,
+      "loss": 0.1064,
+      "step": 12291
+    },
+    {
+      "epoch": 0.10670046266959488,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001937987634230703,
+      "loss": 0.0977,
+      "step": 12292
+    },
+    {
+      "epoch": 0.10670914314979904,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019379767772994539,
+      "loss": 0.124,
+      "step": 12293
+    },
+    {
+      "epoch": 0.1067178236300032,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0019379659194518053,
+      "loss": 0.0923,
+      "step": 12294
+    },
+    {
+      "epoch": 0.10672650411020737,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019379550606877702,
+      "loss": 0.1084,
+      "step": 12295
+    },
+    {
+      "epoch": 0.10673518459041154,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.00193794420100736,
+      "loss": 0.1152,
+      "step": 12296
+    },
+    {
+      "epoch": 0.1067438650706157,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019379333404105866,
+      "loss": 0.1475,
+      "step": 12297
+    },
+    {
+      "epoch": 0.10675254555081987,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001937922478897462,
+      "loss": 0.2305,
+      "step": 12298
+    },
+    {
+      "epoch": 0.10676122603102403,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0019379116164679976,
+      "loss": 0.1494,
+      "step": 12299
+    },
+    {
+      "epoch": 0.1067699065112282,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0019379007531222059,
+      "loss": 0.1562,
+      "step": 12300
+    },
+    {
+      "epoch": 0.10677858699143236,
+      "grad_norm": 1.578125,
+      "learning_rate": 0.0019378898888600985,
+      "loss": 0.1514,
+      "step": 12301
+    },
+    {
+      "epoch": 0.10678726747163653,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019378790236816875,
+      "loss": 0.1211,
+      "step": 12302
+    },
+    {
+      "epoch": 0.10679594795184069,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0019378681575869842,
+      "loss": 0.0996,
+      "step": 12303
+    },
+    {
+      "epoch": 0.10680462843204486,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001937857290576001,
+      "loss": 0.1309,
+      "step": 12304
+    },
+    {
+      "epoch": 0.10681330891224902,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019378464226487498,
+      "loss": 0.0894,
+      "step": 12305
+    },
+    {
+      "epoch": 0.10682198939245319,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001937835553805242,
+      "loss": 0.1162,
+      "step": 12306
+    },
+    {
+      "epoch": 0.10683066987265735,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019378246840454903,
+      "loss": 0.1201,
+      "step": 12307
+    },
+    {
+      "epoch": 0.10683935035286152,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001937813813369506,
+      "loss": 0.1221,
+      "step": 12308
+    },
+    {
+      "epoch": 0.10684803083306568,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0019378029417773009,
+      "loss": 0.1406,
+      "step": 12309
+    },
+    {
+      "epoch": 0.10685671131326985,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019377920692688873,
+      "loss": 0.1484,
+      "step": 12310
+    },
+    {
+      "epoch": 0.10686539179347401,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019377811958442765,
+      "loss": 0.1211,
+      "step": 12311
+    },
+    {
+      "epoch": 0.10687407227367818,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001937770321503481,
+      "loss": 0.1064,
+      "step": 12312
+    },
+    {
+      "epoch": 0.10688275275388234,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019377594462465125,
+      "loss": 0.1641,
+      "step": 12313
+    },
+    {
+      "epoch": 0.10689143323408651,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001937748570073383,
+      "loss": 0.1475,
+      "step": 12314
+    },
+    {
+      "epoch": 0.10690011371429067,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019377376929841043,
+      "loss": 0.1445,
+      "step": 12315
+    },
+    {
+      "epoch": 0.10690879419449484,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001937726814978688,
+      "loss": 0.1191,
+      "step": 12316
+    },
+    {
+      "epoch": 0.106917474674699,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019377159360571463,
+      "loss": 0.0962,
+      "step": 12317
+    },
+    {
+      "epoch": 0.10692615515490317,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001937705056219491,
+      "loss": 0.0962,
+      "step": 12318
+    },
+    {
+      "epoch": 0.10693483563510733,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0019376941754657337,
+      "loss": 0.123,
+      "step": 12319
+    },
+    {
+      "epoch": 0.1069435161153115,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019376832937958873,
+      "loss": 0.1318,
+      "step": 12320
+    },
+    {
+      "epoch": 0.10695219659551566,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019376724112099627,
+      "loss": 0.0977,
+      "step": 12321
+    },
+    {
+      "epoch": 0.10696087707571983,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019376615277079724,
+      "loss": 0.1562,
+      "step": 12322
+    },
+    {
+      "epoch": 0.106969557555924,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019376506432899276,
+      "loss": 0.1289,
+      "step": 12323
+    },
+    {
+      "epoch": 0.10697823803612816,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001937639757955841,
+      "loss": 0.1201,
+      "step": 12324
+    },
+    {
+      "epoch": 0.10698691851633232,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001937628871705724,
+      "loss": 0.126,
+      "step": 12325
+    },
+    {
+      "epoch": 0.10699559899653649,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019376179845395885,
+      "loss": 0.1011,
+      "step": 12326
+    },
+    {
+      "epoch": 0.10700427947674065,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001937607096457447,
+      "loss": 0.1094,
+      "step": 12327
+    },
+    {
+      "epoch": 0.10701295995694482,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0019375962074593103,
+      "loss": 0.0957,
+      "step": 12328
+    },
+    {
+      "epoch": 0.10702164043714898,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019375853175451917,
+      "loss": 0.126,
+      "step": 12329
+    },
+    {
+      "epoch": 0.10703032091735315,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019375744267151021,
+      "loss": 0.1318,
+      "step": 12330
+    },
+    {
+      "epoch": 0.10703900139755732,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0019375635349690532,
+      "loss": 0.1348,
+      "step": 12331
+    },
+    {
+      "epoch": 0.10704768187776148,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019375526423070582,
+      "loss": 0.1289,
+      "step": 12332
+    },
+    {
+      "epoch": 0.10705636235796565,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019375417487291276,
+      "loss": 0.127,
+      "step": 12333
+    },
+    {
+      "epoch": 0.10706504283816981,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0019375308542352741,
+      "loss": 0.1348,
+      "step": 12334
+    },
+    {
+      "epoch": 0.10707372331837398,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0019375199588255096,
+      "loss": 0.1123,
+      "step": 12335
+    },
+    {
+      "epoch": 0.10708240379857814,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019375090624998458,
+      "loss": 0.168,
+      "step": 12336
+    },
+    {
+      "epoch": 0.1070910842787823,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019374981652582946,
+      "loss": 0.1973,
+      "step": 12337
+    },
+    {
+      "epoch": 0.10709976475898647,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001937487267100868,
+      "loss": 0.1104,
+      "step": 12338
+    },
+    {
+      "epoch": 0.10710844523919064,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019374763680275783,
+      "loss": 0.1211,
+      "step": 12339
+    },
+    {
+      "epoch": 0.1071171257193948,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019374654680384365,
+      "loss": 0.123,
+      "step": 12340
+    },
+    {
+      "epoch": 0.10712580619959897,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001937454567133455,
+      "loss": 0.1167,
+      "step": 12341
+    },
+    {
+      "epoch": 0.10713448667980313,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0019374436653126462,
+      "loss": 0.0996,
+      "step": 12342
+    },
+    {
+      "epoch": 0.1071431671600073,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019374327625760212,
+      "loss": 0.0957,
+      "step": 12343
+    },
+    {
+      "epoch": 0.10715184764021146,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0019374218589235926,
+      "loss": 0.0942,
+      "step": 12344
+    },
+    {
+      "epoch": 0.10716052812041563,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001937410954355372,
+      "loss": 0.0962,
+      "step": 12345
+    },
+    {
+      "epoch": 0.10716920860061979,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019374000488713713,
+      "loss": 0.0845,
+      "step": 12346
+    },
+    {
+      "epoch": 0.10717788908082396,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0019373891424716023,
+      "loss": 0.127,
+      "step": 12347
+    },
+    {
+      "epoch": 0.10718656956102812,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0019373782351560774,
+      "loss": 0.0952,
+      "step": 12348
+    },
+    {
+      "epoch": 0.10719525004123229,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019373673269248081,
+      "loss": 0.0918,
+      "step": 12349
+    },
+    {
+      "epoch": 0.10720393052143645,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019373564177778064,
+      "loss": 0.0962,
+      "step": 12350
+    },
+    {
+      "epoch": 0.10721261100164062,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019373455077150844,
+      "loss": 0.1348,
+      "step": 12351
+    },
+    {
+      "epoch": 0.10722129148184477,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001937334596736654,
+      "loss": 0.1016,
+      "step": 12352
+    },
+    {
+      "epoch": 0.10722997196204893,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001937323684842527,
+      "loss": 0.1279,
+      "step": 12353
+    },
+    {
+      "epoch": 0.1072386524422531,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019373127720327154,
+      "loss": 0.126,
+      "step": 12354
+    },
+    {
+      "epoch": 0.10724733292245726,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001937301858307231,
+      "loss": 0.0947,
+      "step": 12355
+    },
+    {
+      "epoch": 0.10725601340266143,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019372909436660862,
+      "loss": 0.1484,
+      "step": 12356
+    },
+    {
+      "epoch": 0.1072646938828656,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019372800281092927,
+      "loss": 0.0957,
+      "step": 12357
+    },
+    {
+      "epoch": 0.10727337436306976,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0019372691116368618,
+      "loss": 0.124,
+      "step": 12358
+    },
+    {
+      "epoch": 0.10728205484327392,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0019372581942488061,
+      "loss": 0.1079,
+      "step": 12359
+    },
+    {
+      "epoch": 0.10729073532347809,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019372472759451378,
+      "loss": 0.123,
+      "step": 12360
+    },
+    {
+      "epoch": 0.10729941580368225,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0019372363567258682,
+      "loss": 0.1641,
+      "step": 12361
+    },
+    {
+      "epoch": 0.10730809628388642,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019372254365910093,
+      "loss": 0.1104,
+      "step": 12362
+    },
+    {
+      "epoch": 0.10731677676409059,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019372145155405734,
+      "loss": 0.166,
+      "step": 12363
+    },
+    {
+      "epoch": 0.10732545724429475,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019372035935745725,
+      "loss": 0.2969,
+      "step": 12364
+    },
+    {
+      "epoch": 0.10733413772449892,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019371926706930182,
+      "loss": 0.0933,
+      "step": 12365
+    },
+    {
+      "epoch": 0.10734281820470308,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0019371817468959226,
+      "loss": 0.1035,
+      "step": 12366
+    },
+    {
+      "epoch": 0.10735149868490725,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019371708221832974,
+      "loss": 0.1094,
+      "step": 12367
+    },
+    {
+      "epoch": 0.10736017916511141,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.001937159896555155,
+      "loss": 0.1328,
+      "step": 12368
+    },
+    {
+      "epoch": 0.10736885964531558,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001937148970011507,
+      "loss": 0.1328,
+      "step": 12369
+    },
+    {
+      "epoch": 0.10737754012551974,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019371380425523654,
+      "loss": 0.1143,
+      "step": 12370
+    },
+    {
+      "epoch": 0.1073862206057239,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019371271141777422,
+      "loss": 0.1064,
+      "step": 12371
+    },
+    {
+      "epoch": 0.10739490108592807,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0019371161848876497,
+      "loss": 0.0889,
+      "step": 12372
+    },
+    {
+      "epoch": 0.10740358156613224,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0019371052546820994,
+      "loss": 0.2188,
+      "step": 12373
+    },
+    {
+      "epoch": 0.1074122620463364,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019370943235611031,
+      "loss": 0.1602,
+      "step": 12374
+    },
+    {
+      "epoch": 0.10742094252654057,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019370833915246734,
+      "loss": 0.1387,
+      "step": 12375
+    },
+    {
+      "epoch": 0.10742962300674473,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019370724585728216,
+      "loss": 0.123,
+      "step": 12376
+    },
+    {
+      "epoch": 0.1074383034869489,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019370615247055602,
+      "loss": 0.1055,
+      "step": 12377
+    },
+    {
+      "epoch": 0.10744698396715306,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019370505899229005,
+      "loss": 0.1533,
+      "step": 12378
+    },
+    {
+      "epoch": 0.10745566444735723,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019370396542248553,
+      "loss": 0.1777,
+      "step": 12379
+    },
+    {
+      "epoch": 0.10746434492756139,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019370287176114357,
+      "loss": 0.1099,
+      "step": 12380
+    },
+    {
+      "epoch": 0.10747302540776556,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019370177800826542,
+      "loss": 0.0996,
+      "step": 12381
+    },
+    {
+      "epoch": 0.10748170588796972,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0019370068416385228,
+      "loss": 0.1201,
+      "step": 12382
+    },
+    {
+      "epoch": 0.10749038636817389,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001936995902279053,
+      "loss": 0.1309,
+      "step": 12383
+    },
+    {
+      "epoch": 0.10749906684837805,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019369849620042572,
+      "loss": 0.126,
+      "step": 12384
+    },
+    {
+      "epoch": 0.10750774732858222,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0019369740208141472,
+      "loss": 0.1279,
+      "step": 12385
+    },
+    {
+      "epoch": 0.10751642780878638,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019369630787087352,
+      "loss": 0.1035,
+      "step": 12386
+    },
+    {
+      "epoch": 0.10752510828899055,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0019369521356880327,
+      "loss": 0.1045,
+      "step": 12387
+    },
+    {
+      "epoch": 0.10753378876919471,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019369411917520521,
+      "loss": 0.1152,
+      "step": 12388
+    },
+    {
+      "epoch": 0.10754246924939888,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0019369302469008053,
+      "loss": 0.1436,
+      "step": 12389
+    },
+    {
+      "epoch": 0.10755114972960304,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019369193011343035,
+      "loss": 0.1152,
+      "step": 12390
+    },
+    {
+      "epoch": 0.10755983020980721,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.00193690835445256,
+      "loss": 0.083,
+      "step": 12391
+    },
+    {
+      "epoch": 0.10756851069001137,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001936897406855586,
+      "loss": 0.1035,
+      "step": 12392
+    },
+    {
+      "epoch": 0.10757719117021554,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019368864583433933,
+      "loss": 0.1621,
+      "step": 12393
+    },
+    {
+      "epoch": 0.1075858716504197,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019368755089159944,
+      "loss": 0.1162,
+      "step": 12394
+    },
+    {
+      "epoch": 0.10759455213062387,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019368645585734008,
+      "loss": 0.2031,
+      "step": 12395
+    },
+    {
+      "epoch": 0.10760323261082803,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019368536073156248,
+      "loss": 0.1006,
+      "step": 12396
+    },
+    {
+      "epoch": 0.1076119130910322,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019368426551426782,
+      "loss": 0.1152,
+      "step": 12397
+    },
+    {
+      "epoch": 0.10762059357123636,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001936831702054573,
+      "loss": 0.0903,
+      "step": 12398
+    },
+    {
+      "epoch": 0.10762927405144053,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019368207480513215,
+      "loss": 0.1157,
+      "step": 12399
+    },
+    {
+      "epoch": 0.1076379545316447,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019368097931329353,
+      "loss": 0.209,
+      "step": 12400
+    },
+    {
+      "epoch": 0.10764663501184886,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019367988372994264,
+      "loss": 0.1533,
+      "step": 12401
+    },
+    {
+      "epoch": 0.10765531549205302,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0019367878805508067,
+      "loss": 0.1167,
+      "step": 12402
+    },
+    {
+      "epoch": 0.10766399597225719,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0019367769228870887,
+      "loss": 0.1152,
+      "step": 12403
+    },
+    {
+      "epoch": 0.10767267645246135,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0019367659643082838,
+      "loss": 0.1221,
+      "step": 12404
+    },
+    {
+      "epoch": 0.10768135693266552,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019367550048144042,
+      "loss": 0.1396,
+      "step": 12405
+    },
+    {
+      "epoch": 0.10769003741286969,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001936744044405462,
+      "loss": 0.168,
+      "step": 12406
+    },
+    {
+      "epoch": 0.10769871789307385,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0019367330830814692,
+      "loss": 0.1162,
+      "step": 12407
+    },
+    {
+      "epoch": 0.10770739837327802,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019367221208424375,
+      "loss": 0.1133,
+      "step": 12408
+    },
+    {
+      "epoch": 0.10771607885348218,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001936711157688379,
+      "loss": 0.1221,
+      "step": 12409
+    },
+    {
+      "epoch": 0.10772475933368635,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001936700193619306,
+      "loss": 0.1602,
+      "step": 12410
+    },
+    {
+      "epoch": 0.10773343981389051,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00193668922863523,
+      "loss": 0.1113,
+      "step": 12411
+    },
+    {
+      "epoch": 0.10774212029409468,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019366782627361632,
+      "loss": 0.1289,
+      "step": 12412
+    },
+    {
+      "epoch": 0.10775080077429884,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001936667295922118,
+      "loss": 0.1182,
+      "step": 12413
+    },
+    {
+      "epoch": 0.10775948125450299,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0019366563281931054,
+      "loss": 0.0996,
+      "step": 12414
+    },
+    {
+      "epoch": 0.10776816173470716,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0019366453595491384,
+      "loss": 0.1128,
+      "step": 12415
+    },
+    {
+      "epoch": 0.10777684221491132,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019366343899902287,
+      "loss": 0.1035,
+      "step": 12416
+    },
+    {
+      "epoch": 0.10778552269511549,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019366234195163882,
+      "loss": 0.125,
+      "step": 12417
+    },
+    {
+      "epoch": 0.10779420317531965,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019366124481276286,
+      "loss": 0.0854,
+      "step": 12418
+    },
+    {
+      "epoch": 0.10780288365552382,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019366014758239623,
+      "loss": 0.1445,
+      "step": 12419
+    },
+    {
+      "epoch": 0.10781156413572798,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019365905026054013,
+      "loss": 0.1562,
+      "step": 12420
+    },
+    {
+      "epoch": 0.10782024461593215,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019365795284719577,
+      "loss": 0.1367,
+      "step": 12421
+    },
+    {
+      "epoch": 0.10782892509613631,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0019365685534236427,
+      "loss": 0.1523,
+      "step": 12422
+    },
+    {
+      "epoch": 0.10783760557634048,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0019365575774604696,
+      "loss": 0.123,
+      "step": 12423
+    },
+    {
+      "epoch": 0.10784628605654464,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019365466005824492,
+      "loss": 0.1162,
+      "step": 12424
+    },
+    {
+      "epoch": 0.10785496653674881,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0019365356227895943,
+      "loss": 0.1465,
+      "step": 12425
+    },
+    {
+      "epoch": 0.10786364701695297,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019365246440819165,
+      "loss": 0.1387,
+      "step": 12426
+    },
+    {
+      "epoch": 0.10787232749715714,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0019365136644594278,
+      "loss": 0.2812,
+      "step": 12427
+    },
+    {
+      "epoch": 0.1078810079773613,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0019365026839221402,
+      "loss": 0.1162,
+      "step": 12428
+    },
+    {
+      "epoch": 0.10788968845756547,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001936491702470066,
+      "loss": 0.1182,
+      "step": 12429
+    },
+    {
+      "epoch": 0.10789836893776963,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019364807201032173,
+      "loss": 0.1348,
+      "step": 12430
+    },
+    {
+      "epoch": 0.1079070494179738,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019364697368216056,
+      "loss": 0.0947,
+      "step": 12431
+    },
+    {
+      "epoch": 0.10791572989817796,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001936458752625243,
+      "loss": 0.1455,
+      "step": 12432
+    },
+    {
+      "epoch": 0.10792441037838213,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001936447767514142,
+      "loss": 0.1699,
+      "step": 12433
+    },
+    {
+      "epoch": 0.1079330908585863,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001936436781488314,
+      "loss": 0.1357,
+      "step": 12434
+    },
+    {
+      "epoch": 0.10794177133879046,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0019364257945477716,
+      "loss": 0.1113,
+      "step": 12435
+    },
+    {
+      "epoch": 0.10795045181899462,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0019364148066925264,
+      "loss": 0.1445,
+      "step": 12436
+    },
+    {
+      "epoch": 0.10795913229919879,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019364038179225907,
+      "loss": 0.1309,
+      "step": 12437
+    },
+    {
+      "epoch": 0.10796781277940296,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0019363928282379761,
+      "loss": 0.1494,
+      "step": 12438
+    },
+    {
+      "epoch": 0.10797649325960712,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001936381837638695,
+      "loss": 0.124,
+      "step": 12439
+    },
+    {
+      "epoch": 0.10798517373981129,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019363708461247594,
+      "loss": 0.1025,
+      "step": 12440
+    },
+    {
+      "epoch": 0.10799385422001545,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001936359853696181,
+      "loss": 0.1289,
+      "step": 12441
+    },
+    {
+      "epoch": 0.10800253470021962,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0019363488603529719,
+      "loss": 0.1123,
+      "step": 12442
+    },
+    {
+      "epoch": 0.10801121518042378,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0019363378660951448,
+      "loss": 0.127,
+      "step": 12443
+    },
+    {
+      "epoch": 0.10801989566062795,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0019363268709227104,
+      "loss": 0.1133,
+      "step": 12444
+    },
+    {
+      "epoch": 0.10802857614083211,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0019363158748356818,
+      "loss": 0.1162,
+      "step": 12445
+    },
+    {
+      "epoch": 0.10803725662103628,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001936304877834071,
+      "loss": 0.1367,
+      "step": 12446
+    },
+    {
+      "epoch": 0.10804593710124044,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0019362938799178898,
+      "loss": 0.1318,
+      "step": 12447
+    },
+    {
+      "epoch": 0.1080546175814446,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.00193628288108715,
+      "loss": 0.1162,
+      "step": 12448
+    },
+    {
+      "epoch": 0.10806329806164877,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0019362718813418637,
+      "loss": 0.1045,
+      "step": 12449
+    },
+    {
+      "epoch": 0.10807197854185294,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001936260880682043,
+      "loss": 0.1279,
+      "step": 12450
+    },
+    {
+      "epoch": 0.1080806590220571,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019362498791077,
+      "loss": 0.125,
+      "step": 12451
+    },
+    {
+      "epoch": 0.10808933950226127,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019362388766188468,
+      "loss": 0.0977,
+      "step": 12452
+    },
+    {
+      "epoch": 0.10809801998246543,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0019362278732154954,
+      "loss": 0.0898,
+      "step": 12453
+    },
+    {
+      "epoch": 0.1081067004626696,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0019362168688976575,
+      "loss": 0.1172,
+      "step": 12454
+    },
+    {
+      "epoch": 0.10811538094287376,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019362058636653457,
+      "loss": 0.1377,
+      "step": 12455
+    },
+    {
+      "epoch": 0.10812406142307793,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0019361948575185716,
+      "loss": 0.1816,
+      "step": 12456
+    },
+    {
+      "epoch": 0.10813274190328209,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0019361838504573475,
+      "loss": 0.0928,
+      "step": 12457
+    },
+    {
+      "epoch": 0.10814142238348626,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0019361728424816848,
+      "loss": 0.4199,
+      "step": 12458
+    },
+    {
+      "epoch": 0.10815010286369042,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019361618335915964,
+      "loss": 0.1191,
+      "step": 12459
+    },
+    {
+      "epoch": 0.10815878334389459,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019361508237870939,
+      "loss": 0.0918,
+      "step": 12460
+    },
+    {
+      "epoch": 0.10816746382409875,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0019361398130681897,
+      "loss": 0.1113,
+      "step": 12461
+    },
+    {
+      "epoch": 0.10817614430430292,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019361288014348953,
+      "loss": 0.1426,
+      "step": 12462
+    },
+    {
+      "epoch": 0.10818482478450708,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019361177888872231,
+      "loss": 0.1123,
+      "step": 12463
+    },
+    {
+      "epoch": 0.10819350526471125,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0019361067754251852,
+      "loss": 0.1123,
+      "step": 12464
+    },
+    {
+      "epoch": 0.10820218574491541,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001936095761048793,
+      "loss": 0.1641,
+      "step": 12465
+    },
+    {
+      "epoch": 0.10821086622511958,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019360847457580595,
+      "loss": 0.1074,
+      "step": 12466
+    },
+    {
+      "epoch": 0.10821954670532374,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0019360737295529963,
+      "loss": 0.1162,
+      "step": 12467
+    },
+    {
+      "epoch": 0.10822822718552791,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019360627124336153,
+      "loss": 0.1318,
+      "step": 12468
+    },
+    {
+      "epoch": 0.10823690766573207,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019360516943999284,
+      "loss": 0.1699,
+      "step": 12469
+    },
+    {
+      "epoch": 0.10824558814593624,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019360406754519483,
+      "loss": 0.1504,
+      "step": 12470
+    },
+    {
+      "epoch": 0.1082542686261404,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0019360296555896866,
+      "loss": 0.0986,
+      "step": 12471
+    },
+    {
+      "epoch": 0.10826294910634457,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0019360186348131555,
+      "loss": 0.0835,
+      "step": 12472
+    },
+    {
+      "epoch": 0.10827162958654873,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019360076131223665,
+      "loss": 0.124,
+      "step": 12473
+    },
+    {
+      "epoch": 0.1082803100667529,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019359965905173327,
+      "loss": 0.1191,
+      "step": 12474
+    },
+    {
+      "epoch": 0.10828899054695705,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0019359855669980653,
+      "loss": 0.1436,
+      "step": 12475
+    },
+    {
+      "epoch": 0.10829767102716122,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0019359745425645766,
+      "loss": 0.1104,
+      "step": 12476
+    },
+    {
+      "epoch": 0.10830635150736538,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001935963517216879,
+      "loss": 0.1367,
+      "step": 12477
+    },
+    {
+      "epoch": 0.10831503198756955,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001935952490954984,
+      "loss": 0.1357,
+      "step": 12478
+    },
+    {
+      "epoch": 0.10832371246777371,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0019359414637789042,
+      "loss": 0.1426,
+      "step": 12479
+    },
+    {
+      "epoch": 0.10833239294797788,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019359304356886512,
+      "loss": 0.1582,
+      "step": 12480
+    },
+    {
+      "epoch": 0.10834107342818204,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019359194066842372,
+      "loss": 0.1152,
+      "step": 12481
+    },
+    {
+      "epoch": 0.1083497539083862,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0019359083767656743,
+      "loss": 0.1172,
+      "step": 12482
+    },
+    {
+      "epoch": 0.10835843438859037,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019358973459329745,
+      "loss": 0.1167,
+      "step": 12483
+    },
+    {
+      "epoch": 0.10836711486879454,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.00193588631418615,
+      "loss": 0.1582,
+      "step": 12484
+    },
+    {
+      "epoch": 0.1083757953489987,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0019358752815252127,
+      "loss": 0.1826,
+      "step": 12485
+    },
+    {
+      "epoch": 0.10838447582920287,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001935864247950175,
+      "loss": 0.0986,
+      "step": 12486
+    },
+    {
+      "epoch": 0.10839315630940703,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019358532134610485,
+      "loss": 0.124,
+      "step": 12487
+    },
+    {
+      "epoch": 0.1084018367896112,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0019358421780578455,
+      "loss": 0.1094,
+      "step": 12488
+    },
+    {
+      "epoch": 0.10841051726981536,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019358311417405783,
+      "loss": 0.1504,
+      "step": 12489
+    },
+    {
+      "epoch": 0.10841919775001953,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019358201045092582,
+      "loss": 0.0884,
+      "step": 12490
+    },
+    {
+      "epoch": 0.10842787823022369,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0019358090663638982,
+      "loss": 0.1089,
+      "step": 12491
+    },
+    {
+      "epoch": 0.10843655871042786,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0019357980273045096,
+      "loss": 0.1182,
+      "step": 12492
+    },
+    {
+      "epoch": 0.10844523919063202,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019357869873311056,
+      "loss": 0.1396,
+      "step": 12493
+    },
+    {
+      "epoch": 0.10845391967083619,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0019357759464436969,
+      "loss": 0.0977,
+      "step": 12494
+    },
+    {
+      "epoch": 0.10846260015104035,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001935764904642296,
+      "loss": 0.1079,
+      "step": 12495
+    },
+    {
+      "epoch": 0.10847128063124452,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019357538619269158,
+      "loss": 0.1504,
+      "step": 12496
+    },
+    {
+      "epoch": 0.10847996111144868,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019357428182975671,
+      "loss": 0.084,
+      "step": 12497
+    },
+    {
+      "epoch": 0.10848864159165285,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001935731773754263,
+      "loss": 0.1152,
+      "step": 12498
+    },
+    {
+      "epoch": 0.10849732207185701,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001935720728297015,
+      "loss": 0.1143,
+      "step": 12499
+    },
+    {
+      "epoch": 0.10850600255206118,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019357096819258355,
+      "loss": 0.1797,
+      "step": 12500
+    },
+    {
+      "epoch": 0.10851468303226534,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019356986346407365,
+      "loss": 0.0977,
+      "step": 12501
+    },
+    {
+      "epoch": 0.10852336351246951,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00193568758644173,
+      "loss": 0.1123,
+      "step": 12502
+    },
+    {
+      "epoch": 0.10853204399267367,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019356765373288278,
+      "loss": 0.1299,
+      "step": 12503
+    },
+    {
+      "epoch": 0.10854072447287784,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0019356654873020425,
+      "loss": 0.0762,
+      "step": 12504
+    },
+    {
+      "epoch": 0.108549404953082,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019356544363613862,
+      "loss": 0.1699,
+      "step": 12505
+    },
+    {
+      "epoch": 0.10855808543328617,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0019356433845068705,
+      "loss": 0.2031,
+      "step": 12506
+    },
+    {
+      "epoch": 0.10856676591349033,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0019356323317385078,
+      "loss": 0.1152,
+      "step": 12507
+    },
+    {
+      "epoch": 0.1085754463936945,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019356212780563102,
+      "loss": 0.1562,
+      "step": 12508
+    },
+    {
+      "epoch": 0.10858412687389866,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019356102234602897,
+      "loss": 0.1113,
+      "step": 12509
+    },
+    {
+      "epoch": 0.10859280735410283,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0019355991679504585,
+      "loss": 0.1147,
+      "step": 12510
+    },
+    {
+      "epoch": 0.108601487834307,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019355881115268284,
+      "loss": 0.1582,
+      "step": 12511
+    },
+    {
+      "epoch": 0.10861016831451116,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0019355770541894118,
+      "loss": 0.1104,
+      "step": 12512
+    },
+    {
+      "epoch": 0.10861884879471533,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019355659959382206,
+      "loss": 0.1289,
+      "step": 12513
+    },
+    {
+      "epoch": 0.10862752927491949,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019355549367732672,
+      "loss": 0.1143,
+      "step": 12514
+    },
+    {
+      "epoch": 0.10863620975512366,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019355438766945634,
+      "loss": 0.1064,
+      "step": 12515
+    },
+    {
+      "epoch": 0.10864489023532782,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0019355328157021213,
+      "loss": 0.1152,
+      "step": 12516
+    },
+    {
+      "epoch": 0.10865357071553199,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001935521753795953,
+      "loss": 0.123,
+      "step": 12517
+    },
+    {
+      "epoch": 0.10866225119573615,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019355106909760707,
+      "loss": 0.0977,
+      "step": 12518
+    },
+    {
+      "epoch": 0.10867093167594032,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019354996272424867,
+      "loss": 0.1011,
+      "step": 12519
+    },
+    {
+      "epoch": 0.10867961215614448,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0019354885625952128,
+      "loss": 0.168,
+      "step": 12520
+    },
+    {
+      "epoch": 0.10868829263634865,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019354774970342609,
+      "loss": 0.1582,
+      "step": 12521
+    },
+    {
+      "epoch": 0.10869697311655281,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019354664305596435,
+      "loss": 0.0957,
+      "step": 12522
+    },
+    {
+      "epoch": 0.10870565359675698,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0019354553631713727,
+      "loss": 0.1045,
+      "step": 12523
+    },
+    {
+      "epoch": 0.10871433407696114,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.00193544429486946,
+      "loss": 0.1172,
+      "step": 12524
+    },
+    {
+      "epoch": 0.1087230145571653,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0019354332256539182,
+      "loss": 0.165,
+      "step": 12525
+    },
+    {
+      "epoch": 0.10873169503736947,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0019354221555247595,
+      "loss": 0.1338,
+      "step": 12526
+    },
+    {
+      "epoch": 0.10874037551757364,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019354110844819956,
+      "loss": 0.1807,
+      "step": 12527
+    },
+    {
+      "epoch": 0.1087490559977778,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019354000125256384,
+      "loss": 0.1172,
+      "step": 12528
+    },
+    {
+      "epoch": 0.10875773647798197,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019353889396557006,
+      "loss": 0.1045,
+      "step": 12529
+    },
+    {
+      "epoch": 0.10876641695818613,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0019353778658721937,
+      "loss": 0.1123,
+      "step": 12530
+    },
+    {
+      "epoch": 0.1087750974383903,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0019353667911751302,
+      "loss": 0.1445,
+      "step": 12531
+    },
+    {
+      "epoch": 0.10878377791859446,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019353557155645223,
+      "loss": 0.1279,
+      "step": 12532
+    },
+    {
+      "epoch": 0.10879245839879863,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001935344639040382,
+      "loss": 0.0869,
+      "step": 12533
+    },
+    {
+      "epoch": 0.10880113887900279,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.001935333561602721,
+      "loss": 0.1553,
+      "step": 12534
+    },
+    {
+      "epoch": 0.10880981935920696,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001935322483251552,
+      "loss": 0.1357,
+      "step": 12535
+    },
+    {
+      "epoch": 0.10881849983941112,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019353114039868868,
+      "loss": 0.125,
+      "step": 12536
+    },
+    {
+      "epoch": 0.10882718031961527,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019353003238087377,
+      "loss": 0.1182,
+      "step": 12537
+    },
+    {
+      "epoch": 0.10883586079981944,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019352892427171166,
+      "loss": 0.1245,
+      "step": 12538
+    },
+    {
+      "epoch": 0.1088445412800236,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0019352781607120358,
+      "loss": 0.1445,
+      "step": 12539
+    },
+    {
+      "epoch": 0.10885322176022777,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0019352670777935074,
+      "loss": 0.1211,
+      "step": 12540
+    },
+    {
+      "epoch": 0.10886190224043193,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019352559939615436,
+      "loss": 0.1152,
+      "step": 12541
+    },
+    {
+      "epoch": 0.1088705827206361,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.001935244909216156,
+      "loss": 0.1094,
+      "step": 12542
+    },
+    {
+      "epoch": 0.10887926320084026,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019352338235573573,
+      "loss": 0.1084,
+      "step": 12543
+    },
+    {
+      "epoch": 0.10888794368104443,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019352227369851594,
+      "loss": 0.1074,
+      "step": 12544
+    },
+    {
+      "epoch": 0.1088966241612486,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019352116494995746,
+      "loss": 0.1187,
+      "step": 12545
+    },
+    {
+      "epoch": 0.10890530464145276,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0019352005611006147,
+      "loss": 0.3398,
+      "step": 12546
+    },
+    {
+      "epoch": 0.10891398512165693,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0019351894717882924,
+      "loss": 0.1182,
+      "step": 12547
+    },
+    {
+      "epoch": 0.10892266560186109,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0019351783815626192,
+      "loss": 0.0996,
+      "step": 12548
+    },
+    {
+      "epoch": 0.10893134608206526,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0019351672904236073,
+      "loss": 0.1328,
+      "step": 12549
+    },
+    {
+      "epoch": 0.10894002656226942,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001935156198371269,
+      "loss": 0.1084,
+      "step": 12550
+    },
+    {
+      "epoch": 0.10894870704247359,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019351451054056166,
+      "loss": 0.0996,
+      "step": 12551
+    },
+    {
+      "epoch": 0.10895738752267775,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019351340115266625,
+      "loss": 0.0786,
+      "step": 12552
+    },
+    {
+      "epoch": 0.10896606800288192,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019351229167344176,
+      "loss": 0.0996,
+      "step": 12553
+    },
+    {
+      "epoch": 0.10897474848308608,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019351118210288953,
+      "loss": 0.1201,
+      "step": 12554
+    },
+    {
+      "epoch": 0.10898342896329025,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019351007244101071,
+      "loss": 0.1211,
+      "step": 12555
+    },
+    {
+      "epoch": 0.10899210944349441,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0019350896268780654,
+      "loss": 0.1191,
+      "step": 12556
+    },
+    {
+      "epoch": 0.10900078992369858,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001935078528432782,
+      "loss": 0.1035,
+      "step": 12557
+    },
+    {
+      "epoch": 0.10900947040390274,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0019350674290742693,
+      "loss": 0.1235,
+      "step": 12558
+    },
+    {
+      "epoch": 0.1090181508841069,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019350563288025396,
+      "loss": 0.1001,
+      "step": 12559
+    },
+    {
+      "epoch": 0.10902683136431107,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001935045227617605,
+      "loss": 0.1123,
+      "step": 12560
+    },
+    {
+      "epoch": 0.10903551184451524,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001935034125519477,
+      "loss": 0.1729,
+      "step": 12561
+    },
+    {
+      "epoch": 0.1090441923247194,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019350230225081686,
+      "loss": 0.1133,
+      "step": 12562
+    },
+    {
+      "epoch": 0.10905287280492357,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019350119185836915,
+      "loss": 0.0933,
+      "step": 12563
+    },
+    {
+      "epoch": 0.10906155328512773,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019350008137460578,
+      "loss": 0.1436,
+      "step": 12564
+    },
+    {
+      "epoch": 0.1090702337653319,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019349897079952798,
+      "loss": 0.1826,
+      "step": 12565
+    },
+    {
+      "epoch": 0.10907891424553606,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019349786013313696,
+      "loss": 0.1484,
+      "step": 12566
+    },
+    {
+      "epoch": 0.10908759472574023,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019349674937543396,
+      "loss": 0.1191,
+      "step": 12567
+    },
+    {
+      "epoch": 0.10909627520594439,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0019349563852642014,
+      "loss": 0.1108,
+      "step": 12568
+    },
+    {
+      "epoch": 0.10910495568614856,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019349452758609673,
+      "loss": 0.0801,
+      "step": 12569
+    },
+    {
+      "epoch": 0.10911363616635272,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019349341655446497,
+      "loss": 0.0903,
+      "step": 12570
+    },
+    {
+      "epoch": 0.10912231664655689,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001934923054315261,
+      "loss": 0.1455,
+      "step": 12571
+    },
+    {
+      "epoch": 0.10913099712676105,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019349119421728127,
+      "loss": 0.1113,
+      "step": 12572
+    },
+    {
+      "epoch": 0.10913967760696522,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0019349008291173172,
+      "loss": 0.1021,
+      "step": 12573
+    },
+    {
+      "epoch": 0.10914835808716938,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019348897151487867,
+      "loss": 0.1426,
+      "step": 12574
+    },
+    {
+      "epoch": 0.10915703856737355,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0019348786002672334,
+      "loss": 0.1328,
+      "step": 12575
+    },
+    {
+      "epoch": 0.10916571904757771,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019348674844726695,
+      "loss": 0.1191,
+      "step": 12576
+    },
+    {
+      "epoch": 0.10917439952778188,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001934856367765107,
+      "loss": 0.1387,
+      "step": 12577
+    },
+    {
+      "epoch": 0.10918308000798604,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019348452501445582,
+      "loss": 0.1123,
+      "step": 12578
+    },
+    {
+      "epoch": 0.10919176048819021,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019348341316110349,
+      "loss": 0.1162,
+      "step": 12579
+    },
+    {
+      "epoch": 0.10920044096839437,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019348230121645495,
+      "loss": 0.1118,
+      "step": 12580
+    },
+    {
+      "epoch": 0.10920912144859854,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0019348118918051148,
+      "loss": 0.1006,
+      "step": 12581
+    },
+    {
+      "epoch": 0.1092178019288027,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.001934800770532742,
+      "loss": 0.0908,
+      "step": 12582
+    },
+    {
+      "epoch": 0.10922648240900687,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019347896483474434,
+      "loss": 0.1084,
+      "step": 12583
+    },
+    {
+      "epoch": 0.10923516288921103,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019347785252492318,
+      "loss": 0.0806,
+      "step": 12584
+    },
+    {
+      "epoch": 0.1092438433694152,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019347674012381187,
+      "loss": 0.0972,
+      "step": 12585
+    },
+    {
+      "epoch": 0.10925252384961937,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0019347562763141165,
+      "loss": 0.1191,
+      "step": 12586
+    },
+    {
+      "epoch": 0.10926120432982353,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019347451504772373,
+      "loss": 0.1025,
+      "step": 12587
+    },
+    {
+      "epoch": 0.1092698848100277,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019347340237274937,
+      "loss": 0.1602,
+      "step": 12588
+    },
+    {
+      "epoch": 0.10927856529023186,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.001934722896064897,
+      "loss": 0.1309,
+      "step": 12589
+    },
+    {
+      "epoch": 0.10928724577043603,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019347117674894604,
+      "loss": 0.1426,
+      "step": 12590
+    },
+    {
+      "epoch": 0.10929592625064019,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0019347006380011954,
+      "loss": 0.1201,
+      "step": 12591
+    },
+    {
+      "epoch": 0.10930460673084436,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019346895076001141,
+      "loss": 0.1484,
+      "step": 12592
+    },
+    {
+      "epoch": 0.10931328721104852,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0019346783762862292,
+      "loss": 0.0962,
+      "step": 12593
+    },
+    {
+      "epoch": 0.10932196769125269,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0019346672440595522,
+      "loss": 0.1201,
+      "step": 12594
+    },
+    {
+      "epoch": 0.10933064817145685,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001934656110920096,
+      "loss": 0.1045,
+      "step": 12595
+    },
+    {
+      "epoch": 0.10933932865166102,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0019346449768678723,
+      "loss": 0.106,
+      "step": 12596
+    },
+    {
+      "epoch": 0.10934800913186518,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019346338419028932,
+      "loss": 0.1992,
+      "step": 12597
+    },
+    {
+      "epoch": 0.10935668961206935,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019346227060251713,
+      "loss": 0.1367,
+      "step": 12598
+    },
+    {
+      "epoch": 0.1093653700922735,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019346115692347183,
+      "loss": 0.1201,
+      "step": 12599
+    },
+    {
+      "epoch": 0.10937405057247766,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001934600431531547,
+      "loss": 0.1191,
+      "step": 12600
+    },
+    {
+      "epoch": 0.10938273105268183,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001934589292915669,
+      "loss": 0.1328,
+      "step": 12601
+    },
+    {
+      "epoch": 0.109391411532886,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019345781533870966,
+      "loss": 0.1562,
+      "step": 12602
+    },
+    {
+      "epoch": 0.10940009201309016,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0019345670129458422,
+      "loss": 0.1104,
+      "step": 12603
+    },
+    {
+      "epoch": 0.10940877249329432,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001934555871591918,
+      "loss": 0.1084,
+      "step": 12604
+    },
+    {
+      "epoch": 0.10941745297349849,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019345447293253359,
+      "loss": 0.124,
+      "step": 12605
+    },
+    {
+      "epoch": 0.10942613345370265,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001934533586146108,
+      "loss": 0.104,
+      "step": 12606
+    },
+    {
+      "epoch": 0.10943481393390682,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019345224420542467,
+      "loss": 0.0962,
+      "step": 12607
+    },
+    {
+      "epoch": 0.10944349441411098,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019345112970497646,
+      "loss": 0.1299,
+      "step": 12608
+    },
+    {
+      "epoch": 0.10945217489431515,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019345001511326735,
+      "loss": 0.1445,
+      "step": 12609
+    },
+    {
+      "epoch": 0.10946085537451931,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.001934489004302985,
+      "loss": 0.125,
+      "step": 12610
+    },
+    {
+      "epoch": 0.10946953585472348,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0019344778565607123,
+      "loss": 0.1064,
+      "step": 12611
+    },
+    {
+      "epoch": 0.10947821633492764,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.001934466707905867,
+      "loss": 0.1484,
+      "step": 12612
+    },
+    {
+      "epoch": 0.10948689681513181,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019344555583384615,
+      "loss": 0.0815,
+      "step": 12613
+    },
+    {
+      "epoch": 0.10949557729533597,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001934444407858508,
+      "loss": 0.1455,
+      "step": 12614
+    },
+    {
+      "epoch": 0.10950425777554014,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0019344332564660185,
+      "loss": 0.1045,
+      "step": 12615
+    },
+    {
+      "epoch": 0.1095129382557443,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0019344221041610054,
+      "loss": 0.1025,
+      "step": 12616
+    },
+    {
+      "epoch": 0.10952161873594847,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0019344109509434808,
+      "loss": 0.1348,
+      "step": 12617
+    },
+    {
+      "epoch": 0.10953029921615264,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019343997968134573,
+      "loss": 0.127,
+      "step": 12618
+    },
+    {
+      "epoch": 0.1095389796963568,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019343886417709463,
+      "loss": 0.0996,
+      "step": 12619
+    },
+    {
+      "epoch": 0.10954766017656097,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019343774858159603,
+      "loss": 0.1504,
+      "step": 12620
+    },
+    {
+      "epoch": 0.10955634065676513,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0019343663289485117,
+      "loss": 0.165,
+      "step": 12621
+    },
+    {
+      "epoch": 0.1095650211369693,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001934355171168613,
+      "loss": 0.1836,
+      "step": 12622
+    },
+    {
+      "epoch": 0.10957370161717346,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0019343440124762756,
+      "loss": 0.1055,
+      "step": 12623
+    },
+    {
+      "epoch": 0.10958238209737763,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019343328528715125,
+      "loss": 0.1699,
+      "step": 12624
+    },
+    {
+      "epoch": 0.10959106257758179,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019343216923543352,
+      "loss": 0.1309,
+      "step": 12625
+    },
+    {
+      "epoch": 0.10959974305778596,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019343105309247565,
+      "loss": 0.1182,
+      "step": 12626
+    },
+    {
+      "epoch": 0.10960842353799012,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019342993685827878,
+      "loss": 0.1138,
+      "step": 12627
+    },
+    {
+      "epoch": 0.10961710401819429,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0019342882053284426,
+      "loss": 0.1348,
+      "step": 12628
+    },
+    {
+      "epoch": 0.10962578449839845,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019342770411617317,
+      "loss": 0.1084,
+      "step": 12629
+    },
+    {
+      "epoch": 0.10963446497860262,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0019342658760826683,
+      "loss": 0.1182,
+      "step": 12630
+    },
+    {
+      "epoch": 0.10964314545880678,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019342547100912645,
+      "loss": 0.1016,
+      "step": 12631
+    },
+    {
+      "epoch": 0.10965182593901095,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0019342435431875318,
+      "loss": 0.1309,
+      "step": 12632
+    },
+    {
+      "epoch": 0.10966050641921511,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019342323753714832,
+      "loss": 0.1143,
+      "step": 12633
+    },
+    {
+      "epoch": 0.10966918689941928,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019342212066431302,
+      "loss": 0.1138,
+      "step": 12634
+    },
+    {
+      "epoch": 0.10967786737962344,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019342100370024857,
+      "loss": 0.1104,
+      "step": 12635
+    },
+    {
+      "epoch": 0.10968654785982761,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001934198866449562,
+      "loss": 0.1406,
+      "step": 12636
+    },
+    {
+      "epoch": 0.10969522834003177,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019341876949843704,
+      "loss": 0.0942,
+      "step": 12637
+    },
+    {
+      "epoch": 0.10970390882023594,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019341765226069238,
+      "loss": 0.1689,
+      "step": 12638
+    },
+    {
+      "epoch": 0.1097125893004401,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019341653493172345,
+      "loss": 0.1035,
+      "step": 12639
+    },
+    {
+      "epoch": 0.10972126978064427,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019341541751153142,
+      "loss": 0.1079,
+      "step": 12640
+    },
+    {
+      "epoch": 0.10972995026084843,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019341430000011756,
+      "loss": 0.1533,
+      "step": 12641
+    },
+    {
+      "epoch": 0.1097386307410526,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019341318239748308,
+      "loss": 0.1309,
+      "step": 12642
+    },
+    {
+      "epoch": 0.10974731122125676,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001934120647036292,
+      "loss": 0.1328,
+      "step": 12643
+    },
+    {
+      "epoch": 0.10975599170146093,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001934109469185571,
+      "loss": 0.1089,
+      "step": 12644
+    },
+    {
+      "epoch": 0.1097646721816651,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0019340982904226811,
+      "loss": 0.1016,
+      "step": 12645
+    },
+    {
+      "epoch": 0.10977335266186926,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019340871107476334,
+      "loss": 0.1084,
+      "step": 12646
+    },
+    {
+      "epoch": 0.10978203314207342,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0019340759301604408,
+      "loss": 0.1094,
+      "step": 12647
+    },
+    {
+      "epoch": 0.10979071362227759,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001934064748661115,
+      "loss": 0.1523,
+      "step": 12648
+    },
+    {
+      "epoch": 0.10979939410248175,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0019340535662496687,
+      "loss": 0.104,
+      "step": 12649
+    },
+    {
+      "epoch": 0.10980807458268592,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001934042382926114,
+      "loss": 0.1113,
+      "step": 12650
+    },
+    {
+      "epoch": 0.10981675506289008,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0019340311986904629,
+      "loss": 0.1211,
+      "step": 12651
+    },
+    {
+      "epoch": 0.10982543554309425,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001934020013542728,
+      "loss": 0.1289,
+      "step": 12652
+    },
+    {
+      "epoch": 0.10983411602329841,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001934008827482921,
+      "loss": 0.1299,
+      "step": 12653
+    },
+    {
+      "epoch": 0.10984279650350258,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0019339976405110548,
+      "loss": 0.0815,
+      "step": 12654
+    },
+    {
+      "epoch": 0.10985147698370674,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019339864526271414,
+      "loss": 0.1191,
+      "step": 12655
+    },
+    {
+      "epoch": 0.10986015746391091,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0019339752638311929,
+      "loss": 0.0928,
+      "step": 12656
+    },
+    {
+      "epoch": 0.10986883794411507,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0019339640741232213,
+      "loss": 0.1387,
+      "step": 12657
+    },
+    {
+      "epoch": 0.10987751842431924,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0019339528835032392,
+      "loss": 0.127,
+      "step": 12658
+    },
+    {
+      "epoch": 0.1098861989045234,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001933941691971259,
+      "loss": 0.1338,
+      "step": 12659
+    },
+    {
+      "epoch": 0.10989487938472756,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0019339304995272923,
+      "loss": 0.1104,
+      "step": 12660
+    },
+    {
+      "epoch": 0.10990355986493172,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019339193061713522,
+      "loss": 0.1226,
+      "step": 12661
+    },
+    {
+      "epoch": 0.10991224034513589,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.00193390811190345,
+      "loss": 0.0933,
+      "step": 12662
+    },
+    {
+      "epoch": 0.10992092082534005,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.001933896916723599,
+      "loss": 0.1299,
+      "step": 12663
+    },
+    {
+      "epoch": 0.10992960130554422,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019338857206318103,
+      "loss": 0.1016,
+      "step": 12664
+    },
+    {
+      "epoch": 0.10993828178574838,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001933874523628097,
+      "loss": 0.1367,
+      "step": 12665
+    },
+    {
+      "epoch": 0.10994696226595255,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001933863325712471,
+      "loss": 0.084,
+      "step": 12666
+    },
+    {
+      "epoch": 0.10995564274615671,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0019338521268849444,
+      "loss": 0.1445,
+      "step": 12667
+    },
+    {
+      "epoch": 0.10996432322636088,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0019338409271455299,
+      "loss": 0.1162,
+      "step": 12668
+    },
+    {
+      "epoch": 0.10997300370656504,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019338297264942393,
+      "loss": 0.1182,
+      "step": 12669
+    },
+    {
+      "epoch": 0.10998168418676921,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019338185249310853,
+      "loss": 0.1797,
+      "step": 12670
+    },
+    {
+      "epoch": 0.10999036466697337,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00193380732245608,
+      "loss": 0.1055,
+      "step": 12671
+    },
+    {
+      "epoch": 0.10999904514717754,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001933796119069235,
+      "loss": 0.1289,
+      "step": 12672
+    },
+    {
+      "epoch": 0.1100077256273817,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0019337849147705634,
+      "loss": 0.0869,
+      "step": 12673
+    },
+    {
+      "epoch": 0.11001640610758587,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001933773709560077,
+      "loss": 0.1074,
+      "step": 12674
+    },
+    {
+      "epoch": 0.11002508658779003,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019337625034377883,
+      "loss": 0.1074,
+      "step": 12675
+    },
+    {
+      "epoch": 0.1100337670679942,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019337512964037096,
+      "loss": 0.1016,
+      "step": 12676
+    },
+    {
+      "epoch": 0.11004244754819836,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0019337400884578528,
+      "loss": 0.1328,
+      "step": 12677
+    },
+    {
+      "epoch": 0.11005112802840253,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0019337288796002306,
+      "loss": 0.1484,
+      "step": 12678
+    },
+    {
+      "epoch": 0.1100598085086067,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019337176698308548,
+      "loss": 0.0928,
+      "step": 12679
+    },
+    {
+      "epoch": 0.11006848898881086,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001933706459149738,
+      "loss": 0.125,
+      "step": 12680
+    },
+    {
+      "epoch": 0.11007716946901502,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0019336952475568923,
+      "loss": 0.1328,
+      "step": 12681
+    },
+    {
+      "epoch": 0.11008584994921919,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.00193368403505233,
+      "loss": 0.0928,
+      "step": 12682
+    },
+    {
+      "epoch": 0.11009453042942335,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019336728216360632,
+      "loss": 0.1104,
+      "step": 12683
+    },
+    {
+      "epoch": 0.11010321090962752,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0019336616073081046,
+      "loss": 0.1016,
+      "step": 12684
+    },
+    {
+      "epoch": 0.11011189138983168,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001933650392068466,
+      "loss": 0.1084,
+      "step": 12685
+    },
+    {
+      "epoch": 0.11012057187003585,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.00193363917591716,
+      "loss": 0.1523,
+      "step": 12686
+    },
+    {
+      "epoch": 0.11012925235024001,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0019336279588541988,
+      "loss": 0.0903,
+      "step": 12687
+    },
+    {
+      "epoch": 0.11013793283044418,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0019336167408795944,
+      "loss": 0.1069,
+      "step": 12688
+    },
+    {
+      "epoch": 0.11014661331064834,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0019336055219933593,
+      "loss": 0.1445,
+      "step": 12689
+    },
+    {
+      "epoch": 0.11015529379085251,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0019335943021955057,
+      "loss": 0.0991,
+      "step": 12690
+    },
+    {
+      "epoch": 0.11016397427105667,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0019335830814860458,
+      "loss": 0.1436,
+      "step": 12691
+    },
+    {
+      "epoch": 0.11017265475126084,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0019335718598649922,
+      "loss": 0.125,
+      "step": 12692
+    },
+    {
+      "epoch": 0.110181335231465,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0019335606373323568,
+      "loss": 0.1416,
+      "step": 12693
+    },
+    {
+      "epoch": 0.11019001571166917,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019335494138881522,
+      "loss": 0.1172,
+      "step": 12694
+    },
+    {
+      "epoch": 0.11019869619187334,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019335381895323903,
+      "loss": 0.1006,
+      "step": 12695
+    },
+    {
+      "epoch": 0.1102073766720775,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019335269642650837,
+      "loss": 0.1621,
+      "step": 12696
+    },
+    {
+      "epoch": 0.11021605715228167,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019335157380862443,
+      "loss": 0.1387,
+      "step": 12697
+    },
+    {
+      "epoch": 0.11022473763248583,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0019335045109958847,
+      "loss": 0.0771,
+      "step": 12698
+    },
+    {
+      "epoch": 0.11023341811269,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001933493282994017,
+      "loss": 0.125,
+      "step": 12699
+    },
+    {
+      "epoch": 0.11024209859289416,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0019334820540806538,
+      "loss": 0.0957,
+      "step": 12700
+    },
+    {
+      "epoch": 0.11025077907309833,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001933470824255807,
+      "loss": 0.0962,
+      "step": 12701
+    },
+    {
+      "epoch": 0.11025945955330249,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0019334595935194892,
+      "loss": 0.1221,
+      "step": 12702
+    },
+    {
+      "epoch": 0.11026814003350666,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0019334483618717122,
+      "loss": 0.0952,
+      "step": 12703
+    },
+    {
+      "epoch": 0.11027682051371082,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019334371293124888,
+      "loss": 0.1523,
+      "step": 12704
+    },
+    {
+      "epoch": 0.11028550099391499,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019334258958418312,
+      "loss": 0.084,
+      "step": 12705
+    },
+    {
+      "epoch": 0.11029418147411915,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019334146614597512,
+      "loss": 0.1123,
+      "step": 12706
+    },
+    {
+      "epoch": 0.11030286195432332,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019334034261662618,
+      "loss": 0.1133,
+      "step": 12707
+    },
+    {
+      "epoch": 0.11031154243452748,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0019333921899613747,
+      "loss": 0.0977,
+      "step": 12708
+    },
+    {
+      "epoch": 0.11032022291473165,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019333809528451027,
+      "loss": 0.0898,
+      "step": 12709
+    },
+    {
+      "epoch": 0.11032890339493581,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019333697148174575,
+      "loss": 0.1138,
+      "step": 12710
+    },
+    {
+      "epoch": 0.11033758387513998,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001933358475878452,
+      "loss": 0.0781,
+      "step": 12711
+    },
+    {
+      "epoch": 0.11034626435534414,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001933347236028098,
+      "loss": 0.1006,
+      "step": 12712
+    },
+    {
+      "epoch": 0.11035494483554831,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001933335995266408,
+      "loss": 0.0815,
+      "step": 12713
+    },
+    {
+      "epoch": 0.11036362531575247,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019333247535933942,
+      "loss": 0.1001,
+      "step": 12714
+    },
+    {
+      "epoch": 0.11037230579595664,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019333135110090691,
+      "loss": 0.0938,
+      "step": 12715
+    },
+    {
+      "epoch": 0.1103809862761608,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019333022675134448,
+      "loss": 0.126,
+      "step": 12716
+    },
+    {
+      "epoch": 0.11038966675636497,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019332910231065338,
+      "loss": 0.1328,
+      "step": 12717
+    },
+    {
+      "epoch": 0.11039834723656913,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0019332797777883483,
+      "loss": 0.1035,
+      "step": 12718
+    },
+    {
+      "epoch": 0.1104070277167733,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019332685315589003,
+      "loss": 0.1348,
+      "step": 12719
+    },
+    {
+      "epoch": 0.11041570819697746,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0019332572844182022,
+      "loss": 0.1914,
+      "step": 12720
+    },
+    {
+      "epoch": 0.11042438867718163,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019332460363662668,
+      "loss": 0.125,
+      "step": 12721
+    },
+    {
+      "epoch": 0.11043306915738578,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0019332347874031063,
+      "loss": 0.0996,
+      "step": 12722
+    },
+    {
+      "epoch": 0.11044174963758994,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0019332235375287322,
+      "loss": 0.0957,
+      "step": 12723
+    },
+    {
+      "epoch": 0.11045043011779411,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0019332122867431576,
+      "loss": 0.1445,
+      "step": 12724
+    },
+    {
+      "epoch": 0.11045911059799828,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019332010350463944,
+      "loss": 0.1094,
+      "step": 12725
+    },
+    {
+      "epoch": 0.11046779107820244,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019331897824384553,
+      "loss": 0.1055,
+      "step": 12726
+    },
+    {
+      "epoch": 0.1104764715584066,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019331785289193521,
+      "loss": 0.1074,
+      "step": 12727
+    },
+    {
+      "epoch": 0.11048515203861077,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0019331672744890979,
+      "loss": 0.1182,
+      "step": 12728
+    },
+    {
+      "epoch": 0.11049383251881494,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001933156019147704,
+      "loss": 0.1758,
+      "step": 12729
+    },
+    {
+      "epoch": 0.1105025129990191,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019331447628951834,
+      "loss": 0.1211,
+      "step": 12730
+    },
+    {
+      "epoch": 0.11051119347922327,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001933133505731548,
+      "loss": 0.1396,
+      "step": 12731
+    },
+    {
+      "epoch": 0.11051987395942743,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019331222476568106,
+      "loss": 0.1006,
+      "step": 12732
+    },
+    {
+      "epoch": 0.1105285544396316,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001933110988670983,
+      "loss": 0.1592,
+      "step": 12733
+    },
+    {
+      "epoch": 0.11053723491983576,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019330997287740777,
+      "loss": 0.1426,
+      "step": 12734
+    },
+    {
+      "epoch": 0.11054591540003993,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0019330884679661072,
+      "loss": 0.0898,
+      "step": 12735
+    },
+    {
+      "epoch": 0.11055459588024409,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001933077206247084,
+      "loss": 0.0815,
+      "step": 12736
+    },
+    {
+      "epoch": 0.11056327636044826,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0019330659436170195,
+      "loss": 0.0889,
+      "step": 12737
+    },
+    {
+      "epoch": 0.11057195684065242,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0019330546800759268,
+      "loss": 0.1006,
+      "step": 12738
+    },
+    {
+      "epoch": 0.11058063732085659,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0019330434156238179,
+      "loss": 0.1367,
+      "step": 12739
+    },
+    {
+      "epoch": 0.11058931780106075,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019330321502607055,
+      "loss": 0.1318,
+      "step": 12740
+    },
+    {
+      "epoch": 0.11059799828126492,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019330208839866017,
+      "loss": 0.1172,
+      "step": 12741
+    },
+    {
+      "epoch": 0.11060667876146908,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019330096168015188,
+      "loss": 0.1094,
+      "step": 12742
+    },
+    {
+      "epoch": 0.11061535924167325,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019329983487054686,
+      "loss": 0.1348,
+      "step": 12743
+    },
+    {
+      "epoch": 0.11062403972187741,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0019329870796984644,
+      "loss": 0.1436,
+      "step": 12744
+    },
+    {
+      "epoch": 0.11063272020208158,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001932975809780518,
+      "loss": 0.1504,
+      "step": 12745
+    },
+    {
+      "epoch": 0.11064140068228574,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0019329645389516415,
+      "loss": 0.1021,
+      "step": 12746
+    },
+    {
+      "epoch": 0.11065008116248991,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019329532672118475,
+      "loss": 0.0967,
+      "step": 12747
+    },
+    {
+      "epoch": 0.11065876164269407,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0019329419945611486,
+      "loss": 0.106,
+      "step": 12748
+    },
+    {
+      "epoch": 0.11066744212289824,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019329307209995567,
+      "loss": 0.2051,
+      "step": 12749
+    },
+    {
+      "epoch": 0.1106761226031024,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0019329194465270844,
+      "loss": 0.1016,
+      "step": 12750
+    },
+    {
+      "epoch": 0.11068480308330657,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0019329081711437438,
+      "loss": 0.1113,
+      "step": 12751
+    },
+    {
+      "epoch": 0.11069348356351073,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019328968948495472,
+      "loss": 0.1309,
+      "step": 12752
+    },
+    {
+      "epoch": 0.1107021640437149,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019328856176445072,
+      "loss": 0.1074,
+      "step": 12753
+    },
+    {
+      "epoch": 0.11071084452391906,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0019328743395286361,
+      "loss": 0.1182,
+      "step": 12754
+    },
+    {
+      "epoch": 0.11071952500412323,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001932863060501946,
+      "loss": 0.1074,
+      "step": 12755
+    },
+    {
+      "epoch": 0.1107282054843274,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0019328517805644496,
+      "loss": 0.1436,
+      "step": 12756
+    },
+    {
+      "epoch": 0.11073688596453156,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019328404997161588,
+      "loss": 0.1611,
+      "step": 12757
+    },
+    {
+      "epoch": 0.11074556644473572,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001932829217957086,
+      "loss": 0.123,
+      "step": 12758
+    },
+    {
+      "epoch": 0.11075424692493989,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001932817935287244,
+      "loss": 0.1289,
+      "step": 12759
+    },
+    {
+      "epoch": 0.11076292740514405,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019328066517066444,
+      "loss": 0.1055,
+      "step": 12760
+    },
+    {
+      "epoch": 0.11077160788534822,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019327953672153004,
+      "loss": 0.1309,
+      "step": 12761
+    },
+    {
+      "epoch": 0.11078028836555238,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019327840818132238,
+      "loss": 0.125,
+      "step": 12762
+    },
+    {
+      "epoch": 0.11078896884575655,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019327727955004272,
+      "loss": 0.1396,
+      "step": 12763
+    },
+    {
+      "epoch": 0.11079764932596071,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019327615082769226,
+      "loss": 0.1016,
+      "step": 12764
+    },
+    {
+      "epoch": 0.11080632980616488,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019327502201427222,
+      "loss": 0.1562,
+      "step": 12765
+    },
+    {
+      "epoch": 0.11081501028636905,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001932738931097839,
+      "loss": 0.0967,
+      "step": 12766
+    },
+    {
+      "epoch": 0.11082369076657321,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0019327276411422848,
+      "loss": 0.126,
+      "step": 12767
+    },
+    {
+      "epoch": 0.11083237124677738,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019327163502760728,
+      "loss": 0.1289,
+      "step": 12768
+    },
+    {
+      "epoch": 0.11084105172698154,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019327050584992138,
+      "loss": 0.1689,
+      "step": 12769
+    },
+    {
+      "epoch": 0.1108497322071857,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019326937658117216,
+      "loss": 0.1328,
+      "step": 12770
+    },
+    {
+      "epoch": 0.11085841268738987,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001932682472213608,
+      "loss": 0.1357,
+      "step": 12771
+    },
+    {
+      "epoch": 0.11086709316759404,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019326711777048853,
+      "loss": 0.123,
+      "step": 12772
+    },
+    {
+      "epoch": 0.1108757736477982,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001932659882285566,
+      "loss": 0.1035,
+      "step": 12773
+    },
+    {
+      "epoch": 0.11088445412800237,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001932648585955662,
+      "loss": 0.0933,
+      "step": 12774
+    },
+    {
+      "epoch": 0.11089313460820653,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0019326372887151866,
+      "loss": 0.1504,
+      "step": 12775
+    },
+    {
+      "epoch": 0.1109018150884107,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001932625990564151,
+      "loss": 0.1011,
+      "step": 12776
+    },
+    {
+      "epoch": 0.11091049556861486,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019326146915025683,
+      "loss": 0.0928,
+      "step": 12777
+    },
+    {
+      "epoch": 0.11091917604881903,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0019326033915304508,
+      "loss": 0.126,
+      "step": 12778
+    },
+    {
+      "epoch": 0.11092785652902319,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019325920906478108,
+      "loss": 0.1084,
+      "step": 12779
+    },
+    {
+      "epoch": 0.11093653700922736,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0019325807888546606,
+      "loss": 0.0869,
+      "step": 12780
+    },
+    {
+      "epoch": 0.11094521748943152,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019325694861510121,
+      "loss": 0.0991,
+      "step": 12781
+    },
+    {
+      "epoch": 0.11095389796963569,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019325581825368789,
+      "loss": 0.1279,
+      "step": 12782
+    },
+    {
+      "epoch": 0.11096257844983984,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0019325468780122719,
+      "loss": 0.1025,
+      "step": 12783
+    },
+    {
+      "epoch": 0.110971258930044,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0019325355725772042,
+      "loss": 0.1035,
+      "step": 12784
+    },
+    {
+      "epoch": 0.11097993941024817,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019325242662316885,
+      "loss": 0.0981,
+      "step": 12785
+    },
+    {
+      "epoch": 0.11098861989045233,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019325129589757367,
+      "loss": 0.1318,
+      "step": 12786
+    },
+    {
+      "epoch": 0.1109973003706565,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019325016508093611,
+      "loss": 0.126,
+      "step": 12787
+    },
+    {
+      "epoch": 0.11100598085086066,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0019324903417325741,
+      "loss": 0.1152,
+      "step": 12788
+    },
+    {
+      "epoch": 0.11101466133106483,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019324790317453882,
+      "loss": 0.0981,
+      "step": 12789
+    },
+    {
+      "epoch": 0.111023341811269,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001932467720847816,
+      "loss": 0.0957,
+      "step": 12790
+    },
+    {
+      "epoch": 0.11103202229147316,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019324564090398692,
+      "loss": 0.1299,
+      "step": 12791
+    },
+    {
+      "epoch": 0.11104070277167732,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001932445096321561,
+      "loss": 0.123,
+      "step": 12792
+    },
+    {
+      "epoch": 0.11104938325188149,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001932433782692903,
+      "loss": 0.0698,
+      "step": 12793
+    },
+    {
+      "epoch": 0.11105806373208565,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019324224681539083,
+      "loss": 0.1191,
+      "step": 12794
+    },
+    {
+      "epoch": 0.11106674421228982,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0019324111527045885,
+      "loss": 0.084,
+      "step": 12795
+    },
+    {
+      "epoch": 0.11107542469249398,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019323998363449564,
+      "loss": 0.1094,
+      "step": 12796
+    },
+    {
+      "epoch": 0.11108410517269815,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019323885190750247,
+      "loss": 0.1396,
+      "step": 12797
+    },
+    {
+      "epoch": 0.11109278565290231,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001932377200894805,
+      "loss": 0.1133,
+      "step": 12798
+    },
+    {
+      "epoch": 0.11110146613310648,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019323658818043107,
+      "loss": 0.1426,
+      "step": 12799
+    },
+    {
+      "epoch": 0.11111014661331065,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001932354561803553,
+      "loss": 0.0811,
+      "step": 12800
+    },
+    {
+      "epoch": 0.11111882709351481,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001932343240892545,
+      "loss": 0.1348,
+      "step": 12801
+    },
+    {
+      "epoch": 0.11112750757371898,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019323319190712991,
+      "loss": 0.0996,
+      "step": 12802
+    },
+    {
+      "epoch": 0.11113618805392314,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019323205963398273,
+      "loss": 0.1484,
+      "step": 12803
+    },
+    {
+      "epoch": 0.1111448685341273,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019323092726981424,
+      "loss": 0.1494,
+      "step": 12804
+    },
+    {
+      "epoch": 0.11115354901433147,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019322979481462563,
+      "loss": 0.1475,
+      "step": 12805
+    },
+    {
+      "epoch": 0.11116222949453564,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001932286622684182,
+      "loss": 0.125,
+      "step": 12806
+    },
+    {
+      "epoch": 0.1111709099747398,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019322752963119314,
+      "loss": 0.0874,
+      "step": 12807
+    },
+    {
+      "epoch": 0.11117959045494397,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001932263969029517,
+      "loss": 0.1099,
+      "step": 12808
+    },
+    {
+      "epoch": 0.11118827093514813,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0019322526408369514,
+      "loss": 0.126,
+      "step": 12809
+    },
+    {
+      "epoch": 0.1111969514153523,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0019322413117342466,
+      "loss": 0.1289,
+      "step": 12810
+    },
+    {
+      "epoch": 0.11120563189555646,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019322299817214152,
+      "loss": 0.1191,
+      "step": 12811
+    },
+    {
+      "epoch": 0.11121431237576063,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019322186507984699,
+      "loss": 0.1152,
+      "step": 12812
+    },
+    {
+      "epoch": 0.11122299285596479,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019322073189654222,
+      "loss": 0.1484,
+      "step": 12813
+    },
+    {
+      "epoch": 0.11123167333616896,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019321959862222857,
+      "loss": 0.123,
+      "step": 12814
+    },
+    {
+      "epoch": 0.11124035381637312,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0019321846525690721,
+      "loss": 0.1602,
+      "step": 12815
+    },
+    {
+      "epoch": 0.11124903429657729,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019321733180057937,
+      "loss": 0.1699,
+      "step": 12816
+    },
+    {
+      "epoch": 0.11125771477678145,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.001932161982532463,
+      "loss": 0.1699,
+      "step": 12817
+    },
+    {
+      "epoch": 0.11126639525698562,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0019321506461490924,
+      "loss": 0.0801,
+      "step": 12818
+    },
+    {
+      "epoch": 0.11127507573718978,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0019321393088556945,
+      "loss": 0.0879,
+      "step": 12819
+    },
+    {
+      "epoch": 0.11128375621739395,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019321279706522817,
+      "loss": 0.1235,
+      "step": 12820
+    },
+    {
+      "epoch": 0.11129243669759811,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001932116631538866,
+      "loss": 0.1094,
+      "step": 12821
+    },
+    {
+      "epoch": 0.11130111717780228,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0019321052915154601,
+      "loss": 0.1309,
+      "step": 12822
+    },
+    {
+      "epoch": 0.11130979765800644,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019320939505820766,
+      "loss": 0.1348,
+      "step": 12823
+    },
+    {
+      "epoch": 0.11131847813821061,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0019320826087387273,
+      "loss": 0.123,
+      "step": 12824
+    },
+    {
+      "epoch": 0.11132715861841477,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019320712659854251,
+      "loss": 0.1162,
+      "step": 12825
+    },
+    {
+      "epoch": 0.11133583909861894,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019320599223221825,
+      "loss": 0.1328,
+      "step": 12826
+    },
+    {
+      "epoch": 0.1113445195788231,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0019320485777490112,
+      "loss": 0.1582,
+      "step": 12827
+    },
+    {
+      "epoch": 0.11135320005902727,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0019320372322659245,
+      "loss": 0.0933,
+      "step": 12828
+    },
+    {
+      "epoch": 0.11136188053923143,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0019320258858729342,
+      "loss": 0.1387,
+      "step": 12829
+    },
+    {
+      "epoch": 0.1113705610194356,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0019320145385700528,
+      "loss": 0.1172,
+      "step": 12830
+    },
+    {
+      "epoch": 0.11137924149963976,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001932003190357293,
+      "loss": 0.1318,
+      "step": 12831
+    },
+    {
+      "epoch": 0.11138792197984393,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019319918412346669,
+      "loss": 0.1162,
+      "step": 12832
+    },
+    {
+      "epoch": 0.1113966024600481,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001931980491202187,
+      "loss": 0.1416,
+      "step": 12833
+    },
+    {
+      "epoch": 0.11140528294025226,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019319691402598657,
+      "loss": 0.1123,
+      "step": 12834
+    },
+    {
+      "epoch": 0.11141396342045642,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019319577884077156,
+      "loss": 0.1016,
+      "step": 12835
+    },
+    {
+      "epoch": 0.11142264390066059,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001931946435645749,
+      "loss": 0.1035,
+      "step": 12836
+    },
+    {
+      "epoch": 0.11143132438086475,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019319350819739782,
+      "loss": 0.1289,
+      "step": 12837
+    },
+    {
+      "epoch": 0.11144000486106892,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019319237273924155,
+      "loss": 0.1768,
+      "step": 12838
+    },
+    {
+      "epoch": 0.11144868534127308,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019319123719010738,
+      "loss": 0.1279,
+      "step": 12839
+    },
+    {
+      "epoch": 0.11145736582147725,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001931901015499965,
+      "loss": 0.1084,
+      "step": 12840
+    },
+    {
+      "epoch": 0.11146604630168142,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001931889658189102,
+      "loss": 0.1177,
+      "step": 12841
+    },
+    {
+      "epoch": 0.11147472678188558,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0019318782999684968,
+      "loss": 0.0928,
+      "step": 12842
+    },
+    {
+      "epoch": 0.11148340726208975,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001931866940838162,
+      "loss": 0.124,
+      "step": 12843
+    },
+    {
+      "epoch": 0.11149208774229391,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.00193185558079811,
+      "loss": 0.1016,
+      "step": 12844
+    },
+    {
+      "epoch": 0.11150076822249806,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0019318442198483533,
+      "loss": 0.1074,
+      "step": 12845
+    },
+    {
+      "epoch": 0.11150944870270223,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0019318328579889042,
+      "loss": 0.1309,
+      "step": 12846
+    },
+    {
+      "epoch": 0.11151812918290639,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0019318214952197751,
+      "loss": 0.2461,
+      "step": 12847
+    },
+    {
+      "epoch": 0.11152680966311056,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019318101315409786,
+      "loss": 0.082,
+      "step": 12848
+    },
+    {
+      "epoch": 0.11153549014331472,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0019317987669525273,
+      "loss": 0.1318,
+      "step": 12849
+    },
+    {
+      "epoch": 0.11154417062351889,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001931787401454433,
+      "loss": 0.1025,
+      "step": 12850
+    },
+    {
+      "epoch": 0.11155285110372305,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0019317760350467087,
+      "loss": 0.0898,
+      "step": 12851
+    },
+    {
+      "epoch": 0.11156153158392722,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019317646677293665,
+      "loss": 0.1201,
+      "step": 12852
+    },
+    {
+      "epoch": 0.11157021206413138,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001931753299502419,
+      "loss": 0.1416,
+      "step": 12853
+    },
+    {
+      "epoch": 0.11157889254433555,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019317419303658786,
+      "loss": 0.0977,
+      "step": 12854
+    },
+    {
+      "epoch": 0.11158757302453971,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0019317305603197576,
+      "loss": 0.0903,
+      "step": 12855
+    },
+    {
+      "epoch": 0.11159625350474388,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019317191893640688,
+      "loss": 0.0918,
+      "step": 12856
+    },
+    {
+      "epoch": 0.11160493398494804,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001931707817498824,
+      "loss": 0.1045,
+      "step": 12857
+    },
+    {
+      "epoch": 0.11161361446515221,
+      "grad_norm": 5.75,
+      "learning_rate": 0.0019316964447240366,
+      "loss": 0.4863,
+      "step": 12858
+    },
+    {
+      "epoch": 0.11162229494535637,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.001931685071039718,
+      "loss": 0.1543,
+      "step": 12859
+    },
+    {
+      "epoch": 0.11163097542556054,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019316736964458815,
+      "loss": 0.1113,
+      "step": 12860
+    },
+    {
+      "epoch": 0.1116396559057647,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019316623209425386,
+      "loss": 0.0903,
+      "step": 12861
+    },
+    {
+      "epoch": 0.11164833638596887,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0019316509445297027,
+      "loss": 0.124,
+      "step": 12862
+    },
+    {
+      "epoch": 0.11165701686617303,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019316395672073856,
+      "loss": 0.1309,
+      "step": 12863
+    },
+    {
+      "epoch": 0.1116656973463772,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019316281889756003,
+      "loss": 0.1064,
+      "step": 12864
+    },
+    {
+      "epoch": 0.11167437782658136,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019316168098343586,
+      "loss": 0.1162,
+      "step": 12865
+    },
+    {
+      "epoch": 0.11168305830678553,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019316054297836731,
+      "loss": 0.127,
+      "step": 12866
+    },
+    {
+      "epoch": 0.1116917387869897,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019315940488235568,
+      "loss": 0.1143,
+      "step": 12867
+    },
+    {
+      "epoch": 0.11170041926719386,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019315826669540216,
+      "loss": 0.1152,
+      "step": 12868
+    },
+    {
+      "epoch": 0.11170909974739802,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.00193157128417508,
+      "loss": 0.126,
+      "step": 12869
+    },
+    {
+      "epoch": 0.11171778022760219,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0019315599004867446,
+      "loss": 0.1299,
+      "step": 12870
+    },
+    {
+      "epoch": 0.11172646070780635,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019315485158890276,
+      "loss": 0.0952,
+      "step": 12871
+    },
+    {
+      "epoch": 0.11173514118801052,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001931537130381942,
+      "loss": 0.123,
+      "step": 12872
+    },
+    {
+      "epoch": 0.11174382166821469,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019315257439654994,
+      "loss": 0.0952,
+      "step": 12873
+    },
+    {
+      "epoch": 0.11175250214841885,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001931514356639713,
+      "loss": 0.1777,
+      "step": 12874
+    },
+    {
+      "epoch": 0.11176118262862302,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019315029684045953,
+      "loss": 0.1055,
+      "step": 12875
+    },
+    {
+      "epoch": 0.11176986310882718,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001931491579260158,
+      "loss": 0.1367,
+      "step": 12876
+    },
+    {
+      "epoch": 0.11177854358903135,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0019314801892064142,
+      "loss": 0.0864,
+      "step": 12877
+    },
+    {
+      "epoch": 0.11178722406923551,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019314687982433761,
+      "loss": 0.106,
+      "step": 12878
+    },
+    {
+      "epoch": 0.11179590454943968,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019314574063710563,
+      "loss": 0.1079,
+      "step": 12879
+    },
+    {
+      "epoch": 0.11180458502964384,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0019314460135894673,
+      "loss": 0.1416,
+      "step": 12880
+    },
+    {
+      "epoch": 0.111813265509848,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0019314346198986213,
+      "loss": 0.0996,
+      "step": 12881
+    },
+    {
+      "epoch": 0.11182194599005217,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019314232252985307,
+      "loss": 0.124,
+      "step": 12882
+    },
+    {
+      "epoch": 0.11183062647025634,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0019314118297892085,
+      "loss": 0.1094,
+      "step": 12883
+    },
+    {
+      "epoch": 0.1118393069504605,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0019314004333706666,
+      "loss": 0.1299,
+      "step": 12884
+    },
+    {
+      "epoch": 0.11184798743066467,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0019313890360429175,
+      "loss": 0.0957,
+      "step": 12885
+    },
+    {
+      "epoch": 0.11185666791086883,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019313776378059742,
+      "loss": 0.1426,
+      "step": 12886
+    },
+    {
+      "epoch": 0.111865348391073,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019313662386598485,
+      "loss": 0.1074,
+      "step": 12887
+    },
+    {
+      "epoch": 0.11187402887127716,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0019313548386045533,
+      "loss": 0.123,
+      "step": 12888
+    },
+    {
+      "epoch": 0.11188270935148133,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001931343437640101,
+      "loss": 0.125,
+      "step": 12889
+    },
+    {
+      "epoch": 0.11189138983168549,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0019313320357665041,
+      "loss": 0.1221,
+      "step": 12890
+    },
+    {
+      "epoch": 0.11190007031188966,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0019313206329837746,
+      "loss": 0.1377,
+      "step": 12891
+    },
+    {
+      "epoch": 0.11190875079209382,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0019313092292919257,
+      "loss": 0.1143,
+      "step": 12892
+    },
+    {
+      "epoch": 0.11191743127229799,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0019312978246909696,
+      "loss": 0.1543,
+      "step": 12893
+    },
+    {
+      "epoch": 0.11192611175250215,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0019312864191809182,
+      "loss": 0.0981,
+      "step": 12894
+    },
+    {
+      "epoch": 0.11193479223270632,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0019312750127617848,
+      "loss": 0.1504,
+      "step": 12895
+    },
+    {
+      "epoch": 0.11194347271291048,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019312636054335814,
+      "loss": 0.0996,
+      "step": 12896
+    },
+    {
+      "epoch": 0.11195215319311465,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001931252197196321,
+      "loss": 0.1406,
+      "step": 12897
+    },
+    {
+      "epoch": 0.11196083367331881,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0019312407880500153,
+      "loss": 0.1689,
+      "step": 12898
+    },
+    {
+      "epoch": 0.11196951415352298,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001931229377994677,
+      "loss": 0.127,
+      "step": 12899
+    },
+    {
+      "epoch": 0.11197819463372714,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0019312179670303194,
+      "loss": 0.1104,
+      "step": 12900
+    },
+    {
+      "epoch": 0.11198687511393131,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0019312065551569538,
+      "loss": 0.0874,
+      "step": 12901
+    },
+    {
+      "epoch": 0.11199555559413547,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019311951423745934,
+      "loss": 0.1162,
+      "step": 12902
+    },
+    {
+      "epoch": 0.11200423607433964,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019311837286832502,
+      "loss": 0.0977,
+      "step": 12903
+    },
+    {
+      "epoch": 0.1120129165545438,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001931172314082937,
+      "loss": 0.1064,
+      "step": 12904
+    },
+    {
+      "epoch": 0.11202159703474797,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0019311608985736666,
+      "loss": 0.1367,
+      "step": 12905
+    },
+    {
+      "epoch": 0.11203027751495213,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001931149482155451,
+      "loss": 0.1025,
+      "step": 12906
+    },
+    {
+      "epoch": 0.11203895799515629,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019311380648283027,
+      "loss": 0.1245,
+      "step": 12907
+    },
+    {
+      "epoch": 0.11204763847536045,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019311266465922342,
+      "loss": 0.1011,
+      "step": 12908
+    },
+    {
+      "epoch": 0.11205631895556462,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0019311152274472583,
+      "loss": 0.1318,
+      "step": 12909
+    },
+    {
+      "epoch": 0.11206499943576878,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019311038073933874,
+      "loss": 0.0957,
+      "step": 12910
+    },
+    {
+      "epoch": 0.11207367991597295,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019310923864306335,
+      "loss": 0.1162,
+      "step": 12911
+    },
+    {
+      "epoch": 0.11208236039617711,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019310809645590097,
+      "loss": 0.1465,
+      "step": 12912
+    },
+    {
+      "epoch": 0.11209104087638128,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019310695417785282,
+      "loss": 0.1465,
+      "step": 12913
+    },
+    {
+      "epoch": 0.11209972135658544,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019310581180892014,
+      "loss": 0.1201,
+      "step": 12914
+    },
+    {
+      "epoch": 0.1121084018367896,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019310466934910422,
+      "loss": 0.0913,
+      "step": 12915
+    },
+    {
+      "epoch": 0.11211708231699377,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0019310352679840625,
+      "loss": 0.1201,
+      "step": 12916
+    },
+    {
+      "epoch": 0.11212576279719794,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019310238415682752,
+      "loss": 0.126,
+      "step": 12917
+    },
+    {
+      "epoch": 0.1121344432774021,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001931012414243693,
+      "loss": 0.1299,
+      "step": 12918
+    },
+    {
+      "epoch": 0.11214312375760627,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0019310009860103278,
+      "loss": 0.1465,
+      "step": 12919
+    },
+    {
+      "epoch": 0.11215180423781043,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0019309895568681925,
+      "loss": 0.1875,
+      "step": 12920
+    },
+    {
+      "epoch": 0.1121604847180146,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0019309781268172994,
+      "loss": 0.1699,
+      "step": 12921
+    },
+    {
+      "epoch": 0.11216916519821876,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0019309666958576613,
+      "loss": 0.0933,
+      "step": 12922
+    },
+    {
+      "epoch": 0.11217784567842293,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019309552639892905,
+      "loss": 0.1436,
+      "step": 12923
+    },
+    {
+      "epoch": 0.11218652615862709,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019309438312121994,
+      "loss": 0.1084,
+      "step": 12924
+    },
+    {
+      "epoch": 0.11219520663883126,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019309323975264007,
+      "loss": 0.1069,
+      "step": 12925
+    },
+    {
+      "epoch": 0.11220388711903542,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019309209629319063,
+      "loss": 0.1055,
+      "step": 12926
+    },
+    {
+      "epoch": 0.11221256759923959,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.00193090952742873,
+      "loss": 0.1074,
+      "step": 12927
+    },
+    {
+      "epoch": 0.11222124807944375,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019308980910168832,
+      "loss": 0.103,
+      "step": 12928
+    },
+    {
+      "epoch": 0.11222992855964792,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019308866536963786,
+      "loss": 0.1064,
+      "step": 12929
+    },
+    {
+      "epoch": 0.11223860903985208,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019308752154672294,
+      "loss": 0.1289,
+      "step": 12930
+    },
+    {
+      "epoch": 0.11224728952005625,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0019308637763294468,
+      "loss": 0.1328,
+      "step": 12931
+    },
+    {
+      "epoch": 0.11225597000026041,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019308523362830446,
+      "loss": 0.0786,
+      "step": 12932
+    },
+    {
+      "epoch": 0.11226465048046458,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019308408953280342,
+      "loss": 0.1768,
+      "step": 12933
+    },
+    {
+      "epoch": 0.11227333096066874,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019308294534644292,
+      "loss": 0.167,
+      "step": 12934
+    },
+    {
+      "epoch": 0.11228201144087291,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0019308180106922415,
+      "loss": 0.1143,
+      "step": 12935
+    },
+    {
+      "epoch": 0.11229069192107707,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0019308065670114835,
+      "loss": 0.0938,
+      "step": 12936
+    },
+    {
+      "epoch": 0.11229937240128124,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001930795122422168,
+      "loss": 0.0986,
+      "step": 12937
+    },
+    {
+      "epoch": 0.1123080528814854,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019307836769243078,
+      "loss": 0.1084,
+      "step": 12938
+    },
+    {
+      "epoch": 0.11231673336168957,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019307722305179145,
+      "loss": 0.1001,
+      "step": 12939
+    },
+    {
+      "epoch": 0.11232541384189373,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0019307607832030015,
+      "loss": 0.0938,
+      "step": 12940
+    },
+    {
+      "epoch": 0.1123340943220979,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0019307493349795808,
+      "loss": 0.1177,
+      "step": 12941
+    },
+    {
+      "epoch": 0.11234277480230206,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019307378858476653,
+      "loss": 0.168,
+      "step": 12942
+    },
+    {
+      "epoch": 0.11235145528250623,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019307264358072672,
+      "loss": 0.1172,
+      "step": 12943
+    },
+    {
+      "epoch": 0.1123601357627104,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.001930714984858399,
+      "loss": 0.1221,
+      "step": 12944
+    },
+    {
+      "epoch": 0.11236881624291456,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019307035330010738,
+      "loss": 0.1099,
+      "step": 12945
+    },
+    {
+      "epoch": 0.11237749672311872,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019306920802353033,
+      "loss": 0.124,
+      "step": 12946
+    },
+    {
+      "epoch": 0.11238617720332289,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0019306806265611005,
+      "loss": 0.0889,
+      "step": 12947
+    },
+    {
+      "epoch": 0.11239485768352706,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0019306691719784782,
+      "loss": 0.127,
+      "step": 12948
+    },
+    {
+      "epoch": 0.11240353816373122,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001930657716487448,
+      "loss": 0.1045,
+      "step": 12949
+    },
+    {
+      "epoch": 0.11241221864393539,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0019306462600880237,
+      "loss": 0.1133,
+      "step": 12950
+    },
+    {
+      "epoch": 0.11242089912413955,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019306348027802165,
+      "loss": 0.1895,
+      "step": 12951
+    },
+    {
+      "epoch": 0.11242957960434372,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019306233445640399,
+      "loss": 0.0903,
+      "step": 12952
+    },
+    {
+      "epoch": 0.11243826008454788,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.001930611885439506,
+      "loss": 0.1152,
+      "step": 12953
+    },
+    {
+      "epoch": 0.11244694056475205,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019306004254066274,
+      "loss": 0.1641,
+      "step": 12954
+    },
+    {
+      "epoch": 0.11245562104495621,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019305889644654164,
+      "loss": 0.123,
+      "step": 12955
+    },
+    {
+      "epoch": 0.11246430152516038,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0019305775026158864,
+      "loss": 0.1187,
+      "step": 12956
+    },
+    {
+      "epoch": 0.11247298200536454,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019305660398580488,
+      "loss": 0.0972,
+      "step": 12957
+    },
+    {
+      "epoch": 0.1124816624855687,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001930554576191917,
+      "loss": 0.1001,
+      "step": 12958
+    },
+    {
+      "epoch": 0.11249034296577287,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001930543111617503,
+      "loss": 0.1309,
+      "step": 12959
+    },
+    {
+      "epoch": 0.11249902344597704,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0019305316461348196,
+      "loss": 0.1191,
+      "step": 12960
+    },
+    {
+      "epoch": 0.1125077039261812,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001930520179743879,
+      "loss": 0.1035,
+      "step": 12961
+    },
+    {
+      "epoch": 0.11251638440638537,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019305087124446942,
+      "loss": 0.127,
+      "step": 12962
+    },
+    {
+      "epoch": 0.11252506488658953,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019304972442372777,
+      "loss": 0.124,
+      "step": 12963
+    },
+    {
+      "epoch": 0.1125337453667937,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019304857751216418,
+      "loss": 0.0986,
+      "step": 12964
+    },
+    {
+      "epoch": 0.11254242584699786,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019304743050977994,
+      "loss": 0.0957,
+      "step": 12965
+    },
+    {
+      "epoch": 0.11255110632720203,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0019304628341657623,
+      "loss": 0.1338,
+      "step": 12966
+    },
+    {
+      "epoch": 0.11255978680740619,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0019304513623255436,
+      "loss": 0.127,
+      "step": 12967
+    },
+    {
+      "epoch": 0.11256846728761034,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001930439889577156,
+      "loss": 0.1182,
+      "step": 12968
+    },
+    {
+      "epoch": 0.11257714776781451,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019304284159206114,
+      "loss": 0.083,
+      "step": 12969
+    },
+    {
+      "epoch": 0.11258582824801867,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019304169413559232,
+      "loss": 0.1006,
+      "step": 12970
+    },
+    {
+      "epoch": 0.11259450872822284,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019304054658831034,
+      "loss": 0.1074,
+      "step": 12971
+    },
+    {
+      "epoch": 0.112603189208427,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0019303939895021648,
+      "loss": 0.1172,
+      "step": 12972
+    },
+    {
+      "epoch": 0.11261186968863117,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0019303825122131196,
+      "loss": 0.1074,
+      "step": 12973
+    },
+    {
+      "epoch": 0.11262055016883533,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0019303710340159803,
+      "loss": 0.1133,
+      "step": 12974
+    },
+    {
+      "epoch": 0.1126292306490395,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019303595549107601,
+      "loss": 0.1426,
+      "step": 12975
+    },
+    {
+      "epoch": 0.11263791112924366,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019303480748974712,
+      "loss": 0.1172,
+      "step": 12976
+    },
+    {
+      "epoch": 0.11264659160944783,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0019303365939761258,
+      "loss": 0.1416,
+      "step": 12977
+    },
+    {
+      "epoch": 0.112655272089652,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.001930325112146737,
+      "loss": 0.1299,
+      "step": 12978
+    },
+    {
+      "epoch": 0.11266395256985616,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001930313629409317,
+      "loss": 0.1138,
+      "step": 12979
+    },
+    {
+      "epoch": 0.11267263305006033,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019303021457638787,
+      "loss": 0.1191,
+      "step": 12980
+    },
+    {
+      "epoch": 0.11268131353026449,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019302906612104346,
+      "loss": 0.1387,
+      "step": 12981
+    },
+    {
+      "epoch": 0.11268999401046866,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0019302791757489965,
+      "loss": 0.1836,
+      "step": 12982
+    },
+    {
+      "epoch": 0.11269867449067282,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0019302676893795782,
+      "loss": 0.1562,
+      "step": 12983
+    },
+    {
+      "epoch": 0.11270735497087699,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019302562021021915,
+      "loss": 0.1387,
+      "step": 12984
+    },
+    {
+      "epoch": 0.11271603545108115,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019302447139168486,
+      "loss": 0.1562,
+      "step": 12985
+    },
+    {
+      "epoch": 0.11272471593128532,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019302332248235631,
+      "loss": 0.1201,
+      "step": 12986
+    },
+    {
+      "epoch": 0.11273339641148948,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0019302217348223468,
+      "loss": 0.0977,
+      "step": 12987
+    },
+    {
+      "epoch": 0.11274207689169365,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019302102439132125,
+      "loss": 0.123,
+      "step": 12988
+    },
+    {
+      "epoch": 0.11275075737189781,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0019301987520961728,
+      "loss": 0.1445,
+      "step": 12989
+    },
+    {
+      "epoch": 0.11275943785210198,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019301872593712399,
+      "loss": 0.1328,
+      "step": 12990
+    },
+    {
+      "epoch": 0.11276811833230614,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0019301757657384274,
+      "loss": 0.1089,
+      "step": 12991
+    },
+    {
+      "epoch": 0.1127767988125103,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019301642711977466,
+      "loss": 0.105,
+      "step": 12992
+    },
+    {
+      "epoch": 0.11278547929271447,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0019301527757492106,
+      "loss": 0.0752,
+      "step": 12993
+    },
+    {
+      "epoch": 0.11279415977291864,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001930141279392832,
+      "loss": 0.166,
+      "step": 12994
+    },
+    {
+      "epoch": 0.1128028402531228,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019301297821286238,
+      "loss": 0.1309,
+      "step": 12995
+    },
+    {
+      "epoch": 0.11281152073332697,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0019301182839565976,
+      "loss": 0.1084,
+      "step": 12996
+    },
+    {
+      "epoch": 0.11282020121353113,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001930106784876767,
+      "loss": 0.1426,
+      "step": 12997
+    },
+    {
+      "epoch": 0.1128288816937353,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0019300952848891437,
+      "loss": 0.1729,
+      "step": 12998
+    },
+    {
+      "epoch": 0.11283756217393946,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0019300837839937408,
+      "loss": 0.1211,
+      "step": 12999
+    },
+    {
+      "epoch": 0.11284624265414363,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001930072282190571,
+      "loss": 0.0894,
+      "step": 13000
+    },
+    {
+      "epoch": 0.11285492313434779,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019300607794796463,
+      "loss": 0.1089,
+      "step": 13001
+    },
+    {
+      "epoch": 0.11286360361455196,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019300492758609795,
+      "loss": 0.1299,
+      "step": 13002
+    },
+    {
+      "epoch": 0.11287228409475612,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0019300377713345836,
+      "loss": 0.123,
+      "step": 13003
+    },
+    {
+      "epoch": 0.11288096457496029,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019300262659004707,
+      "loss": 0.1504,
+      "step": 13004
+    },
+    {
+      "epoch": 0.11288964505516445,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019300147595586537,
+      "loss": 0.166,
+      "step": 13005
+    },
+    {
+      "epoch": 0.11289832553536862,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0019300032523091452,
+      "loss": 0.127,
+      "step": 13006
+    },
+    {
+      "epoch": 0.11290700601557278,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001929991744151957,
+      "loss": 0.123,
+      "step": 13007
+    },
+    {
+      "epoch": 0.11291568649577695,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019299802350871028,
+      "loss": 0.1338,
+      "step": 13008
+    },
+    {
+      "epoch": 0.11292436697598111,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0019299687251145945,
+      "loss": 0.0801,
+      "step": 13009
+    },
+    {
+      "epoch": 0.11293304745618528,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001929957214234445,
+      "loss": 0.1045,
+      "step": 13010
+    },
+    {
+      "epoch": 0.11294172793638944,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019299457024466666,
+      "loss": 0.1621,
+      "step": 13011
+    },
+    {
+      "epoch": 0.11295040841659361,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001929934189751272,
+      "loss": 0.1182,
+      "step": 13012
+    },
+    {
+      "epoch": 0.11295908889679777,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001929922676148274,
+      "loss": 0.1221,
+      "step": 13013
+    },
+    {
+      "epoch": 0.11296776937700194,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.001929911161637685,
+      "loss": 0.1006,
+      "step": 13014
+    },
+    {
+      "epoch": 0.1129764498572061,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019298996462195175,
+      "loss": 0.0801,
+      "step": 13015
+    },
+    {
+      "epoch": 0.11298513033741027,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019298881298937846,
+      "loss": 0.1221,
+      "step": 13016
+    },
+    {
+      "epoch": 0.11299381081761443,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0019298766126604981,
+      "loss": 0.1191,
+      "step": 13017
+    },
+    {
+      "epoch": 0.1130024912978186,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019298650945196712,
+      "loss": 0.0894,
+      "step": 13018
+    },
+    {
+      "epoch": 0.11301117177802276,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001929853575471316,
+      "loss": 0.0693,
+      "step": 13019
+    },
+    {
+      "epoch": 0.11301985225822693,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019298420555154457,
+      "loss": 0.1143,
+      "step": 13020
+    },
+    {
+      "epoch": 0.1130285327384311,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019298305346520726,
+      "loss": 0.125,
+      "step": 13021
+    },
+    {
+      "epoch": 0.11303721321863526,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019298190128812093,
+      "loss": 0.0859,
+      "step": 13022
+    },
+    {
+      "epoch": 0.11304589369883943,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001929807490202868,
+      "loss": 0.1006,
+      "step": 13023
+    },
+    {
+      "epoch": 0.11305457417904359,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0019297959666170622,
+      "loss": 0.1465,
+      "step": 13024
+    },
+    {
+      "epoch": 0.11306325465924776,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0019297844421238039,
+      "loss": 0.0908,
+      "step": 13025
+    },
+    {
+      "epoch": 0.11307193513945192,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0019297729167231056,
+      "loss": 0.1318,
+      "step": 13026
+    },
+    {
+      "epoch": 0.11308061561965609,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019297613904149802,
+      "loss": 0.1328,
+      "step": 13027
+    },
+    {
+      "epoch": 0.11308929609986025,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019297498631994404,
+      "loss": 0.0884,
+      "step": 13028
+    },
+    {
+      "epoch": 0.11309797658006442,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019297383350764983,
+      "loss": 0.1631,
+      "step": 13029
+    },
+    {
+      "epoch": 0.11310665706026857,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001929726806046167,
+      "loss": 0.1162,
+      "step": 13030
+    },
+    {
+      "epoch": 0.11311533754047273,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019297152761084588,
+      "loss": 0.1387,
+      "step": 13031
+    },
+    {
+      "epoch": 0.1131240180206769,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019297037452633864,
+      "loss": 0.1069,
+      "step": 13032
+    },
+    {
+      "epoch": 0.11313269850088106,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0019296922135109626,
+      "loss": 0.1074,
+      "step": 13033
+    },
+    {
+      "epoch": 0.11314137898108523,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0019296806808511999,
+      "loss": 0.1602,
+      "step": 13034
+    },
+    {
+      "epoch": 0.11315005946128939,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001929669147284111,
+      "loss": 0.0894,
+      "step": 13035
+    },
+    {
+      "epoch": 0.11315873994149356,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019296576128097083,
+      "loss": 0.1084,
+      "step": 13036
+    },
+    {
+      "epoch": 0.11316742042169772,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019296460774280045,
+      "loss": 0.1094,
+      "step": 13037
+    },
+    {
+      "epoch": 0.11317610090190189,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.001929634541139012,
+      "loss": 0.1338,
+      "step": 13038
+    },
+    {
+      "epoch": 0.11318478138210605,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001929623003942744,
+      "loss": 0.1562,
+      "step": 13039
+    },
+    {
+      "epoch": 0.11319346186231022,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0019296114658392124,
+      "loss": 0.1211,
+      "step": 13040
+    },
+    {
+      "epoch": 0.11320214234251438,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0019295999268284301,
+      "loss": 0.1206,
+      "step": 13041
+    },
+    {
+      "epoch": 0.11321082282271855,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.00192958838691041,
+      "loss": 0.0889,
+      "step": 13042
+    },
+    {
+      "epoch": 0.11321950330292271,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0019295768460851645,
+      "loss": 0.1406,
+      "step": 13043
+    },
+    {
+      "epoch": 0.11322818378312688,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001929565304352706,
+      "loss": 0.1504,
+      "step": 13044
+    },
+    {
+      "epoch": 0.11323686426333104,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019295537617130475,
+      "loss": 0.1074,
+      "step": 13045
+    },
+    {
+      "epoch": 0.11324554474353521,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0019295422181662017,
+      "loss": 0.1113,
+      "step": 13046
+    },
+    {
+      "epoch": 0.11325422522373937,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001929530673712181,
+      "loss": 0.1426,
+      "step": 13047
+    },
+    {
+      "epoch": 0.11326290570394354,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019295191283509978,
+      "loss": 0.0986,
+      "step": 13048
+    },
+    {
+      "epoch": 0.1132715861841477,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019295075820826646,
+      "loss": 0.1318,
+      "step": 13049
+    },
+    {
+      "epoch": 0.11328026666435187,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001929496034907195,
+      "loss": 0.1309,
+      "step": 13050
+    },
+    {
+      "epoch": 0.11328894714455603,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019294844868246006,
+      "loss": 0.1328,
+      "step": 13051
+    },
+    {
+      "epoch": 0.1132976276247602,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019294729378348946,
+      "loss": 0.1006,
+      "step": 13052
+    },
+    {
+      "epoch": 0.11330630810496437,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0019294613879380896,
+      "loss": 0.1152,
+      "step": 13053
+    },
+    {
+      "epoch": 0.11331498858516853,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019294498371341976,
+      "loss": 0.1172,
+      "step": 13054
+    },
+    {
+      "epoch": 0.1133236690653727,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019294382854232323,
+      "loss": 0.1074,
+      "step": 13055
+    },
+    {
+      "epoch": 0.11333234954557686,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019294267328052056,
+      "loss": 0.106,
+      "step": 13056
+    },
+    {
+      "epoch": 0.11334103002578103,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.00192941517928013,
+      "loss": 0.0942,
+      "step": 13057
+    },
+    {
+      "epoch": 0.11334971050598519,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019294036248480185,
+      "loss": 0.1367,
+      "step": 13058
+    },
+    {
+      "epoch": 0.11335839098618936,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0019293920695088837,
+      "loss": 0.1338,
+      "step": 13059
+    },
+    {
+      "epoch": 0.11336707146639352,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0019293805132627384,
+      "loss": 0.1001,
+      "step": 13060
+    },
+    {
+      "epoch": 0.11337575194659769,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0019293689561095948,
+      "loss": 0.1147,
+      "step": 13061
+    },
+    {
+      "epoch": 0.11338443242680185,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019293573980494658,
+      "loss": 0.125,
+      "step": 13062
+    },
+    {
+      "epoch": 0.11339311290700602,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0019293458390823642,
+      "loss": 0.2207,
+      "step": 13063
+    },
+    {
+      "epoch": 0.11340179338721018,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019293342792083023,
+      "loss": 0.1055,
+      "step": 13064
+    },
+    {
+      "epoch": 0.11341047386741435,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019293227184272927,
+      "loss": 0.1064,
+      "step": 13065
+    },
+    {
+      "epoch": 0.11341915434761851,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0019293111567393487,
+      "loss": 0.1309,
+      "step": 13066
+    },
+    {
+      "epoch": 0.11342783482782268,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019292995941444823,
+      "loss": 0.1113,
+      "step": 13067
+    },
+    {
+      "epoch": 0.11343651530802684,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019292880306427062,
+      "loss": 0.0596,
+      "step": 13068
+    },
+    {
+      "epoch": 0.113445195788231,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001929276466234033,
+      "loss": 0.0845,
+      "step": 13069
+    },
+    {
+      "epoch": 0.11345387626843517,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0019292649009184759,
+      "loss": 0.1387,
+      "step": 13070
+    },
+    {
+      "epoch": 0.11346255674863934,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001929253334696047,
+      "loss": 0.1045,
+      "step": 13071
+    },
+    {
+      "epoch": 0.1134712372288435,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.001929241767566759,
+      "loss": 0.1348,
+      "step": 13072
+    },
+    {
+      "epoch": 0.11347991770904767,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001929230199530625,
+      "loss": 0.1357,
+      "step": 13073
+    },
+    {
+      "epoch": 0.11348859818925183,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019292186305876569,
+      "loss": 0.0923,
+      "step": 13074
+    },
+    {
+      "epoch": 0.113497278669456,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019292070607378677,
+      "loss": 0.1328,
+      "step": 13075
+    },
+    {
+      "epoch": 0.11350595914966016,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0019291954899812705,
+      "loss": 0.064,
+      "step": 13076
+    },
+    {
+      "epoch": 0.11351463962986433,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019291839183178774,
+      "loss": 0.0938,
+      "step": 13077
+    },
+    {
+      "epoch": 0.11352332011006849,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0019291723457477014,
+      "loss": 0.125,
+      "step": 13078
+    },
+    {
+      "epoch": 0.11353200059027266,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.001929160772270755,
+      "loss": 0.0977,
+      "step": 13079
+    },
+    {
+      "epoch": 0.11354068107047682,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019291491978870505,
+      "loss": 0.1055,
+      "step": 13080
+    },
+    {
+      "epoch": 0.11354936155068099,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.001929137622596601,
+      "loss": 0.127,
+      "step": 13081
+    },
+    {
+      "epoch": 0.11355804203088515,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019291260463994194,
+      "loss": 0.1074,
+      "step": 13082
+    },
+    {
+      "epoch": 0.11356672251108932,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019291144692955176,
+      "loss": 0.1016,
+      "step": 13083
+    },
+    {
+      "epoch": 0.11357540299129348,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019291028912849088,
+      "loss": 0.0703,
+      "step": 13084
+    },
+    {
+      "epoch": 0.11358408347149765,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019290913123676055,
+      "loss": 0.0767,
+      "step": 13085
+    },
+    {
+      "epoch": 0.11359276395170181,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019290797325436204,
+      "loss": 0.0908,
+      "step": 13086
+    },
+    {
+      "epoch": 0.11360144443190598,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0019290681518129664,
+      "loss": 0.1289,
+      "step": 13087
+    },
+    {
+      "epoch": 0.11361012491211014,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0019290565701756556,
+      "loss": 0.1543,
+      "step": 13088
+    },
+    {
+      "epoch": 0.11361880539231431,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019290449876317012,
+      "loss": 0.124,
+      "step": 13089
+    },
+    {
+      "epoch": 0.11362748587251847,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019290334041811156,
+      "loss": 0.1074,
+      "step": 13090
+    },
+    {
+      "epoch": 0.11363616635272264,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019290218198239114,
+      "loss": 0.1182,
+      "step": 13091
+    },
+    {
+      "epoch": 0.11364484683292679,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019290102345601015,
+      "loss": 0.125,
+      "step": 13092
+    },
+    {
+      "epoch": 0.11365352731313096,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0019289986483896984,
+      "loss": 0.168,
+      "step": 13093
+    },
+    {
+      "epoch": 0.11366220779333512,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001928987061312715,
+      "loss": 0.1582,
+      "step": 13094
+    },
+    {
+      "epoch": 0.11367088827353929,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0019289754733291637,
+      "loss": 0.1099,
+      "step": 13095
+    },
+    {
+      "epoch": 0.11367956875374345,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019289638844390573,
+      "loss": 0.125,
+      "step": 13096
+    },
+    {
+      "epoch": 0.11368824923394762,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019289522946424082,
+      "loss": 0.123,
+      "step": 13097
+    },
+    {
+      "epoch": 0.11369692971415178,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.00192894070393923,
+      "loss": 0.0786,
+      "step": 13098
+    },
+    {
+      "epoch": 0.11370561019435595,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019289291123295341,
+      "loss": 0.1045,
+      "step": 13099
+    },
+    {
+      "epoch": 0.11371429067456011,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001928917519813334,
+      "loss": 0.1201,
+      "step": 13100
+    },
+    {
+      "epoch": 0.11372297115476428,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001928905926390642,
+      "loss": 0.127,
+      "step": 13101
+    },
+    {
+      "epoch": 0.11373165163496844,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001928894332061471,
+      "loss": 0.1279,
+      "step": 13102
+    },
+    {
+      "epoch": 0.11374033211517261,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0019288827368258337,
+      "loss": 0.124,
+      "step": 13103
+    },
+    {
+      "epoch": 0.11374901259537677,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019288711406837424,
+      "loss": 0.1021,
+      "step": 13104
+    },
+    {
+      "epoch": 0.11375769307558094,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019288595436352106,
+      "loss": 0.0977,
+      "step": 13105
+    },
+    {
+      "epoch": 0.1137663735557851,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.00192884794568025,
+      "loss": 0.1543,
+      "step": 13106
+    },
+    {
+      "epoch": 0.11377505403598927,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001928836346818874,
+      "loss": 0.0947,
+      "step": 13107
+    },
+    {
+      "epoch": 0.11378373451619343,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019288247470510946,
+      "loss": 0.0962,
+      "step": 13108
+    },
+    {
+      "epoch": 0.1137924149963976,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0019288131463769255,
+      "loss": 0.0977,
+      "step": 13109
+    },
+    {
+      "epoch": 0.11380109547660176,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019288015447963785,
+      "loss": 0.0996,
+      "step": 13110
+    },
+    {
+      "epoch": 0.11380977595680593,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019287899423094665,
+      "loss": 0.1396,
+      "step": 13111
+    },
+    {
+      "epoch": 0.1138184564370101,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0019287783389162025,
+      "loss": 0.0967,
+      "step": 13112
+    },
+    {
+      "epoch": 0.11382713691721426,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019287667346165988,
+      "loss": 0.1357,
+      "step": 13113
+    },
+    {
+      "epoch": 0.11383581739741842,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019287551294106683,
+      "loss": 0.125,
+      "step": 13114
+    },
+    {
+      "epoch": 0.11384449787762259,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019287435232984236,
+      "loss": 0.1309,
+      "step": 13115
+    },
+    {
+      "epoch": 0.11385317835782675,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0019287319162798776,
+      "loss": 0.1465,
+      "step": 13116
+    },
+    {
+      "epoch": 0.11386185883803092,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019287203083550425,
+      "loss": 0.1328,
+      "step": 13117
+    },
+    {
+      "epoch": 0.11387053931823508,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0019287086995239317,
+      "loss": 0.1377,
+      "step": 13118
+    },
+    {
+      "epoch": 0.11387921979843925,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0019286970897865571,
+      "loss": 0.1348,
+      "step": 13119
+    },
+    {
+      "epoch": 0.11388790027864341,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019286854791429322,
+      "loss": 0.1187,
+      "step": 13120
+    },
+    {
+      "epoch": 0.11389658075884758,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0019286738675930693,
+      "loss": 0.0942,
+      "step": 13121
+    },
+    {
+      "epoch": 0.11390526123905174,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001928662255136981,
+      "loss": 0.1191,
+      "step": 13122
+    },
+    {
+      "epoch": 0.11391394171925591,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0019286506417746801,
+      "loss": 0.1123,
+      "step": 13123
+    },
+    {
+      "epoch": 0.11392262219946007,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0019286390275061792,
+      "loss": 0.1309,
+      "step": 13124
+    },
+    {
+      "epoch": 0.11393130267966424,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019286274123314912,
+      "loss": 0.1348,
+      "step": 13125
+    },
+    {
+      "epoch": 0.1139399831598684,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0019286157962506286,
+      "loss": 0.1445,
+      "step": 13126
+    },
+    {
+      "epoch": 0.11394866364007257,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0019286041792636044,
+      "loss": 0.1201,
+      "step": 13127
+    },
+    {
+      "epoch": 0.11395734412027674,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001928592561370431,
+      "loss": 0.1162,
+      "step": 13128
+    },
+    {
+      "epoch": 0.1139660246004809,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019285809425711215,
+      "loss": 0.1318,
+      "step": 13129
+    },
+    {
+      "epoch": 0.11397470508068507,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001928569322865688,
+      "loss": 0.1387,
+      "step": 13130
+    },
+    {
+      "epoch": 0.11398338556088923,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0019285577022541435,
+      "loss": 0.1309,
+      "step": 13131
+    },
+    {
+      "epoch": 0.1139920660410934,
+      "grad_norm": 2.09375,
+      "learning_rate": 0.001928546080736501,
+      "loss": 0.1162,
+      "step": 13132
+    },
+    {
+      "epoch": 0.11400074652129756,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001928534458312773,
+      "loss": 0.1099,
+      "step": 13133
+    },
+    {
+      "epoch": 0.11400942700150173,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0019285228349829717,
+      "loss": 0.1426,
+      "step": 13134
+    },
+    {
+      "epoch": 0.11401810748170589,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0019285112107471107,
+      "loss": 0.0859,
+      "step": 13135
+    },
+    {
+      "epoch": 0.11402678796191006,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0019284995856052023,
+      "loss": 0.1465,
+      "step": 13136
+    },
+    {
+      "epoch": 0.11403546844211422,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019284879595572592,
+      "loss": 0.1021,
+      "step": 13137
+    },
+    {
+      "epoch": 0.11404414892231839,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019284763326032938,
+      "loss": 0.0972,
+      "step": 13138
+    },
+    {
+      "epoch": 0.11405282940252255,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0019284647047433192,
+      "loss": 0.127,
+      "step": 13139
+    },
+    {
+      "epoch": 0.11406150988272672,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0019284530759773482,
+      "loss": 0.125,
+      "step": 13140
+    },
+    {
+      "epoch": 0.11407019036293088,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0019284414463053934,
+      "loss": 0.1201,
+      "step": 13141
+    },
+    {
+      "epoch": 0.11407887084313505,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019284298157274675,
+      "loss": 0.1914,
+      "step": 13142
+    },
+    {
+      "epoch": 0.11408755132333921,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019284181842435832,
+      "loss": 0.1289,
+      "step": 13143
+    },
+    {
+      "epoch": 0.11409623180354338,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001928406551853753,
+      "loss": 0.1172,
+      "step": 13144
+    },
+    {
+      "epoch": 0.11410491228374754,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019283949185579898,
+      "loss": 0.1084,
+      "step": 13145
+    },
+    {
+      "epoch": 0.11411359276395171,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019283832843563066,
+      "loss": 0.0913,
+      "step": 13146
+    },
+    {
+      "epoch": 0.11412227324415587,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001928371649248716,
+      "loss": 0.0928,
+      "step": 13147
+    },
+    {
+      "epoch": 0.11413095372436004,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0019283600132352305,
+      "loss": 0.1201,
+      "step": 13148
+    },
+    {
+      "epoch": 0.1141396342045642,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019283483763158627,
+      "loss": 0.1162,
+      "step": 13149
+    },
+    {
+      "epoch": 0.11414831468476837,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0019283367384906255,
+      "loss": 0.1064,
+      "step": 13150
+    },
+    {
+      "epoch": 0.11415699516497253,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001928325099759532,
+      "loss": 0.1436,
+      "step": 13151
+    },
+    {
+      "epoch": 0.1141656756451767,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019283134601225946,
+      "loss": 0.1348,
+      "step": 13152
+    },
+    {
+      "epoch": 0.11417435612538085,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0019283018195798258,
+      "loss": 0.1455,
+      "step": 13153
+    },
+    {
+      "epoch": 0.11418303660558501,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001928290178131239,
+      "loss": 0.1143,
+      "step": 13154
+    },
+    {
+      "epoch": 0.11419171708578918,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019282785357768461,
+      "loss": 0.1602,
+      "step": 13155
+    },
+    {
+      "epoch": 0.11420039756599334,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0019282668925166607,
+      "loss": 0.1055,
+      "step": 13156
+    },
+    {
+      "epoch": 0.11420907804619751,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0019282552483506946,
+      "loss": 0.125,
+      "step": 13157
+    },
+    {
+      "epoch": 0.11421775852640167,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019282436032789611,
+      "loss": 0.1543,
+      "step": 13158
+    },
+    {
+      "epoch": 0.11422643900660584,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0019282319573014732,
+      "loss": 0.1426,
+      "step": 13159
+    },
+    {
+      "epoch": 0.11423511948681,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001928220310418243,
+      "loss": 0.1543,
+      "step": 13160
+    },
+    {
+      "epoch": 0.11424379996701417,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019282086626292833,
+      "loss": 0.125,
+      "step": 13161
+    },
+    {
+      "epoch": 0.11425248044721834,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0019281970139346071,
+      "loss": 0.1094,
+      "step": 13162
+    },
+    {
+      "epoch": 0.1142611609274225,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019281853643342275,
+      "loss": 0.1289,
+      "step": 13163
+    },
+    {
+      "epoch": 0.11426984140762667,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019281737138281566,
+      "loss": 0.0952,
+      "step": 13164
+    },
+    {
+      "epoch": 0.11427852188783083,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0019281620624164073,
+      "loss": 0.083,
+      "step": 13165
+    },
+    {
+      "epoch": 0.114287202368035,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0019281504100989925,
+      "loss": 0.0859,
+      "step": 13166
+    },
+    {
+      "epoch": 0.11429588284823916,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0019281387568759248,
+      "loss": 0.1582,
+      "step": 13167
+    },
+    {
+      "epoch": 0.11430456332844333,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001928127102747217,
+      "loss": 0.0962,
+      "step": 13168
+    },
+    {
+      "epoch": 0.11431324380864749,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001928115447712882,
+      "loss": 0.1201,
+      "step": 13169
+    },
+    {
+      "epoch": 0.11432192428885166,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.001928103791772932,
+      "loss": 0.126,
+      "step": 13170
+    },
+    {
+      "epoch": 0.11433060476905582,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019280921349273804,
+      "loss": 0.0972,
+      "step": 13171
+    },
+    {
+      "epoch": 0.11433928524925999,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019280804771762397,
+      "loss": 0.123,
+      "step": 13172
+    },
+    {
+      "epoch": 0.11434796572946415,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0019280688185195226,
+      "loss": 0.105,
+      "step": 13173
+    },
+    {
+      "epoch": 0.11435664620966832,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019280571589572418,
+      "loss": 0.082,
+      "step": 13174
+    },
+    {
+      "epoch": 0.11436532668987248,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019280454984894103,
+      "loss": 0.1108,
+      "step": 13175
+    },
+    {
+      "epoch": 0.11437400717007665,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019280338371160403,
+      "loss": 0.1143,
+      "step": 13176
+    },
+    {
+      "epoch": 0.11438268765028081,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019280221748371454,
+      "loss": 0.1191,
+      "step": 13177
+    },
+    {
+      "epoch": 0.11439136813048498,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019280105116527377,
+      "loss": 0.1182,
+      "step": 13178
+    },
+    {
+      "epoch": 0.11440004861068914,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019279988475628302,
+      "loss": 0.0986,
+      "step": 13179
+    },
+    {
+      "epoch": 0.11440872909089331,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0019279871825674355,
+      "loss": 0.1699,
+      "step": 13180
+    },
+    {
+      "epoch": 0.11441740957109747,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019279755166665665,
+      "loss": 0.1211,
+      "step": 13181
+    },
+    {
+      "epoch": 0.11442609005130164,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0019279638498602358,
+      "loss": 0.0869,
+      "step": 13182
+    },
+    {
+      "epoch": 0.1144347705315058,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0019279521821484566,
+      "loss": 0.0977,
+      "step": 13183
+    },
+    {
+      "epoch": 0.11444345101170997,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019279405135312411,
+      "loss": 0.125,
+      "step": 13184
+    },
+    {
+      "epoch": 0.11445213149191413,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0019279288440086026,
+      "loss": 0.1094,
+      "step": 13185
+    },
+    {
+      "epoch": 0.1144608119721183,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0019279171735805533,
+      "loss": 0.0889,
+      "step": 13186
+    },
+    {
+      "epoch": 0.11446949245232246,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001927905502247106,
+      "loss": 0.1123,
+      "step": 13187
+    },
+    {
+      "epoch": 0.11447817293252663,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019278938300082739,
+      "loss": 0.1152,
+      "step": 13188
+    },
+    {
+      "epoch": 0.1144868534127308,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0019278821568640694,
+      "loss": 0.1138,
+      "step": 13189
+    },
+    {
+      "epoch": 0.11449553389293496,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019278704828145058,
+      "loss": 0.1445,
+      "step": 13190
+    },
+    {
+      "epoch": 0.11450421437313912,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0019278588078595952,
+      "loss": 0.1279,
+      "step": 13191
+    },
+    {
+      "epoch": 0.11451289485334329,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019278471319993508,
+      "loss": 0.1133,
+      "step": 13192
+    },
+    {
+      "epoch": 0.11452157533354745,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0019278354552337846,
+      "loss": 0.1523,
+      "step": 13193
+    },
+    {
+      "epoch": 0.11453025581375162,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001927823777562911,
+      "loss": 0.0952,
+      "step": 13194
+    },
+    {
+      "epoch": 0.11453893629395578,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001927812098986741,
+      "loss": 0.1221,
+      "step": 13195
+    },
+    {
+      "epoch": 0.11454761677415995,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019278004195052883,
+      "loss": 0.1123,
+      "step": 13196
+    },
+    {
+      "epoch": 0.11455629725436411,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0019277887391185658,
+      "loss": 0.0645,
+      "step": 13197
+    },
+    {
+      "epoch": 0.11456497773456828,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019277770578265856,
+      "loss": 0.1152,
+      "step": 13198
+    },
+    {
+      "epoch": 0.11457365821477244,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0019277653756293609,
+      "loss": 0.127,
+      "step": 13199
+    },
+    {
+      "epoch": 0.11458233869497661,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019277536925269047,
+      "loss": 0.1011,
+      "step": 13200
+    },
+    {
+      "epoch": 0.11459101917518077,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019277420085192292,
+      "loss": 0.104,
+      "step": 13201
+    },
+    {
+      "epoch": 0.11459969965538494,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0019277303236063476,
+      "loss": 0.127,
+      "step": 13202
+    },
+    {
+      "epoch": 0.1146083801355891,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0019277186377882722,
+      "loss": 0.1152,
+      "step": 13203
+    },
+    {
+      "epoch": 0.11461706061579327,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0019277069510650166,
+      "loss": 0.0977,
+      "step": 13204
+    },
+    {
+      "epoch": 0.11462574109599744,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.001927695263436593,
+      "loss": 0.1318,
+      "step": 13205
+    },
+    {
+      "epoch": 0.1146344215762016,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019276835749030144,
+      "loss": 0.1104,
+      "step": 13206
+    },
+    {
+      "epoch": 0.11464310205640577,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0019276718854642934,
+      "loss": 0.1143,
+      "step": 13207
+    },
+    {
+      "epoch": 0.11465178253660993,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019276601951204427,
+      "loss": 0.1416,
+      "step": 13208
+    },
+    {
+      "epoch": 0.1146604630168141,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019276485038714755,
+      "loss": 0.084,
+      "step": 13209
+    },
+    {
+      "epoch": 0.11466914349701826,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019276368117174042,
+      "loss": 0.1201,
+      "step": 13210
+    },
+    {
+      "epoch": 0.11467782397722243,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019276251186582417,
+      "loss": 0.127,
+      "step": 13211
+    },
+    {
+      "epoch": 0.11468650445742659,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0019276134246940008,
+      "loss": 0.1553,
+      "step": 13212
+    },
+    {
+      "epoch": 0.11469518493763076,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019276017298246939,
+      "loss": 0.1055,
+      "step": 13213
+    },
+    {
+      "epoch": 0.11470386541783492,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0019275900340503347,
+      "loss": 0.1064,
+      "step": 13214
+    },
+    {
+      "epoch": 0.11471254589803907,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019275783373709358,
+      "loss": 0.1484,
+      "step": 13215
+    },
+    {
+      "epoch": 0.11472122637824324,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019275666397865088,
+      "loss": 0.1211,
+      "step": 13216
+    },
+    {
+      "epoch": 0.1147299068584474,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019275549412970676,
+      "loss": 0.1201,
+      "step": 13217
+    },
+    {
+      "epoch": 0.11473858733865157,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019275432419026254,
+      "loss": 0.1191,
+      "step": 13218
+    },
+    {
+      "epoch": 0.11474726781885573,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0019275315416031936,
+      "loss": 0.1079,
+      "step": 13219
+    },
+    {
+      "epoch": 0.1147559482990599,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001927519840398786,
+      "loss": 0.1123,
+      "step": 13220
+    },
+    {
+      "epoch": 0.11476462877926406,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0019275081382894154,
+      "loss": 0.1357,
+      "step": 13221
+    },
+    {
+      "epoch": 0.11477330925946823,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019274964352750939,
+      "loss": 0.168,
+      "step": 13222
+    },
+    {
+      "epoch": 0.1147819897396724,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019274847313558348,
+      "loss": 0.1064,
+      "step": 13223
+    },
+    {
+      "epoch": 0.11479067021987656,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019274730265316509,
+      "loss": 0.1357,
+      "step": 13224
+    },
+    {
+      "epoch": 0.11479935070008072,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019274613208025552,
+      "loss": 0.1396,
+      "step": 13225
+    },
+    {
+      "epoch": 0.11480803118028489,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019274496141685601,
+      "loss": 0.1289,
+      "step": 13226
+    },
+    {
+      "epoch": 0.11481671166048905,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019274379066296784,
+      "loss": 0.082,
+      "step": 13227
+    },
+    {
+      "epoch": 0.11482539214069322,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001927426198185923,
+      "loss": 0.1289,
+      "step": 13228
+    },
+    {
+      "epoch": 0.11483407262089738,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001927414488837307,
+      "loss": 0.123,
+      "step": 13229
+    },
+    {
+      "epoch": 0.11484275310110155,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019274027785838425,
+      "loss": 0.0952,
+      "step": 13230
+    },
+    {
+      "epoch": 0.11485143358130571,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001927391067425543,
+      "loss": 0.0952,
+      "step": 13231
+    },
+    {
+      "epoch": 0.11486011406150988,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019273793553624212,
+      "loss": 0.1055,
+      "step": 13232
+    },
+    {
+      "epoch": 0.11486879454171404,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00192736764239449,
+      "loss": 0.1162,
+      "step": 13233
+    },
+    {
+      "epoch": 0.11487747502191821,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019273559285217612,
+      "loss": 0.0996,
+      "step": 13234
+    },
+    {
+      "epoch": 0.11488615550212238,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001927344213744249,
+      "loss": 0.0967,
+      "step": 13235
+    },
+    {
+      "epoch": 0.11489483598232654,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019273324980619653,
+      "loss": 0.1445,
+      "step": 13236
+    },
+    {
+      "epoch": 0.1149035164625307,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019273207814749233,
+      "loss": 0.0972,
+      "step": 13237
+    },
+    {
+      "epoch": 0.11491219694273487,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001927309063983136,
+      "loss": 0.1523,
+      "step": 13238
+    },
+    {
+      "epoch": 0.11492087742293904,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019272973455866154,
+      "loss": 0.1104,
+      "step": 13239
+    },
+    {
+      "epoch": 0.1149295579031432,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019272856262853753,
+      "loss": 0.1279,
+      "step": 13240
+    },
+    {
+      "epoch": 0.11493823838334737,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001927273906079428,
+      "loss": 0.1055,
+      "step": 13241
+    },
+    {
+      "epoch": 0.11494691886355153,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.001927262184968786,
+      "loss": 0.1484,
+      "step": 13242
+    },
+    {
+      "epoch": 0.1149555993437557,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001927250462953463,
+      "loss": 0.0996,
+      "step": 13243
+    },
+    {
+      "epoch": 0.11496427982395986,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001927238740033471,
+      "loss": 0.1387,
+      "step": 13244
+    },
+    {
+      "epoch": 0.11497296030416403,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0019272270162088235,
+      "loss": 0.1172,
+      "step": 13245
+    },
+    {
+      "epoch": 0.11498164078436819,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019272152914795325,
+      "loss": 0.0859,
+      "step": 13246
+    },
+    {
+      "epoch": 0.11499032126457236,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019272035658456114,
+      "loss": 0.1113,
+      "step": 13247
+    },
+    {
+      "epoch": 0.11499900174477652,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019271918393070733,
+      "loss": 0.1338,
+      "step": 13248
+    },
+    {
+      "epoch": 0.11500768222498069,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0019271801118639304,
+      "loss": 0.1152,
+      "step": 13249
+    },
+    {
+      "epoch": 0.11501636270518485,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019271683835161957,
+      "loss": 0.1211,
+      "step": 13250
+    },
+    {
+      "epoch": 0.11502504318538902,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001927156654263882,
+      "loss": 0.1504,
+      "step": 13251
+    },
+    {
+      "epoch": 0.11503372366559318,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019271449241070023,
+      "loss": 0.0928,
+      "step": 13252
+    },
+    {
+      "epoch": 0.11504240414579735,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019271331930455693,
+      "loss": 0.1123,
+      "step": 13253
+    },
+    {
+      "epoch": 0.11505108462600151,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001927121461079596,
+      "loss": 0.1445,
+      "step": 13254
+    },
+    {
+      "epoch": 0.11505976510620568,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0019271097282090952,
+      "loss": 0.1069,
+      "step": 13255
+    },
+    {
+      "epoch": 0.11506844558640984,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0019270979944340791,
+      "loss": 0.125,
+      "step": 13256
+    },
+    {
+      "epoch": 0.11507712606661401,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0019270862597545616,
+      "loss": 0.1016,
+      "step": 13257
+    },
+    {
+      "epoch": 0.11508580654681817,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.001927074524170555,
+      "loss": 0.1924,
+      "step": 13258
+    },
+    {
+      "epoch": 0.11509448702702234,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001927062787682072,
+      "loss": 0.1133,
+      "step": 13259
+    },
+    {
+      "epoch": 0.1151031675072265,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001927051050289125,
+      "loss": 0.0957,
+      "step": 13260
+    },
+    {
+      "epoch": 0.11511184798743067,
+      "grad_norm": 1.8125,
+      "learning_rate": 0.0019270393119917282,
+      "loss": 0.1445,
+      "step": 13261
+    },
+    {
+      "epoch": 0.11512052846763483,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019270275727898932,
+      "loss": 0.0864,
+      "step": 13262
+    },
+    {
+      "epoch": 0.115129208947839,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019270158326836334,
+      "loss": 0.1523,
+      "step": 13263
+    },
+    {
+      "epoch": 0.11513788942804316,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019270040916729615,
+      "loss": 0.0801,
+      "step": 13264
+    },
+    {
+      "epoch": 0.11514656990824733,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019269923497578906,
+      "loss": 0.1191,
+      "step": 13265
+    },
+    {
+      "epoch": 0.1151552503884515,
+      "grad_norm": 2.15625,
+      "learning_rate": 0.001926980606938433,
+      "loss": 0.1895,
+      "step": 13266
+    },
+    {
+      "epoch": 0.11516393086865566,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019269688632146016,
+      "loss": 0.1133,
+      "step": 13267
+    },
+    {
+      "epoch": 0.11517261134885982,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00192695711858641,
+      "loss": 0.1426,
+      "step": 13268
+    },
+    {
+      "epoch": 0.11518129182906399,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00192694537305387,
+      "loss": 0.1562,
+      "step": 13269
+    },
+    {
+      "epoch": 0.11518997230926815,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0019269336266169953,
+      "loss": 0.1299,
+      "step": 13270
+    },
+    {
+      "epoch": 0.11519865278947232,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019269218792757983,
+      "loss": 0.0957,
+      "step": 13271
+    },
+    {
+      "epoch": 0.11520733326967648,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001926910131030292,
+      "loss": 0.0938,
+      "step": 13272
+    },
+    {
+      "epoch": 0.11521601374988065,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0019268983818804893,
+      "loss": 0.1328,
+      "step": 13273
+    },
+    {
+      "epoch": 0.11522469423008481,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019268866318264029,
+      "loss": 0.0918,
+      "step": 13274
+    },
+    {
+      "epoch": 0.11523337471028898,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0019268748808680455,
+      "loss": 0.1318,
+      "step": 13275
+    },
+    {
+      "epoch": 0.11524205519049313,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0019268631290054303,
+      "loss": 0.1279,
+      "step": 13276
+    },
+    {
+      "epoch": 0.1152507356706973,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0019268513762385703,
+      "loss": 0.1064,
+      "step": 13277
+    },
+    {
+      "epoch": 0.11525941615090146,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0019268396225674777,
+      "loss": 0.1367,
+      "step": 13278
+    },
+    {
+      "epoch": 0.11526809663110563,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019268278679921657,
+      "loss": 0.1064,
+      "step": 13279
+    },
+    {
+      "epoch": 0.11527677711130979,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019268161125126472,
+      "loss": 0.2969,
+      "step": 13280
+    },
+    {
+      "epoch": 0.11528545759151396,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019268043561289352,
+      "loss": 0.0938,
+      "step": 13281
+    },
+    {
+      "epoch": 0.11529413807171812,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0019267925988410424,
+      "loss": 0.1211,
+      "step": 13282
+    },
+    {
+      "epoch": 0.11530281855192229,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019267808406489813,
+      "loss": 0.0894,
+      "step": 13283
+    },
+    {
+      "epoch": 0.11531149903212645,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019267690815527652,
+      "loss": 0.0996,
+      "step": 13284
+    },
+    {
+      "epoch": 0.11532017951233062,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001926757321552407,
+      "loss": 0.1104,
+      "step": 13285
+    },
+    {
+      "epoch": 0.11532885999253478,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019267455606479195,
+      "loss": 0.1084,
+      "step": 13286
+    },
+    {
+      "epoch": 0.11533754047273895,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019267337988393154,
+      "loss": 0.1172,
+      "step": 13287
+    },
+    {
+      "epoch": 0.11534622095294311,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019267220361266076,
+      "loss": 0.1396,
+      "step": 13288
+    },
+    {
+      "epoch": 0.11535490143314728,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001926710272509809,
+      "loss": 0.084,
+      "step": 13289
+    },
+    {
+      "epoch": 0.11536358191335144,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019266985079889323,
+      "loss": 0.1504,
+      "step": 13290
+    },
+    {
+      "epoch": 0.11537226239355561,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001926686742563991,
+      "loss": 0.1045,
+      "step": 13291
+    },
+    {
+      "epoch": 0.11538094287375977,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0019266749762349969,
+      "loss": 0.1069,
+      "step": 13292
+    },
+    {
+      "epoch": 0.11538962335396394,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019266632090019639,
+      "loss": 0.1045,
+      "step": 13293
+    },
+    {
+      "epoch": 0.1153983038341681,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001926651440864904,
+      "loss": 0.1182,
+      "step": 13294
+    },
+    {
+      "epoch": 0.11540698431437227,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019266396718238311,
+      "loss": 0.1069,
+      "step": 13295
+    },
+    {
+      "epoch": 0.11541566479457643,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019266279018787572,
+      "loss": 0.104,
+      "step": 13296
+    },
+    {
+      "epoch": 0.1154243452747806,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019266161310296953,
+      "loss": 0.125,
+      "step": 13297
+    },
+    {
+      "epoch": 0.11543302575498476,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019266043592766585,
+      "loss": 0.1758,
+      "step": 13298
+    },
+    {
+      "epoch": 0.11544170623518893,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0019265925866196597,
+      "loss": 0.1143,
+      "step": 13299
+    },
+    {
+      "epoch": 0.1154503867153931,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019265808130587115,
+      "loss": 0.1309,
+      "step": 13300
+    },
+    {
+      "epoch": 0.11545906719559726,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001926569038593827,
+      "loss": 0.1328,
+      "step": 13301
+    },
+    {
+      "epoch": 0.11546774767580142,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001926557263225019,
+      "loss": 0.0776,
+      "step": 13302
+    },
+    {
+      "epoch": 0.11547642815600559,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0019265454869523005,
+      "loss": 0.0967,
+      "step": 13303
+    },
+    {
+      "epoch": 0.11548510863620975,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001926533709775684,
+      "loss": 0.0947,
+      "step": 13304
+    },
+    {
+      "epoch": 0.11549378911641392,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0019265219316951833,
+      "loss": 0.082,
+      "step": 13305
+    },
+    {
+      "epoch": 0.11550246959661808,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019265101527108101,
+      "loss": 0.0859,
+      "step": 13306
+    },
+    {
+      "epoch": 0.11551115007682225,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001926498372822578,
+      "loss": 0.0864,
+      "step": 13307
+    },
+    {
+      "epoch": 0.11551983055702642,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019264865920304995,
+      "loss": 0.1133,
+      "step": 13308
+    },
+    {
+      "epoch": 0.11552851103723058,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019264748103345877,
+      "loss": 0.0972,
+      "step": 13309
+    },
+    {
+      "epoch": 0.11553719151743475,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0019264630277348556,
+      "loss": 0.1777,
+      "step": 13310
+    },
+    {
+      "epoch": 0.11554587199763891,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019264512442313162,
+      "loss": 0.1113,
+      "step": 13311
+    },
+    {
+      "epoch": 0.11555455247784308,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019264394598239817,
+      "loss": 0.0918,
+      "step": 13312
+    },
+    {
+      "epoch": 0.11556323295804724,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019264276745128658,
+      "loss": 0.1128,
+      "step": 13313
+    },
+    {
+      "epoch": 0.1155719134382514,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019264158882979804,
+      "loss": 0.1094,
+      "step": 13314
+    },
+    {
+      "epoch": 0.11558059391845557,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019264041011793397,
+      "loss": 0.1201,
+      "step": 13315
+    },
+    {
+      "epoch": 0.11558927439865974,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0019263923131569553,
+      "loss": 0.1128,
+      "step": 13316
+    },
+    {
+      "epoch": 0.1155979548788639,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001926380524230841,
+      "loss": 0.0864,
+      "step": 13317
+    },
+    {
+      "epoch": 0.11560663535906807,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0019263687344010095,
+      "loss": 0.1289,
+      "step": 13318
+    },
+    {
+      "epoch": 0.11561531583927223,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019263569436674733,
+      "loss": 0.1338,
+      "step": 13319
+    },
+    {
+      "epoch": 0.1156239963194764,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0019263451520302457,
+      "loss": 0.1182,
+      "step": 13320
+    },
+    {
+      "epoch": 0.11563267679968056,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0019263333594893392,
+      "loss": 0.1377,
+      "step": 13321
+    },
+    {
+      "epoch": 0.11564135727988473,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0019263215660447674,
+      "loss": 0.1309,
+      "step": 13322
+    },
+    {
+      "epoch": 0.11565003776008889,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0019263097716965426,
+      "loss": 0.0801,
+      "step": 13323
+    },
+    {
+      "epoch": 0.11565871824029306,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001926297976444678,
+      "loss": 0.1162,
+      "step": 13324
+    },
+    {
+      "epoch": 0.11566739872049722,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001926286180289186,
+      "loss": 0.1216,
+      "step": 13325
+    },
+    {
+      "epoch": 0.11567607920070139,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.00192627438323008,
+      "loss": 0.0928,
+      "step": 13326
+    },
+    {
+      "epoch": 0.11568475968090555,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0019262625852673727,
+      "loss": 0.1895,
+      "step": 13327
+    },
+    {
+      "epoch": 0.11569344016110972,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019262507864010773,
+      "loss": 0.1328,
+      "step": 13328
+    },
+    {
+      "epoch": 0.11570212064131388,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001926238986631206,
+      "loss": 0.126,
+      "step": 13329
+    },
+    {
+      "epoch": 0.11571080112151805,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0019262271859577722,
+      "loss": 0.1055,
+      "step": 13330
+    },
+    {
+      "epoch": 0.11571948160172221,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001926215384380789,
+      "loss": 0.1318,
+      "step": 13331
+    },
+    {
+      "epoch": 0.11572816208192638,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019262035819002693,
+      "loss": 0.1084,
+      "step": 13332
+    },
+    {
+      "epoch": 0.11573684256213054,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019261917785162253,
+      "loss": 0.1377,
+      "step": 13333
+    },
+    {
+      "epoch": 0.11574552304233471,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019261799742286705,
+      "loss": 0.0757,
+      "step": 13334
+    },
+    {
+      "epoch": 0.11575420352253887,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001926168169037618,
+      "loss": 0.1182,
+      "step": 13335
+    },
+    {
+      "epoch": 0.11576288400274304,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0019261563629430802,
+      "loss": 0.1074,
+      "step": 13336
+    },
+    {
+      "epoch": 0.1157715644829472,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00192614455594507,
+      "loss": 0.1118,
+      "step": 13337
+    },
+    {
+      "epoch": 0.11578024496315135,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001926132748043601,
+      "loss": 0.1279,
+      "step": 13338
+    },
+    {
+      "epoch": 0.11578892544335552,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019261209392386854,
+      "loss": 0.0908,
+      "step": 13339
+    },
+    {
+      "epoch": 0.11579760592355968,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019261091295303362,
+      "loss": 0.1367,
+      "step": 13340
+    },
+    {
+      "epoch": 0.11580628640376385,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019260973189185663,
+      "loss": 0.1055,
+      "step": 13341
+    },
+    {
+      "epoch": 0.11581496688396802,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0019260855074033892,
+      "loss": 0.0967,
+      "step": 13342
+    },
+    {
+      "epoch": 0.11582364736417218,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019260736949848175,
+      "loss": 0.1162,
+      "step": 13343
+    },
+    {
+      "epoch": 0.11583232784437635,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0019260618816628638,
+      "loss": 0.0928,
+      "step": 13344
+    },
+    {
+      "epoch": 0.11584100832458051,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019260500674375411,
+      "loss": 0.126,
+      "step": 13345
+    },
+    {
+      "epoch": 0.11584968880478468,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019260382523088625,
+      "loss": 0.0923,
+      "step": 13346
+    },
+    {
+      "epoch": 0.11585836928498884,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001926026436276841,
+      "loss": 0.1157,
+      "step": 13347
+    },
+    {
+      "epoch": 0.115867049765193,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019260146193414893,
+      "loss": 0.125,
+      "step": 13348
+    },
+    {
+      "epoch": 0.11587573024539717,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019260028015028205,
+      "loss": 0.1504,
+      "step": 13349
+    },
+    {
+      "epoch": 0.11588441072560134,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019259909827608476,
+      "loss": 0.1299,
+      "step": 13350
+    },
+    {
+      "epoch": 0.1158930912058055,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019259791631155829,
+      "loss": 0.0972,
+      "step": 13351
+    },
+    {
+      "epoch": 0.11590177168600967,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.00192596734256704,
+      "loss": 0.0957,
+      "step": 13352
+    },
+    {
+      "epoch": 0.11591045216621383,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001925955521115232,
+      "loss": 0.106,
+      "step": 13353
+    },
+    {
+      "epoch": 0.115919132646418,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019259436987601713,
+      "loss": 0.106,
+      "step": 13354
+    },
+    {
+      "epoch": 0.11592781312662216,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0019259318755018707,
+      "loss": 0.1357,
+      "step": 13355
+    },
+    {
+      "epoch": 0.11593649360682633,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019259200513403435,
+      "loss": 0.1191,
+      "step": 13356
+    },
+    {
+      "epoch": 0.11594517408703049,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019259082262756027,
+      "loss": 0.1123,
+      "step": 13357
+    },
+    {
+      "epoch": 0.11595385456723466,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019258964003076608,
+      "loss": 0.1074,
+      "step": 13358
+    },
+    {
+      "epoch": 0.11596253504743882,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0019258845734365314,
+      "loss": 0.1367,
+      "step": 13359
+    },
+    {
+      "epoch": 0.11597121552764299,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0019258727456622268,
+      "loss": 0.0947,
+      "step": 13360
+    },
+    {
+      "epoch": 0.11597989600784715,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019258609169847602,
+      "loss": 0.1216,
+      "step": 13361
+    },
+    {
+      "epoch": 0.11598857648805132,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0019258490874041446,
+      "loss": 0.1157,
+      "step": 13362
+    },
+    {
+      "epoch": 0.11599725696825548,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019258372569203929,
+      "loss": 0.1338,
+      "step": 13363
+    },
+    {
+      "epoch": 0.11600593744845965,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019258254255335176,
+      "loss": 0.1201,
+      "step": 13364
+    },
+    {
+      "epoch": 0.11601461792866381,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0019258135932435324,
+      "loss": 0.1553,
+      "step": 13365
+    },
+    {
+      "epoch": 0.11602329840886798,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0019258017600504499,
+      "loss": 0.1045,
+      "step": 13366
+    },
+    {
+      "epoch": 0.11603197888907214,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019257899259542829,
+      "loss": 0.0996,
+      "step": 13367
+    },
+    {
+      "epoch": 0.11604065936927631,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019257780909550442,
+      "loss": 0.1592,
+      "step": 13368
+    },
+    {
+      "epoch": 0.11604933984948047,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019257662550527471,
+      "loss": 0.1084,
+      "step": 13369
+    },
+    {
+      "epoch": 0.11605802032968464,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0019257544182474042,
+      "loss": 0.0898,
+      "step": 13370
+    },
+    {
+      "epoch": 0.1160667008098888,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0019257425805390292,
+      "loss": 0.1094,
+      "step": 13371
+    },
+    {
+      "epoch": 0.11607538129009297,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0019257307419276342,
+      "loss": 0.1357,
+      "step": 13372
+    },
+    {
+      "epoch": 0.11608406177029713,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019257189024132328,
+      "loss": 0.1172,
+      "step": 13373
+    },
+    {
+      "epoch": 0.1160927422505013,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019257070619958373,
+      "loss": 0.1055,
+      "step": 13374
+    },
+    {
+      "epoch": 0.11610142273070546,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001925695220675461,
+      "loss": 0.0967,
+      "step": 13375
+    },
+    {
+      "epoch": 0.11611010321090963,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019256833784521167,
+      "loss": 0.1592,
+      "step": 13376
+    },
+    {
+      "epoch": 0.1161187836911138,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019256715353258176,
+      "loss": 0.0947,
+      "step": 13377
+    },
+    {
+      "epoch": 0.11612746417131796,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019256596912965768,
+      "loss": 0.0952,
+      "step": 13378
+    },
+    {
+      "epoch": 0.11613614465152212,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019256478463644065,
+      "loss": 0.105,
+      "step": 13379
+    },
+    {
+      "epoch": 0.11614482513172629,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019256360005293204,
+      "loss": 0.1289,
+      "step": 13380
+    },
+    {
+      "epoch": 0.11615350561193045,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001925624153791331,
+      "loss": 0.1338,
+      "step": 13381
+    },
+    {
+      "epoch": 0.11616218609213462,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019256123061504515,
+      "loss": 0.0923,
+      "step": 13382
+    },
+    {
+      "epoch": 0.11617086657233879,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019256004576066947,
+      "loss": 0.1025,
+      "step": 13383
+    },
+    {
+      "epoch": 0.11617954705254295,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019255886081600737,
+      "loss": 0.1797,
+      "step": 13384
+    },
+    {
+      "epoch": 0.11618822753274712,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019255767578106016,
+      "loss": 0.0947,
+      "step": 13385
+    },
+    {
+      "epoch": 0.11619690801295128,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001925564906558291,
+      "loss": 0.0869,
+      "step": 13386
+    },
+    {
+      "epoch": 0.11620558849315545,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019255530544031551,
+      "loss": 0.1182,
+      "step": 13387
+    },
+    {
+      "epoch": 0.11621426897335961,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0019255412013452068,
+      "loss": 0.1396,
+      "step": 13388
+    },
+    {
+      "epoch": 0.11622294945356378,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0019255293473844591,
+      "loss": 0.0688,
+      "step": 13389
+    },
+    {
+      "epoch": 0.11623162993376794,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019255174925209245,
+      "loss": 0.0801,
+      "step": 13390
+    },
+    {
+      "epoch": 0.1162403104139721,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019255056367546166,
+      "loss": 0.1357,
+      "step": 13391
+    },
+    {
+      "epoch": 0.11624899089417627,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0019254937800855484,
+      "loss": 0.0957,
+      "step": 13392
+    },
+    {
+      "epoch": 0.11625767137438044,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019254819225137322,
+      "loss": 0.1035,
+      "step": 13393
+    },
+    {
+      "epoch": 0.1162663518545846,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0019254700640391818,
+      "loss": 0.1484,
+      "step": 13394
+    },
+    {
+      "epoch": 0.11627503233478877,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0019254582046619093,
+      "loss": 0.1182,
+      "step": 13395
+    },
+    {
+      "epoch": 0.11628371281499293,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001925446344381928,
+      "loss": 0.1006,
+      "step": 13396
+    },
+    {
+      "epoch": 0.1162923932951971,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019254344831992518,
+      "loss": 0.1621,
+      "step": 13397
+    },
+    {
+      "epoch": 0.11630107377540126,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001925422621113892,
+      "loss": 0.0918,
+      "step": 13398
+    },
+    {
+      "epoch": 0.11630975425560543,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001925410758125863,
+      "loss": 0.1318,
+      "step": 13399
+    },
+    {
+      "epoch": 0.11631843473580958,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019253988942351768,
+      "loss": 0.1172,
+      "step": 13400
+    },
+    {
+      "epoch": 0.11632711521601374,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.001925387029441847,
+      "loss": 0.1152,
+      "step": 13401
+    },
+    {
+      "epoch": 0.11633579569621791,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0019253751637458864,
+      "loss": 0.1055,
+      "step": 13402
+    },
+    {
+      "epoch": 0.11634447617642207,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019253632971473079,
+      "loss": 0.126,
+      "step": 13403
+    },
+    {
+      "epoch": 0.11635315665662624,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0019253514296461243,
+      "loss": 0.1279,
+      "step": 13404
+    },
+    {
+      "epoch": 0.1163618371368304,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019253395612423487,
+      "loss": 0.1377,
+      "step": 13405
+    },
+    {
+      "epoch": 0.11637051761703457,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019253276919359945,
+      "loss": 0.1289,
+      "step": 13406
+    },
+    {
+      "epoch": 0.11637919809723873,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019253158217270741,
+      "loss": 0.1191,
+      "step": 13407
+    },
+    {
+      "epoch": 0.1163878785774429,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001925303950615601,
+      "loss": 0.0874,
+      "step": 13408
+    },
+    {
+      "epoch": 0.11639655905764706,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019252920786015876,
+      "loss": 0.1138,
+      "step": 13409
+    },
+    {
+      "epoch": 0.11640523953785123,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001925280205685047,
+      "loss": 0.1123,
+      "step": 13410
+    },
+    {
+      "epoch": 0.1164139200180554,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019252683318659927,
+      "loss": 0.0947,
+      "step": 13411
+    },
+    {
+      "epoch": 0.11642260049825956,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019252564571444375,
+      "loss": 0.103,
+      "step": 13412
+    },
+    {
+      "epoch": 0.11643128097846372,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001925244581520394,
+      "loss": 0.1201,
+      "step": 13413
+    },
+    {
+      "epoch": 0.11643996145866789,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019252327049938756,
+      "loss": 0.1201,
+      "step": 13414
+    },
+    {
+      "epoch": 0.11644864193887206,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0019252208275648947,
+      "loss": 0.106,
+      "step": 13415
+    },
+    {
+      "epoch": 0.11645732241907622,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001925208949233465,
+      "loss": 0.1074,
+      "step": 13416
+    },
+    {
+      "epoch": 0.11646600289928039,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019251970699995992,
+      "loss": 0.1396,
+      "step": 13417
+    },
+    {
+      "epoch": 0.11647468337948455,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0019251851898633102,
+      "loss": 0.1064,
+      "step": 13418
+    },
+    {
+      "epoch": 0.11648336385968872,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019251733088246114,
+      "loss": 0.0889,
+      "step": 13419
+    },
+    {
+      "epoch": 0.11649204433989288,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001925161426883515,
+      "loss": 0.1172,
+      "step": 13420
+    },
+    {
+      "epoch": 0.11650072482009705,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019251495440400347,
+      "loss": 0.1055,
+      "step": 13421
+    },
+    {
+      "epoch": 0.11650940530030121,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001925137660294183,
+      "loss": 0.124,
+      "step": 13422
+    },
+    {
+      "epoch": 0.11651808578050538,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0019251257756459734,
+      "loss": 0.1191,
+      "step": 13423
+    },
+    {
+      "epoch": 0.11652676626070954,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019251138900954186,
+      "loss": 0.0757,
+      "step": 13424
+    },
+    {
+      "epoch": 0.1165354467409137,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0019251020036425317,
+      "loss": 0.103,
+      "step": 13425
+    },
+    {
+      "epoch": 0.11654412722111787,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019250901162873256,
+      "loss": 0.1309,
+      "step": 13426
+    },
+    {
+      "epoch": 0.11655280770132204,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019250782280298133,
+      "loss": 0.126,
+      "step": 13427
+    },
+    {
+      "epoch": 0.1165614881815262,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001925066338870008,
+      "loss": 0.1011,
+      "step": 13428
+    },
+    {
+      "epoch": 0.11657016866173037,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019250544488079223,
+      "loss": 0.1143,
+      "step": 13429
+    },
+    {
+      "epoch": 0.11657884914193453,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019250425578435698,
+      "loss": 0.0972,
+      "step": 13430
+    },
+    {
+      "epoch": 0.1165875296221387,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001925030665976963,
+      "loss": 0.0957,
+      "step": 13431
+    },
+    {
+      "epoch": 0.11659621010234286,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0019250187732081149,
+      "loss": 0.127,
+      "step": 13432
+    },
+    {
+      "epoch": 0.11660489058254703,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019250068795370388,
+      "loss": 0.1279,
+      "step": 13433
+    },
+    {
+      "epoch": 0.11661357106275119,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019249949849637475,
+      "loss": 0.1201,
+      "step": 13434
+    },
+    {
+      "epoch": 0.11662225154295536,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019249830894882544,
+      "loss": 0.123,
+      "step": 13435
+    },
+    {
+      "epoch": 0.11663093202315952,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019249711931105715,
+      "loss": 0.1504,
+      "step": 13436
+    },
+    {
+      "epoch": 0.11663961250336369,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001924959295830713,
+      "loss": 0.0947,
+      "step": 13437
+    },
+    {
+      "epoch": 0.11664829298356785,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019249473976486913,
+      "loss": 0.126,
+      "step": 13438
+    },
+    {
+      "epoch": 0.11665697346377202,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0019249354985645195,
+      "loss": 0.1396,
+      "step": 13439
+    },
+    {
+      "epoch": 0.11666565394397618,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019249235985782107,
+      "loss": 0.0781,
+      "step": 13440
+    },
+    {
+      "epoch": 0.11667433442418035,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0019249116976897778,
+      "loss": 0.083,
+      "step": 13441
+    },
+    {
+      "epoch": 0.11668301490438451,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001924899795899234,
+      "loss": 0.1006,
+      "step": 13442
+    },
+    {
+      "epoch": 0.11669169538458868,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019248878932065921,
+      "loss": 0.1221,
+      "step": 13443
+    },
+    {
+      "epoch": 0.11670037586479284,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001924875989611865,
+      "loss": 0.1465,
+      "step": 13444
+    },
+    {
+      "epoch": 0.11670905634499701,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0019248640851150661,
+      "loss": 0.1465,
+      "step": 13445
+    },
+    {
+      "epoch": 0.11671773682520117,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0019248521797162082,
+      "loss": 0.1299,
+      "step": 13446
+    },
+    {
+      "epoch": 0.11672641730540534,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0019248402734153046,
+      "loss": 0.1074,
+      "step": 13447
+    },
+    {
+      "epoch": 0.1167350977856095,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019248283662123675,
+      "loss": 0.1533,
+      "step": 13448
+    },
+    {
+      "epoch": 0.11674377826581367,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0019248164581074107,
+      "loss": 0.1738,
+      "step": 13449
+    },
+    {
+      "epoch": 0.11675245874601783,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0019248045491004474,
+      "loss": 0.1602,
+      "step": 13450
+    },
+    {
+      "epoch": 0.116761139226222,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00192479263919149,
+      "loss": 0.1152,
+      "step": 13451
+    },
+    {
+      "epoch": 0.11676981970642616,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0019247807283805517,
+      "loss": 0.1387,
+      "step": 13452
+    },
+    {
+      "epoch": 0.11677850018663033,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019247688166676455,
+      "loss": 0.1094,
+      "step": 13453
+    },
+    {
+      "epoch": 0.1167871806668345,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019247569040527846,
+      "loss": 0.1377,
+      "step": 13454
+    },
+    {
+      "epoch": 0.11679586114703866,
+      "grad_norm": 0.875,
+      "learning_rate": 0.001924744990535982,
+      "loss": 0.125,
+      "step": 13455
+    },
+    {
+      "epoch": 0.11680454162724282,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019247330761172506,
+      "loss": 0.1084,
+      "step": 13456
+    },
+    {
+      "epoch": 0.11681322210744699,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019247211607966036,
+      "loss": 0.1143,
+      "step": 13457
+    },
+    {
+      "epoch": 0.11682190258765116,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0019247092445740536,
+      "loss": 0.0879,
+      "step": 13458
+    },
+    {
+      "epoch": 0.11683058306785532,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0019246973274496145,
+      "loss": 0.1934,
+      "step": 13459
+    },
+    {
+      "epoch": 0.11683926354805949,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019246854094232984,
+      "loss": 0.166,
+      "step": 13460
+    },
+    {
+      "epoch": 0.11684794402826364,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.001924673490495119,
+      "loss": 0.1113,
+      "step": 13461
+    },
+    {
+      "epoch": 0.1168566245084678,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001924661570665089,
+      "loss": 0.1064,
+      "step": 13462
+    },
+    {
+      "epoch": 0.11686530498867197,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019246496499332212,
+      "loss": 0.1152,
+      "step": 13463
+    },
+    {
+      "epoch": 0.11687398546887613,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001924637728299529,
+      "loss": 0.1143,
+      "step": 13464
+    },
+    {
+      "epoch": 0.1168826659490803,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019246258057640257,
+      "loss": 0.1367,
+      "step": 13465
+    },
+    {
+      "epoch": 0.11689134642928446,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0019246138823267238,
+      "loss": 0.0791,
+      "step": 13466
+    },
+    {
+      "epoch": 0.11690002690948863,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0019246019579876367,
+      "loss": 0.166,
+      "step": 13467
+    },
+    {
+      "epoch": 0.11690870738969279,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001924590032746777,
+      "loss": 0.1182,
+      "step": 13468
+    },
+    {
+      "epoch": 0.11691738786989696,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019245781066041584,
+      "loss": 0.1221,
+      "step": 13469
+    },
+    {
+      "epoch": 0.11692606835010112,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001924566179559793,
+      "loss": 0.1104,
+      "step": 13470
+    },
+    {
+      "epoch": 0.11693474883030529,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001924554251613695,
+      "loss": 0.1001,
+      "step": 13471
+    },
+    {
+      "epoch": 0.11694342931050945,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0019245423227658765,
+      "loss": 0.124,
+      "step": 13472
+    },
+    {
+      "epoch": 0.11695210979071362,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001924530393016351,
+      "loss": 0.124,
+      "step": 13473
+    },
+    {
+      "epoch": 0.11696079027091778,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019245184623651315,
+      "loss": 0.1045,
+      "step": 13474
+    },
+    {
+      "epoch": 0.11696947075112195,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019245065308122308,
+      "loss": 0.1064,
+      "step": 13475
+    },
+    {
+      "epoch": 0.11697815123132611,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019244945983576626,
+      "loss": 0.0884,
+      "step": 13476
+    },
+    {
+      "epoch": 0.11698683171153028,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019244826650014393,
+      "loss": 0.0908,
+      "step": 13477
+    },
+    {
+      "epoch": 0.11699551219173444,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001924470730743574,
+      "loss": 0.1348,
+      "step": 13478
+    },
+    {
+      "epoch": 0.11700419267193861,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0019244587955840797,
+      "loss": 0.1074,
+      "step": 13479
+    },
+    {
+      "epoch": 0.11701287315214277,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.00192444685952297,
+      "loss": 0.0908,
+      "step": 13480
+    },
+    {
+      "epoch": 0.11702155363234694,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0019244349225602576,
+      "loss": 0.1064,
+      "step": 13481
+    },
+    {
+      "epoch": 0.1170302341125511,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019244229846959555,
+      "loss": 0.1064,
+      "step": 13482
+    },
+    {
+      "epoch": 0.11703891459275527,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019244110459300768,
+      "loss": 0.1084,
+      "step": 13483
+    },
+    {
+      "epoch": 0.11704759507295943,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019243991062626348,
+      "loss": 0.1699,
+      "step": 13484
+    },
+    {
+      "epoch": 0.1170562755531636,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0019243871656936418,
+      "loss": 0.1426,
+      "step": 13485
+    },
+    {
+      "epoch": 0.11706495603336776,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0019243752242231115,
+      "loss": 0.1143,
+      "step": 13486
+    },
+    {
+      "epoch": 0.11707363651357193,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001924363281851057,
+      "loss": 0.0986,
+      "step": 13487
+    },
+    {
+      "epoch": 0.1170823169937761,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019243513385774913,
+      "loss": 0.1152,
+      "step": 13488
+    },
+    {
+      "epoch": 0.11709099747398026,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0019243393944024271,
+      "loss": 0.1914,
+      "step": 13489
+    },
+    {
+      "epoch": 0.11709967795418443,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.001924327449325878,
+      "loss": 0.1641,
+      "step": 13490
+    },
+    {
+      "epoch": 0.11710835843438859,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0019243155033478567,
+      "loss": 0.0977,
+      "step": 13491
+    },
+    {
+      "epoch": 0.11711703891459276,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019243035564683764,
+      "loss": 0.1475,
+      "step": 13492
+    },
+    {
+      "epoch": 0.11712571939479692,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019242916086874501,
+      "loss": 0.1138,
+      "step": 13493
+    },
+    {
+      "epoch": 0.11713439987500109,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019242796600050904,
+      "loss": 0.1396,
+      "step": 13494
+    },
+    {
+      "epoch": 0.11714308035520525,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0019242677104213117,
+      "loss": 0.2129,
+      "step": 13495
+    },
+    {
+      "epoch": 0.11715176083540942,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019242557599361257,
+      "loss": 0.105,
+      "step": 13496
+    },
+    {
+      "epoch": 0.11716044131561358,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019242438085495459,
+      "loss": 0.1816,
+      "step": 13497
+    },
+    {
+      "epoch": 0.11716912179581775,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019242318562615857,
+      "loss": 0.1191,
+      "step": 13498
+    },
+    {
+      "epoch": 0.11717780227602191,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019242199030722578,
+      "loss": 0.1416,
+      "step": 13499
+    },
+    {
+      "epoch": 0.11718648275622608,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0019242079489815757,
+      "loss": 0.1309,
+      "step": 13500
+    },
+    {
+      "epoch": 0.11719516323643024,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019241959939895518,
+      "loss": 0.0996,
+      "step": 13501
+    },
+    {
+      "epoch": 0.1172038437166344,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019241840380961994,
+      "loss": 0.1289,
+      "step": 13502
+    },
+    {
+      "epoch": 0.11721252419683857,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0019241720813015322,
+      "loss": 0.1465,
+      "step": 13503
+    },
+    {
+      "epoch": 0.11722120467704274,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0019241601236055625,
+      "loss": 0.1113,
+      "step": 13504
+    },
+    {
+      "epoch": 0.1172298851572469,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0019241481650083038,
+      "loss": 0.1348,
+      "step": 13505
+    },
+    {
+      "epoch": 0.11723856563745107,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.001924136205509769,
+      "loss": 0.1079,
+      "step": 13506
+    },
+    {
+      "epoch": 0.11724724611765523,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001924124245109971,
+      "loss": 0.1533,
+      "step": 13507
+    },
+    {
+      "epoch": 0.1172559265978594,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019241122838089236,
+      "loss": 0.1328,
+      "step": 13508
+    },
+    {
+      "epoch": 0.11726460707806356,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019241003216066389,
+      "loss": 0.127,
+      "step": 13509
+    },
+    {
+      "epoch": 0.11727328755826773,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019240883585031308,
+      "loss": 0.1758,
+      "step": 13510
+    },
+    {
+      "epoch": 0.11728196803847189,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019240763944984118,
+      "loss": 0.1143,
+      "step": 13511
+    },
+    {
+      "epoch": 0.11729064851867606,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019240644295924958,
+      "loss": 0.1338,
+      "step": 13512
+    },
+    {
+      "epoch": 0.11729932899888022,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0019240524637853947,
+      "loss": 0.1377,
+      "step": 13513
+    },
+    {
+      "epoch": 0.11730800947908439,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019240404970771224,
+      "loss": 0.1357,
+      "step": 13514
+    },
+    {
+      "epoch": 0.11731668995928855,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019240285294676918,
+      "loss": 0.1465,
+      "step": 13515
+    },
+    {
+      "epoch": 0.11732537043949272,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.001924016560957116,
+      "loss": 0.1396,
+      "step": 13516
+    },
+    {
+      "epoch": 0.11733405091969688,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019240045915454081,
+      "loss": 0.1177,
+      "step": 13517
+    },
+    {
+      "epoch": 0.11734273139990105,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001923992621232581,
+      "loss": 0.083,
+      "step": 13518
+    },
+    {
+      "epoch": 0.11735141188010521,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019239806500186481,
+      "loss": 0.1147,
+      "step": 13519
+    },
+    {
+      "epoch": 0.11736009236030938,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019239686779036222,
+      "loss": 0.1133,
+      "step": 13520
+    },
+    {
+      "epoch": 0.11736877284051354,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019239567048875165,
+      "loss": 0.1025,
+      "step": 13521
+    },
+    {
+      "epoch": 0.11737745332071771,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019239447309703445,
+      "loss": 0.1445,
+      "step": 13522
+    },
+    {
+      "epoch": 0.11738613380092186,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019239327561521187,
+      "loss": 0.1592,
+      "step": 13523
+    },
+    {
+      "epoch": 0.11739481428112603,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0019239207804328524,
+      "loss": 0.123,
+      "step": 13524
+    },
+    {
+      "epoch": 0.11740349476133019,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001923908803812559,
+      "loss": 0.1094,
+      "step": 13525
+    },
+    {
+      "epoch": 0.11741217524153436,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001923896826291251,
+      "loss": 0.123,
+      "step": 13526
+    },
+    {
+      "epoch": 0.11742085572173852,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0019238848478689418,
+      "loss": 0.1045,
+      "step": 13527
+    },
+    {
+      "epoch": 0.11742953620194269,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019238728685456443,
+      "loss": 0.0811,
+      "step": 13528
+    },
+    {
+      "epoch": 0.11743821668214685,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001923860888321372,
+      "loss": 0.0869,
+      "step": 13529
+    },
+    {
+      "epoch": 0.11744689716235102,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001923848907196138,
+      "loss": 0.0874,
+      "step": 13530
+    },
+    {
+      "epoch": 0.11745557764255518,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001923836925169955,
+      "loss": 0.0977,
+      "step": 13531
+    },
+    {
+      "epoch": 0.11746425812275935,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019238249422428365,
+      "loss": 0.1113,
+      "step": 13532
+    },
+    {
+      "epoch": 0.11747293860296351,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019238129584147952,
+      "loss": 0.0962,
+      "step": 13533
+    },
+    {
+      "epoch": 0.11748161908316768,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0019238009736858447,
+      "loss": 0.1074,
+      "step": 13534
+    },
+    {
+      "epoch": 0.11749029956337184,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0019237889880559975,
+      "loss": 0.0825,
+      "step": 13535
+    },
+    {
+      "epoch": 0.117498980043576,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0019237770015252673,
+      "loss": 0.1045,
+      "step": 13536
+    },
+    {
+      "epoch": 0.11750766052378017,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019237650140936668,
+      "loss": 0.1621,
+      "step": 13537
+    },
+    {
+      "epoch": 0.11751634100398434,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0019237530257612092,
+      "loss": 0.0996,
+      "step": 13538
+    },
+    {
+      "epoch": 0.1175250214841885,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001923741036527908,
+      "loss": 0.1289,
+      "step": 13539
+    },
+    {
+      "epoch": 0.11753370196439267,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0019237290463937759,
+      "loss": 0.1523,
+      "step": 13540
+    },
+    {
+      "epoch": 0.11754238244459683,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001923717055358826,
+      "loss": 0.1045,
+      "step": 13541
+    },
+    {
+      "epoch": 0.117551062924801,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0019237050634230714,
+      "loss": 0.1162,
+      "step": 13542
+    },
+    {
+      "epoch": 0.11755974340500516,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019236930705865253,
+      "loss": 0.1299,
+      "step": 13543
+    },
+    {
+      "epoch": 0.11756842388520933,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001923681076849201,
+      "loss": 0.0996,
+      "step": 13544
+    },
+    {
+      "epoch": 0.11757710436541349,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019236690822111114,
+      "loss": 0.1338,
+      "step": 13545
+    },
+    {
+      "epoch": 0.11758578484561766,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019236570866722697,
+      "loss": 0.0859,
+      "step": 13546
+    },
+    {
+      "epoch": 0.11759446532582182,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019236450902326888,
+      "loss": 0.1104,
+      "step": 13547
+    },
+    {
+      "epoch": 0.11760314580602599,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0019236330928923823,
+      "loss": 0.1221,
+      "step": 13548
+    },
+    {
+      "epoch": 0.11761182628623015,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0019236210946513626,
+      "loss": 0.1289,
+      "step": 13549
+    },
+    {
+      "epoch": 0.11762050676643432,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019236090955096435,
+      "loss": 0.1484,
+      "step": 13550
+    },
+    {
+      "epoch": 0.11762918724663848,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019235970954672378,
+      "loss": 0.1309,
+      "step": 13551
+    },
+    {
+      "epoch": 0.11763786772684265,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0019235850945241587,
+      "loss": 0.0923,
+      "step": 13552
+    },
+    {
+      "epoch": 0.11764654820704681,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0019235730926804196,
+      "loss": 0.1211,
+      "step": 13553
+    },
+    {
+      "epoch": 0.11765522868725098,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001923561089936033,
+      "loss": 0.0996,
+      "step": 13554
+    },
+    {
+      "epoch": 0.11766390916745514,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019235490862910126,
+      "loss": 0.1367,
+      "step": 13555
+    },
+    {
+      "epoch": 0.11767258964765931,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019235370817453712,
+      "loss": 0.1074,
+      "step": 13556
+    },
+    {
+      "epoch": 0.11768127012786347,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.001923525076299122,
+      "loss": 0.1777,
+      "step": 13557
+    },
+    {
+      "epoch": 0.11768995060806764,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001923513069952278,
+      "loss": 0.1289,
+      "step": 13558
+    },
+    {
+      "epoch": 0.1176986310882718,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019235010627048526,
+      "loss": 0.123,
+      "step": 13559
+    },
+    {
+      "epoch": 0.11770731156847597,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019234890545568588,
+      "loss": 0.0967,
+      "step": 13560
+    },
+    {
+      "epoch": 0.11771599204868013,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019234770455083098,
+      "loss": 0.1562,
+      "step": 13561
+    },
+    {
+      "epoch": 0.1177246725288843,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019234650355592186,
+      "loss": 0.0698,
+      "step": 13562
+    },
+    {
+      "epoch": 0.11773335300908847,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019234530247095984,
+      "loss": 0.1084,
+      "step": 13563
+    },
+    {
+      "epoch": 0.11774203348929263,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0019234410129594626,
+      "loss": 0.0898,
+      "step": 13564
+    },
+    {
+      "epoch": 0.1177507139694968,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019234290003088237,
+      "loss": 0.1104,
+      "step": 13565
+    },
+    {
+      "epoch": 0.11775939444970096,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.0019234169867576954,
+      "loss": 0.1216,
+      "step": 13566
+    },
+    {
+      "epoch": 0.11776807492990513,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019234049723060908,
+      "loss": 0.1367,
+      "step": 13567
+    },
+    {
+      "epoch": 0.11777675541010929,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019233929569540226,
+      "loss": 0.1562,
+      "step": 13568
+    },
+    {
+      "epoch": 0.11778543589031346,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0019233809407015047,
+      "loss": 0.1055,
+      "step": 13569
+    },
+    {
+      "epoch": 0.11779411637051762,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019233689235485492,
+      "loss": 0.0801,
+      "step": 13570
+    },
+    {
+      "epoch": 0.11780279685072179,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019233569054951701,
+      "loss": 0.1167,
+      "step": 13571
+    },
+    {
+      "epoch": 0.11781147733092595,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019233448865413802,
+      "loss": 0.0869,
+      "step": 13572
+    },
+    {
+      "epoch": 0.11782015781113012,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001923332866687193,
+      "loss": 0.1094,
+      "step": 13573
+    },
+    {
+      "epoch": 0.11782883829133428,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019233208459326209,
+      "loss": 0.1064,
+      "step": 13574
+    },
+    {
+      "epoch": 0.11783751877153845,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019233088242776777,
+      "loss": 0.0781,
+      "step": 13575
+    },
+    {
+      "epoch": 0.11784619925174261,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019232968017223763,
+      "loss": 0.1416,
+      "step": 13576
+    },
+    {
+      "epoch": 0.11785487973194678,
+      "grad_norm": 0.061279296875,
+      "learning_rate": 0.0019232847782667303,
+      "loss": 0.0854,
+      "step": 13577
+    },
+    {
+      "epoch": 0.11786356021215094,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019232727539107515,
+      "loss": 0.0996,
+      "step": 13578
+    },
+    {
+      "epoch": 0.1178722406923551,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0019232607286544547,
+      "loss": 0.1416,
+      "step": 13579
+    },
+    {
+      "epoch": 0.11788092117255927,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019232487024978522,
+      "loss": 0.1113,
+      "step": 13580
+    },
+    {
+      "epoch": 0.11788960165276344,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0019232366754409571,
+      "loss": 0.1201,
+      "step": 13581
+    },
+    {
+      "epoch": 0.1178982821329676,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001923224647483783,
+      "loss": 0.1182,
+      "step": 13582
+    },
+    {
+      "epoch": 0.11790696261317177,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019232126186263427,
+      "loss": 0.1025,
+      "step": 13583
+    },
+    {
+      "epoch": 0.11791564309337592,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019232005888686495,
+      "loss": 0.1157,
+      "step": 13584
+    },
+    {
+      "epoch": 0.11792432357358008,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019231885582107165,
+      "loss": 0.0996,
+      "step": 13585
+    },
+    {
+      "epoch": 0.11793300405378425,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019231765266525568,
+      "loss": 0.1064,
+      "step": 13586
+    },
+    {
+      "epoch": 0.11794168453398841,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0019231644941941834,
+      "loss": 0.126,
+      "step": 13587
+    },
+    {
+      "epoch": 0.11795036501419258,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019231524608356101,
+      "loss": 0.126,
+      "step": 13588
+    },
+    {
+      "epoch": 0.11795904549439674,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0019231404265768492,
+      "loss": 0.0908,
+      "step": 13589
+    },
+    {
+      "epoch": 0.11796772597460091,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019231283914179147,
+      "loss": 0.1484,
+      "step": 13590
+    },
+    {
+      "epoch": 0.11797640645480507,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001923116355358819,
+      "loss": 0.1289,
+      "step": 13591
+    },
+    {
+      "epoch": 0.11798508693500924,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001923104318399576,
+      "loss": 0.1309,
+      "step": 13592
+    },
+    {
+      "epoch": 0.1179937674152134,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019230922805401982,
+      "loss": 0.1104,
+      "step": 13593
+    },
+    {
+      "epoch": 0.11800244789541757,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019230802417806992,
+      "loss": 0.1357,
+      "step": 13594
+    },
+    {
+      "epoch": 0.11801112837562173,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001923068202121092,
+      "loss": 0.0996,
+      "step": 13595
+    },
+    {
+      "epoch": 0.1180198088558259,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0019230561615613897,
+      "loss": 0.1191,
+      "step": 13596
+    },
+    {
+      "epoch": 0.11802848933603007,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019230441201016058,
+      "loss": 0.0771,
+      "step": 13597
+    },
+    {
+      "epoch": 0.11803716981623423,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001923032077741753,
+      "loss": 0.1211,
+      "step": 13598
+    },
+    {
+      "epoch": 0.1180458502964384,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0019230200344818446,
+      "loss": 0.1152,
+      "step": 13599
+    },
+    {
+      "epoch": 0.11805453077664256,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019230079903218937,
+      "loss": 0.1133,
+      "step": 13600
+    },
+    {
+      "epoch": 0.11806321125684673,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001922995945261914,
+      "loss": 0.0967,
+      "step": 13601
+    },
+    {
+      "epoch": 0.11807189173705089,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0019229838993019182,
+      "loss": 0.0737,
+      "step": 13602
+    },
+    {
+      "epoch": 0.11808057221725506,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019229718524419197,
+      "loss": 0.1562,
+      "step": 13603
+    },
+    {
+      "epoch": 0.11808925269745922,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019229598046819312,
+      "loss": 0.1191,
+      "step": 13604
+    },
+    {
+      "epoch": 0.11809793317766339,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0019229477560219663,
+      "loss": 0.1133,
+      "step": 13605
+    },
+    {
+      "epoch": 0.11810661365786755,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019229357064620386,
+      "loss": 0.0977,
+      "step": 13606
+    },
+    {
+      "epoch": 0.11811529413807172,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019229236560021601,
+      "loss": 0.1084,
+      "step": 13607
+    },
+    {
+      "epoch": 0.11812397461827588,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001922911604642345,
+      "loss": 0.0728,
+      "step": 13608
+    },
+    {
+      "epoch": 0.11813265509848005,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019228995523826063,
+      "loss": 0.1123,
+      "step": 13609
+    },
+    {
+      "epoch": 0.11814133557868421,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0019228874992229567,
+      "loss": 0.1021,
+      "step": 13610
+    },
+    {
+      "epoch": 0.11815001605888838,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0019228754451634097,
+      "loss": 0.1348,
+      "step": 13611
+    },
+    {
+      "epoch": 0.11815869653909254,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019228633902039786,
+      "loss": 0.0845,
+      "step": 13612
+    },
+    {
+      "epoch": 0.11816737701929671,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0019228513343446762,
+      "loss": 0.1631,
+      "step": 13613
+    },
+    {
+      "epoch": 0.11817605749950087,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0019228392775855164,
+      "loss": 0.127,
+      "step": 13614
+    },
+    {
+      "epoch": 0.11818473797970504,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019228272199265115,
+      "loss": 0.0947,
+      "step": 13615
+    },
+    {
+      "epoch": 0.1181934184599092,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019228151613676753,
+      "loss": 0.0654,
+      "step": 13616
+    },
+    {
+      "epoch": 0.11820209894011337,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0019228031019090208,
+      "loss": 0.1133,
+      "step": 13617
+    },
+    {
+      "epoch": 0.11821077942031753,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0019227910415505615,
+      "loss": 0.1006,
+      "step": 13618
+    },
+    {
+      "epoch": 0.1182194599005217,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.00192277898029231,
+      "loss": 0.1289,
+      "step": 13619
+    },
+    {
+      "epoch": 0.11822814038072586,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0019227669181342796,
+      "loss": 0.1982,
+      "step": 13620
+    },
+    {
+      "epoch": 0.11823682086093003,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0019227548550764838,
+      "loss": 0.1226,
+      "step": 13621
+    },
+    {
+      "epoch": 0.1182455013411342,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019227427911189356,
+      "loss": 0.0996,
+      "step": 13622
+    },
+    {
+      "epoch": 0.11825418182133836,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019227307262616481,
+      "loss": 0.124,
+      "step": 13623
+    },
+    {
+      "epoch": 0.11826286230154252,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.0019227186605046353,
+      "loss": 0.1133,
+      "step": 13624
+    },
+    {
+      "epoch": 0.11827154278174669,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0019227065938479092,
+      "loss": 0.0996,
+      "step": 13625
+    },
+    {
+      "epoch": 0.11828022326195085,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0019226945262914837,
+      "loss": 0.1177,
+      "step": 13626
+    },
+    {
+      "epoch": 0.11828890374215502,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019226824578353714,
+      "loss": 0.0913,
+      "step": 13627
+    },
+    {
+      "epoch": 0.11829758422235918,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0019226703884795864,
+      "loss": 0.1016,
+      "step": 13628
+    },
+    {
+      "epoch": 0.11830626470256335,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019226583182241413,
+      "loss": 0.125,
+      "step": 13629
+    },
+    {
+      "epoch": 0.11831494518276751,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019226462470690493,
+      "loss": 0.209,
+      "step": 13630
+    },
+    {
+      "epoch": 0.11832362566297168,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0019226341750143239,
+      "loss": 0.1006,
+      "step": 13631
+    },
+    {
+      "epoch": 0.11833230614317584,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019226221020599782,
+      "loss": 0.127,
+      "step": 13632
+    },
+    {
+      "epoch": 0.11834098662338001,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019226100282060251,
+      "loss": 0.0986,
+      "step": 13633
+    },
+    {
+      "epoch": 0.11834966710358417,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001922597953452478,
+      "loss": 0.0996,
+      "step": 13634
+    },
+    {
+      "epoch": 0.11835834758378834,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019225858777993507,
+      "loss": 0.0957,
+      "step": 13635
+    },
+    {
+      "epoch": 0.1183670280639925,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019225738012466549,
+      "loss": 0.1064,
+      "step": 13636
+    },
+    {
+      "epoch": 0.11837570854419667,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019225617237944054,
+      "loss": 0.0986,
+      "step": 13637
+    },
+    {
+      "epoch": 0.11838438902440084,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001922549645442615,
+      "loss": 0.1367,
+      "step": 13638
+    },
+    {
+      "epoch": 0.118393069504605,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.001922537566191296,
+      "loss": 0.1221,
+      "step": 13639
+    },
+    {
+      "epoch": 0.11840174998480917,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0019225254860404627,
+      "loss": 0.1533,
+      "step": 13640
+    },
+    {
+      "epoch": 0.11841043046501333,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019225134049901274,
+      "loss": 0.1016,
+      "step": 13641
+    },
+    {
+      "epoch": 0.1184191109452175,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019225013230403044,
+      "loss": 0.1035,
+      "step": 13642
+    },
+    {
+      "epoch": 0.11842779142542166,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019224892401910059,
+      "loss": 0.1729,
+      "step": 13643
+    },
+    {
+      "epoch": 0.11843647190562583,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019224771564422456,
+      "loss": 0.1348,
+      "step": 13644
+    },
+    {
+      "epoch": 0.11844515238582999,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0019224650717940368,
+      "loss": 0.1021,
+      "step": 13645
+    },
+    {
+      "epoch": 0.11845383286603414,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019224529862463926,
+      "loss": 0.1699,
+      "step": 13646
+    },
+    {
+      "epoch": 0.11846251334623831,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019224408997993258,
+      "loss": 0.1177,
+      "step": 13647
+    },
+    {
+      "epoch": 0.11847119382644247,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019224288124528503,
+      "loss": 0.1348,
+      "step": 13648
+    },
+    {
+      "epoch": 0.11847987430664664,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001922416724206979,
+      "loss": 0.1348,
+      "step": 13649
+    },
+    {
+      "epoch": 0.1184885547868508,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.001922404635061725,
+      "loss": 0.1152,
+      "step": 13650
+    },
+    {
+      "epoch": 0.11849723526705497,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019223925450171017,
+      "loss": 0.0898,
+      "step": 13651
+    },
+    {
+      "epoch": 0.11850591574725913,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019223804540731224,
+      "loss": 0.1064,
+      "step": 13652
+    },
+    {
+      "epoch": 0.1185145962274633,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019223683622298,
+      "loss": 0.125,
+      "step": 13653
+    },
+    {
+      "epoch": 0.11852327670766746,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001922356269487148,
+      "loss": 0.083,
+      "step": 13654
+    },
+    {
+      "epoch": 0.11853195718787163,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019223441758451797,
+      "loss": 0.0947,
+      "step": 13655
+    },
+    {
+      "epoch": 0.1185406376680758,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001922332081303908,
+      "loss": 0.1299,
+      "step": 13656
+    },
+    {
+      "epoch": 0.11854931814827996,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001922319985863346,
+      "loss": 0.0806,
+      "step": 13657
+    },
+    {
+      "epoch": 0.11855799862848412,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019223078895235079,
+      "loss": 0.0918,
+      "step": 13658
+    },
+    {
+      "epoch": 0.11856667910868829,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019222957922844055,
+      "loss": 0.0752,
+      "step": 13659
+    },
+    {
+      "epoch": 0.11857535958889245,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0019222836941460532,
+      "loss": 0.124,
+      "step": 13660
+    },
+    {
+      "epoch": 0.11858404006909662,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019222715951084637,
+      "loss": 0.1045,
+      "step": 13661
+    },
+    {
+      "epoch": 0.11859272054930078,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019222594951716504,
+      "loss": 0.1001,
+      "step": 13662
+    },
+    {
+      "epoch": 0.11860140102950495,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0019222473943356266,
+      "loss": 0.1104,
+      "step": 13663
+    },
+    {
+      "epoch": 0.11861008150970911,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0019222352926004053,
+      "loss": 0.1309,
+      "step": 13664
+    },
+    {
+      "epoch": 0.11861876198991328,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019222231899659998,
+      "loss": 0.1201,
+      "step": 13665
+    },
+    {
+      "epoch": 0.11862744247011744,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0019222110864324231,
+      "loss": 0.1406,
+      "step": 13666
+    },
+    {
+      "epoch": 0.11863612295032161,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001922198981999689,
+      "loss": 0.1279,
+      "step": 13667
+    },
+    {
+      "epoch": 0.11864480343052577,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019221868766678106,
+      "loss": 0.1387,
+      "step": 13668
+    },
+    {
+      "epoch": 0.11865348391072994,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019221747704368007,
+      "loss": 0.1074,
+      "step": 13669
+    },
+    {
+      "epoch": 0.1186621643909341,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0019221626633066729,
+      "loss": 0.1123,
+      "step": 13670
+    },
+    {
+      "epoch": 0.11867084487113827,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019221505552774406,
+      "loss": 0.0889,
+      "step": 13671
+    },
+    {
+      "epoch": 0.11867952535134244,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019221384463491164,
+      "loss": 0.0767,
+      "step": 13672
+    },
+    {
+      "epoch": 0.1186882058315466,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001922126336521714,
+      "loss": 0.1187,
+      "step": 13673
+    },
+    {
+      "epoch": 0.11869688631175077,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019221142257952468,
+      "loss": 0.1318,
+      "step": 13674
+    },
+    {
+      "epoch": 0.11870556679195493,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019221021141697277,
+      "loss": 0.1196,
+      "step": 13675
+    },
+    {
+      "epoch": 0.1187142472721591,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019220900016451703,
+      "loss": 0.1309,
+      "step": 13676
+    },
+    {
+      "epoch": 0.11872292775236326,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019220778882215875,
+      "loss": 0.0977,
+      "step": 13677
+    },
+    {
+      "epoch": 0.11873160823256743,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0019220657738989928,
+      "loss": 0.1118,
+      "step": 13678
+    },
+    {
+      "epoch": 0.11874028871277159,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019220536586773988,
+      "loss": 0.1328,
+      "step": 13679
+    },
+    {
+      "epoch": 0.11874896919297576,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.00192204154255682,
+      "loss": 0.1201,
+      "step": 13680
+    },
+    {
+      "epoch": 0.11875764967317992,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019220294255372684,
+      "loss": 0.0952,
+      "step": 13681
+    },
+    {
+      "epoch": 0.11876633015338409,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001922017307618758,
+      "loss": 0.1162,
+      "step": 13682
+    },
+    {
+      "epoch": 0.11877501063358825,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019220051888013017,
+      "loss": 0.1484,
+      "step": 13683
+    },
+    {
+      "epoch": 0.11878369111379242,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001921993069084913,
+      "loss": 0.1318,
+      "step": 13684
+    },
+    {
+      "epoch": 0.11879237159399658,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001921980948469605,
+      "loss": 0.0938,
+      "step": 13685
+    },
+    {
+      "epoch": 0.11880105207420075,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001921968826955391,
+      "loss": 0.0952,
+      "step": 13686
+    },
+    {
+      "epoch": 0.11880973255440491,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0019219567045422842,
+      "loss": 0.0967,
+      "step": 13687
+    },
+    {
+      "epoch": 0.11881841303460908,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0019219445812302977,
+      "loss": 0.123,
+      "step": 13688
+    },
+    {
+      "epoch": 0.11882709351481324,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019219324570194452,
+      "loss": 0.106,
+      "step": 13689
+    },
+    {
+      "epoch": 0.11883577399501741,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0019219203319097396,
+      "loss": 0.1152,
+      "step": 13690
+    },
+    {
+      "epoch": 0.11884445447522157,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019219082059011943,
+      "loss": 0.0869,
+      "step": 13691
+    },
+    {
+      "epoch": 0.11885313495542574,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0019218960789938227,
+      "loss": 0.0923,
+      "step": 13692
+    },
+    {
+      "epoch": 0.1188618154356299,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019218839511876379,
+      "loss": 0.1436,
+      "step": 13693
+    },
+    {
+      "epoch": 0.11887049591583407,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0019218718224826529,
+      "loss": 0.0957,
+      "step": 13694
+    },
+    {
+      "epoch": 0.11887917639603823,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0019218596928788812,
+      "loss": 0.0996,
+      "step": 13695
+    },
+    {
+      "epoch": 0.1188878568762424,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0019218475623763362,
+      "loss": 0.1064,
+      "step": 13696
+    },
+    {
+      "epoch": 0.11889653735644656,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019218354309750313,
+      "loss": 0.1641,
+      "step": 13697
+    },
+    {
+      "epoch": 0.11890521783665073,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001921823298674979,
+      "loss": 0.126,
+      "step": 13698
+    },
+    {
+      "epoch": 0.1189138983168549,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019218111654761936,
+      "loss": 0.0742,
+      "step": 13699
+    },
+    {
+      "epoch": 0.11892257879705906,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0019217990313786875,
+      "loss": 0.1182,
+      "step": 13700
+    },
+    {
+      "epoch": 0.11893125927726322,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0019217868963824747,
+      "loss": 0.1543,
+      "step": 13701
+    },
+    {
+      "epoch": 0.11893993975746739,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0019217747604875677,
+      "loss": 0.1211,
+      "step": 13702
+    },
+    {
+      "epoch": 0.11894862023767155,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0019217626236939802,
+      "loss": 0.1279,
+      "step": 13703
+    },
+    {
+      "epoch": 0.11895730071787572,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001921750486001726,
+      "loss": 0.0957,
+      "step": 13704
+    },
+    {
+      "epoch": 0.11896598119807988,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001921738347410817,
+      "loss": 0.1621,
+      "step": 13705
+    },
+    {
+      "epoch": 0.11897466167828405,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0019217262079212678,
+      "loss": 0.1338,
+      "step": 13706
+    },
+    {
+      "epoch": 0.11898334215848821,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001921714067533091,
+      "loss": 0.1191,
+      "step": 13707
+    },
+    {
+      "epoch": 0.11899202263869237,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019217019262463002,
+      "loss": 0.1523,
+      "step": 13708
+    },
+    {
+      "epoch": 0.11900070311889653,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019216897840609085,
+      "loss": 0.1455,
+      "step": 13709
+    },
+    {
+      "epoch": 0.1190093835991007,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.001921677640976929,
+      "loss": 0.1387,
+      "step": 13710
+    },
+    {
+      "epoch": 0.11901806407930486,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019216654969943753,
+      "loss": 0.1016,
+      "step": 13711
+    },
+    {
+      "epoch": 0.11902674455950903,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019216533521132604,
+      "loss": 0.0908,
+      "step": 13712
+    },
+    {
+      "epoch": 0.11903542503971319,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019216412063335982,
+      "loss": 0.0977,
+      "step": 13713
+    },
+    {
+      "epoch": 0.11904410551991736,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001921629059655401,
+      "loss": 0.1196,
+      "step": 13714
+    },
+    {
+      "epoch": 0.11905278600012152,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0019216169120786829,
+      "loss": 0.0947,
+      "step": 13715
+    },
+    {
+      "epoch": 0.11906146648032569,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0019216047636034569,
+      "loss": 0.1621,
+      "step": 13716
+    },
+    {
+      "epoch": 0.11907014696052985,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0019215926142297363,
+      "loss": 0.1113,
+      "step": 13717
+    },
+    {
+      "epoch": 0.11907882744073402,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0019215804639575338,
+      "loss": 0.1318,
+      "step": 13718
+    },
+    {
+      "epoch": 0.11908750792093818,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019215683127868637,
+      "loss": 0.1172,
+      "step": 13719
+    },
+    {
+      "epoch": 0.11909618840114235,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001921556160717739,
+      "loss": 0.1523,
+      "step": 13720
+    },
+    {
+      "epoch": 0.11910486888134651,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019215440077501728,
+      "loss": 0.1211,
+      "step": 13721
+    },
+    {
+      "epoch": 0.11911354936155068,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019215318538841782,
+      "loss": 0.1143,
+      "step": 13722
+    },
+    {
+      "epoch": 0.11912222984175484,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0019215196991197688,
+      "loss": 0.0962,
+      "step": 13723
+    },
+    {
+      "epoch": 0.11913091032195901,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0019215075434569576,
+      "loss": 0.1367,
+      "step": 13724
+    },
+    {
+      "epoch": 0.11913959080216317,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019214953868957583,
+      "loss": 0.1133,
+      "step": 13725
+    },
+    {
+      "epoch": 0.11914827128236734,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0019214832294361841,
+      "loss": 0.0962,
+      "step": 13726
+    },
+    {
+      "epoch": 0.1191569517625715,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019214710710782482,
+      "loss": 0.1113,
+      "step": 13727
+    },
+    {
+      "epoch": 0.11916563224277567,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0019214589118219634,
+      "loss": 0.0732,
+      "step": 13728
+    },
+    {
+      "epoch": 0.11917431272297983,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019214467516673439,
+      "loss": 0.1621,
+      "step": 13729
+    },
+    {
+      "epoch": 0.119182993203184,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019214345906144023,
+      "loss": 0.1074,
+      "step": 13730
+    },
+    {
+      "epoch": 0.11919167368338816,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019214224286631526,
+      "loss": 0.0864,
+      "step": 13731
+    },
+    {
+      "epoch": 0.11920035416359233,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019214102658136074,
+      "loss": 0.1602,
+      "step": 13732
+    },
+    {
+      "epoch": 0.1192090346437965,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0019213981020657802,
+      "loss": 0.1377,
+      "step": 13733
+    },
+    {
+      "epoch": 0.11921771512400066,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0019213859374196845,
+      "loss": 0.1719,
+      "step": 13734
+    },
+    {
+      "epoch": 0.11922639560420482,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0019213737718753335,
+      "loss": 0.0811,
+      "step": 13735
+    },
+    {
+      "epoch": 0.11923507608440899,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019213616054327403,
+      "loss": 0.1377,
+      "step": 13736
+    },
+    {
+      "epoch": 0.11924375656461315,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019213494380919188,
+      "loss": 0.1807,
+      "step": 13737
+    },
+    {
+      "epoch": 0.11925243704481732,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0019213372698528815,
+      "loss": 0.0986,
+      "step": 13738
+    },
+    {
+      "epoch": 0.11926111752502148,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0019213251007156422,
+      "loss": 0.1309,
+      "step": 13739
+    },
+    {
+      "epoch": 0.11926979800522565,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019213129306802142,
+      "loss": 0.0874,
+      "step": 13740
+    },
+    {
+      "epoch": 0.11927847848542981,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019213007597466106,
+      "loss": 0.0996,
+      "step": 13741
+    },
+    {
+      "epoch": 0.11928715896563398,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019212885879148448,
+      "loss": 0.0942,
+      "step": 13742
+    },
+    {
+      "epoch": 0.11929583944583814,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0019212764151849302,
+      "loss": 0.1289,
+      "step": 13743
+    },
+    {
+      "epoch": 0.11930451992604231,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0019212642415568804,
+      "loss": 0.0796,
+      "step": 13744
+    },
+    {
+      "epoch": 0.11931320040624648,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019212520670307078,
+      "loss": 0.1016,
+      "step": 13745
+    },
+    {
+      "epoch": 0.11932188088645064,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019212398916064264,
+      "loss": 0.1377,
+      "step": 13746
+    },
+    {
+      "epoch": 0.1193305613666548,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019212277152840496,
+      "loss": 0.1162,
+      "step": 13747
+    },
+    {
+      "epoch": 0.11933924184685897,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0019212155380635902,
+      "loss": 0.1138,
+      "step": 13748
+    },
+    {
+      "epoch": 0.11934792232706314,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019212033599450625,
+      "loss": 0.1484,
+      "step": 13749
+    },
+    {
+      "epoch": 0.1193566028072673,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019211911809284786,
+      "loss": 0.1562,
+      "step": 13750
+    },
+    {
+      "epoch": 0.11936528328747147,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019211790010138523,
+      "loss": 0.104,
+      "step": 13751
+    },
+    {
+      "epoch": 0.11937396376767563,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019211668202011973,
+      "loss": 0.0796,
+      "step": 13752
+    },
+    {
+      "epoch": 0.1193826442478798,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0019211546384905263,
+      "loss": 0.1152,
+      "step": 13753
+    },
+    {
+      "epoch": 0.11939132472808396,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0019211424558818533,
+      "loss": 0.1133,
+      "step": 13754
+    },
+    {
+      "epoch": 0.11940000520828813,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0019211302723751908,
+      "loss": 0.1377,
+      "step": 13755
+    },
+    {
+      "epoch": 0.11940868568849229,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001921118087970553,
+      "loss": 0.1143,
+      "step": 13756
+    },
+    {
+      "epoch": 0.11941736616869646,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0019211059026679523,
+      "loss": 0.0752,
+      "step": 13757
+    },
+    {
+      "epoch": 0.11942604664890062,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.001921093716467403,
+      "loss": 0.0972,
+      "step": 13758
+    },
+    {
+      "epoch": 0.11943472712910479,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0019210815293689177,
+      "loss": 0.1006,
+      "step": 13759
+    },
+    {
+      "epoch": 0.11944340760930895,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00192106934137251,
+      "loss": 0.106,
+      "step": 13760
+    },
+    {
+      "epoch": 0.11945208808951312,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0019210571524781932,
+      "loss": 0.127,
+      "step": 13761
+    },
+    {
+      "epoch": 0.11946076856971728,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019210449626859806,
+      "loss": 0.1523,
+      "step": 13762
+    },
+    {
+      "epoch": 0.11946944904992145,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019210327719958859,
+      "loss": 0.0986,
+      "step": 13763
+    },
+    {
+      "epoch": 0.11947812953012561,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019210205804079217,
+      "loss": 0.0928,
+      "step": 13764
+    },
+    {
+      "epoch": 0.11948681001032978,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0019210083879221019,
+      "loss": 0.1187,
+      "step": 13765
+    },
+    {
+      "epoch": 0.11949549049053394,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0019209961945384395,
+      "loss": 0.1099,
+      "step": 13766
+    },
+    {
+      "epoch": 0.11950417097073811,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019209840002569483,
+      "loss": 0.1963,
+      "step": 13767
+    },
+    {
+      "epoch": 0.11951285145094227,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001920971805077641,
+      "loss": 0.0986,
+      "step": 13768
+    },
+    {
+      "epoch": 0.11952153193114642,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019209596090005315,
+      "loss": 0.0933,
+      "step": 13769
+    },
+    {
+      "epoch": 0.11953021241135059,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019209474120256327,
+      "loss": 0.1445,
+      "step": 13770
+    },
+    {
+      "epoch": 0.11953889289155475,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001920935214152958,
+      "loss": 0.125,
+      "step": 13771
+    },
+    {
+      "epoch": 0.11954757337175892,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019209230153825216,
+      "loss": 0.0986,
+      "step": 13772
+    },
+    {
+      "epoch": 0.11955625385196308,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0019209108157143356,
+      "loss": 0.0933,
+      "step": 13773
+    },
+    {
+      "epoch": 0.11956493433216725,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001920898615148414,
+      "loss": 0.1172,
+      "step": 13774
+    },
+    {
+      "epoch": 0.11957361481237141,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0019208864136847697,
+      "loss": 0.1084,
+      "step": 13775
+    },
+    {
+      "epoch": 0.11958229529257558,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0019208742113234168,
+      "loss": 0.1367,
+      "step": 13776
+    },
+    {
+      "epoch": 0.11959097577277975,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.001920862008064368,
+      "loss": 0.1182,
+      "step": 13777
+    },
+    {
+      "epoch": 0.11959965625298391,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.0019208498039076367,
+      "loss": 0.1719,
+      "step": 13778
+    },
+    {
+      "epoch": 0.11960833673318808,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0019208375988532366,
+      "loss": 0.1006,
+      "step": 13779
+    },
+    {
+      "epoch": 0.11961701721339224,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001920825392901181,
+      "loss": 0.1113,
+      "step": 13780
+    },
+    {
+      "epoch": 0.1196256976935964,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019208131860514825,
+      "loss": 0.1006,
+      "step": 13781
+    },
+    {
+      "epoch": 0.11963437817380057,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019208009783041556,
+      "loss": 0.1592,
+      "step": 13782
+    },
+    {
+      "epoch": 0.11964305865400474,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019207887696592128,
+      "loss": 0.1621,
+      "step": 13783
+    },
+    {
+      "epoch": 0.1196517391342089,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019207765601166678,
+      "loss": 0.126,
+      "step": 13784
+    },
+    {
+      "epoch": 0.11966041961441307,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001920764349676534,
+      "loss": 0.1504,
+      "step": 13785
+    },
+    {
+      "epoch": 0.11966910009461723,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019207521383388245,
+      "loss": 0.1094,
+      "step": 13786
+    },
+    {
+      "epoch": 0.1196777805748214,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019207399261035524,
+      "loss": 0.1201,
+      "step": 13787
+    },
+    {
+      "epoch": 0.11968646105502556,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019207277129707323,
+      "loss": 0.1562,
+      "step": 13788
+    },
+    {
+      "epoch": 0.11969514153522973,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019207154989403761,
+      "loss": 0.1133,
+      "step": 13789
+    },
+    {
+      "epoch": 0.11970382201543389,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0019207032840124981,
+      "loss": 0.1138,
+      "step": 13790
+    },
+    {
+      "epoch": 0.11971250249563806,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019206910681871113,
+      "loss": 0.1025,
+      "step": 13791
+    },
+    {
+      "epoch": 0.11972118297584222,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0019206788514642286,
+      "loss": 0.1016,
+      "step": 13792
+    },
+    {
+      "epoch": 0.11972986345604639,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019206666338438642,
+      "loss": 0.1162,
+      "step": 13793
+    },
+    {
+      "epoch": 0.11973854393625055,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0019206544153260313,
+      "loss": 0.1367,
+      "step": 13794
+    },
+    {
+      "epoch": 0.11974722441645472,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019206421959107425,
+      "loss": 0.1138,
+      "step": 13795
+    },
+    {
+      "epoch": 0.11975590489665888,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019206299755980122,
+      "loss": 0.1108,
+      "step": 13796
+    },
+    {
+      "epoch": 0.11976458537686305,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001920617754387853,
+      "loss": 0.125,
+      "step": 13797
+    },
+    {
+      "epoch": 0.11977326585706721,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0019206055322802788,
+      "loss": 0.0977,
+      "step": 13798
+    },
+    {
+      "epoch": 0.11978194633727138,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019205933092753028,
+      "loss": 0.1221,
+      "step": 13799
+    },
+    {
+      "epoch": 0.11979062681747554,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019205810853729379,
+      "loss": 0.1006,
+      "step": 13800
+    },
+    {
+      "epoch": 0.11979930729767971,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001920568860573198,
+      "loss": 0.1025,
+      "step": 13801
+    },
+    {
+      "epoch": 0.11980798777788387,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019205566348760965,
+      "loss": 0.1025,
+      "step": 13802
+    },
+    {
+      "epoch": 0.11981666825808804,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0019205444082816463,
+      "loss": 0.1172,
+      "step": 13803
+    },
+    {
+      "epoch": 0.1198253487382922,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019205321807898613,
+      "loss": 0.1309,
+      "step": 13804
+    },
+    {
+      "epoch": 0.11983402921849637,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.001920519952400755,
+      "loss": 0.0977,
+      "step": 13805
+    },
+    {
+      "epoch": 0.11984270969870053,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019205077231143398,
+      "loss": 0.1523,
+      "step": 13806
+    },
+    {
+      "epoch": 0.1198513901789047,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019204954929306298,
+      "loss": 0.0977,
+      "step": 13807
+    },
+    {
+      "epoch": 0.11986007065910886,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019204832618496383,
+      "loss": 0.1143,
+      "step": 13808
+    },
+    {
+      "epoch": 0.11986875113931303,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019204710298713786,
+      "loss": 0.1133,
+      "step": 13809
+    },
+    {
+      "epoch": 0.1198774316195172,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0019204587969958642,
+      "loss": 0.125,
+      "step": 13810
+    },
+    {
+      "epoch": 0.11988611209972136,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0019204465632231083,
+      "loss": 0.127,
+      "step": 13811
+    },
+    {
+      "epoch": 0.11989479257992552,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0019204343285531245,
+      "loss": 0.127,
+      "step": 13812
+    },
+    {
+      "epoch": 0.11990347306012969,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019204220929859258,
+      "loss": 0.1045,
+      "step": 13813
+    },
+    {
+      "epoch": 0.11991215354033385,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001920409856521526,
+      "loss": 0.1177,
+      "step": 13814
+    },
+    {
+      "epoch": 0.11992083402053802,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001920397619159938,
+      "loss": 0.1206,
+      "step": 13815
+    },
+    {
+      "epoch": 0.11992951450074218,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019203853809011762,
+      "loss": 0.1426,
+      "step": 13816
+    },
+    {
+      "epoch": 0.11993819498094635,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019203731417452529,
+      "loss": 0.1338,
+      "step": 13817
+    },
+    {
+      "epoch": 0.11994687546115052,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019203609016921817,
+      "loss": 0.0933,
+      "step": 13818
+    },
+    {
+      "epoch": 0.11995555594135468,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019203486607419762,
+      "loss": 0.0723,
+      "step": 13819
+    },
+    {
+      "epoch": 0.11996423642155885,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.00192033641889465,
+      "loss": 0.0967,
+      "step": 13820
+    },
+    {
+      "epoch": 0.11997291690176301,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019203241761502159,
+      "loss": 0.127,
+      "step": 13821
+    },
+    {
+      "epoch": 0.11998159738196718,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0019203119325086875,
+      "loss": 0.0879,
+      "step": 13822
+    },
+    {
+      "epoch": 0.11999027786217134,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019202996879700788,
+      "loss": 0.1152,
+      "step": 13823
+    },
+    {
+      "epoch": 0.1199989583423755,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001920287442534402,
+      "loss": 0.123,
+      "step": 13824
+    },
+    {
+      "epoch": 0.12000763882257967,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0019202751962016718,
+      "loss": 0.1426,
+      "step": 13825
+    },
+    {
+      "epoch": 0.12001631930278384,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001920262948971901,
+      "loss": 0.1206,
+      "step": 13826
+    },
+    {
+      "epoch": 0.120024999782988,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0019202507008451025,
+      "loss": 0.1074,
+      "step": 13827
+    },
+    {
+      "epoch": 0.12003368026319217,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019202384518212903,
+      "loss": 0.0913,
+      "step": 13828
+    },
+    {
+      "epoch": 0.12004236074339633,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019202262019004776,
+      "loss": 0.1152,
+      "step": 13829
+    },
+    {
+      "epoch": 0.1200510412236005,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019202139510826781,
+      "loss": 0.127,
+      "step": 13830
+    },
+    {
+      "epoch": 0.12005972170380465,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0019202016993679048,
+      "loss": 0.123,
+      "step": 13831
+    },
+    {
+      "epoch": 0.12006840218400881,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019201894467561712,
+      "loss": 0.1094,
+      "step": 13832
+    },
+    {
+      "epoch": 0.12007708266421298,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001920177193247491,
+      "loss": 0.1338,
+      "step": 13833
+    },
+    {
+      "epoch": 0.12008576314441714,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0019201649388418769,
+      "loss": 0.1699,
+      "step": 13834
+    },
+    {
+      "epoch": 0.12009444362462131,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019201526835393432,
+      "loss": 0.1055,
+      "step": 13835
+    },
+    {
+      "epoch": 0.12010312410482547,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019201404273399027,
+      "loss": 0.0947,
+      "step": 13836
+    },
+    {
+      "epoch": 0.12011180458502964,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001920128170243569,
+      "loss": 0.1128,
+      "step": 13837
+    },
+    {
+      "epoch": 0.1201204850652338,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019201159122503551,
+      "loss": 0.1084,
+      "step": 13838
+    },
+    {
+      "epoch": 0.12012916554543797,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001920103653360275,
+      "loss": 0.1055,
+      "step": 13839
+    },
+    {
+      "epoch": 0.12013784602564213,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019200913935733417,
+      "loss": 0.1113,
+      "step": 13840
+    },
+    {
+      "epoch": 0.1201465265058463,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001920079132889569,
+      "loss": 0.1157,
+      "step": 13841
+    },
+    {
+      "epoch": 0.12015520698605046,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00192006687130897,
+      "loss": 0.1357,
+      "step": 13842
+    },
+    {
+      "epoch": 0.12016388746625463,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0019200546088315583,
+      "loss": 0.1318,
+      "step": 13843
+    },
+    {
+      "epoch": 0.1201725679464588,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0019200423454573472,
+      "loss": 0.167,
+      "step": 13844
+    },
+    {
+      "epoch": 0.12018124842666296,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019200300811863503,
+      "loss": 0.123,
+      "step": 13845
+    },
+    {
+      "epoch": 0.12018992890686712,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019200178160185804,
+      "loss": 0.124,
+      "step": 13846
+    },
+    {
+      "epoch": 0.12019860938707129,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019200055499540515,
+      "loss": 0.0708,
+      "step": 13847
+    },
+    {
+      "epoch": 0.12020728986727545,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019199932829927766,
+      "loss": 0.1367,
+      "step": 13848
+    },
+    {
+      "epoch": 0.12021597034747962,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0019199810151347696,
+      "loss": 0.1055,
+      "step": 13849
+    },
+    {
+      "epoch": 0.12022465082768378,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001919968746380044,
+      "loss": 0.0898,
+      "step": 13850
+    },
+    {
+      "epoch": 0.12023333130788795,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019199564767286123,
+      "loss": 0.1152,
+      "step": 13851
+    },
+    {
+      "epoch": 0.12024201178809212,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019199442061804887,
+      "loss": 0.0947,
+      "step": 13852
+    },
+    {
+      "epoch": 0.12025069226829628,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019199319347356868,
+      "loss": 0.1143,
+      "step": 13853
+    },
+    {
+      "epoch": 0.12025937274850045,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019199196623942193,
+      "loss": 0.1309,
+      "step": 13854
+    },
+    {
+      "epoch": 0.12026805322870461,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0019199073891561004,
+      "loss": 0.1182,
+      "step": 13855
+    },
+    {
+      "epoch": 0.12027673370890878,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019198951150213423,
+      "loss": 0.1816,
+      "step": 13856
+    },
+    {
+      "epoch": 0.12028541418911294,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.00191988283998996,
+      "loss": 0.0967,
+      "step": 13857
+    },
+    {
+      "epoch": 0.1202940946693171,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019198705640619657,
+      "loss": 0.0957,
+      "step": 13858
+    },
+    {
+      "epoch": 0.12030277514952127,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0019198582872373736,
+      "loss": 0.1289,
+      "step": 13859
+    },
+    {
+      "epoch": 0.12031145562972544,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019198460095161962,
+      "loss": 0.1309,
+      "step": 13860
+    },
+    {
+      "epoch": 0.1203201361099296,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0019198337308984482,
+      "loss": 0.168,
+      "step": 13861
+    },
+    {
+      "epoch": 0.12032881659013377,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001919821451384142,
+      "loss": 0.1162,
+      "step": 13862
+    },
+    {
+      "epoch": 0.12033749707033793,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0019198091709732915,
+      "loss": 0.0962,
+      "step": 13863
+    },
+    {
+      "epoch": 0.1203461775505421,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0019197968896659098,
+      "loss": 0.1553,
+      "step": 13864
+    },
+    {
+      "epoch": 0.12035485803074626,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.001919784607462011,
+      "loss": 0.1592,
+      "step": 13865
+    },
+    {
+      "epoch": 0.12036353851095043,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019197723243616076,
+      "loss": 0.1162,
+      "step": 13866
+    },
+    {
+      "epoch": 0.12037221899115459,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0019197600403647137,
+      "loss": 0.1758,
+      "step": 13867
+    },
+    {
+      "epoch": 0.12038089947135876,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0019197477554713425,
+      "loss": 0.105,
+      "step": 13868
+    },
+    {
+      "epoch": 0.12038957995156292,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0019197354696815075,
+      "loss": 0.1396,
+      "step": 13869
+    },
+    {
+      "epoch": 0.12039826043176709,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001919723182995222,
+      "loss": 0.1299,
+      "step": 13870
+    },
+    {
+      "epoch": 0.12040694091197125,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019197108954124998,
+      "loss": 0.1025,
+      "step": 13871
+    },
+    {
+      "epoch": 0.12041562139217542,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001919698606933354,
+      "loss": 0.1914,
+      "step": 13872
+    },
+    {
+      "epoch": 0.12042430187237958,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0019196863175577979,
+      "loss": 0.1035,
+      "step": 13873
+    },
+    {
+      "epoch": 0.12043298235258375,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0019196740272858454,
+      "loss": 0.6211,
+      "step": 13874
+    },
+    {
+      "epoch": 0.12044166283278791,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019196617361175094,
+      "loss": 0.1445,
+      "step": 13875
+    },
+    {
+      "epoch": 0.12045034331299208,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001919649444052804,
+      "loss": 0.1445,
+      "step": 13876
+    },
+    {
+      "epoch": 0.12045902379319624,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001919637151091742,
+      "loss": 0.0913,
+      "step": 13877
+    },
+    {
+      "epoch": 0.12046770427340041,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019196248572343373,
+      "loss": 0.1279,
+      "step": 13878
+    },
+    {
+      "epoch": 0.12047638475360457,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0019196125624806031,
+      "loss": 0.1074,
+      "step": 13879
+    },
+    {
+      "epoch": 0.12048506523380874,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001919600266830553,
+      "loss": 0.1533,
+      "step": 13880
+    },
+    {
+      "epoch": 0.1204937457140129,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019195879702842004,
+      "loss": 0.0908,
+      "step": 13881
+    },
+    {
+      "epoch": 0.12050242619421707,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0019195756728415583,
+      "loss": 0.1069,
+      "step": 13882
+    },
+    {
+      "epoch": 0.12051110667442123,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001919563374502641,
+      "loss": 0.0947,
+      "step": 13883
+    },
+    {
+      "epoch": 0.1205197871546254,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019195510752674611,
+      "loss": 0.1182,
+      "step": 13884
+    },
+    {
+      "epoch": 0.12052846763482956,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0019195387751360332,
+      "loss": 0.1602,
+      "step": 13885
+    },
+    {
+      "epoch": 0.12053714811503373,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0019195264741083693,
+      "loss": 0.1289,
+      "step": 13886
+    },
+    {
+      "epoch": 0.1205458285952379,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0019195141721844837,
+      "loss": 0.1445,
+      "step": 13887
+    },
+    {
+      "epoch": 0.12055450907544206,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.00191950186936439,
+      "loss": 0.1045,
+      "step": 13888
+    },
+    {
+      "epoch": 0.12056318955564622,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019194895656481008,
+      "loss": 0.1328,
+      "step": 13889
+    },
+    {
+      "epoch": 0.12057187003585039,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0019194772610356308,
+      "loss": 0.1621,
+      "step": 13890
+    },
+    {
+      "epoch": 0.12058055051605455,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019194649555269922,
+      "loss": 0.0986,
+      "step": 13891
+    },
+    {
+      "epoch": 0.1205892309962587,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019194526491221994,
+      "loss": 0.1211,
+      "step": 13892
+    },
+    {
+      "epoch": 0.12059791147646287,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019194403418212656,
+      "loss": 0.1084,
+      "step": 13893
+    },
+    {
+      "epoch": 0.12060659195666704,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001919428033624204,
+      "loss": 0.127,
+      "step": 13894
+    },
+    {
+      "epoch": 0.1206152724368712,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.001919415724531028,
+      "loss": 0.1211,
+      "step": 13895
+    },
+    {
+      "epoch": 0.12062395291707537,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001919403414541752,
+      "loss": 0.0874,
+      "step": 13896
+    },
+    {
+      "epoch": 0.12063263339727953,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0019193911036563878,
+      "loss": 0.1211,
+      "step": 13897
+    },
+    {
+      "epoch": 0.1206413138774837,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0019193787918749504,
+      "loss": 0.1016,
+      "step": 13898
+    },
+    {
+      "epoch": 0.12064999435768786,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019193664791974524,
+      "loss": 0.0957,
+      "step": 13899
+    },
+    {
+      "epoch": 0.12065867483789203,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019193541656239075,
+      "loss": 0.1348,
+      "step": 13900
+    },
+    {
+      "epoch": 0.12066735531809619,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0019193418511543293,
+      "loss": 0.0835,
+      "step": 13901
+    },
+    {
+      "epoch": 0.12067603579830036,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001919329535788731,
+      "loss": 0.1201,
+      "step": 13902
+    },
+    {
+      "epoch": 0.12068471627850452,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019193172195271265,
+      "loss": 0.3574,
+      "step": 13903
+    },
+    {
+      "epoch": 0.12069339675870869,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019193049023695289,
+      "loss": 0.0928,
+      "step": 13904
+    },
+    {
+      "epoch": 0.12070207723891285,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0019192925843159517,
+      "loss": 0.1543,
+      "step": 13905
+    },
+    {
+      "epoch": 0.12071075771911702,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0019192802653664085,
+      "loss": 0.0928,
+      "step": 13906
+    },
+    {
+      "epoch": 0.12071943819932118,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019192679455209128,
+      "loss": 0.1738,
+      "step": 13907
+    },
+    {
+      "epoch": 0.12072811867952535,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019192556247794776,
+      "loss": 0.1074,
+      "step": 13908
+    },
+    {
+      "epoch": 0.12073679915972951,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0019192433031421173,
+      "loss": 0.1011,
+      "step": 13909
+    },
+    {
+      "epoch": 0.12074547963993368,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019192309806088444,
+      "loss": 0.124,
+      "step": 13910
+    },
+    {
+      "epoch": 0.12075416012013784,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001919218657179673,
+      "loss": 0.0903,
+      "step": 13911
+    },
+    {
+      "epoch": 0.12076284060034201,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019192063328546164,
+      "loss": 0.1387,
+      "step": 13912
+    },
+    {
+      "epoch": 0.12077152108054617,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001919194007633688,
+      "loss": 0.126,
+      "step": 13913
+    },
+    {
+      "epoch": 0.12078020156075034,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019191816815169013,
+      "loss": 0.1797,
+      "step": 13914
+    },
+    {
+      "epoch": 0.1207888820409545,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019191693545042702,
+      "loss": 0.1147,
+      "step": 13915
+    },
+    {
+      "epoch": 0.12079756252115867,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001919157026595807,
+      "loss": 0.0918,
+      "step": 13916
+    },
+    {
+      "epoch": 0.12080624300136283,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0019191446977915268,
+      "loss": 0.0938,
+      "step": 13917
+    },
+    {
+      "epoch": 0.120814923481567,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001919132368091442,
+      "loss": 0.1177,
+      "step": 13918
+    },
+    {
+      "epoch": 0.12082360396177116,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0019191200374955662,
+      "loss": 0.1221,
+      "step": 13919
+    },
+    {
+      "epoch": 0.12083228444197533,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001919107706003913,
+      "loss": 0.1396,
+      "step": 13920
+    },
+    {
+      "epoch": 0.1208409649221795,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0019190953736164963,
+      "loss": 0.1514,
+      "step": 13921
+    },
+    {
+      "epoch": 0.12084964540238366,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019190830403333287,
+      "loss": 0.1094,
+      "step": 13922
+    },
+    {
+      "epoch": 0.12085832588258782,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0019190707061544246,
+      "loss": 0.1348,
+      "step": 13923
+    },
+    {
+      "epoch": 0.12086700636279199,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019190583710797968,
+      "loss": 0.085,
+      "step": 13924
+    },
+    {
+      "epoch": 0.12087568684299616,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019190460351094594,
+      "loss": 0.0913,
+      "step": 13925
+    },
+    {
+      "epoch": 0.12088436732320032,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0019190336982434254,
+      "loss": 0.1328,
+      "step": 13926
+    },
+    {
+      "epoch": 0.12089304780340449,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019190213604817084,
+      "loss": 0.1396,
+      "step": 13927
+    },
+    {
+      "epoch": 0.12090172828360865,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019190090218243216,
+      "loss": 0.1133,
+      "step": 13928
+    },
+    {
+      "epoch": 0.12091040876381282,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019189966822712795,
+      "loss": 0.1055,
+      "step": 13929
+    },
+    {
+      "epoch": 0.12091908924401698,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019189843418225946,
+      "loss": 0.1738,
+      "step": 13930
+    },
+    {
+      "epoch": 0.12092776972422115,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019189720004782807,
+      "loss": 0.1064,
+      "step": 13931
+    },
+    {
+      "epoch": 0.12093645020442531,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0019189596582383512,
+      "loss": 0.1084,
+      "step": 13932
+    },
+    {
+      "epoch": 0.12094513068462948,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.00191894731510282,
+      "loss": 0.1152,
+      "step": 13933
+    },
+    {
+      "epoch": 0.12095381116483364,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0019189349710717004,
+      "loss": 0.1182,
+      "step": 13934
+    },
+    {
+      "epoch": 0.1209624916450378,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019189226261450054,
+      "loss": 0.084,
+      "step": 13935
+    },
+    {
+      "epoch": 0.12097117212524197,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019189102803227494,
+      "loss": 0.1006,
+      "step": 13936
+    },
+    {
+      "epoch": 0.12097985260544614,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0019188979336049452,
+      "loss": 0.1191,
+      "step": 13937
+    },
+    {
+      "epoch": 0.1209885330856503,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019188855859916063,
+      "loss": 0.0752,
+      "step": 13938
+    },
+    {
+      "epoch": 0.12099721356585447,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001918873237482747,
+      "loss": 0.1514,
+      "step": 13939
+    },
+    {
+      "epoch": 0.12100589404605863,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019188608880783798,
+      "loss": 0.0972,
+      "step": 13940
+    },
+    {
+      "epoch": 0.1210145745262628,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019188485377785185,
+      "loss": 0.0801,
+      "step": 13941
+    },
+    {
+      "epoch": 0.12102325500646696,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019188361865831772,
+      "loss": 0.1182,
+      "step": 13942
+    },
+    {
+      "epoch": 0.12103193548667113,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019188238344923685,
+      "loss": 0.1309,
+      "step": 13943
+    },
+    {
+      "epoch": 0.12104061596687529,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0019188114815061066,
+      "loss": 0.1855,
+      "step": 13944
+    },
+    {
+      "epoch": 0.12104929644707946,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019187991276244049,
+      "loss": 0.1523,
+      "step": 13945
+    },
+    {
+      "epoch": 0.12105797692728362,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0019187867728472766,
+      "loss": 0.1348,
+      "step": 13946
+    },
+    {
+      "epoch": 0.12106665740748779,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0019187744171747351,
+      "loss": 0.1133,
+      "step": 13947
+    },
+    {
+      "epoch": 0.12107533788769195,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0019187620606067946,
+      "loss": 0.0737,
+      "step": 13948
+    },
+    {
+      "epoch": 0.12108401836789612,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001918749703143468,
+      "loss": 0.1133,
+      "step": 13949
+    },
+    {
+      "epoch": 0.12109269884810028,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.001918737344784769,
+      "loss": 0.2305,
+      "step": 13950
+    },
+    {
+      "epoch": 0.12110137932830445,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0019187249855307114,
+      "loss": 0.1172,
+      "step": 13951
+    },
+    {
+      "epoch": 0.12111005980850861,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019187126253813083,
+      "loss": 0.1445,
+      "step": 13952
+    },
+    {
+      "epoch": 0.12111874028871278,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0019187002643365736,
+      "loss": 0.1309,
+      "step": 13953
+    },
+    {
+      "epoch": 0.12112742076891693,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0019186879023965204,
+      "loss": 0.1367,
+      "step": 13954
+    },
+    {
+      "epoch": 0.1211361012491211,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0019186755395611624,
+      "loss": 0.0942,
+      "step": 13955
+    },
+    {
+      "epoch": 0.12114478172932526,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001918663175830513,
+      "loss": 0.0938,
+      "step": 13956
+    },
+    {
+      "epoch": 0.12115346220952943,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019186508112045862,
+      "loss": 0.1309,
+      "step": 13957
+    },
+    {
+      "epoch": 0.12116214268973359,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0019186384456833948,
+      "loss": 0.1152,
+      "step": 13958
+    },
+    {
+      "epoch": 0.12117082316993776,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001918626079266953,
+      "loss": 0.0801,
+      "step": 13959
+    },
+    {
+      "epoch": 0.12117950365014192,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0019186137119552738,
+      "loss": 0.1006,
+      "step": 13960
+    },
+    {
+      "epoch": 0.12118818413034609,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001918601343748371,
+      "loss": 0.1338,
+      "step": 13961
+    },
+    {
+      "epoch": 0.12119686461055025,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001918588974646258,
+      "loss": 0.1357,
+      "step": 13962
+    },
+    {
+      "epoch": 0.12120554509075442,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0019185766046489485,
+      "loss": 0.0791,
+      "step": 13963
+    },
+    {
+      "epoch": 0.12121422557095858,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001918564233756456,
+      "loss": 0.1064,
+      "step": 13964
+    },
+    {
+      "epoch": 0.12122290605116275,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019185518619687938,
+      "loss": 0.1309,
+      "step": 13965
+    },
+    {
+      "epoch": 0.12123158653136691,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0019185394892859758,
+      "loss": 0.123,
+      "step": 13966
+    },
+    {
+      "epoch": 0.12124026701157108,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019185271157080152,
+      "loss": 0.0991,
+      "step": 13967
+    },
+    {
+      "epoch": 0.12124894749177524,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019185147412349253,
+      "loss": 0.1504,
+      "step": 13968
+    },
+    {
+      "epoch": 0.1212576279719794,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019185023658667203,
+      "loss": 0.0938,
+      "step": 13969
+    },
+    {
+      "epoch": 0.12126630845218357,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019184899896034133,
+      "loss": 0.1279,
+      "step": 13970
+    },
+    {
+      "epoch": 0.12127498893238774,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0019184776124450183,
+      "loss": 0.1367,
+      "step": 13971
+    },
+    {
+      "epoch": 0.1212836694125919,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0019184652343915481,
+      "loss": 0.1143,
+      "step": 13972
+    },
+    {
+      "epoch": 0.12129234989279607,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0019184528554430165,
+      "loss": 0.1055,
+      "step": 13973
+    },
+    {
+      "epoch": 0.12130103037300023,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019184404755994376,
+      "loss": 0.1348,
+      "step": 13974
+    },
+    {
+      "epoch": 0.1213097108532044,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019184280948608243,
+      "loss": 0.1162,
+      "step": 13975
+    },
+    {
+      "epoch": 0.12131839133340856,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019184157132271903,
+      "loss": 0.1104,
+      "step": 13976
+    },
+    {
+      "epoch": 0.12132707181361273,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001918403330698549,
+      "loss": 0.1436,
+      "step": 13977
+    },
+    {
+      "epoch": 0.12133575229381689,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0019183909472749144,
+      "loss": 0.0806,
+      "step": 13978
+    },
+    {
+      "epoch": 0.12134443277402106,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019183785629562997,
+      "loss": 0.1123,
+      "step": 13979
+    },
+    {
+      "epoch": 0.12135311325422522,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019183661777427187,
+      "loss": 0.124,
+      "step": 13980
+    },
+    {
+      "epoch": 0.12136179373442939,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0019183537916341842,
+      "loss": 0.0913,
+      "step": 13981
+    },
+    {
+      "epoch": 0.12137047421463355,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0019183414046307107,
+      "loss": 0.1133,
+      "step": 13982
+    },
+    {
+      "epoch": 0.12137915469483772,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0019183290167323114,
+      "loss": 0.1201,
+      "step": 13983
+    },
+    {
+      "epoch": 0.12138783517504188,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0019183166279389992,
+      "loss": 0.1177,
+      "step": 13984
+    },
+    {
+      "epoch": 0.12139651565524605,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019183042382507887,
+      "loss": 0.1455,
+      "step": 13985
+    },
+    {
+      "epoch": 0.12140519613545021,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019182918476676931,
+      "loss": 0.1025,
+      "step": 13986
+    },
+    {
+      "epoch": 0.12141387661565438,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0019182794561897257,
+      "loss": 0.0957,
+      "step": 13987
+    },
+    {
+      "epoch": 0.12142255709585854,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019182670638168999,
+      "loss": 0.1602,
+      "step": 13988
+    },
+    {
+      "epoch": 0.12143123757606271,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00191825467054923,
+      "loss": 0.1045,
+      "step": 13989
+    },
+    {
+      "epoch": 0.12143991805626687,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019182422763867287,
+      "loss": 0.1143,
+      "step": 13990
+    },
+    {
+      "epoch": 0.12144859853647104,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.00191822988132941,
+      "loss": 0.1104,
+      "step": 13991
+    },
+    {
+      "epoch": 0.1214572790166752,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0019182174853772877,
+      "loss": 0.1025,
+      "step": 13992
+    },
+    {
+      "epoch": 0.12146595949687937,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0019182050885303747,
+      "loss": 0.1162,
+      "step": 13993
+    },
+    {
+      "epoch": 0.12147463997708353,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019181926907886851,
+      "loss": 0.1211,
+      "step": 13994
+    },
+    {
+      "epoch": 0.1214833204572877,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019181802921522323,
+      "loss": 0.1309,
+      "step": 13995
+    },
+    {
+      "epoch": 0.12149200093749186,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019181678926210295,
+      "loss": 0.1016,
+      "step": 13996
+    },
+    {
+      "epoch": 0.12150068141769603,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.001918155492195091,
+      "loss": 0.1348,
+      "step": 13997
+    },
+    {
+      "epoch": 0.1215093618979002,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0019181430908744297,
+      "loss": 0.1328,
+      "step": 13998
+    },
+    {
+      "epoch": 0.12151804237810436,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0019181306886590596,
+      "loss": 0.1113,
+      "step": 13999
+    },
+    {
+      "epoch": 0.12152672285830853,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019181182855489939,
+      "loss": 0.1152,
+      "step": 14000
+    },
+    {
+      "epoch": 0.12153540333851269,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019181058815442463,
+      "loss": 0.1143,
+      "step": 14001
+    },
+    {
+      "epoch": 0.12154408381871686,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0019180934766448306,
+      "loss": 0.1162,
+      "step": 14002
+    },
+    {
+      "epoch": 0.12155276429892102,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019180810708507598,
+      "loss": 0.1123,
+      "step": 14003
+    },
+    {
+      "epoch": 0.12156144477912519,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0019180686641620483,
+      "loss": 0.0972,
+      "step": 14004
+    },
+    {
+      "epoch": 0.12157012525932935,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0019180562565787088,
+      "loss": 0.1406,
+      "step": 14005
+    },
+    {
+      "epoch": 0.12157880573953352,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0019180438481007556,
+      "loss": 0.1182,
+      "step": 14006
+    },
+    {
+      "epoch": 0.12158748621973768,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.001918031438728202,
+      "loss": 0.1309,
+      "step": 14007
+    },
+    {
+      "epoch": 0.12159616669994185,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001918019028461061,
+      "loss": 0.1245,
+      "step": 14008
+    },
+    {
+      "epoch": 0.12160484718014601,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001918006617299347,
+      "loss": 0.0996,
+      "step": 14009
+    },
+    {
+      "epoch": 0.12161352766035018,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019179942052430733,
+      "loss": 0.125,
+      "step": 14010
+    },
+    {
+      "epoch": 0.12162220814055434,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019179817922922533,
+      "loss": 0.1187,
+      "step": 14011
+    },
+    {
+      "epoch": 0.1216308886207585,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0019179693784469008,
+      "loss": 0.124,
+      "step": 14012
+    },
+    {
+      "epoch": 0.12163956910096267,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001917956963707029,
+      "loss": 0.1816,
+      "step": 14013
+    },
+    {
+      "epoch": 0.12164824958116684,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019179445480726521,
+      "loss": 0.1016,
+      "step": 14014
+    },
+    {
+      "epoch": 0.121656930061371,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019179321315437835,
+      "loss": 0.1221,
+      "step": 14015
+    },
+    {
+      "epoch": 0.12166561054157515,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019179197141204363,
+      "loss": 0.1357,
+      "step": 14016
+    },
+    {
+      "epoch": 0.12167429102177932,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019179072958026243,
+      "loss": 0.1387,
+      "step": 14017
+    },
+    {
+      "epoch": 0.12168297150198348,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0019178948765903611,
+      "loss": 0.1104,
+      "step": 14018
+    },
+    {
+      "epoch": 0.12169165198218765,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0019178824564836609,
+      "loss": 0.1074,
+      "step": 14019
+    },
+    {
+      "epoch": 0.12170033246239181,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001917870035482536,
+      "loss": 0.123,
+      "step": 14020
+    },
+    {
+      "epoch": 0.12170901294259598,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0019178576135870011,
+      "loss": 0.123,
+      "step": 14021
+    },
+    {
+      "epoch": 0.12171769342280014,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019178451907970696,
+      "loss": 0.1182,
+      "step": 14022
+    },
+    {
+      "epoch": 0.12172637390300431,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019178327671127546,
+      "loss": 0.1602,
+      "step": 14023
+    },
+    {
+      "epoch": 0.12173505438320847,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.00191782034253407,
+      "loss": 0.1143,
+      "step": 14024
+    },
+    {
+      "epoch": 0.12174373486341264,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0019178079170610294,
+      "loss": 0.1465,
+      "step": 14025
+    },
+    {
+      "epoch": 0.1217524153436168,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0019177954906936467,
+      "loss": 0.1143,
+      "step": 14026
+    },
+    {
+      "epoch": 0.12176109582382097,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0019177830634319346,
+      "loss": 0.1104,
+      "step": 14027
+    },
+    {
+      "epoch": 0.12176977630402513,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019177706352759077,
+      "loss": 0.1094,
+      "step": 14028
+    },
+    {
+      "epoch": 0.1217784567842293,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0019177582062255788,
+      "loss": 0.0991,
+      "step": 14029
+    },
+    {
+      "epoch": 0.12178713726443346,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019177457762809618,
+      "loss": 0.0952,
+      "step": 14030
+    },
+    {
+      "epoch": 0.12179581774463763,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019177333454420704,
+      "loss": 0.165,
+      "step": 14031
+    },
+    {
+      "epoch": 0.1218044982248418,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001917720913708918,
+      "loss": 0.1123,
+      "step": 14032
+    },
+    {
+      "epoch": 0.12181317870504596,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019177084810815183,
+      "loss": 0.0947,
+      "step": 14033
+    },
+    {
+      "epoch": 0.12182185918525013,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001917696047559885,
+      "loss": 0.1045,
+      "step": 14034
+    },
+    {
+      "epoch": 0.12183053966545429,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019176836131440313,
+      "loss": 0.0752,
+      "step": 14035
+    },
+    {
+      "epoch": 0.12183922014565846,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019176711778339714,
+      "loss": 0.0908,
+      "step": 14036
+    },
+    {
+      "epoch": 0.12184790062586262,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0019176587416297186,
+      "loss": 0.0967,
+      "step": 14037
+    },
+    {
+      "epoch": 0.12185658110606679,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019176463045312863,
+      "loss": 0.126,
+      "step": 14038
+    },
+    {
+      "epoch": 0.12186526158627095,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019176338665386886,
+      "loss": 0.0742,
+      "step": 14039
+    },
+    {
+      "epoch": 0.12187394206647512,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019176214276519386,
+      "loss": 0.1172,
+      "step": 14040
+    },
+    {
+      "epoch": 0.12188262254667928,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.00191760898787105,
+      "loss": 0.1348,
+      "step": 14041
+    },
+    {
+      "epoch": 0.12189130302688345,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019175965471960367,
+      "loss": 0.1133,
+      "step": 14042
+    },
+    {
+      "epoch": 0.12189998350708761,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.001917584105626912,
+      "loss": 0.123,
+      "step": 14043
+    },
+    {
+      "epoch": 0.12190866398729178,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019175716631636894,
+      "loss": 0.1357,
+      "step": 14044
+    },
+    {
+      "epoch": 0.12191734446749594,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019175592198063828,
+      "loss": 0.1113,
+      "step": 14045
+    },
+    {
+      "epoch": 0.1219260249477001,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001917546775555006,
+      "loss": 0.1191,
+      "step": 14046
+    },
+    {
+      "epoch": 0.12193470542790427,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019175343304095724,
+      "loss": 0.1094,
+      "step": 14047
+    },
+    {
+      "epoch": 0.12194338590810844,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.001917521884370095,
+      "loss": 0.0991,
+      "step": 14048
+    },
+    {
+      "epoch": 0.1219520663883126,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0019175094374365882,
+      "loss": 0.1094,
+      "step": 14049
+    },
+    {
+      "epoch": 0.12196074686851677,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019174969896090656,
+      "loss": 0.1426,
+      "step": 14050
+    },
+    {
+      "epoch": 0.12196942734872093,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019174845408875402,
+      "loss": 0.0869,
+      "step": 14051
+    },
+    {
+      "epoch": 0.1219781078289251,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001917472091272026,
+      "loss": 0.1367,
+      "step": 14052
+    },
+    {
+      "epoch": 0.12198678830912926,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0019174596407625368,
+      "loss": 0.1104,
+      "step": 14053
+    },
+    {
+      "epoch": 0.12199546878933343,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.001917447189359086,
+      "loss": 0.1533,
+      "step": 14054
+    },
+    {
+      "epoch": 0.12200414926953759,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001917434737061687,
+      "loss": 0.126,
+      "step": 14055
+    },
+    {
+      "epoch": 0.12201282974974176,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019174222838703543,
+      "loss": 0.1074,
+      "step": 14056
+    },
+    {
+      "epoch": 0.12202151022994592,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019174098297851004,
+      "loss": 0.1191,
+      "step": 14057
+    },
+    {
+      "epoch": 0.12203019071015009,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019173973748059393,
+      "loss": 0.0977,
+      "step": 14058
+    },
+    {
+      "epoch": 0.12203887119035425,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001917384918932885,
+      "loss": 0.0845,
+      "step": 14059
+    },
+    {
+      "epoch": 0.12204755167055842,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019173724621659506,
+      "loss": 0.0981,
+      "step": 14060
+    },
+    {
+      "epoch": 0.12205623215076258,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.00191736000450515,
+      "loss": 0.127,
+      "step": 14061
+    },
+    {
+      "epoch": 0.12206491263096675,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001917347545950497,
+      "loss": 0.0986,
+      "step": 14062
+    },
+    {
+      "epoch": 0.12207359311117091,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019173350865020048,
+      "loss": 0.127,
+      "step": 14063
+    },
+    {
+      "epoch": 0.12208227359137508,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001917322626159687,
+      "loss": 0.1504,
+      "step": 14064
+    },
+    {
+      "epoch": 0.12209095407157924,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019173101649235579,
+      "loss": 0.1484,
+      "step": 14065
+    },
+    {
+      "epoch": 0.12209963455178341,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019172977027936302,
+      "loss": 0.1045,
+      "step": 14066
+    },
+    {
+      "epoch": 0.12210831503198757,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019172852397699185,
+      "loss": 0.1045,
+      "step": 14067
+    },
+    {
+      "epoch": 0.12211699551219174,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019172727758524357,
+      "loss": 0.1245,
+      "step": 14068
+    },
+    {
+      "epoch": 0.1221256759923959,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0019172603110411957,
+      "loss": 0.1055,
+      "step": 14069
+    },
+    {
+      "epoch": 0.12213435647260007,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001917247845336212,
+      "loss": 0.104,
+      "step": 14070
+    },
+    {
+      "epoch": 0.12214303695280423,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0019172353787374988,
+      "loss": 0.166,
+      "step": 14071
+    },
+    {
+      "epoch": 0.1221517174330084,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019172229112450687,
+      "loss": 0.1143,
+      "step": 14072
+    },
+    {
+      "epoch": 0.12216039791321257,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001917210442858936,
+      "loss": 0.1074,
+      "step": 14073
+    },
+    {
+      "epoch": 0.12216907839341673,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019171979735791145,
+      "loss": 0.0835,
+      "step": 14074
+    },
+    {
+      "epoch": 0.1221777588736209,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019171855034056174,
+      "loss": 0.1377,
+      "step": 14075
+    },
+    {
+      "epoch": 0.12218643935382506,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0019171730323384586,
+      "loss": 0.1221,
+      "step": 14076
+    },
+    {
+      "epoch": 0.12219511983402921,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019171605603776516,
+      "loss": 0.0903,
+      "step": 14077
+    },
+    {
+      "epoch": 0.12220380031423338,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.00191714808752321,
+      "loss": 0.126,
+      "step": 14078
+    },
+    {
+      "epoch": 0.12221248079443754,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0019171356137751478,
+      "loss": 0.207,
+      "step": 14079
+    },
+    {
+      "epoch": 0.12222116127464171,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019171231391334781,
+      "loss": 0.083,
+      "step": 14080
+    },
+    {
+      "epoch": 0.12222984175484587,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019171106635982149,
+      "loss": 0.125,
+      "step": 14081
+    },
+    {
+      "epoch": 0.12223852223505004,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0019170981871693717,
+      "loss": 0.1562,
+      "step": 14082
+    },
+    {
+      "epoch": 0.1222472027152542,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0019170857098469625,
+      "loss": 0.1338,
+      "step": 14083
+    },
+    {
+      "epoch": 0.12225588319545837,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019170732316310002,
+      "loss": 0.085,
+      "step": 14084
+    },
+    {
+      "epoch": 0.12226456367566253,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019170607525214988,
+      "loss": 0.127,
+      "step": 14085
+    },
+    {
+      "epoch": 0.1222732441558667,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019170482725184728,
+      "loss": 0.1108,
+      "step": 14086
+    },
+    {
+      "epoch": 0.12228192463607086,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019170357916219345,
+      "loss": 0.1602,
+      "step": 14087
+    },
+    {
+      "epoch": 0.12229060511627503,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001917023309831898,
+      "loss": 0.1289,
+      "step": 14088
+    },
+    {
+      "epoch": 0.1222992855964792,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019170108271483776,
+      "loss": 0.1582,
+      "step": 14089
+    },
+    {
+      "epoch": 0.12230796607668336,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001916998343571386,
+      "loss": 0.0825,
+      "step": 14090
+    },
+    {
+      "epoch": 0.12231664655688752,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019169858591009375,
+      "loss": 0.1001,
+      "step": 14091
+    },
+    {
+      "epoch": 0.12232532703709169,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019169733737370454,
+      "loss": 0.1357,
+      "step": 14092
+    },
+    {
+      "epoch": 0.12233400751729585,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019169608874797237,
+      "loss": 0.1172,
+      "step": 14093
+    },
+    {
+      "epoch": 0.12234268799750002,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019169484003289858,
+      "loss": 0.1216,
+      "step": 14094
+    },
+    {
+      "epoch": 0.12235136847770418,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019169359122848454,
+      "loss": 0.0815,
+      "step": 14095
+    },
+    {
+      "epoch": 0.12236004895790835,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019169234233473158,
+      "loss": 0.1416,
+      "step": 14096
+    },
+    {
+      "epoch": 0.12236872943811251,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0019169109335164115,
+      "loss": 0.0947,
+      "step": 14097
+    },
+    {
+      "epoch": 0.12237740991831668,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0019168984427921454,
+      "loss": 0.1426,
+      "step": 14098
+    },
+    {
+      "epoch": 0.12238609039852084,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019168859511745316,
+      "loss": 0.124,
+      "step": 14099
+    },
+    {
+      "epoch": 0.12239477087872501,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019168734586635834,
+      "loss": 0.1084,
+      "step": 14100
+    },
+    {
+      "epoch": 0.12240345135892917,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001916860965259315,
+      "loss": 0.0913,
+      "step": 14101
+    },
+    {
+      "epoch": 0.12241213183913334,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0019168484709617395,
+      "loss": 0.1064,
+      "step": 14102
+    },
+    {
+      "epoch": 0.1224208123193375,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001916835975770871,
+      "loss": 0.1377,
+      "step": 14103
+    },
+    {
+      "epoch": 0.12242949279954167,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0019168234796867224,
+      "loss": 0.0938,
+      "step": 14104
+    },
+    {
+      "epoch": 0.12243817327974583,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019168109827093084,
+      "loss": 0.1079,
+      "step": 14105
+    },
+    {
+      "epoch": 0.12244685375995,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.001916798484838642,
+      "loss": 0.127,
+      "step": 14106
+    },
+    {
+      "epoch": 0.12245553424015417,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019167859860747371,
+      "loss": 0.0991,
+      "step": 14107
+    },
+    {
+      "epoch": 0.12246421472035833,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0019167734864176075,
+      "loss": 0.0933,
+      "step": 14108
+    },
+    {
+      "epoch": 0.1224728952005625,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001916760985867266,
+      "loss": 0.1187,
+      "step": 14109
+    },
+    {
+      "epoch": 0.12248157568076666,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019167484844237277,
+      "loss": 0.1182,
+      "step": 14110
+    },
+    {
+      "epoch": 0.12249025616097083,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019167359820870052,
+      "loss": 0.1191,
+      "step": 14111
+    },
+    {
+      "epoch": 0.12249893664117499,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019167234788571128,
+      "loss": 0.1143,
+      "step": 14112
+    },
+    {
+      "epoch": 0.12250761712137916,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019167109747340635,
+      "loss": 0.1211,
+      "step": 14113
+    },
+    {
+      "epoch": 0.12251629760158332,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0019166984697178717,
+      "loss": 0.1279,
+      "step": 14114
+    },
+    {
+      "epoch": 0.12252497808178749,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0019166859638085503,
+      "loss": 0.1211,
+      "step": 14115
+    },
+    {
+      "epoch": 0.12253365856199165,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019166734570061138,
+      "loss": 0.1035,
+      "step": 14116
+    },
+    {
+      "epoch": 0.12254233904219582,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019166609493105752,
+      "loss": 0.1279,
+      "step": 14117
+    },
+    {
+      "epoch": 0.12255101952239998,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019166484407219485,
+      "loss": 0.104,
+      "step": 14118
+    },
+    {
+      "epoch": 0.12255970000260415,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0019166359312402475,
+      "loss": 0.1191,
+      "step": 14119
+    },
+    {
+      "epoch": 0.12256838048280831,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019166234208654854,
+      "loss": 0.1387,
+      "step": 14120
+    },
+    {
+      "epoch": 0.12257706096301248,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0019166109095976765,
+      "loss": 0.1562,
+      "step": 14121
+    },
+    {
+      "epoch": 0.12258574144321664,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019165983974368341,
+      "loss": 0.0996,
+      "step": 14122
+    },
+    {
+      "epoch": 0.12259442192342081,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001916585884382972,
+      "loss": 0.1377,
+      "step": 14123
+    },
+    {
+      "epoch": 0.12260310240362497,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0019165733704361039,
+      "loss": 0.0767,
+      "step": 14124
+    },
+    {
+      "epoch": 0.12261178288382914,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019165608555962433,
+      "loss": 0.1357,
+      "step": 14125
+    },
+    {
+      "epoch": 0.1226204633640333,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0019165483398634038,
+      "loss": 0.1152,
+      "step": 14126
+    },
+    {
+      "epoch": 0.12262914384423747,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019165358232375999,
+      "loss": 0.0967,
+      "step": 14127
+    },
+    {
+      "epoch": 0.12263782432444163,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0019165233057188442,
+      "loss": 0.1074,
+      "step": 14128
+    },
+    {
+      "epoch": 0.1226465048046458,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001916510787307151,
+      "loss": 0.1216,
+      "step": 14129
+    },
+    {
+      "epoch": 0.12265518528484996,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001916498268002534,
+      "loss": 0.1221,
+      "step": 14130
+    },
+    {
+      "epoch": 0.12266386576505413,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019164857478050066,
+      "loss": 0.168,
+      "step": 14131
+    },
+    {
+      "epoch": 0.1226725462452583,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019164732267145828,
+      "loss": 0.1504,
+      "step": 14132
+    },
+    {
+      "epoch": 0.12268122672546246,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0019164607047312762,
+      "loss": 0.1045,
+      "step": 14133
+    },
+    {
+      "epoch": 0.12268990720566662,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019164481818551,
+      "loss": 0.1104,
+      "step": 14134
+    },
+    {
+      "epoch": 0.12269858768587079,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019164356580860687,
+      "loss": 0.1074,
+      "step": 14135
+    },
+    {
+      "epoch": 0.12270726816607495,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0019164231334241955,
+      "loss": 0.1021,
+      "step": 14136
+    },
+    {
+      "epoch": 0.12271594864627912,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0019164106078694942,
+      "loss": 0.085,
+      "step": 14137
+    },
+    {
+      "epoch": 0.12272462912648328,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019163980814219787,
+      "loss": 0.0942,
+      "step": 14138
+    },
+    {
+      "epoch": 0.12273330960668744,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0019163855540816623,
+      "loss": 0.1089,
+      "step": 14139
+    },
+    {
+      "epoch": 0.1227419900868916,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019163730258485593,
+      "loss": 0.1133,
+      "step": 14140
+    },
+    {
+      "epoch": 0.12275067056709577,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019163604967226827,
+      "loss": 0.1074,
+      "step": 14141
+    },
+    {
+      "epoch": 0.12275935104729993,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0019163479667040468,
+      "loss": 0.1416,
+      "step": 14142
+    },
+    {
+      "epoch": 0.1227680315275041,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019163354357926645,
+      "loss": 0.1191,
+      "step": 14143
+    },
+    {
+      "epoch": 0.12277671200770826,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019163229039885505,
+      "loss": 0.1299,
+      "step": 14144
+    },
+    {
+      "epoch": 0.12278539248791243,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001916310371291718,
+      "loss": 0.126,
+      "step": 14145
+    },
+    {
+      "epoch": 0.12279407296811659,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0019162978377021806,
+      "loss": 0.127,
+      "step": 14146
+    },
+    {
+      "epoch": 0.12280275344832076,
+      "grad_norm": 2.75,
+      "learning_rate": 0.0019162853032199524,
+      "loss": 0.3652,
+      "step": 14147
+    },
+    {
+      "epoch": 0.12281143392852492,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019162727678450468,
+      "loss": 0.0986,
+      "step": 14148
+    },
+    {
+      "epoch": 0.12282011440872909,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0019162602315774772,
+      "loss": 0.1143,
+      "step": 14149
+    },
+    {
+      "epoch": 0.12282879488893325,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019162476944172577,
+      "loss": 0.085,
+      "step": 14150
+    },
+    {
+      "epoch": 0.12283747536913742,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0019162351563644022,
+      "loss": 0.1357,
+      "step": 14151
+    },
+    {
+      "epoch": 0.12284615584934158,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019162226174189243,
+      "loss": 0.1201,
+      "step": 14152
+    },
+    {
+      "epoch": 0.12285483632954575,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0019162100775808376,
+      "loss": 0.1309,
+      "step": 14153
+    },
+    {
+      "epoch": 0.12286351680974991,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0019161975368501556,
+      "loss": 0.1465,
+      "step": 14154
+    },
+    {
+      "epoch": 0.12287219728995408,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019161849952268925,
+      "loss": 0.1582,
+      "step": 14155
+    },
+    {
+      "epoch": 0.12288087777015824,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0019161724527110616,
+      "loss": 0.1641,
+      "step": 14156
+    },
+    {
+      "epoch": 0.12288955825036241,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0019161599093026767,
+      "loss": 0.0811,
+      "step": 14157
+    },
+    {
+      "epoch": 0.12289823873056657,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019161473650017515,
+      "loss": 0.0918,
+      "step": 14158
+    },
+    {
+      "epoch": 0.12290691921077074,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0019161348198083001,
+      "loss": 0.0815,
+      "step": 14159
+    },
+    {
+      "epoch": 0.1229155996909749,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0019161222737223357,
+      "loss": 0.083,
+      "step": 14160
+    },
+    {
+      "epoch": 0.12292428017117907,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019161097267438726,
+      "loss": 0.0962,
+      "step": 14161
+    },
+    {
+      "epoch": 0.12293296065138323,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019160971788729237,
+      "loss": 0.124,
+      "step": 14162
+    },
+    {
+      "epoch": 0.1229416411315874,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019160846301095036,
+      "loss": 0.1069,
+      "step": 14163
+    },
+    {
+      "epoch": 0.12295032161179156,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019160720804536253,
+      "loss": 0.1406,
+      "step": 14164
+    },
+    {
+      "epoch": 0.12295900209199573,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.001916059529905303,
+      "loss": 0.1147,
+      "step": 14165
+    },
+    {
+      "epoch": 0.1229676825721999,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0019160469784645503,
+      "loss": 0.1021,
+      "step": 14166
+    },
+    {
+      "epoch": 0.12297636305240406,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019160344261313805,
+      "loss": 0.1152,
+      "step": 14167
+    },
+    {
+      "epoch": 0.12298504353260822,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019160218729058081,
+      "loss": 0.1543,
+      "step": 14168
+    },
+    {
+      "epoch": 0.12299372401281239,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019160093187878461,
+      "loss": 0.1143,
+      "step": 14169
+    },
+    {
+      "epoch": 0.12300240449301655,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0019159967637775088,
+      "loss": 0.1074,
+      "step": 14170
+    },
+    {
+      "epoch": 0.12301108497322072,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.00191598420787481,
+      "loss": 0.1436,
+      "step": 14171
+    },
+    {
+      "epoch": 0.12301976545342488,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019159716510797628,
+      "loss": 0.1162,
+      "step": 14172
+    },
+    {
+      "epoch": 0.12302844593362905,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0019159590933923811,
+      "loss": 0.1045,
+      "step": 14173
+    },
+    {
+      "epoch": 0.12303712641383321,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001915946534812679,
+      "loss": 0.1465,
+      "step": 14174
+    },
+    {
+      "epoch": 0.12304580689403738,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0019159339753406703,
+      "loss": 0.1006,
+      "step": 14175
+    },
+    {
+      "epoch": 0.12305448737424154,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001915921414976368,
+      "loss": 0.1001,
+      "step": 14176
+    },
+    {
+      "epoch": 0.12306316785444571,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019159088537197865,
+      "loss": 0.0933,
+      "step": 14177
+    },
+    {
+      "epoch": 0.12307184833464987,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0019158962915709393,
+      "loss": 0.127,
+      "step": 14178
+    },
+    {
+      "epoch": 0.12308052881485404,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0019158837285298404,
+      "loss": 0.0942,
+      "step": 14179
+    },
+    {
+      "epoch": 0.1230892092950582,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019158711645965029,
+      "loss": 0.0928,
+      "step": 14180
+    },
+    {
+      "epoch": 0.12309788977526237,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0019158585997709411,
+      "loss": 0.1201,
+      "step": 14181
+    },
+    {
+      "epoch": 0.12310657025546654,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019158460340531686,
+      "loss": 0.0889,
+      "step": 14182
+    },
+    {
+      "epoch": 0.1231152507356707,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0019158334674431992,
+      "loss": 0.0742,
+      "step": 14183
+    },
+    {
+      "epoch": 0.12312393121587487,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019158208999410467,
+      "loss": 0.085,
+      "step": 14184
+    },
+    {
+      "epoch": 0.12313261169607903,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0019158083315467245,
+      "loss": 0.1572,
+      "step": 14185
+    },
+    {
+      "epoch": 0.1231412921762832,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019157957622602468,
+      "loss": 0.1602,
+      "step": 14186
+    },
+    {
+      "epoch": 0.12314997265648736,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001915783192081627,
+      "loss": 0.1572,
+      "step": 14187
+    },
+    {
+      "epoch": 0.12315865313669153,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001915770621010879,
+      "loss": 0.1094,
+      "step": 14188
+    },
+    {
+      "epoch": 0.12316733361689569,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019157580490480161,
+      "loss": 0.1006,
+      "step": 14189
+    },
+    {
+      "epoch": 0.12317601409709986,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001915745476193053,
+      "loss": 0.1143,
+      "step": 14190
+    },
+    {
+      "epoch": 0.12318469457730402,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019157329024460025,
+      "loss": 0.0938,
+      "step": 14191
+    },
+    {
+      "epoch": 0.12319337505750819,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019157203278068792,
+      "loss": 0.1768,
+      "step": 14192
+    },
+    {
+      "epoch": 0.12320205553771235,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019157077522756957,
+      "loss": 0.1533,
+      "step": 14193
+    },
+    {
+      "epoch": 0.12321073601791652,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001915695175852467,
+      "loss": 0.0845,
+      "step": 14194
+    },
+    {
+      "epoch": 0.12321941649812068,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019156825985372065,
+      "loss": 0.1143,
+      "step": 14195
+    },
+    {
+      "epoch": 0.12322809697832485,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019156700203299274,
+      "loss": 0.127,
+      "step": 14196
+    },
+    {
+      "epoch": 0.12323677745852901,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019156574412306437,
+      "loss": 0.1084,
+      "step": 14197
+    },
+    {
+      "epoch": 0.12324545793873318,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019156448612393695,
+      "loss": 0.0986,
+      "step": 14198
+    },
+    {
+      "epoch": 0.12325413841893734,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0019156322803561183,
+      "loss": 0.1299,
+      "step": 14199
+    },
+    {
+      "epoch": 0.1232628188991415,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001915619698580904,
+      "loss": 0.124,
+      "step": 14200
+    },
+    {
+      "epoch": 0.12327149937934566,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0019156071159137401,
+      "loss": 0.1133,
+      "step": 14201
+    },
+    {
+      "epoch": 0.12328017985954982,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0019155945323546407,
+      "loss": 0.0986,
+      "step": 14202
+    },
+    {
+      "epoch": 0.12328886033975399,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0019155819479036195,
+      "loss": 0.1064,
+      "step": 14203
+    },
+    {
+      "epoch": 0.12329754081995815,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0019155693625606896,
+      "loss": 0.1582,
+      "step": 14204
+    },
+    {
+      "epoch": 0.12330622130016232,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019155567763258657,
+      "loss": 0.123,
+      "step": 14205
+    },
+    {
+      "epoch": 0.12331490178036648,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019155441891991608,
+      "loss": 0.0825,
+      "step": 14206
+    },
+    {
+      "epoch": 0.12332358226057065,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0019155316011805893,
+      "loss": 0.125,
+      "step": 14207
+    },
+    {
+      "epoch": 0.12333226274077481,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0019155190122701648,
+      "loss": 0.104,
+      "step": 14208
+    },
+    {
+      "epoch": 0.12334094322097898,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0019155064224679007,
+      "loss": 0.0918,
+      "step": 14209
+    },
+    {
+      "epoch": 0.12334962370118314,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019154938317738112,
+      "loss": 0.1338,
+      "step": 14210
+    },
+    {
+      "epoch": 0.12335830418138731,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0019154812401879098,
+      "loss": 0.1201,
+      "step": 14211
+    },
+    {
+      "epoch": 0.12336698466159148,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0019154686477102106,
+      "loss": 0.1211,
+      "step": 14212
+    },
+    {
+      "epoch": 0.12337566514179564,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0019154560543407269,
+      "loss": 0.1836,
+      "step": 14213
+    },
+    {
+      "epoch": 0.1233843456219998,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0019154434600794727,
+      "loss": 0.0977,
+      "step": 14214
+    },
+    {
+      "epoch": 0.12339302610220397,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019154308649264617,
+      "loss": 0.1338,
+      "step": 14215
+    },
+    {
+      "epoch": 0.12340170658240814,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019154182688817078,
+      "loss": 0.0791,
+      "step": 14216
+    },
+    {
+      "epoch": 0.1234103870626123,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019154056719452254,
+      "loss": 0.0938,
+      "step": 14217
+    },
+    {
+      "epoch": 0.12341906754281647,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0019153930741170268,
+      "loss": 0.0938,
+      "step": 14218
+    },
+    {
+      "epoch": 0.12342774802302063,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001915380475397127,
+      "loss": 0.2539,
+      "step": 14219
+    },
+    {
+      "epoch": 0.1234364285032248,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001915367875785539,
+      "loss": 0.0786,
+      "step": 14220
+    },
+    {
+      "epoch": 0.12344510898342896,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.001915355275282277,
+      "loss": 0.1084,
+      "step": 14221
+    },
+    {
+      "epoch": 0.12345378946363313,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001915342673887355,
+      "loss": 0.0874,
+      "step": 14222
+    },
+    {
+      "epoch": 0.12346246994383729,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019153300716007865,
+      "loss": 0.1191,
+      "step": 14223
+    },
+    {
+      "epoch": 0.12347115042404146,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019153174684225852,
+      "loss": 0.0972,
+      "step": 14224
+    },
+    {
+      "epoch": 0.12347983090424562,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019153048643527648,
+      "loss": 0.0825,
+      "step": 14225
+    },
+    {
+      "epoch": 0.12348851138444979,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0019152922593913397,
+      "loss": 0.0933,
+      "step": 14226
+    },
+    {
+      "epoch": 0.12349719186465395,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019152796535383224,
+      "loss": 0.1484,
+      "step": 14227
+    },
+    {
+      "epoch": 0.12350587234485812,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0019152670467937282,
+      "loss": 0.1543,
+      "step": 14228
+    },
+    {
+      "epoch": 0.12351455282506228,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019152544391575698,
+      "loss": 0.1123,
+      "step": 14229
+    },
+    {
+      "epoch": 0.12352323330526645,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0019152418306298618,
+      "loss": 0.1436,
+      "step": 14230
+    },
+    {
+      "epoch": 0.12353191378547061,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019152292212106176,
+      "loss": 0.1006,
+      "step": 14231
+    },
+    {
+      "epoch": 0.12354059426567478,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0019152166108998506,
+      "loss": 0.1152,
+      "step": 14232
+    },
+    {
+      "epoch": 0.12354927474587894,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0019152039996975752,
+      "loss": 0.1328,
+      "step": 14233
+    },
+    {
+      "epoch": 0.12355795522608311,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0019151913876038046,
+      "loss": 0.125,
+      "step": 14234
+    },
+    {
+      "epoch": 0.12356663570628727,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019151787746185536,
+      "loss": 0.1094,
+      "step": 14235
+    },
+    {
+      "epoch": 0.12357531618649144,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019151661607418348,
+      "loss": 0.1357,
+      "step": 14236
+    },
+    {
+      "epoch": 0.1235839966666956,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001915153545973663,
+      "loss": 0.1069,
+      "step": 14237
+    },
+    {
+      "epoch": 0.12359267714689977,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001915140930314051,
+      "loss": 0.1006,
+      "step": 14238
+    },
+    {
+      "epoch": 0.12360135762710393,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019151283137630135,
+      "loss": 0.1113,
+      "step": 14239
+    },
+    {
+      "epoch": 0.1236100381073081,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019151156963205637,
+      "loss": 0.0869,
+      "step": 14240
+    },
+    {
+      "epoch": 0.12361871858751226,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0019151030779867159,
+      "loss": 0.1094,
+      "step": 14241
+    },
+    {
+      "epoch": 0.12362739906771643,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0019150904587614832,
+      "loss": 0.0928,
+      "step": 14242
+    },
+    {
+      "epoch": 0.1236360795479206,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019150778386448802,
+      "loss": 0.1079,
+      "step": 14243
+    },
+    {
+      "epoch": 0.12364476002812476,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00191506521763692,
+      "loss": 0.1387,
+      "step": 14244
+    },
+    {
+      "epoch": 0.12365344050832892,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001915052595737617,
+      "loss": 0.1719,
+      "step": 14245
+    },
+    {
+      "epoch": 0.12366212098853309,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019150399729469848,
+      "loss": 0.1152,
+      "step": 14246
+    },
+    {
+      "epoch": 0.12367080146873725,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0019150273492650367,
+      "loss": 0.0776,
+      "step": 14247
+    },
+    {
+      "epoch": 0.12367948194894142,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0019150147246917875,
+      "loss": 0.1152,
+      "step": 14248
+    },
+    {
+      "epoch": 0.12368816242914558,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.00191500209922725,
+      "loss": 0.0952,
+      "step": 14249
+    },
+    {
+      "epoch": 0.12369684290934975,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019149894728714388,
+      "loss": 0.1069,
+      "step": 14250
+    },
+    {
+      "epoch": 0.12370552338955391,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001914976845624367,
+      "loss": 0.1128,
+      "step": 14251
+    },
+    {
+      "epoch": 0.12371420386975808,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0019149642174860488,
+      "loss": 0.1309,
+      "step": 14252
+    },
+    {
+      "epoch": 0.12372288434996224,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019149515884564982,
+      "loss": 0.1006,
+      "step": 14253
+    },
+    {
+      "epoch": 0.12373156483016641,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019149389585357286,
+      "loss": 0.1504,
+      "step": 14254
+    },
+    {
+      "epoch": 0.12374024531037058,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019149263277237541,
+      "loss": 0.1035,
+      "step": 14255
+    },
+    {
+      "epoch": 0.12374892579057474,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019149136960205887,
+      "loss": 0.1118,
+      "step": 14256
+    },
+    {
+      "epoch": 0.1237576062707789,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019149010634262455,
+      "loss": 0.1289,
+      "step": 14257
+    },
+    {
+      "epoch": 0.12376628675098307,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019148884299407386,
+      "loss": 0.1445,
+      "step": 14258
+    },
+    {
+      "epoch": 0.12377496723118724,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0019148757955640824,
+      "loss": 0.1084,
+      "step": 14259
+    },
+    {
+      "epoch": 0.1237836477113914,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.00191486316029629,
+      "loss": 0.1201,
+      "step": 14260
+    },
+    {
+      "epoch": 0.12379232819159557,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019148505241373757,
+      "loss": 0.1143,
+      "step": 14261
+    },
+    {
+      "epoch": 0.12380100867179972,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019148378870873528,
+      "loss": 0.1025,
+      "step": 14262
+    },
+    {
+      "epoch": 0.12380968915200388,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019148252491462358,
+      "loss": 0.1152,
+      "step": 14263
+    },
+    {
+      "epoch": 0.12381836963220805,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001914812610314038,
+      "loss": 0.1328,
+      "step": 14264
+    },
+    {
+      "epoch": 0.12382705011241221,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019147999705907733,
+      "loss": 0.0742,
+      "step": 14265
+    },
+    {
+      "epoch": 0.12383573059261638,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0019147873299764555,
+      "loss": 0.1328,
+      "step": 14266
+    },
+    {
+      "epoch": 0.12384441107282054,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0019147746884710989,
+      "loss": 0.1533,
+      "step": 14267
+    },
+    {
+      "epoch": 0.12385309155302471,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0019147620460747163,
+      "loss": 0.1475,
+      "step": 14268
+    },
+    {
+      "epoch": 0.12386177203322887,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0019147494027873227,
+      "loss": 0.103,
+      "step": 14269
+    },
+    {
+      "epoch": 0.12387045251343304,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019147367586089315,
+      "loss": 0.0894,
+      "step": 14270
+    },
+    {
+      "epoch": 0.1238791329936372,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001914724113539556,
+      "loss": 0.1035,
+      "step": 14271
+    },
+    {
+      "epoch": 0.12388781347384137,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019147114675792105,
+      "loss": 0.1602,
+      "step": 14272
+    },
+    {
+      "epoch": 0.12389649395404553,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019146988207279093,
+      "loss": 0.0918,
+      "step": 14273
+    },
+    {
+      "epoch": 0.1239051744342497,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0019146861729856654,
+      "loss": 0.1172,
+      "step": 14274
+    },
+    {
+      "epoch": 0.12391385491445386,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019146735243524926,
+      "loss": 0.0835,
+      "step": 14275
+    },
+    {
+      "epoch": 0.12392253539465803,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019146608748284054,
+      "loss": 0.1504,
+      "step": 14276
+    },
+    {
+      "epoch": 0.1239312158748622,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0019146482244134172,
+      "loss": 0.1025,
+      "step": 14277
+    },
+    {
+      "epoch": 0.12393989635506636,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001914635573107542,
+      "loss": 0.0767,
+      "step": 14278
+    },
+    {
+      "epoch": 0.12394857683527052,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019146229209107935,
+      "loss": 0.125,
+      "step": 14279
+    },
+    {
+      "epoch": 0.12395725731547469,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001914610267823186,
+      "loss": 0.0947,
+      "step": 14280
+    },
+    {
+      "epoch": 0.12396593779567885,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0019145976138447323,
+      "loss": 0.1162,
+      "step": 14281
+    },
+    {
+      "epoch": 0.12397461827588302,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0019145849589754472,
+      "loss": 0.1416,
+      "step": 14282
+    },
+    {
+      "epoch": 0.12398329875608718,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019145723032153443,
+      "loss": 0.1504,
+      "step": 14283
+    },
+    {
+      "epoch": 0.12399197923629135,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0019145596465644375,
+      "loss": 0.1309,
+      "step": 14284
+    },
+    {
+      "epoch": 0.12400065971649551,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0019145469890227403,
+      "loss": 0.0996,
+      "step": 14285
+    },
+    {
+      "epoch": 0.12400934019669968,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019145343305902668,
+      "loss": 0.1001,
+      "step": 14286
+    },
+    {
+      "epoch": 0.12401802067690385,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0019145216712670309,
+      "loss": 0.1328,
+      "step": 14287
+    },
+    {
+      "epoch": 0.12402670115710801,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001914509011053046,
+      "loss": 0.0928,
+      "step": 14288
+    },
+    {
+      "epoch": 0.12403538163731218,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019144963499483265,
+      "loss": 0.0942,
+      "step": 14289
+    },
+    {
+      "epoch": 0.12404406211751634,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0019144836879528859,
+      "loss": 0.1396,
+      "step": 14290
+    },
+    {
+      "epoch": 0.1240527425977205,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.0019144710250667382,
+      "loss": 0.0957,
+      "step": 14291
+    },
+    {
+      "epoch": 0.12406142307792467,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019144583612898975,
+      "loss": 0.1113,
+      "step": 14292
+    },
+    {
+      "epoch": 0.12407010355812884,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019144456966223773,
+      "loss": 0.1123,
+      "step": 14293
+    },
+    {
+      "epoch": 0.124078784038333,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019144330310641915,
+      "loss": 0.1133,
+      "step": 14294
+    },
+    {
+      "epoch": 0.12408746451853717,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019144203646153538,
+      "loss": 0.0898,
+      "step": 14295
+    },
+    {
+      "epoch": 0.12409614499874133,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0019144076972758784,
+      "loss": 0.1289,
+      "step": 14296
+    },
+    {
+      "epoch": 0.1241048254789455,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019143950290457787,
+      "loss": 0.1338,
+      "step": 14297
+    },
+    {
+      "epoch": 0.12411350595914966,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001914382359925069,
+      "loss": 0.0845,
+      "step": 14298
+    },
+    {
+      "epoch": 0.12412218643935383,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001914369689913763,
+      "loss": 0.1099,
+      "step": 14299
+    },
+    {
+      "epoch": 0.12413086691955799,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0019143570190118748,
+      "loss": 0.0752,
+      "step": 14300
+    },
+    {
+      "epoch": 0.12413954739976216,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019143443472194178,
+      "loss": 0.1064,
+      "step": 14301
+    },
+    {
+      "epoch": 0.12414822787996632,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001914331674536406,
+      "loss": 0.1108,
+      "step": 14302
+    },
+    {
+      "epoch": 0.12415690836017049,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019143190009628533,
+      "loss": 0.1748,
+      "step": 14303
+    },
+    {
+      "epoch": 0.12416558884037465,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0019143063264987735,
+      "loss": 0.124,
+      "step": 14304
+    },
+    {
+      "epoch": 0.12417426932057882,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0019142936511441805,
+      "loss": 0.1318,
+      "step": 14305
+    },
+    {
+      "epoch": 0.12418294980078298,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019142809748990883,
+      "loss": 0.1006,
+      "step": 14306
+    },
+    {
+      "epoch": 0.12419163028098715,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0019142682977635111,
+      "loss": 0.1719,
+      "step": 14307
+    },
+    {
+      "epoch": 0.12420031076119131,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019142556197374616,
+      "loss": 0.1279,
+      "step": 14308
+    },
+    {
+      "epoch": 0.12420899124139548,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001914242940820955,
+      "loss": 0.1426,
+      "step": 14309
+    },
+    {
+      "epoch": 0.12421767172159964,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001914230261014004,
+      "loss": 0.1172,
+      "step": 14310
+    },
+    {
+      "epoch": 0.12422635220180381,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019142175803166233,
+      "loss": 0.1182,
+      "step": 14311
+    },
+    {
+      "epoch": 0.12423503268200797,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019142048987288265,
+      "loss": 0.1279,
+      "step": 14312
+    },
+    {
+      "epoch": 0.12424371316221214,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019141922162506275,
+      "loss": 0.1455,
+      "step": 14313
+    },
+    {
+      "epoch": 0.1242523936424163,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00191417953288204,
+      "loss": 0.1416,
+      "step": 14314
+    },
+    {
+      "epoch": 0.12426107412262047,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019141668486230782,
+      "loss": 0.1426,
+      "step": 14315
+    },
+    {
+      "epoch": 0.12426975460282463,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0019141541634737555,
+      "loss": 0.1143,
+      "step": 14316
+    },
+    {
+      "epoch": 0.1242784350830288,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0019141414774340863,
+      "loss": 0.0884,
+      "step": 14317
+    },
+    {
+      "epoch": 0.12428711556323296,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.001914128790504084,
+      "loss": 0.0864,
+      "step": 14318
+    },
+    {
+      "epoch": 0.12429579604343713,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0019141161026837627,
+      "loss": 0.1523,
+      "step": 14319
+    },
+    {
+      "epoch": 0.1243044765236413,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001914103413973136,
+      "loss": 0.168,
+      "step": 14320
+    },
+    {
+      "epoch": 0.12431315700384546,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0019140907243722185,
+      "loss": 0.1387,
+      "step": 14321
+    },
+    {
+      "epoch": 0.12432183748404962,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0019140780338810237,
+      "loss": 0.1182,
+      "step": 14322
+    },
+    {
+      "epoch": 0.12433051796425379,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019140653424995651,
+      "loss": 0.167,
+      "step": 14323
+    },
+    {
+      "epoch": 0.12433919844445794,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001914052650227857,
+      "loss": 0.1152,
+      "step": 14324
+    },
+    {
+      "epoch": 0.1243478789246621,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.001914039957065913,
+      "loss": 0.1025,
+      "step": 14325
+    },
+    {
+      "epoch": 0.12435655940486627,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019140272630137475,
+      "loss": 0.0928,
+      "step": 14326
+    },
+    {
+      "epoch": 0.12436523988507044,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0019140145680713737,
+      "loss": 0.0879,
+      "step": 14327
+    },
+    {
+      "epoch": 0.1243739203652746,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019140018722388057,
+      "loss": 0.1211,
+      "step": 14328
+    },
+    {
+      "epoch": 0.12438260084547877,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001913989175516058,
+      "loss": 0.0996,
+      "step": 14329
+    },
+    {
+      "epoch": 0.12439128132568293,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019139764779031432,
+      "loss": 0.1309,
+      "step": 14330
+    },
+    {
+      "epoch": 0.1243999618058871,
+      "grad_norm": 2.546875,
+      "learning_rate": 0.0019139637794000765,
+      "loss": 0.1572,
+      "step": 14331
+    },
+    {
+      "epoch": 0.12440864228609126,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019139510800068708,
+      "loss": 0.1143,
+      "step": 14332
+    },
+    {
+      "epoch": 0.12441732276629543,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001913938379723541,
+      "loss": 0.168,
+      "step": 14333
+    },
+    {
+      "epoch": 0.12442600324649959,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0019139256785501004,
+      "loss": 0.1299,
+      "step": 14334
+    },
+    {
+      "epoch": 0.12443468372670376,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019139129764865624,
+      "loss": 0.1143,
+      "step": 14335
+    },
+    {
+      "epoch": 0.12444336420690792,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019139002735329414,
+      "loss": 0.1641,
+      "step": 14336
+    },
+    {
+      "epoch": 0.12445204468711209,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0019138875696892517,
+      "loss": 0.1162,
+      "step": 14337
+    },
+    {
+      "epoch": 0.12446072516731625,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0019138748649555066,
+      "loss": 0.1211,
+      "step": 14338
+    },
+    {
+      "epoch": 0.12446940564752042,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019138621593317202,
+      "loss": 0.1099,
+      "step": 14339
+    },
+    {
+      "epoch": 0.12447808612772458,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0019138494528179065,
+      "loss": 0.1328,
+      "step": 14340
+    },
+    {
+      "epoch": 0.12448676660792875,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019138367454140791,
+      "loss": 0.21,
+      "step": 14341
+    },
+    {
+      "epoch": 0.12449544708813291,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001913824037120252,
+      "loss": 0.1211,
+      "step": 14342
+    },
+    {
+      "epoch": 0.12450412756833708,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0019138113279364394,
+      "loss": 0.1182,
+      "step": 14343
+    },
+    {
+      "epoch": 0.12451280804854124,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019137986178626547,
+      "loss": 0.1543,
+      "step": 14344
+    },
+    {
+      "epoch": 0.12452148852874541,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0019137859068989122,
+      "loss": 0.0952,
+      "step": 14345
+    },
+    {
+      "epoch": 0.12453016900894957,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0019137731950452255,
+      "loss": 0.1348,
+      "step": 14346
+    },
+    {
+      "epoch": 0.12453884948915374,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0019137604823016088,
+      "loss": 0.125,
+      "step": 14347
+    },
+    {
+      "epoch": 0.1245475299693579,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.001913747768668076,
+      "loss": 0.1177,
+      "step": 14348
+    },
+    {
+      "epoch": 0.12455621044956207,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019137350541446403,
+      "loss": 0.083,
+      "step": 14349
+    },
+    {
+      "epoch": 0.12456489092976623,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.001913722338731317,
+      "loss": 0.1328,
+      "step": 14350
+    },
+    {
+      "epoch": 0.1245735714099704,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0019137096224281186,
+      "loss": 0.1094,
+      "step": 14351
+    },
+    {
+      "epoch": 0.12458225189017456,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00191369690523506,
+      "loss": 0.1177,
+      "step": 14352
+    },
+    {
+      "epoch": 0.12459093237037873,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019136841871521541,
+      "loss": 0.0898,
+      "step": 14353
+    },
+    {
+      "epoch": 0.1245996128505829,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0019136714681794158,
+      "loss": 0.084,
+      "step": 14354
+    },
+    {
+      "epoch": 0.12460829333078706,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019136587483168585,
+      "loss": 0.1006,
+      "step": 14355
+    },
+    {
+      "epoch": 0.12461697381099122,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019136460275644962,
+      "loss": 0.0981,
+      "step": 14356
+    },
+    {
+      "epoch": 0.12462565429119539,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0019136333059223429,
+      "loss": 0.0825,
+      "step": 14357
+    },
+    {
+      "epoch": 0.12463433477139955,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019136205833904126,
+      "loss": 0.103,
+      "step": 14358
+    },
+    {
+      "epoch": 0.12464301525160372,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.001913607859968719,
+      "loss": 0.1172,
+      "step": 14359
+    },
+    {
+      "epoch": 0.12465169573180788,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001913595135657276,
+      "loss": 0.1016,
+      "step": 14360
+    },
+    {
+      "epoch": 0.12466037621201205,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019135824104560973,
+      "loss": 0.0879,
+      "step": 14361
+    },
+    {
+      "epoch": 0.12466905669221622,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019135696843651975,
+      "loss": 0.124,
+      "step": 14362
+    },
+    {
+      "epoch": 0.12467773717242038,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019135569573845899,
+      "loss": 0.083,
+      "step": 14363
+    },
+    {
+      "epoch": 0.12468641765262455,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001913544229514289,
+      "loss": 0.1562,
+      "step": 14364
+    },
+    {
+      "epoch": 0.12469509813282871,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019135315007543078,
+      "loss": 0.1104,
+      "step": 14365
+    },
+    {
+      "epoch": 0.12470377861303288,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.001913518771104661,
+      "loss": 0.0811,
+      "step": 14366
+    },
+    {
+      "epoch": 0.12471245909323704,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019135060405653624,
+      "loss": 0.0771,
+      "step": 14367
+    },
+    {
+      "epoch": 0.1247211395734412,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019134933091364257,
+      "loss": 0.1201,
+      "step": 14368
+    },
+    {
+      "epoch": 0.12472982005364537,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001913480576817865,
+      "loss": 0.125,
+      "step": 14369
+    },
+    {
+      "epoch": 0.12473850053384954,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001913467843609694,
+      "loss": 0.1094,
+      "step": 14370
+    },
+    {
+      "epoch": 0.1247471810140537,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001913455109511927,
+      "loss": 0.1133,
+      "step": 14371
+    },
+    {
+      "epoch": 0.12475586149425787,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019134423745245774,
+      "loss": 0.1104,
+      "step": 14372
+    },
+    {
+      "epoch": 0.12476454197446203,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019134296386476596,
+      "loss": 0.0884,
+      "step": 14373
+    },
+    {
+      "epoch": 0.1247732224546662,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0019134169018811878,
+      "loss": 0.126,
+      "step": 14374
+    },
+    {
+      "epoch": 0.12478190293487036,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001913404164225175,
+      "loss": 0.1221,
+      "step": 14375
+    },
+    {
+      "epoch": 0.12479058341507453,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0019133914256796356,
+      "loss": 0.1016,
+      "step": 14376
+    },
+    {
+      "epoch": 0.12479926389527869,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001913378686244584,
+      "loss": 0.0957,
+      "step": 14377
+    },
+    {
+      "epoch": 0.12480794437548286,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019133659459200335,
+      "loss": 0.1416,
+      "step": 14378
+    },
+    {
+      "epoch": 0.12481662485568702,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001913353204705998,
+      "loss": 0.1094,
+      "step": 14379
+    },
+    {
+      "epoch": 0.12482530533589119,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019133404626024918,
+      "loss": 0.0854,
+      "step": 14380
+    },
+    {
+      "epoch": 0.12483398581609535,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0019133277196095285,
+      "loss": 0.1069,
+      "step": 14381
+    },
+    {
+      "epoch": 0.12484266629629952,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0019133149757271227,
+      "loss": 0.1123,
+      "step": 14382
+    },
+    {
+      "epoch": 0.12485134677650368,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019133022309552874,
+      "loss": 0.1094,
+      "step": 14383
+    },
+    {
+      "epoch": 0.12486002725670785,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0019132894852940373,
+      "loss": 0.1328,
+      "step": 14384
+    },
+    {
+      "epoch": 0.124868707736912,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001913276738743386,
+      "loss": 0.1113,
+      "step": 14385
+    },
+    {
+      "epoch": 0.12487738821711616,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019132639913033473,
+      "loss": 0.1455,
+      "step": 14386
+    },
+    {
+      "epoch": 0.12488606869732033,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0019132512429739354,
+      "loss": 0.0889,
+      "step": 14387
+    },
+    {
+      "epoch": 0.1248947491775245,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019132384937551642,
+      "loss": 0.1094,
+      "step": 14388
+    },
+    {
+      "epoch": 0.12490342965772866,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019132257436470475,
+      "loss": 0.1128,
+      "step": 14389
+    },
+    {
+      "epoch": 0.12491211013793282,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0019132129926495995,
+      "loss": 0.1128,
+      "step": 14390
+    },
+    {
+      "epoch": 0.12492079061813699,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0019132002407628338,
+      "loss": 0.1396,
+      "step": 14391
+    },
+    {
+      "epoch": 0.12492947109834115,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019131874879867645,
+      "loss": 0.1562,
+      "step": 14392
+    },
+    {
+      "epoch": 0.12493815157854532,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.001913174734321406,
+      "loss": 0.1885,
+      "step": 14393
+    },
+    {
+      "epoch": 0.12494683205874949,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019131619797667716,
+      "loss": 0.1348,
+      "step": 14394
+    },
+    {
+      "epoch": 0.12495551253895365,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019131492243228752,
+      "loss": 0.0977,
+      "step": 14395
+    },
+    {
+      "epoch": 0.12496419301915782,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0019131364679897313,
+      "loss": 0.083,
+      "step": 14396
+    },
+    {
+      "epoch": 0.12497287349936198,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019131237107673537,
+      "loss": 0.106,
+      "step": 14397
+    },
+    {
+      "epoch": 0.12498155397956615,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001913110952655756,
+      "loss": 0.1455,
+      "step": 14398
+    },
+    {
+      "epoch": 0.12499023445977031,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0019130981936549523,
+      "loss": 0.1235,
+      "step": 14399
+    },
+    {
+      "epoch": 0.12499891493997448,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001913085433764957,
+      "loss": 0.1279,
+      "step": 14400
+    },
+    {
+      "epoch": 0.12500759542017864,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019130726729857835,
+      "loss": 0.1143,
+      "step": 14401
+    },
+    {
+      "epoch": 0.1250162759003828,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019130599113174457,
+      "loss": 0.1162,
+      "step": 14402
+    },
+    {
+      "epoch": 0.12502495638058697,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001913047148759958,
+      "loss": 0.1064,
+      "step": 14403
+    },
+    {
+      "epoch": 0.12503363686079114,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0019130343853133343,
+      "loss": 0.1201,
+      "step": 14404
+    },
+    {
+      "epoch": 0.1250423173409953,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019130216209775881,
+      "loss": 0.1113,
+      "step": 14405
+    },
+    {
+      "epoch": 0.12505099782119947,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019130088557527338,
+      "loss": 0.125,
+      "step": 14406
+    },
+    {
+      "epoch": 0.12505967830140363,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0019129960896387855,
+      "loss": 0.0947,
+      "step": 14407
+    },
+    {
+      "epoch": 0.1250683587816078,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0019129833226357565,
+      "loss": 0.1191,
+      "step": 14408
+    },
+    {
+      "epoch": 0.12507703926181196,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0019129705547436613,
+      "loss": 0.0967,
+      "step": 14409
+    },
+    {
+      "epoch": 0.12508571974201613,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0019129577859625137,
+      "loss": 0.1113,
+      "step": 14410
+    },
+    {
+      "epoch": 0.1250944002222203,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0019129450162923276,
+      "loss": 0.0776,
+      "step": 14411
+    },
+    {
+      "epoch": 0.12510308070242446,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0019129322457331174,
+      "loss": 0.0898,
+      "step": 14412
+    },
+    {
+      "epoch": 0.12511176118262862,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019129194742848966,
+      "loss": 0.0957,
+      "step": 14413
+    },
+    {
+      "epoch": 0.1251204416628328,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019129067019476788,
+      "loss": 0.1436,
+      "step": 14414
+    },
+    {
+      "epoch": 0.12512912214303695,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001912893928721479,
+      "loss": 0.1445,
+      "step": 14415
+    },
+    {
+      "epoch": 0.12513780262324112,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0019128811546063103,
+      "loss": 0.085,
+      "step": 14416
+    },
+    {
+      "epoch": 0.12514648310344528,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001912868379602187,
+      "loss": 0.1602,
+      "step": 14417
+    },
+    {
+      "epoch": 0.12515516358364945,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019128556037091233,
+      "loss": 0.1123,
+      "step": 14418
+    },
+    {
+      "epoch": 0.1251638440638536,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019128428269271326,
+      "loss": 0.1211,
+      "step": 14419
+    },
+    {
+      "epoch": 0.12517252454405778,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0019128300492562294,
+      "loss": 0.1221,
+      "step": 14420
+    },
+    {
+      "epoch": 0.12518120502426194,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0019128172706964273,
+      "loss": 0.1191,
+      "step": 14421
+    },
+    {
+      "epoch": 0.1251898855044661,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0019128044912477408,
+      "loss": 0.0928,
+      "step": 14422
+    },
+    {
+      "epoch": 0.12519856598467027,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0019127917109101832,
+      "loss": 0.1021,
+      "step": 14423
+    },
+    {
+      "epoch": 0.12520724646487444,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001912778929683769,
+      "loss": 0.0977,
+      "step": 14424
+    },
+    {
+      "epoch": 0.1252159269450786,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0019127661475685117,
+      "loss": 0.127,
+      "step": 14425
+    },
+    {
+      "epoch": 0.12522460742528277,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019127533645644258,
+      "loss": 0.1299,
+      "step": 14426
+    },
+    {
+      "epoch": 0.12523328790548693,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0019127405806715248,
+      "loss": 0.1191,
+      "step": 14427
+    },
+    {
+      "epoch": 0.1252419683856911,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019127277958898231,
+      "loss": 0.1406,
+      "step": 14428
+    },
+    {
+      "epoch": 0.12525064886589526,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0019127150102193344,
+      "loss": 0.2129,
+      "step": 14429
+    },
+    {
+      "epoch": 0.12525932934609943,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001912702223660073,
+      "loss": 0.1133,
+      "step": 14430
+    },
+    {
+      "epoch": 0.1252680098263036,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019126894362120523,
+      "loss": 0.1006,
+      "step": 14431
+    },
+    {
+      "epoch": 0.12527669030650776,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.001912676647875287,
+      "loss": 0.1963,
+      "step": 14432
+    },
+    {
+      "epoch": 0.12528537078671192,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019126638586497904,
+      "loss": 0.1182,
+      "step": 14433
+    },
+    {
+      "epoch": 0.1252940512669161,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001912651068535577,
+      "loss": 0.0923,
+      "step": 14434
+    },
+    {
+      "epoch": 0.12530273174712026,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0019126382775326604,
+      "loss": 0.1182,
+      "step": 14435
+    },
+    {
+      "epoch": 0.12531141222732442,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0019126254856410554,
+      "loss": 0.1089,
+      "step": 14436
+    },
+    {
+      "epoch": 0.12532009270752859,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0019126126928607746,
+      "loss": 0.1055,
+      "step": 14437
+    },
+    {
+      "epoch": 0.12532877318773275,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0019125998991918333,
+      "loss": 0.0825,
+      "step": 14438
+    },
+    {
+      "epoch": 0.12533745366793692,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019125871046342447,
+      "loss": 0.1235,
+      "step": 14439
+    },
+    {
+      "epoch": 0.12534613414814108,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019125743091880233,
+      "loss": 0.082,
+      "step": 14440
+    },
+    {
+      "epoch": 0.12535481462834525,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019125615128531826,
+      "loss": 0.1035,
+      "step": 14441
+    },
+    {
+      "epoch": 0.1253634951085494,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001912548715629737,
+      "loss": 0.1167,
+      "step": 14442
+    },
+    {
+      "epoch": 0.12537217558875358,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0019125359175177,
+      "loss": 0.1328,
+      "step": 14443
+    },
+    {
+      "epoch": 0.12538085606895774,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001912523118517086,
+      "loss": 0.123,
+      "step": 14444
+    },
+    {
+      "epoch": 0.1253895365491619,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0019125103186279093,
+      "loss": 0.0928,
+      "step": 14445
+    },
+    {
+      "epoch": 0.12539821702936607,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019124975178501835,
+      "loss": 0.1738,
+      "step": 14446
+    },
+    {
+      "epoch": 0.12540689750957024,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0019124847161839226,
+      "loss": 0.1064,
+      "step": 14447
+    },
+    {
+      "epoch": 0.1254155779897744,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019124719136291402,
+      "loss": 0.1426,
+      "step": 14448
+    },
+    {
+      "epoch": 0.12542425846997857,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001912459110185851,
+      "loss": 0.0952,
+      "step": 14449
+    },
+    {
+      "epoch": 0.12543293895018273,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019124463058540686,
+      "loss": 0.1108,
+      "step": 14450
+    },
+    {
+      "epoch": 0.1254416194303869,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019124335006338073,
+      "loss": 0.1523,
+      "step": 14451
+    },
+    {
+      "epoch": 0.12545029991059106,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0019124206945250809,
+      "loss": 0.0859,
+      "step": 14452
+    },
+    {
+      "epoch": 0.12545898039079523,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0019124078875279035,
+      "loss": 0.1318,
+      "step": 14453
+    },
+    {
+      "epoch": 0.1254676608709994,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001912395079642289,
+      "loss": 0.0928,
+      "step": 14454
+    },
+    {
+      "epoch": 0.12547634135120356,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001912382270868251,
+      "loss": 0.1309,
+      "step": 14455
+    },
+    {
+      "epoch": 0.12548502183140772,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0019123694612058046,
+      "loss": 0.0938,
+      "step": 14456
+    },
+    {
+      "epoch": 0.1254937023116119,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0019123566506549629,
+      "loss": 0.1016,
+      "step": 14457
+    },
+    {
+      "epoch": 0.12550238279181605,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.00191234383921574,
+      "loss": 0.0967,
+      "step": 14458
+    },
+    {
+      "epoch": 0.12551106327202022,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0019123310268881503,
+      "loss": 0.1187,
+      "step": 14459
+    },
+    {
+      "epoch": 0.12551974375222438,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019123182136722074,
+      "loss": 0.1387,
+      "step": 14460
+    },
+    {
+      "epoch": 0.12552842423242855,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0019123053995679258,
+      "loss": 0.103,
+      "step": 14461
+    },
+    {
+      "epoch": 0.1255371047126327,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019122925845753193,
+      "loss": 0.1504,
+      "step": 14462
+    },
+    {
+      "epoch": 0.12554578519283688,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0019122797686944013,
+      "loss": 0.1221,
+      "step": 14463
+    },
+    {
+      "epoch": 0.12555446567304104,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0019122669519251868,
+      "loss": 0.103,
+      "step": 14464
+    },
+    {
+      "epoch": 0.1255631461532452,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0019122541342676894,
+      "loss": 0.127,
+      "step": 14465
+    },
+    {
+      "epoch": 0.12557182663344937,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0019122413157219229,
+      "loss": 0.1357,
+      "step": 14466
+    },
+    {
+      "epoch": 0.12558050711365354,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0019122284962879014,
+      "loss": 0.1514,
+      "step": 14467
+    },
+    {
+      "epoch": 0.1255891875938577,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001912215675965639,
+      "loss": 0.1211,
+      "step": 14468
+    },
+    {
+      "epoch": 0.12559786807406187,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019122028547551497,
+      "loss": 0.1748,
+      "step": 14469
+    },
+    {
+      "epoch": 0.12560654855426603,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019121900326564479,
+      "loss": 0.1221,
+      "step": 14470
+    },
+    {
+      "epoch": 0.1256152290344702,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001912177209669547,
+      "loss": 0.165,
+      "step": 14471
+    },
+    {
+      "epoch": 0.12562390951467436,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019121643857944617,
+      "loss": 0.1299,
+      "step": 14472
+    },
+    {
+      "epoch": 0.12563258999487853,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019121515610312052,
+      "loss": 0.1484,
+      "step": 14473
+    },
+    {
+      "epoch": 0.1256412704750827,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0019121387353797923,
+      "loss": 0.1348,
+      "step": 14474
+    },
+    {
+      "epoch": 0.12564995095528686,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0019121259088402363,
+      "loss": 0.1157,
+      "step": 14475
+    },
+    {
+      "epoch": 0.12565863143549102,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019121130814125518,
+      "loss": 0.1011,
+      "step": 14476
+    },
+    {
+      "epoch": 0.1256673119156952,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0019121002530967527,
+      "loss": 0.127,
+      "step": 14477
+    },
+    {
+      "epoch": 0.12567599239589933,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.001912087423892853,
+      "loss": 0.1025,
+      "step": 14478
+    },
+    {
+      "epoch": 0.1256846728761035,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0019120745938008663,
+      "loss": 0.0698,
+      "step": 14479
+    },
+    {
+      "epoch": 0.12569335335630766,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0019120617628208077,
+      "loss": 0.1387,
+      "step": 14480
+    },
+    {
+      "epoch": 0.12570203383651182,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.00191204893095269,
+      "loss": 0.127,
+      "step": 14481
+    },
+    {
+      "epoch": 0.125710714316716,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001912036098196528,
+      "loss": 0.1748,
+      "step": 14482
+    },
+    {
+      "epoch": 0.12571939479692015,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019120232645523354,
+      "loss": 0.1309,
+      "step": 14483
+    },
+    {
+      "epoch": 0.12572807527712432,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019120104300201267,
+      "loss": 0.1348,
+      "step": 14484
+    },
+    {
+      "epoch": 0.12573675575732848,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001911997594599915,
+      "loss": 0.1299,
+      "step": 14485
+    },
+    {
+      "epoch": 0.12574543623753265,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0019119847582917153,
+      "loss": 0.0913,
+      "step": 14486
+    },
+    {
+      "epoch": 0.1257541167177368,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0019119719210955414,
+      "loss": 0.1348,
+      "step": 14487
+    },
+    {
+      "epoch": 0.12576279719794098,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001911959083011407,
+      "loss": 0.1025,
+      "step": 14488
+    },
+    {
+      "epoch": 0.12577147767814514,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019119462440393262,
+      "loss": 0.1641,
+      "step": 14489
+    },
+    {
+      "epoch": 0.1257801581583493,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019119334041793135,
+      "loss": 0.0635,
+      "step": 14490
+    },
+    {
+      "epoch": 0.12578883863855347,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0019119205634313826,
+      "loss": 0.1064,
+      "step": 14491
+    },
+    {
+      "epoch": 0.12579751911875764,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019119077217955474,
+      "loss": 0.1416,
+      "step": 14492
+    },
+    {
+      "epoch": 0.1258061995989618,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019118948792718222,
+      "loss": 0.1182,
+      "step": 14493
+    },
+    {
+      "epoch": 0.12581488007916597,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0019118820358602212,
+      "loss": 0.0854,
+      "step": 14494
+    },
+    {
+      "epoch": 0.12582356055937013,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019118691915607578,
+      "loss": 0.1084,
+      "step": 14495
+    },
+    {
+      "epoch": 0.1258322410395743,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019118563463734464,
+      "loss": 0.1191,
+      "step": 14496
+    },
+    {
+      "epoch": 0.12584092151977846,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019118435002983016,
+      "loss": 0.1206,
+      "step": 14497
+    },
+    {
+      "epoch": 0.12584960199998263,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019118306533353368,
+      "loss": 0.1348,
+      "step": 14498
+    },
+    {
+      "epoch": 0.1258582824801868,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019118178054845657,
+      "loss": 0.0957,
+      "step": 14499
+    },
+    {
+      "epoch": 0.12586696296039096,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0019118049567460034,
+      "loss": 0.1299,
+      "step": 14500
+    },
+    {
+      "epoch": 0.12587564344059513,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019117921071196632,
+      "loss": 0.1621,
+      "step": 14501
+    },
+    {
+      "epoch": 0.1258843239207993,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019117792566055592,
+      "loss": 0.1069,
+      "step": 14502
+    },
+    {
+      "epoch": 0.12589300440100346,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0019117664052037057,
+      "loss": 0.1162,
+      "step": 14503
+    },
+    {
+      "epoch": 0.12590168488120762,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0019117535529141168,
+      "loss": 0.1328,
+      "step": 14504
+    },
+    {
+      "epoch": 0.12591036536141179,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019117406997368064,
+      "loss": 0.127,
+      "step": 14505
+    },
+    {
+      "epoch": 0.12591904584161595,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0019117278456717885,
+      "loss": 0.1416,
+      "step": 14506
+    },
+    {
+      "epoch": 0.12592772632182012,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019117149907190772,
+      "loss": 0.126,
+      "step": 14507
+    },
+    {
+      "epoch": 0.12593640680202428,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019117021348786865,
+      "loss": 0.1592,
+      "step": 14508
+    },
+    {
+      "epoch": 0.12594508728222845,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019116892781506306,
+      "loss": 0.1621,
+      "step": 14509
+    },
+    {
+      "epoch": 0.1259537677624326,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019116764205349235,
+      "loss": 0.1289,
+      "step": 14510
+    },
+    {
+      "epoch": 0.12596244824263678,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019116635620315793,
+      "loss": 0.1367,
+      "step": 14511
+    },
+    {
+      "epoch": 0.12597112872284094,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0019116507026406121,
+      "loss": 0.1011,
+      "step": 14512
+    },
+    {
+      "epoch": 0.1259798092030451,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0019116378423620355,
+      "loss": 0.1152,
+      "step": 14513
+    },
+    {
+      "epoch": 0.12598848968324927,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0019116249811958644,
+      "loss": 0.1035,
+      "step": 14514
+    },
+    {
+      "epoch": 0.12599717016345344,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019116121191421122,
+      "loss": 0.1084,
+      "step": 14515
+    },
+    {
+      "epoch": 0.1260058506436576,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019115992562007934,
+      "loss": 0.1523,
+      "step": 14516
+    },
+    {
+      "epoch": 0.12601453112386177,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0019115863923719217,
+      "loss": 0.1328,
+      "step": 14517
+    },
+    {
+      "epoch": 0.12602321160406593,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001911573527655511,
+      "loss": 0.123,
+      "step": 14518
+    },
+    {
+      "epoch": 0.1260318920842701,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0019115606620515762,
+      "loss": 0.1055,
+      "step": 14519
+    },
+    {
+      "epoch": 0.12604057256447426,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019115477955601306,
+      "loss": 0.1338,
+      "step": 14520
+    },
+    {
+      "epoch": 0.12604925304467843,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0019115349281811887,
+      "loss": 0.105,
+      "step": 14521
+    },
+    {
+      "epoch": 0.1260579335248826,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019115220599147643,
+      "loss": 0.1621,
+      "step": 14522
+    },
+    {
+      "epoch": 0.12606661400508676,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019115091907608713,
+      "loss": 0.1123,
+      "step": 14523
+    },
+    {
+      "epoch": 0.12607529448529092,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019114963207195244,
+      "loss": 0.1069,
+      "step": 14524
+    },
+    {
+      "epoch": 0.1260839749654951,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019114834497907373,
+      "loss": 0.1094,
+      "step": 14525
+    },
+    {
+      "epoch": 0.12609265544569925,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001911470577974524,
+      "loss": 0.1187,
+      "step": 14526
+    },
+    {
+      "epoch": 0.12610133592590342,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0019114577052708984,
+      "loss": 0.1465,
+      "step": 14527
+    },
+    {
+      "epoch": 0.12611001640610758,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019114448316798751,
+      "loss": 0.0874,
+      "step": 14528
+    },
+    {
+      "epoch": 0.12611869688631175,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001911431957201468,
+      "loss": 0.083,
+      "step": 14529
+    },
+    {
+      "epoch": 0.1261273773665159,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019114190818356911,
+      "loss": 0.1523,
+      "step": 14530
+    },
+    {
+      "epoch": 0.12613605784672008,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019114062055825582,
+      "loss": 0.126,
+      "step": 14531
+    },
+    {
+      "epoch": 0.12614473832692424,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0019113933284420838,
+      "loss": 0.1172,
+      "step": 14532
+    },
+    {
+      "epoch": 0.1261534188071284,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.001911380450414282,
+      "loss": 0.1289,
+      "step": 14533
+    },
+    {
+      "epoch": 0.12616209928733257,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.001911367571499167,
+      "loss": 0.1191,
+      "step": 14534
+    },
+    {
+      "epoch": 0.12617077976753674,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0019113546916967518,
+      "loss": 0.1406,
+      "step": 14535
+    },
+    {
+      "epoch": 0.1261794602477409,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0019113418110070519,
+      "loss": 0.1162,
+      "step": 14536
+    },
+    {
+      "epoch": 0.12618814072794507,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019113289294300808,
+      "loss": 0.1699,
+      "step": 14537
+    },
+    {
+      "epoch": 0.12619682120814923,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019113160469658524,
+      "loss": 0.1143,
+      "step": 14538
+    },
+    {
+      "epoch": 0.1262055016883534,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001911303163614381,
+      "loss": 0.1289,
+      "step": 14539
+    },
+    {
+      "epoch": 0.12621418216855756,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019112902793756805,
+      "loss": 0.1328,
+      "step": 14540
+    },
+    {
+      "epoch": 0.12622286264876173,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0019112773942497655,
+      "loss": 0.1055,
+      "step": 14541
+    },
+    {
+      "epoch": 0.1262315431289659,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0019112645082366494,
+      "loss": 0.1299,
+      "step": 14542
+    },
+    {
+      "epoch": 0.12624022360917006,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0019112516213363464,
+      "loss": 0.1064,
+      "step": 14543
+    },
+    {
+      "epoch": 0.12624890408937423,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019112387335488712,
+      "loss": 0.1113,
+      "step": 14544
+    },
+    {
+      "epoch": 0.1262575845695784,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019112258448742377,
+      "loss": 0.127,
+      "step": 14545
+    },
+    {
+      "epoch": 0.12626626504978256,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019112129553124596,
+      "loss": 0.0957,
+      "step": 14546
+    },
+    {
+      "epoch": 0.12627494552998672,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019112000648635512,
+      "loss": 0.1084,
+      "step": 14547
+    },
+    {
+      "epoch": 0.12628362601019089,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0019111871735275265,
+      "loss": 0.1475,
+      "step": 14548
+    },
+    {
+      "epoch": 0.12629230649039505,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019111742813043998,
+      "loss": 0.085,
+      "step": 14549
+    },
+    {
+      "epoch": 0.12630098697059922,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.001911161388194185,
+      "loss": 0.1074,
+      "step": 14550
+    },
+    {
+      "epoch": 0.12630966745080338,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019111484941968962,
+      "loss": 0.1025,
+      "step": 14551
+    },
+    {
+      "epoch": 0.12631834793100755,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001911135599312548,
+      "loss": 0.0879,
+      "step": 14552
+    },
+    {
+      "epoch": 0.1263270284112117,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019111227035411536,
+      "loss": 0.0996,
+      "step": 14553
+    },
+    {
+      "epoch": 0.12633570889141588,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001911109806882728,
+      "loss": 0.1172,
+      "step": 14554
+    },
+    {
+      "epoch": 0.12634438937162004,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019110969093372844,
+      "loss": 0.0962,
+      "step": 14555
+    },
+    {
+      "epoch": 0.1263530698518242,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.001911084010904838,
+      "loss": 0.1152,
+      "step": 14556
+    },
+    {
+      "epoch": 0.12636175033202837,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019110711115854018,
+      "loss": 0.0835,
+      "step": 14557
+    },
+    {
+      "epoch": 0.12637043081223254,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019110582113789907,
+      "loss": 0.1172,
+      "step": 14558
+    },
+    {
+      "epoch": 0.1263791112924367,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0019110453102856187,
+      "loss": 0.1406,
+      "step": 14559
+    },
+    {
+      "epoch": 0.12638779177264087,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019110324083052995,
+      "loss": 0.1016,
+      "step": 14560
+    },
+    {
+      "epoch": 0.12639647225284503,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019110195054380474,
+      "loss": 0.1357,
+      "step": 14561
+    },
+    {
+      "epoch": 0.1264051527330492,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019110066016838768,
+      "loss": 0.1211,
+      "step": 14562
+    },
+    {
+      "epoch": 0.12641383321325336,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0019109936970428012,
+      "loss": 0.0879,
+      "step": 14563
+    },
+    {
+      "epoch": 0.12642251369345753,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001910980791514835,
+      "loss": 0.0923,
+      "step": 14564
+    },
+    {
+      "epoch": 0.1264311941736617,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0019109678850999928,
+      "loss": 0.125,
+      "step": 14565
+    },
+    {
+      "epoch": 0.12643987465386586,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019109549777982884,
+      "loss": 0.1113,
+      "step": 14566
+    },
+    {
+      "epoch": 0.12644855513407002,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0019109420696097354,
+      "loss": 0.0786,
+      "step": 14567
+    },
+    {
+      "epoch": 0.1264572356142742,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019109291605343486,
+      "loss": 0.1167,
+      "step": 14568
+    },
+    {
+      "epoch": 0.12646591609447835,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019109162505721417,
+      "loss": 0.0957,
+      "step": 14569
+    },
+    {
+      "epoch": 0.12647459657468252,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.001910903339723129,
+      "loss": 0.085,
+      "step": 14570
+    },
+    {
+      "epoch": 0.12648327705488668,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0019108904279873248,
+      "loss": 0.0967,
+      "step": 14571
+    },
+    {
+      "epoch": 0.12649195753509085,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001910877515364743,
+      "loss": 0.1279,
+      "step": 14572
+    },
+    {
+      "epoch": 0.126500638015295,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0019108646018553975,
+      "loss": 0.1006,
+      "step": 14573
+    },
+    {
+      "epoch": 0.12650931849549918,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019108516874593027,
+      "loss": 0.0938,
+      "step": 14574
+    },
+    {
+      "epoch": 0.12651799897570334,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0019108387721764727,
+      "loss": 0.124,
+      "step": 14575
+    },
+    {
+      "epoch": 0.1265266794559075,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0019108258560069217,
+      "loss": 0.123,
+      "step": 14576
+    },
+    {
+      "epoch": 0.12653535993611167,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0019108129389506635,
+      "loss": 0.1367,
+      "step": 14577
+    },
+    {
+      "epoch": 0.12654404041631584,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0019108000210077125,
+      "loss": 0.1152,
+      "step": 14578
+    },
+    {
+      "epoch": 0.12655272089652,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019107871021780832,
+      "loss": 0.127,
+      "step": 14579
+    },
+    {
+      "epoch": 0.12656140137672417,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019107741824617888,
+      "loss": 0.2031,
+      "step": 14580
+    },
+    {
+      "epoch": 0.12657008185692833,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001910761261858844,
+      "loss": 0.0952,
+      "step": 14581
+    },
+    {
+      "epoch": 0.1265787623371325,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001910748340369263,
+      "loss": 0.0938,
+      "step": 14582
+    },
+    {
+      "epoch": 0.12658744281733667,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0019107354179930599,
+      "loss": 0.1162,
+      "step": 14583
+    },
+    {
+      "epoch": 0.12659612329754083,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019107224947302486,
+      "loss": 0.1465,
+      "step": 14584
+    },
+    {
+      "epoch": 0.126604803777745,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019107095705808432,
+      "loss": 0.1182,
+      "step": 14585
+    },
+    {
+      "epoch": 0.12661348425794916,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019106966455448582,
+      "loss": 0.082,
+      "step": 14586
+    },
+    {
+      "epoch": 0.12662216473815333,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019106837196223075,
+      "loss": 0.1201,
+      "step": 14587
+    },
+    {
+      "epoch": 0.1266308452183575,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019106707928132054,
+      "loss": 0.165,
+      "step": 14588
+    },
+    {
+      "epoch": 0.12663952569856166,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001910657865117566,
+      "loss": 0.125,
+      "step": 14589
+    },
+    {
+      "epoch": 0.12664820617876582,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0019106449365354028,
+      "loss": 0.127,
+      "step": 14590
+    },
+    {
+      "epoch": 0.12665688665896999,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0019106320070667309,
+      "loss": 0.1406,
+      "step": 14591
+    },
+    {
+      "epoch": 0.12666556713917415,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019106190767115639,
+      "loss": 0.1426,
+      "step": 14592
+    },
+    {
+      "epoch": 0.12667424761937832,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019106061454699159,
+      "loss": 0.1089,
+      "step": 14593
+    },
+    {
+      "epoch": 0.12668292809958248,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019105932133418014,
+      "loss": 0.0796,
+      "step": 14594
+    },
+    {
+      "epoch": 0.12669160857978665,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.0019105802803272343,
+      "loss": 0.1816,
+      "step": 14595
+    },
+    {
+      "epoch": 0.1267002890599908,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019105673464262288,
+      "loss": 0.084,
+      "step": 14596
+    },
+    {
+      "epoch": 0.12670896954019498,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0019105544116387989,
+      "loss": 0.1289,
+      "step": 14597
+    },
+    {
+      "epoch": 0.12671765002039914,
+      "grad_norm": 1.9140625,
+      "learning_rate": 0.0019105414759649588,
+      "loss": 0.2041,
+      "step": 14598
+    },
+    {
+      "epoch": 0.1267263305006033,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019105285394047232,
+      "loss": 0.1191,
+      "step": 14599
+    },
+    {
+      "epoch": 0.12673501098080747,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019105156019581055,
+      "loss": 0.124,
+      "step": 14600
+    },
+    {
+      "epoch": 0.1267436914610116,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0019105026636251203,
+      "loss": 0.1699,
+      "step": 14601
+    },
+    {
+      "epoch": 0.12675237194121577,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019104897244057808,
+      "loss": 0.1299,
+      "step": 14602
+    },
+    {
+      "epoch": 0.12676105242141994,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019104767843001027,
+      "loss": 0.1357,
+      "step": 14603
+    },
+    {
+      "epoch": 0.1267697329016241,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0019104638433080992,
+      "loss": 0.1016,
+      "step": 14604
+    },
+    {
+      "epoch": 0.12677841338182827,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0019104509014297847,
+      "loss": 0.1143,
+      "step": 14605
+    },
+    {
+      "epoch": 0.12678709386203244,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0019104379586651727,
+      "loss": 0.127,
+      "step": 14606
+    },
+    {
+      "epoch": 0.1267957743422366,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0019104250150142785,
+      "loss": 0.1084,
+      "step": 14607
+    },
+    {
+      "epoch": 0.12680445482244077,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019104120704771155,
+      "loss": 0.1152,
+      "step": 14608
+    },
+    {
+      "epoch": 0.12681313530264493,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.001910399125053698,
+      "loss": 0.083,
+      "step": 14609
+    },
+    {
+      "epoch": 0.1268218157828491,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0019103861787440402,
+      "loss": 0.1035,
+      "step": 14610
+    },
+    {
+      "epoch": 0.12683049626305326,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0019103732315481564,
+      "loss": 0.125,
+      "step": 14611
+    },
+    {
+      "epoch": 0.12683917674325743,
+      "grad_norm": 2.40625,
+      "learning_rate": 0.0019103602834660606,
+      "loss": 0.1494,
+      "step": 14612
+    },
+    {
+      "epoch": 0.1268478572234616,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001910347334497767,
+      "loss": 0.0996,
+      "step": 14613
+    },
+    {
+      "epoch": 0.12685653770366576,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0019103343846432895,
+      "loss": 0.1025,
+      "step": 14614
+    },
+    {
+      "epoch": 0.12686521818386992,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0019103214339026425,
+      "loss": 0.1177,
+      "step": 14615
+    },
+    {
+      "epoch": 0.1268738986640741,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0019103084822758402,
+      "loss": 0.1387,
+      "step": 14616
+    },
+    {
+      "epoch": 0.12688257914427825,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019102955297628968,
+      "loss": 0.1543,
+      "step": 14617
+    },
+    {
+      "epoch": 0.12689125962448242,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019102825763638266,
+      "loss": 0.1426,
+      "step": 14618
+    },
+    {
+      "epoch": 0.12689994010468658,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019102696220786432,
+      "loss": 0.1465,
+      "step": 14619
+    },
+    {
+      "epoch": 0.12690862058489075,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019102566669073609,
+      "loss": 0.1475,
+      "step": 14620
+    },
+    {
+      "epoch": 0.1269173010650949,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019102437108499946,
+      "loss": 0.1279,
+      "step": 14621
+    },
+    {
+      "epoch": 0.12692598154529908,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019102307539065578,
+      "loss": 0.1416,
+      "step": 14622
+    },
+    {
+      "epoch": 0.12693466202550324,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0019102177960770648,
+      "loss": 0.084,
+      "step": 14623
+    },
+    {
+      "epoch": 0.1269433425057074,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0019102048373615297,
+      "loss": 0.0947,
+      "step": 14624
+    },
+    {
+      "epoch": 0.12695202298591157,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019101918777599668,
+      "loss": 0.1025,
+      "step": 14625
+    },
+    {
+      "epoch": 0.12696070346611574,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0019101789172723904,
+      "loss": 0.0996,
+      "step": 14626
+    },
+    {
+      "epoch": 0.1269693839463199,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0019101659558988142,
+      "loss": 0.1064,
+      "step": 14627
+    },
+    {
+      "epoch": 0.12697806442652407,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001910152993639253,
+      "loss": 0.1113,
+      "step": 14628
+    },
+    {
+      "epoch": 0.12698674490672823,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019101400304937205,
+      "loss": 0.1221,
+      "step": 14629
+    },
+    {
+      "epoch": 0.1269954253869324,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001910127066462231,
+      "loss": 0.1104,
+      "step": 14630
+    },
+    {
+      "epoch": 0.12700410586713656,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001910114101544799,
+      "loss": 0.1758,
+      "step": 14631
+    },
+    {
+      "epoch": 0.12701278634734073,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.001910101135741438,
+      "loss": 0.1162,
+      "step": 14632
+    },
+    {
+      "epoch": 0.1270214668275449,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0019100881690521628,
+      "loss": 0.1143,
+      "step": 14633
+    },
+    {
+      "epoch": 0.12703014730774906,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001910075201476987,
+      "loss": 0.1021,
+      "step": 14634
+    },
+    {
+      "epoch": 0.12703882778795322,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019100622330159255,
+      "loss": 0.1055,
+      "step": 14635
+    },
+    {
+      "epoch": 0.1270475082681574,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001910049263668992,
+      "loss": 0.0811,
+      "step": 14636
+    },
+    {
+      "epoch": 0.12705618874836155,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001910036293436201,
+      "loss": 0.1279,
+      "step": 14637
+    },
+    {
+      "epoch": 0.12706486922856572,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0019100233223175664,
+      "loss": 0.1172,
+      "step": 14638
+    },
+    {
+      "epoch": 0.12707354970876988,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001910010350313102,
+      "loss": 0.0928,
+      "step": 14639
+    },
+    {
+      "epoch": 0.12708223018897405,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001909997377422823,
+      "loss": 0.1172,
+      "step": 14640
+    },
+    {
+      "epoch": 0.12709091066917821,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019099844036467425,
+      "loss": 0.1201,
+      "step": 14641
+    },
+    {
+      "epoch": 0.12709959114938238,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019099714289848757,
+      "loss": 0.1367,
+      "step": 14642
+    },
+    {
+      "epoch": 0.12710827162958654,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019099584534372363,
+      "loss": 0.1426,
+      "step": 14643
+    },
+    {
+      "epoch": 0.1271169521097907,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0019099454770038385,
+      "loss": 0.1089,
+      "step": 14644
+    },
+    {
+      "epoch": 0.12712563258999487,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019099324996846962,
+      "loss": 0.1025,
+      "step": 14645
+    },
+    {
+      "epoch": 0.12713431307019904,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019099195214798243,
+      "loss": 0.0986,
+      "step": 14646
+    },
+    {
+      "epoch": 0.1271429935504032,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0019099065423892362,
+      "loss": 0.1465,
+      "step": 14647
+    },
+    {
+      "epoch": 0.12715167403060737,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0019098935624129467,
+      "loss": 0.1328,
+      "step": 14648
+    },
+    {
+      "epoch": 0.12716035451081154,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0019098805815509696,
+      "loss": 0.1338,
+      "step": 14649
+    },
+    {
+      "epoch": 0.1271690349910157,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019098675998033194,
+      "loss": 0.1406,
+      "step": 14650
+    },
+    {
+      "epoch": 0.12717771547121987,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.00190985461717001,
+      "loss": 0.1221,
+      "step": 14651
+    },
+    {
+      "epoch": 0.12718639595142403,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0019098416336510563,
+      "loss": 0.1035,
+      "step": 14652
+    },
+    {
+      "epoch": 0.1271950764316282,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0019098286492464714,
+      "loss": 0.1523,
+      "step": 14653
+    },
+    {
+      "epoch": 0.12720375691183236,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019098156639562702,
+      "loss": 0.1465,
+      "step": 14654
+    },
+    {
+      "epoch": 0.12721243739203653,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0019098026777804669,
+      "loss": 0.0835,
+      "step": 14655
+    },
+    {
+      "epoch": 0.1272211178722407,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0019097896907190753,
+      "loss": 0.168,
+      "step": 14656
+    },
+    {
+      "epoch": 0.12722979835244486,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00190977670277211,
+      "loss": 0.1187,
+      "step": 14657
+    },
+    {
+      "epoch": 0.12723847883264902,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001909763713939585,
+      "loss": 0.1025,
+      "step": 14658
+    },
+    {
+      "epoch": 0.1272471593128532,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0019097507242215146,
+      "loss": 0.0874,
+      "step": 14659
+    },
+    {
+      "epoch": 0.12725583979305735,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001909737733617913,
+      "loss": 0.1289,
+      "step": 14660
+    },
+    {
+      "epoch": 0.12726452027326152,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019097247421287944,
+      "loss": 0.1133,
+      "step": 14661
+    },
+    {
+      "epoch": 0.12727320075346568,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019097117497541727,
+      "loss": 0.0996,
+      "step": 14662
+    },
+    {
+      "epoch": 0.12728188123366985,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001909698756494063,
+      "loss": 0.1069,
+      "step": 14663
+    },
+    {
+      "epoch": 0.127290561713874,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0019096857623484786,
+      "loss": 0.1289,
+      "step": 14664
+    },
+    {
+      "epoch": 0.12729924219407818,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0019096727673174338,
+      "loss": 0.0825,
+      "step": 14665
+    },
+    {
+      "epoch": 0.12730792267428234,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001909659771400943,
+      "loss": 0.1157,
+      "step": 14666
+    },
+    {
+      "epoch": 0.1273166031544865,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0019096467745990209,
+      "loss": 0.1211,
+      "step": 14667
+    },
+    {
+      "epoch": 0.12732528363469067,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001909633776911681,
+      "loss": 0.1021,
+      "step": 14668
+    },
+    {
+      "epoch": 0.12733396411489484,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019096207783389377,
+      "loss": 0.1172,
+      "step": 14669
+    },
+    {
+      "epoch": 0.127342644595099,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0019096077788808054,
+      "loss": 0.1406,
+      "step": 14670
+    },
+    {
+      "epoch": 0.12735132507530317,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001909594778537298,
+      "loss": 0.1084,
+      "step": 14671
+    },
+    {
+      "epoch": 0.12736000555550733,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019095817773084301,
+      "loss": 0.1484,
+      "step": 14672
+    },
+    {
+      "epoch": 0.1273686860357115,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0019095687751942154,
+      "loss": 0.1152,
+      "step": 14673
+    },
+    {
+      "epoch": 0.12737736651591566,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.001909555772194669,
+      "loss": 0.1084,
+      "step": 14674
+    },
+    {
+      "epoch": 0.12738604699611983,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019095427683098044,
+      "loss": 0.0791,
+      "step": 14675
+    },
+    {
+      "epoch": 0.127394727476324,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001909529763539636,
+      "loss": 0.1006,
+      "step": 14676
+    },
+    {
+      "epoch": 0.12740340795652816,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.001909516757884178,
+      "loss": 0.1152,
+      "step": 14677
+    },
+    {
+      "epoch": 0.12741208843673232,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0019095037513434442,
+      "loss": 0.0933,
+      "step": 14678
+    },
+    {
+      "epoch": 0.1274207689169365,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0019094907439174498,
+      "loss": 0.1377,
+      "step": 14679
+    },
+    {
+      "epoch": 0.12742944939714065,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001909477735606208,
+      "loss": 0.1289,
+      "step": 14680
+    },
+    {
+      "epoch": 0.12743812987734482,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001909464726409734,
+      "loss": 0.0918,
+      "step": 14681
+    },
+    {
+      "epoch": 0.12744681035754898,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001909451716328041,
+      "loss": 0.1348,
+      "step": 14682
+    },
+    {
+      "epoch": 0.12745549083775315,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019094387053611446,
+      "loss": 0.1719,
+      "step": 14683
+    },
+    {
+      "epoch": 0.12746417131795731,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0019094256935090574,
+      "loss": 0.3652,
+      "step": 14684
+    },
+    {
+      "epoch": 0.12747285179816148,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001909412680771795,
+      "loss": 0.1045,
+      "step": 14685
+    },
+    {
+      "epoch": 0.12748153227836564,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019093996671493705,
+      "loss": 0.0967,
+      "step": 14686
+    },
+    {
+      "epoch": 0.1274902127585698,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019093866526417988,
+      "loss": 0.123,
+      "step": 14687
+    },
+    {
+      "epoch": 0.12749889323877397,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019093736372490947,
+      "loss": 0.0801,
+      "step": 14688
+    },
+    {
+      "epoch": 0.12750757371897814,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001909360620971271,
+      "loss": 0.1152,
+      "step": 14689
+    },
+    {
+      "epoch": 0.1275162541991823,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001909347603808343,
+      "loss": 0.1074,
+      "step": 14690
+    },
+    {
+      "epoch": 0.12752493467938647,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019093345857603243,
+      "loss": 0.1309,
+      "step": 14691
+    },
+    {
+      "epoch": 0.12753361515959064,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.00190932156682723,
+      "loss": 0.104,
+      "step": 14692
+    },
+    {
+      "epoch": 0.1275422956397948,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0019093085470090734,
+      "loss": 0.1553,
+      "step": 14693
+    },
+    {
+      "epoch": 0.12755097611999897,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019092955263058692,
+      "loss": 0.0957,
+      "step": 14694
+    },
+    {
+      "epoch": 0.12755965660020313,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019092825047176314,
+      "loss": 0.1108,
+      "step": 14695
+    },
+    {
+      "epoch": 0.1275683370804073,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0019092694822443748,
+      "loss": 0.0806,
+      "step": 14696
+    },
+    {
+      "epoch": 0.12757701756061146,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001909256458886113,
+      "loss": 0.1104,
+      "step": 14697
+    },
+    {
+      "epoch": 0.12758569804081563,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019092434346428607,
+      "loss": 0.1064,
+      "step": 14698
+    },
+    {
+      "epoch": 0.1275943785210198,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0019092304095146317,
+      "loss": 0.1318,
+      "step": 14699
+    },
+    {
+      "epoch": 0.12760305900122396,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019092173835014406,
+      "loss": 0.2109,
+      "step": 14700
+    },
+    {
+      "epoch": 0.12761173948142812,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0019092043566033014,
+      "loss": 0.0879,
+      "step": 14701
+    },
+    {
+      "epoch": 0.1276204199616323,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019091913288202284,
+      "loss": 0.166,
+      "step": 14702
+    },
+    {
+      "epoch": 0.12762910044183645,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019091783001522363,
+      "loss": 0.0967,
+      "step": 14703
+    },
+    {
+      "epoch": 0.12763778092204062,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019091652705993385,
+      "loss": 0.1377,
+      "step": 14704
+    },
+    {
+      "epoch": 0.12764646140224478,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0019091522401615503,
+      "loss": 0.1455,
+      "step": 14705
+    },
+    {
+      "epoch": 0.12765514188244895,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019091392088388848,
+      "loss": 0.1162,
+      "step": 14706
+    },
+    {
+      "epoch": 0.1276638223626531,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0019091261766313575,
+      "loss": 0.1079,
+      "step": 14707
+    },
+    {
+      "epoch": 0.12767250284285728,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0019091131435389813,
+      "loss": 0.1206,
+      "step": 14708
+    },
+    {
+      "epoch": 0.12768118332306144,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0019091001095617713,
+      "loss": 0.0923,
+      "step": 14709
+    },
+    {
+      "epoch": 0.1276898638032656,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019090870746997415,
+      "loss": 0.103,
+      "step": 14710
+    },
+    {
+      "epoch": 0.12769854428346977,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019090740389529066,
+      "loss": 0.1045,
+      "step": 14711
+    },
+    {
+      "epoch": 0.12770722476367394,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019090610023212802,
+      "loss": 0.1416,
+      "step": 14712
+    },
+    {
+      "epoch": 0.1277159052438781,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001909047964804877,
+      "loss": 0.124,
+      "step": 14713
+    },
+    {
+      "epoch": 0.12772458572408227,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019090349264037111,
+      "loss": 0.1045,
+      "step": 14714
+    },
+    {
+      "epoch": 0.12773326620428643,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019090218871177964,
+      "loss": 0.0952,
+      "step": 14715
+    },
+    {
+      "epoch": 0.1277419466844906,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0019090088469471478,
+      "loss": 0.1406,
+      "step": 14716
+    },
+    {
+      "epoch": 0.12775062716469476,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0019089958058917797,
+      "loss": 0.1055,
+      "step": 14717
+    },
+    {
+      "epoch": 0.12775930764489893,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0019089827639517053,
+      "loss": 0.1406,
+      "step": 14718
+    },
+    {
+      "epoch": 0.1277679881251031,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0019089697211269394,
+      "loss": 0.2324,
+      "step": 14719
+    },
+    {
+      "epoch": 0.12777666860530726,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001908956677417497,
+      "loss": 0.1426,
+      "step": 14720
+    },
+    {
+      "epoch": 0.12778534908551142,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001908943632823391,
+      "loss": 0.0845,
+      "step": 14721
+    },
+    {
+      "epoch": 0.1277940295657156,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001908930587344637,
+      "loss": 0.1221,
+      "step": 14722
+    },
+    {
+      "epoch": 0.12780271004591975,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019089175409812487,
+      "loss": 0.103,
+      "step": 14723
+    },
+    {
+      "epoch": 0.1278113905261239,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0019089044937332397,
+      "loss": 0.1211,
+      "step": 14724
+    },
+    {
+      "epoch": 0.12782007100632806,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0019088914456006254,
+      "loss": 0.0947,
+      "step": 14725
+    },
+    {
+      "epoch": 0.12782875148653222,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019088783965834195,
+      "loss": 0.0991,
+      "step": 14726
+    },
+    {
+      "epoch": 0.1278374319667364,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0019088653466816363,
+      "loss": 0.1699,
+      "step": 14727
+    },
+    {
+      "epoch": 0.12784611244694055,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019088522958952902,
+      "loss": 0.1377,
+      "step": 14728
+    },
+    {
+      "epoch": 0.12785479292714472,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0019088392442243953,
+      "loss": 0.1182,
+      "step": 14729
+    },
+    {
+      "epoch": 0.12786347340734888,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001908826191668966,
+      "loss": 0.1465,
+      "step": 14730
+    },
+    {
+      "epoch": 0.12787215388755305,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0019088131382290164,
+      "loss": 0.1006,
+      "step": 14731
+    },
+    {
+      "epoch": 0.1278808343677572,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.001908800083904561,
+      "loss": 0.1035,
+      "step": 14732
+    },
+    {
+      "epoch": 0.12788951484796138,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0019087870286956141,
+      "loss": 0.0933,
+      "step": 14733
+    },
+    {
+      "epoch": 0.12789819532816554,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019087739726021899,
+      "loss": 0.1279,
+      "step": 14734
+    },
+    {
+      "epoch": 0.1279068758083697,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019087609156243023,
+      "loss": 0.0674,
+      "step": 14735
+    },
+    {
+      "epoch": 0.12791555628857387,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001908747857761966,
+      "loss": 0.1602,
+      "step": 14736
+    },
+    {
+      "epoch": 0.12792423676877804,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0019087347990151954,
+      "loss": 0.1084,
+      "step": 14737
+    },
+    {
+      "epoch": 0.1279329172489822,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019087217393840045,
+      "loss": 0.1201,
+      "step": 14738
+    },
+    {
+      "epoch": 0.12794159772918637,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019087086788684073,
+      "loss": 0.0908,
+      "step": 14739
+    },
+    {
+      "epoch": 0.12795027820939053,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019086956174684188,
+      "loss": 0.0947,
+      "step": 14740
+    },
+    {
+      "epoch": 0.1279589586895947,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0019086825551840527,
+      "loss": 0.1396,
+      "step": 14741
+    },
+    {
+      "epoch": 0.12796763916979886,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019086694920153239,
+      "loss": 0.0996,
+      "step": 14742
+    },
+    {
+      "epoch": 0.12797631965000303,
+      "grad_norm": 0.875,
+      "learning_rate": 0.001908656427962246,
+      "loss": 0.0996,
+      "step": 14743
+    },
+    {
+      "epoch": 0.1279850001302072,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019086433630248337,
+      "loss": 0.126,
+      "step": 14744
+    },
+    {
+      "epoch": 0.12799368061041136,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001908630297203101,
+      "loss": 0.1123,
+      "step": 14745
+    },
+    {
+      "epoch": 0.12800236109061552,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019086172304970629,
+      "loss": 0.0967,
+      "step": 14746
+    },
+    {
+      "epoch": 0.1280110415708197,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019086041629067325,
+      "loss": 0.1113,
+      "step": 14747
+    },
+    {
+      "epoch": 0.12801972205102385,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0019085910944321247,
+      "loss": 0.125,
+      "step": 14748
+    },
+    {
+      "epoch": 0.12802840253122802,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019085780250732542,
+      "loss": 0.1328,
+      "step": 14749
+    },
+    {
+      "epoch": 0.12803708301143218,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0019085649548301348,
+      "loss": 0.1318,
+      "step": 14750
+    },
+    {
+      "epoch": 0.12804576349163635,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019085518837027812,
+      "loss": 0.1191,
+      "step": 14751
+    },
+    {
+      "epoch": 0.12805444397184051,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001908538811691207,
+      "loss": 0.1387,
+      "step": 14752
+    },
+    {
+      "epoch": 0.12806312445204468,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001908525738795427,
+      "loss": 0.0942,
+      "step": 14753
+    },
+    {
+      "epoch": 0.12807180493224884,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0019085126650154555,
+      "loss": 0.1094,
+      "step": 14754
+    },
+    {
+      "epoch": 0.128080485412453,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019084995903513064,
+      "loss": 0.1108,
+      "step": 14755
+    },
+    {
+      "epoch": 0.12808916589265718,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019084865148029945,
+      "loss": 0.1172,
+      "step": 14756
+    },
+    {
+      "epoch": 0.12809784637286134,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.001908473438370534,
+      "loss": 0.1289,
+      "step": 14757
+    },
+    {
+      "epoch": 0.1281065268530655,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001908460361053939,
+      "loss": 0.1348,
+      "step": 14758
+    },
+    {
+      "epoch": 0.12811520733326967,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001908447282853224,
+      "loss": 0.1191,
+      "step": 14759
+    },
+    {
+      "epoch": 0.12812388781347384,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001908434203768403,
+      "loss": 0.0991,
+      "step": 14760
+    },
+    {
+      "epoch": 0.128132568293678,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019084211237994906,
+      "loss": 0.1182,
+      "step": 14761
+    },
+    {
+      "epoch": 0.12814124877388217,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.001908408042946501,
+      "loss": 0.1396,
+      "step": 14762
+    },
+    {
+      "epoch": 0.12814992925408633,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0019083949612094487,
+      "loss": 0.1094,
+      "step": 14763
+    },
+    {
+      "epoch": 0.1281586097342905,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0019083818785883475,
+      "loss": 0.2793,
+      "step": 14764
+    },
+    {
+      "epoch": 0.12816729021449466,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0019083687950832123,
+      "loss": 0.0996,
+      "step": 14765
+    },
+    {
+      "epoch": 0.12817597069469883,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001908355710694057,
+      "loss": 0.1602,
+      "step": 14766
+    },
+    {
+      "epoch": 0.128184651174903,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019083426254208962,
+      "loss": 0.1406,
+      "step": 14767
+    },
+    {
+      "epoch": 0.12819333165510716,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0019083295392637437,
+      "loss": 0.1611,
+      "step": 14768
+    },
+    {
+      "epoch": 0.12820201213531132,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0019083164522226141,
+      "loss": 0.1079,
+      "step": 14769
+    },
+    {
+      "epoch": 0.1282106926155155,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019083033642975224,
+      "loss": 0.1572,
+      "step": 14770
+    },
+    {
+      "epoch": 0.12821937309571965,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0019082902754884816,
+      "loss": 0.1221,
+      "step": 14771
+    },
+    {
+      "epoch": 0.12822805357592382,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0019082771857955073,
+      "loss": 0.1104,
+      "step": 14772
+    },
+    {
+      "epoch": 0.12823673405612798,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019082640952186127,
+      "loss": 0.1226,
+      "step": 14773
+    },
+    {
+      "epoch": 0.12824541453633215,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001908251003757813,
+      "loss": 0.0942,
+      "step": 14774
+    },
+    {
+      "epoch": 0.1282540950165363,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001908237911413122,
+      "loss": 0.1377,
+      "step": 14775
+    },
+    {
+      "epoch": 0.12826277549674048,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001908224818184554,
+      "loss": 0.1553,
+      "step": 14776
+    },
+    {
+      "epoch": 0.12827145597694464,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019082117240721238,
+      "loss": 0.1011,
+      "step": 14777
+    },
+    {
+      "epoch": 0.1282801364571488,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001908198629075845,
+      "loss": 0.1543,
+      "step": 14778
+    },
+    {
+      "epoch": 0.12828881693735297,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019081855331957328,
+      "loss": 0.0894,
+      "step": 14779
+    },
+    {
+      "epoch": 0.12829749741755714,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0019081724364318007,
+      "loss": 0.1216,
+      "step": 14780
+    },
+    {
+      "epoch": 0.1283061778977613,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0019081593387840635,
+      "loss": 0.1016,
+      "step": 14781
+    },
+    {
+      "epoch": 0.12831485837796547,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019081462402525354,
+      "loss": 0.1221,
+      "step": 14782
+    },
+    {
+      "epoch": 0.12832353885816963,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.001908133140837231,
+      "loss": 0.1094,
+      "step": 14783
+    },
+    {
+      "epoch": 0.1283322193383738,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0019081200405381637,
+      "loss": 0.1206,
+      "step": 14784
+    },
+    {
+      "epoch": 0.12834089981857796,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0019081069393553488,
+      "loss": 0.1523,
+      "step": 14785
+    },
+    {
+      "epoch": 0.12834958029878213,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0019080938372888002,
+      "loss": 0.1035,
+      "step": 14786
+    },
+    {
+      "epoch": 0.1283582607789863,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0019080807343385324,
+      "loss": 0.0996,
+      "step": 14787
+    },
+    {
+      "epoch": 0.12836694125919046,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019080676305045599,
+      "loss": 0.1016,
+      "step": 14788
+    },
+    {
+      "epoch": 0.12837562173939462,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0019080545257868964,
+      "loss": 0.0918,
+      "step": 14789
+    },
+    {
+      "epoch": 0.1283843022195988,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0019080414201855568,
+      "loss": 0.1226,
+      "step": 14790
+    },
+    {
+      "epoch": 0.12839298269980295,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019080283137005549,
+      "loss": 0.126,
+      "step": 14791
+    },
+    {
+      "epoch": 0.12840166318000712,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019080152063319057,
+      "loss": 0.104,
+      "step": 14792
+    },
+    {
+      "epoch": 0.12841034366021128,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0019080020980796234,
+      "loss": 0.1084,
+      "step": 14793
+    },
+    {
+      "epoch": 0.12841902414041545,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019079889889437216,
+      "loss": 0.0977,
+      "step": 14794
+    },
+    {
+      "epoch": 0.12842770462061961,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0019079758789242156,
+      "loss": 0.0957,
+      "step": 14795
+    },
+    {
+      "epoch": 0.12843638510082378,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0019079627680211194,
+      "loss": 0.1299,
+      "step": 14796
+    },
+    {
+      "epoch": 0.12844506558102795,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001907949656234447,
+      "loss": 0.1021,
+      "step": 14797
+    },
+    {
+      "epoch": 0.1284537460612321,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.001907936543564213,
+      "loss": 0.0952,
+      "step": 14798
+    },
+    {
+      "epoch": 0.12846242654143628,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001907923430010432,
+      "loss": 0.0981,
+      "step": 14799
+    },
+    {
+      "epoch": 0.12847110702164044,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0019079103155731178,
+      "loss": 0.0874,
+      "step": 14800
+    },
+    {
+      "epoch": 0.1284797875018446,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001907897200252285,
+      "loss": 0.1035,
+      "step": 14801
+    },
+    {
+      "epoch": 0.12848846798204877,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019078840840479484,
+      "loss": 0.1167,
+      "step": 14802
+    },
+    {
+      "epoch": 0.12849714846225294,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001907870966960121,
+      "loss": 0.1299,
+      "step": 14803
+    },
+    {
+      "epoch": 0.1285058289424571,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001907857848988819,
+      "loss": 0.0933,
+      "step": 14804
+    },
+    {
+      "epoch": 0.12851450942266127,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019078447301340553,
+      "loss": 0.1064,
+      "step": 14805
+    },
+    {
+      "epoch": 0.12852318990286543,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019078316103958448,
+      "loss": 0.0898,
+      "step": 14806
+    },
+    {
+      "epoch": 0.1285318703830696,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.0019078184897742018,
+      "loss": 0.1738,
+      "step": 14807
+    },
+    {
+      "epoch": 0.12854055086327376,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0019078053682691407,
+      "loss": 0.1196,
+      "step": 14808
+    },
+    {
+      "epoch": 0.12854923134347793,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0019077922458806758,
+      "loss": 0.127,
+      "step": 14809
+    },
+    {
+      "epoch": 0.1285579118236821,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019077791226088212,
+      "loss": 0.1211,
+      "step": 14810
+    },
+    {
+      "epoch": 0.12856659230388626,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.001907765998453592,
+      "loss": 0.0884,
+      "step": 14811
+    },
+    {
+      "epoch": 0.12857527278409042,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019077528734150015,
+      "loss": 0.1191,
+      "step": 14812
+    },
+    {
+      "epoch": 0.1285839532642946,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001907739747493065,
+      "loss": 0.1162,
+      "step": 14813
+    },
+    {
+      "epoch": 0.12859263374449875,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001907726620687796,
+      "loss": 0.1748,
+      "step": 14814
+    },
+    {
+      "epoch": 0.12860131422470292,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0019077134929992097,
+      "loss": 0.1025,
+      "step": 14815
+    },
+    {
+      "epoch": 0.12860999470490708,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019077003644273202,
+      "loss": 0.1064,
+      "step": 14816
+    },
+    {
+      "epoch": 0.12861867518511125,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019076872349721413,
+      "loss": 0.1426,
+      "step": 14817
+    },
+    {
+      "epoch": 0.1286273556653154,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001907674104633688,
+      "loss": 0.1328,
+      "step": 14818
+    },
+    {
+      "epoch": 0.12863603614551958,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0019076609734119743,
+      "loss": 0.123,
+      "step": 14819
+    },
+    {
+      "epoch": 0.12864471662572374,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019076478413070147,
+      "loss": 0.1055,
+      "step": 14820
+    },
+    {
+      "epoch": 0.1286533971059279,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0019076347083188236,
+      "loss": 0.0942,
+      "step": 14821
+    },
+    {
+      "epoch": 0.12866207758613207,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019076215744474152,
+      "loss": 0.1133,
+      "step": 14822
+    },
+    {
+      "epoch": 0.12867075806633624,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019076084396928042,
+      "loss": 0.1074,
+      "step": 14823
+    },
+    {
+      "epoch": 0.1286794385465404,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019075953040550045,
+      "loss": 0.1104,
+      "step": 14824
+    },
+    {
+      "epoch": 0.12868811902674457,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019075821675340313,
+      "loss": 0.124,
+      "step": 14825
+    },
+    {
+      "epoch": 0.12869679950694873,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0019075690301298977,
+      "loss": 0.0957,
+      "step": 14826
+    },
+    {
+      "epoch": 0.1287054799871529,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0019075558918426189,
+      "loss": 0.1426,
+      "step": 14827
+    },
+    {
+      "epoch": 0.12871416046735706,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019075427526722092,
+      "loss": 0.1328,
+      "step": 14828
+    },
+    {
+      "epoch": 0.12872284094756123,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001907529612618683,
+      "loss": 0.1641,
+      "step": 14829
+    },
+    {
+      "epoch": 0.1287315214277654,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0019075164716820543,
+      "loss": 0.1787,
+      "step": 14830
+    },
+    {
+      "epoch": 0.12874020190796956,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019075033298623378,
+      "loss": 0.0938,
+      "step": 14831
+    },
+    {
+      "epoch": 0.12874888238817372,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0019074901871595478,
+      "loss": 0.1162,
+      "step": 14832
+    },
+    {
+      "epoch": 0.1287575628683779,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019074770435736988,
+      "loss": 0.1309,
+      "step": 14833
+    },
+    {
+      "epoch": 0.12876624334858205,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.001907463899104805,
+      "loss": 0.1416,
+      "step": 14834
+    },
+    {
+      "epoch": 0.12877492382878622,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019074507537528804,
+      "loss": 0.124,
+      "step": 14835
+    },
+    {
+      "epoch": 0.12878360430899038,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019074376075179404,
+      "loss": 0.1182,
+      "step": 14836
+    },
+    {
+      "epoch": 0.12879228478919455,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0019074244603999983,
+      "loss": 0.1016,
+      "step": 14837
+    },
+    {
+      "epoch": 0.12880096526939872,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0019074113123990692,
+      "loss": 0.1895,
+      "step": 14838
+    },
+    {
+      "epoch": 0.12880964574960288,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0019073981635151672,
+      "loss": 0.1143,
+      "step": 14839
+    },
+    {
+      "epoch": 0.12881832622980705,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0019073850137483067,
+      "loss": 0.1299,
+      "step": 14840
+    },
+    {
+      "epoch": 0.1288270067100112,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.001907371863098502,
+      "loss": 0.1094,
+      "step": 14841
+    },
+    {
+      "epoch": 0.12883568719021538,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019073587115657677,
+      "loss": 0.1055,
+      "step": 14842
+    },
+    {
+      "epoch": 0.12884436767041954,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0019073455591501179,
+      "loss": 0.1377,
+      "step": 14843
+    },
+    {
+      "epoch": 0.1288530481506237,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019073324058515673,
+      "loss": 0.1523,
+      "step": 14844
+    },
+    {
+      "epoch": 0.12886172863082787,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019073192516701298,
+      "loss": 0.1191,
+      "step": 14845
+    },
+    {
+      "epoch": 0.12887040911103204,
+      "grad_norm": 1.984375,
+      "learning_rate": 0.0019073060966058202,
+      "loss": 0.6523,
+      "step": 14846
+    },
+    {
+      "epoch": 0.12887908959123617,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001907292940658653,
+      "loss": 0.0938,
+      "step": 14847
+    },
+    {
+      "epoch": 0.12888777007144034,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019072797838286422,
+      "loss": 0.1465,
+      "step": 14848
+    },
+    {
+      "epoch": 0.1288964505516445,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0019072666261158026,
+      "loss": 0.0718,
+      "step": 14849
+    },
+    {
+      "epoch": 0.12890513103184867,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019072534675201478,
+      "loss": 0.1201,
+      "step": 14850
+    },
+    {
+      "epoch": 0.12891381151205283,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019072403080416932,
+      "loss": 0.1074,
+      "step": 14851
+    },
+    {
+      "epoch": 0.128922491992257,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019072271476804523,
+      "loss": 0.1157,
+      "step": 14852
+    },
+    {
+      "epoch": 0.12893117247246116,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019072139864364404,
+      "loss": 0.1045,
+      "step": 14853
+    },
+    {
+      "epoch": 0.12893985295266533,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0019072008243096713,
+      "loss": 0.1113,
+      "step": 14854
+    },
+    {
+      "epoch": 0.1289485334328695,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0019071876613001592,
+      "loss": 0.1143,
+      "step": 14855
+    },
+    {
+      "epoch": 0.12895721391307366,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001907174497407919,
+      "loss": 0.0947,
+      "step": 14856
+    },
+    {
+      "epoch": 0.12896589439327782,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001907161332632965,
+      "loss": 0.084,
+      "step": 14857
+    },
+    {
+      "epoch": 0.128974574873482,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0019071481669753116,
+      "loss": 0.0864,
+      "step": 14858
+    },
+    {
+      "epoch": 0.12898325535368615,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019071350004349727,
+      "loss": 0.1172,
+      "step": 14859
+    },
+    {
+      "epoch": 0.12899193583389032,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019071218330119636,
+      "loss": 0.0859,
+      "step": 14860
+    },
+    {
+      "epoch": 0.12900061631409449,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0019071086647062978,
+      "loss": 0.106,
+      "step": 14861
+    },
+    {
+      "epoch": 0.12900929679429865,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0019070954955179903,
+      "loss": 0.1084,
+      "step": 14862
+    },
+    {
+      "epoch": 0.12901797727450282,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0019070823254470548,
+      "loss": 0.1079,
+      "step": 14863
+    },
+    {
+      "epoch": 0.12902665775470698,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001907069154493507,
+      "loss": 0.1016,
+      "step": 14864
+    },
+    {
+      "epoch": 0.12903533823491115,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019070559826573597,
+      "loss": 0.1045,
+      "step": 14865
+    },
+    {
+      "epoch": 0.1290440187151153,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0019070428099386288,
+      "loss": 0.1533,
+      "step": 14866
+    },
+    {
+      "epoch": 0.12905269919531948,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019070296363373274,
+      "loss": 0.1387,
+      "step": 14867
+    },
+    {
+      "epoch": 0.12906137967552364,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019070164618534711,
+      "loss": 0.1016,
+      "step": 14868
+    },
+    {
+      "epoch": 0.1290700601557278,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019070032864870732,
+      "loss": 0.0898,
+      "step": 14869
+    },
+    {
+      "epoch": 0.12907874063593197,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0019069901102381488,
+      "loss": 0.1191,
+      "step": 14870
+    },
+    {
+      "epoch": 0.12908742111613614,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0019069769331067125,
+      "loss": 0.1318,
+      "step": 14871
+    },
+    {
+      "epoch": 0.1290961015963403,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0019069637550927778,
+      "loss": 0.1377,
+      "step": 14872
+    },
+    {
+      "epoch": 0.12910478207654447,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0019069505761963599,
+      "loss": 0.126,
+      "step": 14873
+    },
+    {
+      "epoch": 0.12911346255674863,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001906937396417473,
+      "loss": 0.1025,
+      "step": 14874
+    },
+    {
+      "epoch": 0.1291221430369528,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019069242157561314,
+      "loss": 0.0957,
+      "step": 14875
+    },
+    {
+      "epoch": 0.12913082351715696,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0019069110342123498,
+      "loss": 0.1123,
+      "step": 14876
+    },
+    {
+      "epoch": 0.12913950399736113,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019068978517861423,
+      "loss": 0.1016,
+      "step": 14877
+    },
+    {
+      "epoch": 0.1291481844775653,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001906884668477523,
+      "loss": 0.1455,
+      "step": 14878
+    },
+    {
+      "epoch": 0.12915686495776946,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0019068714842865077,
+      "loss": 0.1582,
+      "step": 14879
+    },
+    {
+      "epoch": 0.12916554543797362,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019068582992131091,
+      "loss": 0.0869,
+      "step": 14880
+    },
+    {
+      "epoch": 0.1291742259181778,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0019068451132573429,
+      "loss": 0.1289,
+      "step": 14881
+    },
+    {
+      "epoch": 0.12918290639838195,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0019068319264192225,
+      "loss": 0.1631,
+      "step": 14882
+    },
+    {
+      "epoch": 0.12919158687858612,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0019068187386987632,
+      "loss": 0.1074,
+      "step": 14883
+    },
+    {
+      "epoch": 0.12920026735879028,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019068055500959789,
+      "loss": 0.0713,
+      "step": 14884
+    },
+    {
+      "epoch": 0.12920894783899445,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001906792360610884,
+      "loss": 0.1216,
+      "step": 14885
+    },
+    {
+      "epoch": 0.1292176283191986,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0019067791702434933,
+      "loss": 0.1436,
+      "step": 14886
+    },
+    {
+      "epoch": 0.12922630879940278,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019067659789938213,
+      "loss": 0.124,
+      "step": 14887
+    },
+    {
+      "epoch": 0.12923498927960694,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0019067527868618817,
+      "loss": 0.1279,
+      "step": 14888
+    },
+    {
+      "epoch": 0.1292436697598111,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0019067395938476897,
+      "loss": 0.0928,
+      "step": 14889
+    },
+    {
+      "epoch": 0.12925235024001527,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001906726399951259,
+      "loss": 0.0967,
+      "step": 14890
+    },
+    {
+      "epoch": 0.12926103072021944,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0019067132051726046,
+      "loss": 0.0981,
+      "step": 14891
+    },
+    {
+      "epoch": 0.1292697112004236,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001906700009511741,
+      "loss": 0.1025,
+      "step": 14892
+    },
+    {
+      "epoch": 0.12927839168062777,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0019066868129686823,
+      "loss": 0.1357,
+      "step": 14893
+    },
+    {
+      "epoch": 0.12928707216083193,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0019066736155434428,
+      "loss": 0.3633,
+      "step": 14894
+    },
+    {
+      "epoch": 0.1292957526410361,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001906660417236037,
+      "loss": 0.1138,
+      "step": 14895
+    },
+    {
+      "epoch": 0.12930443312124026,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.00190664721804648,
+      "loss": 0.124,
+      "step": 14896
+    },
+    {
+      "epoch": 0.12931311360144443,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019066340179747857,
+      "loss": 0.124,
+      "step": 14897
+    },
+    {
+      "epoch": 0.1293217940816486,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001906620817020968,
+      "loss": 0.0889,
+      "step": 14898
+    },
+    {
+      "epoch": 0.12933047456185276,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0019066076151850423,
+      "loss": 0.1001,
+      "step": 14899
+    },
+    {
+      "epoch": 0.12933915504205692,
+      "grad_norm": 1.9140625,
+      "learning_rate": 0.0019065944124670225,
+      "loss": 0.1328,
+      "step": 14900
+    },
+    {
+      "epoch": 0.1293478355222611,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0019065812088669232,
+      "loss": 0.1055,
+      "step": 14901
+    },
+    {
+      "epoch": 0.12935651600246525,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001906568004384759,
+      "loss": 0.1104,
+      "step": 14902
+    },
+    {
+      "epoch": 0.12936519648266942,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.001906554799020544,
+      "loss": 0.0879,
+      "step": 14903
+    },
+    {
+      "epoch": 0.12937387696287359,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0019065415927742926,
+      "loss": 0.1611,
+      "step": 14904
+    },
+    {
+      "epoch": 0.12938255744307775,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019065283856460199,
+      "loss": 0.1094,
+      "step": 14905
+    },
+    {
+      "epoch": 0.12939123792328192,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019065151776357394,
+      "loss": 0.1084,
+      "step": 14906
+    },
+    {
+      "epoch": 0.12939991840348608,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0019065019687434664,
+      "loss": 0.1426,
+      "step": 14907
+    },
+    {
+      "epoch": 0.12940859888369025,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0019064887589692148,
+      "loss": 0.1064,
+      "step": 14908
+    },
+    {
+      "epoch": 0.1294172793638944,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0019064755483129989,
+      "loss": 0.104,
+      "step": 14909
+    },
+    {
+      "epoch": 0.12942595984409858,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001906462336774834,
+      "loss": 0.1191,
+      "step": 14910
+    },
+    {
+      "epoch": 0.12943464032430274,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0019064491243547335,
+      "loss": 0.1299,
+      "step": 14911
+    },
+    {
+      "epoch": 0.1294433208045069,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0019064359110527128,
+      "loss": 0.1016,
+      "step": 14912
+    },
+    {
+      "epoch": 0.12945200128471107,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0019064226968687856,
+      "loss": 0.123,
+      "step": 14913
+    },
+    {
+      "epoch": 0.12946068176491524,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019064094818029668,
+      "loss": 0.1084,
+      "step": 14914
+    },
+    {
+      "epoch": 0.1294693622451194,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0019063962658552706,
+      "loss": 0.1113,
+      "step": 14915
+    },
+    {
+      "epoch": 0.12947804272532357,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019063830490257117,
+      "loss": 0.1201,
+      "step": 14916
+    },
+    {
+      "epoch": 0.12948672320552773,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019063698313143043,
+      "loss": 0.0859,
+      "step": 14917
+    },
+    {
+      "epoch": 0.1294954036857319,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0019063566127210632,
+      "loss": 0.125,
+      "step": 14918
+    },
+    {
+      "epoch": 0.12950408416593606,
+      "grad_norm": 1.921875,
+      "learning_rate": 0.0019063433932460024,
+      "loss": 0.1172,
+      "step": 14919
+    },
+    {
+      "epoch": 0.12951276464614023,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0019063301728891365,
+      "loss": 0.1289,
+      "step": 14920
+    },
+    {
+      "epoch": 0.1295214451263444,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0019063169516504801,
+      "loss": 0.1152,
+      "step": 14921
+    },
+    {
+      "epoch": 0.12953012560654856,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0019063037295300475,
+      "loss": 0.0791,
+      "step": 14922
+    },
+    {
+      "epoch": 0.12953880608675272,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0019062905065278537,
+      "loss": 0.0825,
+      "step": 14923
+    },
+    {
+      "epoch": 0.1295474865669569,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019062772826439122,
+      "loss": 0.0908,
+      "step": 14924
+    },
+    {
+      "epoch": 0.12955616704716105,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0019062640578782384,
+      "loss": 0.1279,
+      "step": 14925
+    },
+    {
+      "epoch": 0.12956484752736522,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019062508322308463,
+      "loss": 0.1533,
+      "step": 14926
+    },
+    {
+      "epoch": 0.12957352800756938,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019062376057017501,
+      "loss": 0.1006,
+      "step": 14927
+    },
+    {
+      "epoch": 0.12958220848777355,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0019062243782909647,
+      "loss": 0.0894,
+      "step": 14928
+    },
+    {
+      "epoch": 0.1295908889679777,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019062111499985044,
+      "loss": 0.1133,
+      "step": 14929
+    },
+    {
+      "epoch": 0.12959956944818188,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019061979208243839,
+      "loss": 0.1133,
+      "step": 14930
+    },
+    {
+      "epoch": 0.12960824992838604,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0019061846907686174,
+      "loss": 0.209,
+      "step": 14931
+    },
+    {
+      "epoch": 0.1296169304085902,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019061714598312192,
+      "loss": 0.0996,
+      "step": 14932
+    },
+    {
+      "epoch": 0.12962561088879437,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0019061582280122042,
+      "loss": 0.1079,
+      "step": 14933
+    },
+    {
+      "epoch": 0.12963429136899854,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019061449953115866,
+      "loss": 0.1055,
+      "step": 14934
+    },
+    {
+      "epoch": 0.1296429718492027,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0019061317617293809,
+      "loss": 0.1094,
+      "step": 14935
+    },
+    {
+      "epoch": 0.12965165232940687,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019061185272656017,
+      "loss": 0.1465,
+      "step": 14936
+    },
+    {
+      "epoch": 0.12966033280961103,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019061052919202635,
+      "loss": 0.1484,
+      "step": 14937
+    },
+    {
+      "epoch": 0.1296690132898152,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0019060920556933808,
+      "loss": 0.1465,
+      "step": 14938
+    },
+    {
+      "epoch": 0.12967769377001936,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0019060788185849672,
+      "loss": 0.0898,
+      "step": 14939
+    },
+    {
+      "epoch": 0.12968637425022353,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019060655805950386,
+      "loss": 0.1216,
+      "step": 14940
+    },
+    {
+      "epoch": 0.1296950547304277,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019060523417236087,
+      "loss": 0.085,
+      "step": 14941
+    },
+    {
+      "epoch": 0.12970373521063186,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019060391019706919,
+      "loss": 0.0967,
+      "step": 14942
+    },
+    {
+      "epoch": 0.12971241569083602,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001906025861336303,
+      "loss": 0.1172,
+      "step": 14943
+    },
+    {
+      "epoch": 0.1297210961710402,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0019060126198204561,
+      "loss": 0.0908,
+      "step": 14944
+    },
+    {
+      "epoch": 0.12972977665124436,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001905999377423166,
+      "loss": 0.126,
+      "step": 14945
+    },
+    {
+      "epoch": 0.12973845713144852,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0019059861341444472,
+      "loss": 0.1445,
+      "step": 14946
+    },
+    {
+      "epoch": 0.12974713761165269,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0019059728899843142,
+      "loss": 0.1162,
+      "step": 14947
+    },
+    {
+      "epoch": 0.12975581809185685,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0019059596449427809,
+      "loss": 0.1396,
+      "step": 14948
+    },
+    {
+      "epoch": 0.12976449857206102,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0019059463990198624,
+      "loss": 0.1348,
+      "step": 14949
+    },
+    {
+      "epoch": 0.12977317905226518,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019059331522155733,
+      "loss": 0.1016,
+      "step": 14950
+    },
+    {
+      "epoch": 0.12978185953246935,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0019059199045299278,
+      "loss": 0.1162,
+      "step": 14951
+    },
+    {
+      "epoch": 0.1297905400126735,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0019059066559629404,
+      "loss": 0.0977,
+      "step": 14952
+    },
+    {
+      "epoch": 0.12979922049287768,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019058934065146254,
+      "loss": 0.1064,
+      "step": 14953
+    },
+    {
+      "epoch": 0.12980790097308184,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019058801561849974,
+      "loss": 0.127,
+      "step": 14954
+    },
+    {
+      "epoch": 0.129816581453286,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019058669049740711,
+      "loss": 0.1289,
+      "step": 14955
+    },
+    {
+      "epoch": 0.12982526193349017,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019058536528818609,
+      "loss": 0.1396,
+      "step": 14956
+    },
+    {
+      "epoch": 0.12983394241369434,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019058403999083812,
+      "loss": 0.1123,
+      "step": 14957
+    },
+    {
+      "epoch": 0.1298426228938985,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019058271460536465,
+      "loss": 0.0908,
+      "step": 14958
+    },
+    {
+      "epoch": 0.12985130337410267,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0019058138913176715,
+      "loss": 0.3477,
+      "step": 14959
+    },
+    {
+      "epoch": 0.12985998385430683,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019058006357004702,
+      "loss": 0.1035,
+      "step": 14960
+    },
+    {
+      "epoch": 0.129868664334511,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0019057873792020577,
+      "loss": 0.0771,
+      "step": 14961
+    },
+    {
+      "epoch": 0.12987734481471516,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0019057741218224484,
+      "loss": 0.1143,
+      "step": 14962
+    },
+    {
+      "epoch": 0.12988602529491933,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019057608635616563,
+      "loss": 0.1064,
+      "step": 14963
+    },
+    {
+      "epoch": 0.1298947057751235,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019057476044196963,
+      "loss": 0.1289,
+      "step": 14964
+    },
+    {
+      "epoch": 0.12990338625532766,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001905734344396583,
+      "loss": 0.1406,
+      "step": 14965
+    },
+    {
+      "epoch": 0.12991206673553182,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0019057210834923306,
+      "loss": 0.1289,
+      "step": 14966
+    },
+    {
+      "epoch": 0.129920747215736,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019057078217069538,
+      "loss": 0.1279,
+      "step": 14967
+    },
+    {
+      "epoch": 0.12992942769594015,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0019056945590404668,
+      "loss": 0.1133,
+      "step": 14968
+    },
+    {
+      "epoch": 0.12993810817614432,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019056812954928845,
+      "loss": 0.1504,
+      "step": 14969
+    },
+    {
+      "epoch": 0.12994678865634846,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0019056680310642212,
+      "loss": 0.1318,
+      "step": 14970
+    },
+    {
+      "epoch": 0.12995546913655262,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0019056547657544914,
+      "loss": 0.1152,
+      "step": 14971
+    },
+    {
+      "epoch": 0.12996414961675679,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0019056414995637096,
+      "loss": 0.1553,
+      "step": 14972
+    },
+    {
+      "epoch": 0.12997283009696095,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0019056282324918905,
+      "loss": 0.0938,
+      "step": 14973
+    },
+    {
+      "epoch": 0.12998151057716512,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0019056149645390486,
+      "loss": 0.0923,
+      "step": 14974
+    },
+    {
+      "epoch": 0.12999019105736928,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0019056016957051984,
+      "loss": 0.1021,
+      "step": 14975
+    },
+    {
+      "epoch": 0.12999887153757345,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.001905588425990354,
+      "loss": 0.1045,
+      "step": 14976
+    },
+    {
+      "epoch": 0.1300075520177776,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0019055751553945302,
+      "loss": 0.0942,
+      "step": 14977
+    },
+    {
+      "epoch": 0.13001623249798178,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019055618839177416,
+      "loss": 0.1777,
+      "step": 14978
+    },
+    {
+      "epoch": 0.13002491297818594,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0019055486115600027,
+      "loss": 0.1416,
+      "step": 14979
+    },
+    {
+      "epoch": 0.1300335934583901,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0019055353383213276,
+      "loss": 0.0869,
+      "step": 14980
+    },
+    {
+      "epoch": 0.13004227393859427,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0019055220642017312,
+      "loss": 0.1055,
+      "step": 14981
+    },
+    {
+      "epoch": 0.13005095441879844,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0019055087892012284,
+      "loss": 0.1504,
+      "step": 14982
+    },
+    {
+      "epoch": 0.1300596348990026,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0019054955133198331,
+      "loss": 0.1104,
+      "step": 14983
+    },
+    {
+      "epoch": 0.13006831537920677,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00190548223655756,
+      "loss": 0.1035,
+      "step": 14984
+    },
+    {
+      "epoch": 0.13007699585941093,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0019054689589144233,
+      "loss": 0.0938,
+      "step": 14985
+    },
+    {
+      "epoch": 0.1300856763396151,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0019054556803904381,
+      "loss": 0.0957,
+      "step": 14986
+    },
+    {
+      "epoch": 0.13009435681981926,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0019054424009856189,
+      "loss": 0.1133,
+      "step": 14987
+    },
+    {
+      "epoch": 0.13010303730002343,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0019054291206999797,
+      "loss": 0.1279,
+      "step": 14988
+    },
+    {
+      "epoch": 0.1301117177802276,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019054158395335355,
+      "loss": 0.0771,
+      "step": 14989
+    },
+    {
+      "epoch": 0.13012039826043176,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019054025574863004,
+      "loss": 0.0957,
+      "step": 14990
+    },
+    {
+      "epoch": 0.13012907874063592,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0019053892745582895,
+      "loss": 0.0879,
+      "step": 14991
+    },
+    {
+      "epoch": 0.1301377592208401,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0019053759907495167,
+      "loss": 0.0923,
+      "step": 14992
+    },
+    {
+      "epoch": 0.13014643970104425,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0019053627060599968,
+      "loss": 0.3535,
+      "step": 14993
+    },
+    {
+      "epoch": 0.13015512018124842,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019053494204897446,
+      "loss": 0.084,
+      "step": 14994
+    },
+    {
+      "epoch": 0.13016380066145258,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019053361340387743,
+      "loss": 0.0942,
+      "step": 14995
+    },
+    {
+      "epoch": 0.13017248114165675,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0019053228467071001,
+      "loss": 0.1143,
+      "step": 14996
+    },
+    {
+      "epoch": 0.1301811616218609,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019053095584947374,
+      "loss": 0.1221,
+      "step": 14997
+    },
+    {
+      "epoch": 0.13018984210206508,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0019052962694017003,
+      "loss": 0.1064,
+      "step": 14998
+    },
+    {
+      "epoch": 0.13019852258226924,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019052829794280028,
+      "loss": 0.1221,
+      "step": 14999
+    },
+    {
+      "epoch": 0.1302072030624734,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019052696885736608,
+      "loss": 0.1206,
+      "step": 15000
+    },
+    {
+      "epoch": 0.13021588354267757,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.001905256396838687,
+      "loss": 0.0762,
+      "step": 15001
+    },
+    {
+      "epoch": 0.13022456402288174,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0019052431042230976,
+      "loss": 0.1445,
+      "step": 15002
+    },
+    {
+      "epoch": 0.1302332445030859,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0019052298107269062,
+      "loss": 0.1128,
+      "step": 15003
+    },
+    {
+      "epoch": 0.13024192498329007,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019052165163501273,
+      "loss": 0.0947,
+      "step": 15004
+    },
+    {
+      "epoch": 0.13025060546349423,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001905203221092776,
+      "loss": 0.1445,
+      "step": 15005
+    },
+    {
+      "epoch": 0.1302592859436984,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019051899249548667,
+      "loss": 0.0854,
+      "step": 15006
+    },
+    {
+      "epoch": 0.13026796642390256,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0019051766279364135,
+      "loss": 0.1562,
+      "step": 15007
+    },
+    {
+      "epoch": 0.13027664690410673,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0019051633300374313,
+      "loss": 0.1211,
+      "step": 15008
+    },
+    {
+      "epoch": 0.1302853273843109,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0019051500312579348,
+      "loss": 0.1523,
+      "step": 15009
+    },
+    {
+      "epoch": 0.13029400786451506,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0019051367315979382,
+      "loss": 0.1719,
+      "step": 15010
+    },
+    {
+      "epoch": 0.13030268834471923,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0019051234310574561,
+      "loss": 0.1133,
+      "step": 15011
+    },
+    {
+      "epoch": 0.1303113688249234,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001905110129636503,
+      "loss": 0.1084,
+      "step": 15012
+    },
+    {
+      "epoch": 0.13032004930512756,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019050968273350939,
+      "loss": 0.1045,
+      "step": 15013
+    },
+    {
+      "epoch": 0.13032872978533172,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0019050835241532426,
+      "loss": 0.1172,
+      "step": 15014
+    },
+    {
+      "epoch": 0.13033741026553589,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0019050702200909644,
+      "loss": 0.2773,
+      "step": 15015
+    },
+    {
+      "epoch": 0.13034609074574005,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0019050569151482736,
+      "loss": 0.1338,
+      "step": 15016
+    },
+    {
+      "epoch": 0.13035477122594422,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019050436093251842,
+      "loss": 0.1543,
+      "step": 15017
+    },
+    {
+      "epoch": 0.13036345170614838,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0019050303026217116,
+      "loss": 0.104,
+      "step": 15018
+    },
+    {
+      "epoch": 0.13037213218635255,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.0019050169950378696,
+      "loss": 0.1094,
+      "step": 15019
+    },
+    {
+      "epoch": 0.1303808126665567,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0019050036865736733,
+      "loss": 0.1484,
+      "step": 15020
+    },
+    {
+      "epoch": 0.13038949314676088,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001904990377229137,
+      "loss": 0.1133,
+      "step": 15021
+    },
+    {
+      "epoch": 0.13039817362696504,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0019049770670042756,
+      "loss": 0.127,
+      "step": 15022
+    },
+    {
+      "epoch": 0.1304068541071692,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001904963755899103,
+      "loss": 0.1592,
+      "step": 15023
+    },
+    {
+      "epoch": 0.13041553458737337,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0019049504439136342,
+      "loss": 0.1318,
+      "step": 15024
+    },
+    {
+      "epoch": 0.13042421506757754,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0019049371310478837,
+      "loss": 0.1104,
+      "step": 15025
+    },
+    {
+      "epoch": 0.1304328955477817,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0019049238173018661,
+      "loss": 0.1523,
+      "step": 15026
+    },
+    {
+      "epoch": 0.13044157602798587,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0019049105026755961,
+      "loss": 0.1719,
+      "step": 15027
+    },
+    {
+      "epoch": 0.13045025650819003,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001904897187169088,
+      "loss": 0.0986,
+      "step": 15028
+    },
+    {
+      "epoch": 0.1304589369883942,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019048838707823562,
+      "loss": 0.0913,
+      "step": 15029
+    },
+    {
+      "epoch": 0.13046761746859836,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0019048705535154157,
+      "loss": 0.1631,
+      "step": 15030
+    },
+    {
+      "epoch": 0.13047629794880253,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019048572353682807,
+      "loss": 0.1279,
+      "step": 15031
+    },
+    {
+      "epoch": 0.1304849784290067,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001904843916340966,
+      "loss": 0.1426,
+      "step": 15032
+    },
+    {
+      "epoch": 0.13049365890921086,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.001904830596433486,
+      "loss": 0.1445,
+      "step": 15033
+    },
+    {
+      "epoch": 0.13050233938941502,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0019048172756458556,
+      "loss": 0.1182,
+      "step": 15034
+    },
+    {
+      "epoch": 0.1305110198696192,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0019048039539780889,
+      "loss": 0.1128,
+      "step": 15035
+    },
+    {
+      "epoch": 0.13051970034982335,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0019047906314302008,
+      "loss": 0.1299,
+      "step": 15036
+    },
+    {
+      "epoch": 0.13052838083002752,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0019047773080022056,
+      "loss": 0.1084,
+      "step": 15037
+    },
+    {
+      "epoch": 0.13053706131023168,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019047639836941182,
+      "loss": 0.1035,
+      "step": 15038
+    },
+    {
+      "epoch": 0.13054574179043585,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0019047506585059529,
+      "loss": 0.1045,
+      "step": 15039
+    },
+    {
+      "epoch": 0.13055442227064,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019047373324377244,
+      "loss": 0.1318,
+      "step": 15040
+    },
+    {
+      "epoch": 0.13056310275084418,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001904724005489447,
+      "loss": 0.1113,
+      "step": 15041
+    },
+    {
+      "epoch": 0.13057178323104834,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001904710677661136,
+      "loss": 0.1128,
+      "step": 15042
+    },
+    {
+      "epoch": 0.1305804637112525,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001904697348952805,
+      "loss": 0.0977,
+      "step": 15043
+    },
+    {
+      "epoch": 0.13058914419145667,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0019046840193644695,
+      "loss": 0.2363,
+      "step": 15044
+    },
+    {
+      "epoch": 0.13059782467166084,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019046706888961435,
+      "loss": 0.1191,
+      "step": 15045
+    },
+    {
+      "epoch": 0.130606505151865,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0019046573575478417,
+      "loss": 0.1143,
+      "step": 15046
+    },
+    {
+      "epoch": 0.13061518563206917,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019046440253195785,
+      "loss": 0.1025,
+      "step": 15047
+    },
+    {
+      "epoch": 0.13062386611227333,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001904630692211369,
+      "loss": 0.1182,
+      "step": 15048
+    },
+    {
+      "epoch": 0.1306325465924775,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0019046173582232273,
+      "loss": 0.1172,
+      "step": 15049
+    },
+    {
+      "epoch": 0.13064122707268166,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0019046040233551686,
+      "loss": 0.1299,
+      "step": 15050
+    },
+    {
+      "epoch": 0.13064990755288583,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019045906876072063,
+      "loss": 0.1191,
+      "step": 15051
+    },
+    {
+      "epoch": 0.13065858803309,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001904577350979356,
+      "loss": 0.1064,
+      "step": 15052
+    },
+    {
+      "epoch": 0.13066726851329416,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0019045640134716323,
+      "loss": 0.1758,
+      "step": 15053
+    },
+    {
+      "epoch": 0.13067594899349833,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0019045506750840489,
+      "loss": 0.126,
+      "step": 15054
+    },
+    {
+      "epoch": 0.1306846294737025,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019045373358166215,
+      "loss": 0.1016,
+      "step": 15055
+    },
+    {
+      "epoch": 0.13069330995390666,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019045239956693638,
+      "loss": 0.1113,
+      "step": 15056
+    },
+    {
+      "epoch": 0.13070199043411082,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0019045106546422908,
+      "loss": 0.1133,
+      "step": 15057
+    },
+    {
+      "epoch": 0.13071067091431499,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001904497312735417,
+      "loss": 0.1416,
+      "step": 15058
+    },
+    {
+      "epoch": 0.13071935139451915,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019044839699487574,
+      "loss": 0.1279,
+      "step": 15059
+    },
+    {
+      "epoch": 0.13072803187472332,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019044706262823258,
+      "loss": 0.1133,
+      "step": 15060
+    },
+    {
+      "epoch": 0.13073671235492748,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0019044572817361375,
+      "loss": 0.0747,
+      "step": 15061
+    },
+    {
+      "epoch": 0.13074539283513165,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0019044439363102065,
+      "loss": 0.084,
+      "step": 15062
+    },
+    {
+      "epoch": 0.1307540733153358,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.001904430590004548,
+      "loss": 0.1328,
+      "step": 15063
+    },
+    {
+      "epoch": 0.13076275379553998,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0019044172428191763,
+      "loss": 0.1069,
+      "step": 15064
+    },
+    {
+      "epoch": 0.13077143427574414,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0019044038947541055,
+      "loss": 0.1001,
+      "step": 15065
+    },
+    {
+      "epoch": 0.1307801147559483,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0019043905458093514,
+      "loss": 0.1309,
+      "step": 15066
+    },
+    {
+      "epoch": 0.13078879523615247,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0019043771959849274,
+      "loss": 0.1016,
+      "step": 15067
+    },
+    {
+      "epoch": 0.13079747571635664,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019043638452808486,
+      "loss": 0.0986,
+      "step": 15068
+    },
+    {
+      "epoch": 0.1308061561965608,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019043504936971296,
+      "loss": 0.0981,
+      "step": 15069
+    },
+    {
+      "epoch": 0.13081483667676497,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0019043371412337854,
+      "loss": 0.0947,
+      "step": 15070
+    },
+    {
+      "epoch": 0.13082351715696913,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019043237878908297,
+      "loss": 0.1182,
+      "step": 15071
+    },
+    {
+      "epoch": 0.1308321976371733,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001904310433668278,
+      "loss": 0.1279,
+      "step": 15072
+    },
+    {
+      "epoch": 0.13084087811737746,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0019042970785661441,
+      "loss": 0.1416,
+      "step": 15073
+    },
+    {
+      "epoch": 0.13084955859758163,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0019042837225844431,
+      "loss": 0.1123,
+      "step": 15074
+    },
+    {
+      "epoch": 0.1308582390777858,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019042703657231896,
+      "loss": 0.1494,
+      "step": 15075
+    },
+    {
+      "epoch": 0.13086691955798996,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001904257007982398,
+      "loss": 0.1201,
+      "step": 15076
+    },
+    {
+      "epoch": 0.13087560003819412,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0019042436493620831,
+      "loss": 0.0986,
+      "step": 15077
+    },
+    {
+      "epoch": 0.1308842805183983,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019042302898622594,
+      "loss": 0.084,
+      "step": 15078
+    },
+    {
+      "epoch": 0.13089296099860245,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019042169294829416,
+      "loss": 0.0903,
+      "step": 15079
+    },
+    {
+      "epoch": 0.13090164147880662,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019042035682241443,
+      "loss": 0.1113,
+      "step": 15080
+    },
+    {
+      "epoch": 0.13091032195901078,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019041902060858817,
+      "loss": 0.0762,
+      "step": 15081
+    },
+    {
+      "epoch": 0.13091900243921495,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0019041768430681694,
+      "loss": 0.1084,
+      "step": 15082
+    },
+    {
+      "epoch": 0.1309276829194191,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.001904163479171021,
+      "loss": 0.1011,
+      "step": 15083
+    },
+    {
+      "epoch": 0.13093636339962328,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0019041501143944514,
+      "loss": 0.1514,
+      "step": 15084
+    },
+    {
+      "epoch": 0.13094504387982744,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019041367487384756,
+      "loss": 0.0625,
+      "step": 15085
+    },
+    {
+      "epoch": 0.1309537243600316,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0019041233822031079,
+      "loss": 0.1367,
+      "step": 15086
+    },
+    {
+      "epoch": 0.13096240484023577,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0019041100147883627,
+      "loss": 0.1416,
+      "step": 15087
+    },
+    {
+      "epoch": 0.13097108532043994,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0019040966464942551,
+      "loss": 0.1328,
+      "step": 15088
+    },
+    {
+      "epoch": 0.1309797658006441,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019040832773207991,
+      "loss": 0.1045,
+      "step": 15089
+    },
+    {
+      "epoch": 0.13098844628084827,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.00190406990726801,
+      "loss": 0.123,
+      "step": 15090
+    },
+    {
+      "epoch": 0.13099712676105243,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019040565363359024,
+      "loss": 0.127,
+      "step": 15091
+    },
+    {
+      "epoch": 0.1310058072412566,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0019040431645244902,
+      "loss": 0.1035,
+      "step": 15092
+    },
+    {
+      "epoch": 0.13101448772146077,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019040297918337889,
+      "loss": 0.1025,
+      "step": 15093
+    },
+    {
+      "epoch": 0.1310231682016649,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0019040164182638122,
+      "loss": 0.1172,
+      "step": 15094
+    },
+    {
+      "epoch": 0.13103184868186907,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019040030438145757,
+      "loss": 0.1094,
+      "step": 15095
+    },
+    {
+      "epoch": 0.13104052916207323,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019039896684860931,
+      "loss": 0.1221,
+      "step": 15096
+    },
+    {
+      "epoch": 0.1310492096422774,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019039762922783798,
+      "loss": 0.1123,
+      "step": 15097
+    },
+    {
+      "epoch": 0.13105789012248156,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0019039629151914501,
+      "loss": 0.1201,
+      "step": 15098
+    },
+    {
+      "epoch": 0.13106657060268573,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0019039495372253185,
+      "loss": 0.1211,
+      "step": 15099
+    },
+    {
+      "epoch": 0.1310752510828899,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0019039361583799996,
+      "loss": 0.0933,
+      "step": 15100
+    },
+    {
+      "epoch": 0.13108393156309406,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0019039227786555087,
+      "loss": 0.1064,
+      "step": 15101
+    },
+    {
+      "epoch": 0.13109261204329822,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019039093980518596,
+      "loss": 0.1035,
+      "step": 15102
+    },
+    {
+      "epoch": 0.1311012925235024,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019038960165690673,
+      "loss": 0.1035,
+      "step": 15103
+    },
+    {
+      "epoch": 0.13110997300370655,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019038826342071464,
+      "loss": 0.1099,
+      "step": 15104
+    },
+    {
+      "epoch": 0.13111865348391072,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0019038692509661117,
+      "loss": 0.1377,
+      "step": 15105
+    },
+    {
+      "epoch": 0.13112733396411488,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0019038558668459775,
+      "loss": 0.1055,
+      "step": 15106
+    },
+    {
+      "epoch": 0.13113601444431905,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0019038424818467587,
+      "loss": 0.0791,
+      "step": 15107
+    },
+    {
+      "epoch": 0.13114469492452321,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0019038290959684694,
+      "loss": 0.1289,
+      "step": 15108
+    },
+    {
+      "epoch": 0.13115337540472738,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019038157092111253,
+      "loss": 0.0967,
+      "step": 15109
+    },
+    {
+      "epoch": 0.13116205588493154,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.00190380232157474,
+      "loss": 0.1406,
+      "step": 15110
+    },
+    {
+      "epoch": 0.1311707363651357,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0019037889330593286,
+      "loss": 0.1377,
+      "step": 15111
+    },
+    {
+      "epoch": 0.13117941684533987,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0019037755436649058,
+      "loss": 0.1299,
+      "step": 15112
+    },
+    {
+      "epoch": 0.13118809732554404,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0019037621533914862,
+      "loss": 0.1133,
+      "step": 15113
+    },
+    {
+      "epoch": 0.1311967778057482,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0019037487622390844,
+      "loss": 0.0967,
+      "step": 15114
+    },
+    {
+      "epoch": 0.13120545828595237,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001903735370207715,
+      "loss": 0.0771,
+      "step": 15115
+    },
+    {
+      "epoch": 0.13121413876615654,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0019037219772973927,
+      "loss": 0.1377,
+      "step": 15116
+    },
+    {
+      "epoch": 0.1312228192463607,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0019037085835081318,
+      "loss": 0.124,
+      "step": 15117
+    },
+    {
+      "epoch": 0.13123149972656487,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019036951888399473,
+      "loss": 0.1484,
+      "step": 15118
+    },
+    {
+      "epoch": 0.13124018020676903,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019036817932928543,
+      "loss": 0.1455,
+      "step": 15119
+    },
+    {
+      "epoch": 0.1312488606869732,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019036683968668667,
+      "loss": 0.1143,
+      "step": 15120
+    },
+    {
+      "epoch": 0.13125754116717736,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 0.0019036549995619996,
+      "loss": 0.1016,
+      "step": 15121
+    },
+    {
+      "epoch": 0.13126622164738153,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001903641601378267,
+      "loss": 0.0952,
+      "step": 15122
+    },
+    {
+      "epoch": 0.1312749021275857,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0019036282023156843,
+      "loss": 0.0762,
+      "step": 15123
+    },
+    {
+      "epoch": 0.13128358260778986,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019036148023742659,
+      "loss": 0.1602,
+      "step": 15124
+    },
+    {
+      "epoch": 0.13129226308799402,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0019036014015540263,
+      "loss": 0.125,
+      "step": 15125
+    },
+    {
+      "epoch": 0.1313009435681982,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019035879998549805,
+      "loss": 0.1191,
+      "step": 15126
+    },
+    {
+      "epoch": 0.13130962404840235,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019035745972771428,
+      "loss": 0.127,
+      "step": 15127
+    },
+    {
+      "epoch": 0.13131830452860652,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019035611938205282,
+      "loss": 0.3496,
+      "step": 15128
+    },
+    {
+      "epoch": 0.13132698500881068,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0019035477894851509,
+      "loss": 0.1309,
+      "step": 15129
+    },
+    {
+      "epoch": 0.13133566548901485,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0019035343842710257,
+      "loss": 0.1758,
+      "step": 15130
+    },
+    {
+      "epoch": 0.131344345969219,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019035209781781675,
+      "loss": 0.126,
+      "step": 15131
+    },
+    {
+      "epoch": 0.13135302644942318,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001903507571206591,
+      "loss": 0.1914,
+      "step": 15132
+    },
+    {
+      "epoch": 0.13136170692962734,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0019034941633563106,
+      "loss": 0.1055,
+      "step": 15133
+    },
+    {
+      "epoch": 0.1313703874098315,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.001903480754627341,
+      "loss": 0.1562,
+      "step": 15134
+    },
+    {
+      "epoch": 0.13137906789003567,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019034673450196969,
+      "loss": 0.1328,
+      "step": 15135
+    },
+    {
+      "epoch": 0.13138774837023984,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.001903453934533393,
+      "loss": 0.0962,
+      "step": 15136
+    },
+    {
+      "epoch": 0.131396428850444,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019034405231684443,
+      "loss": 0.1318,
+      "step": 15137
+    },
+    {
+      "epoch": 0.13140510933064817,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0019034271109248646,
+      "loss": 0.1445,
+      "step": 15138
+    },
+    {
+      "epoch": 0.13141378981085233,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0019034136978026692,
+      "loss": 0.1211,
+      "step": 15139
+    },
+    {
+      "epoch": 0.1314224702910565,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0019034002838018726,
+      "loss": 0.1128,
+      "step": 15140
+    },
+    {
+      "epoch": 0.13143115077126066,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0019033868689224897,
+      "loss": 0.1855,
+      "step": 15141
+    },
+    {
+      "epoch": 0.13143983125146483,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019033734531645353,
+      "loss": 0.166,
+      "step": 15142
+    },
+    {
+      "epoch": 0.131448511731669,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019033600365280232,
+      "loss": 0.1211,
+      "step": 15143
+    },
+    {
+      "epoch": 0.13145719221187316,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001903346619012969,
+      "loss": 0.1104,
+      "step": 15144
+    },
+    {
+      "epoch": 0.13146587269207732,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0019033332006193867,
+      "loss": 0.1182,
+      "step": 15145
+    },
+    {
+      "epoch": 0.1314745531722815,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0019033197813472914,
+      "loss": 0.1641,
+      "step": 15146
+    },
+    {
+      "epoch": 0.13148323365248565,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019033063611966978,
+      "loss": 0.1592,
+      "step": 15147
+    },
+    {
+      "epoch": 0.13149191413268982,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0019032929401676202,
+      "loss": 0.1113,
+      "step": 15148
+    },
+    {
+      "epoch": 0.13150059461289398,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019032795182600734,
+      "loss": 0.0991,
+      "step": 15149
+    },
+    {
+      "epoch": 0.13150927509309815,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019032660954740727,
+      "loss": 0.1045,
+      "step": 15150
+    },
+    {
+      "epoch": 0.13151795557330231,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001903252671809632,
+      "loss": 0.1279,
+      "step": 15151
+    },
+    {
+      "epoch": 0.13152663605350648,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019032392472667662,
+      "loss": 0.1309,
+      "step": 15152
+    },
+    {
+      "epoch": 0.13153531653371064,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.00190322582184549,
+      "loss": 0.1045,
+      "step": 15153
+    },
+    {
+      "epoch": 0.1315439970139148,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0019032123955458182,
+      "loss": 0.127,
+      "step": 15154
+    },
+    {
+      "epoch": 0.13155267749411897,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0019031989683677654,
+      "loss": 0.0996,
+      "step": 15155
+    },
+    {
+      "epoch": 0.13156135797432314,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001903185540311346,
+      "loss": 0.1221,
+      "step": 15156
+    },
+    {
+      "epoch": 0.1315700384545273,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0019031721113765753,
+      "loss": 0.1328,
+      "step": 15157
+    },
+    {
+      "epoch": 0.13157871893473147,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0019031586815634676,
+      "loss": 0.1221,
+      "step": 15158
+    },
+    {
+      "epoch": 0.13158739941493564,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019031452508720375,
+      "loss": 0.1289,
+      "step": 15159
+    },
+    {
+      "epoch": 0.1315960798951398,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0019031318193023,
+      "loss": 0.124,
+      "step": 15160
+    },
+    {
+      "epoch": 0.13160476037534397,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0019031183868542694,
+      "loss": 0.1094,
+      "step": 15161
+    },
+    {
+      "epoch": 0.13161344085554813,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0019031049535279607,
+      "loss": 0.1348,
+      "step": 15162
+    },
+    {
+      "epoch": 0.1316221213357523,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019030915193233887,
+      "loss": 0.1084,
+      "step": 15163
+    },
+    {
+      "epoch": 0.13163080181595646,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0019030780842405675,
+      "loss": 0.1758,
+      "step": 15164
+    },
+    {
+      "epoch": 0.13163948229616063,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0019030646482795124,
+      "loss": 0.1201,
+      "step": 15165
+    },
+    {
+      "epoch": 0.1316481627763648,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0019030512114402377,
+      "loss": 0.1523,
+      "step": 15166
+    },
+    {
+      "epoch": 0.13165684325656896,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0019030377737227586,
+      "loss": 0.124,
+      "step": 15167
+    },
+    {
+      "epoch": 0.13166552373677312,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001903024335127089,
+      "loss": 0.1147,
+      "step": 15168
+    },
+    {
+      "epoch": 0.1316742042169773,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0019030108956532446,
+      "loss": 0.1582,
+      "step": 15169
+    },
+    {
+      "epoch": 0.13168288469718145,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019029974553012392,
+      "loss": 0.1152,
+      "step": 15170
+    },
+    {
+      "epoch": 0.13169156517738562,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001902984014071088,
+      "loss": 0.1484,
+      "step": 15171
+    },
+    {
+      "epoch": 0.13170024565758978,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001902970571962805,
+      "loss": 0.1084,
+      "step": 15172
+    },
+    {
+      "epoch": 0.13170892613779395,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001902957128976406,
+      "loss": 0.1279,
+      "step": 15173
+    },
+    {
+      "epoch": 0.1317176066179981,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0019029436851119047,
+      "loss": 0.1104,
+      "step": 15174
+    },
+    {
+      "epoch": 0.13172628709820228,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019029302403693168,
+      "loss": 0.0918,
+      "step": 15175
+    },
+    {
+      "epoch": 0.13173496757840644,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019029167947486564,
+      "loss": 0.1387,
+      "step": 15176
+    },
+    {
+      "epoch": 0.1317436480586106,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0019029033482499378,
+      "loss": 0.0977,
+      "step": 15177
+    },
+    {
+      "epoch": 0.13175232853881477,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0019028899008731767,
+      "loss": 0.0977,
+      "step": 15178
+    },
+    {
+      "epoch": 0.13176100901901894,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0019028764526183867,
+      "loss": 0.084,
+      "step": 15179
+    },
+    {
+      "epoch": 0.1317696894992231,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0019028630034855839,
+      "loss": 0.0898,
+      "step": 15180
+    },
+    {
+      "epoch": 0.13177836997942727,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019028495534747815,
+      "loss": 0.1191,
+      "step": 15181
+    },
+    {
+      "epoch": 0.13178705045963143,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.001902836102585995,
+      "loss": 0.1191,
+      "step": 15182
+    },
+    {
+      "epoch": 0.1317957309398356,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001902822650819239,
+      "loss": 0.125,
+      "step": 15183
+    },
+    {
+      "epoch": 0.13180441142003976,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019028091981745283,
+      "loss": 0.1221,
+      "step": 15184
+    },
+    {
+      "epoch": 0.13181309190024393,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0019027957446518776,
+      "loss": 0.0908,
+      "step": 15185
+    },
+    {
+      "epoch": 0.1318217723804481,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0019027822902513013,
+      "loss": 0.1562,
+      "step": 15186
+    },
+    {
+      "epoch": 0.13183045286065226,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019027688349728146,
+      "loss": 0.1001,
+      "step": 15187
+    },
+    {
+      "epoch": 0.13183913334085642,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001902755378816432,
+      "loss": 0.0957,
+      "step": 15188
+    },
+    {
+      "epoch": 0.1318478138210606,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019027419217821679,
+      "loss": 0.1318,
+      "step": 15189
+    },
+    {
+      "epoch": 0.13185649430126475,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0019027284638700374,
+      "loss": 0.1289,
+      "step": 15190
+    },
+    {
+      "epoch": 0.13186517478146892,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.001902715005080055,
+      "loss": 0.1133,
+      "step": 15191
+    },
+    {
+      "epoch": 0.13187385526167308,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0019027015454122357,
+      "loss": 0.1211,
+      "step": 15192
+    },
+    {
+      "epoch": 0.13188253574187725,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0019026880848665943,
+      "loss": 0.1162,
+      "step": 15193
+    },
+    {
+      "epoch": 0.13189121622208141,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0019026746234431446,
+      "loss": 0.1279,
+      "step": 15194
+    },
+    {
+      "epoch": 0.13189989670228558,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0019026611611419025,
+      "loss": 0.1104,
+      "step": 15195
+    },
+    {
+      "epoch": 0.13190857718248974,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0019026476979628819,
+      "loss": 0.0942,
+      "step": 15196
+    },
+    {
+      "epoch": 0.1319172576626939,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0019026342339060983,
+      "loss": 0.1211,
+      "step": 15197
+    },
+    {
+      "epoch": 0.13192593814289807,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019026207689715655,
+      "loss": 0.1104,
+      "step": 15198
+    },
+    {
+      "epoch": 0.13193461862310224,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001902607303159299,
+      "loss": 0.1494,
+      "step": 15199
+    },
+    {
+      "epoch": 0.1319432991033064,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001902593836469313,
+      "loss": 0.0845,
+      "step": 15200
+    },
+    {
+      "epoch": 0.13195197958351057,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0019025803689016226,
+      "loss": 0.1328,
+      "step": 15201
+    },
+    {
+      "epoch": 0.13196066006371474,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0019025669004562424,
+      "loss": 0.1025,
+      "step": 15202
+    },
+    {
+      "epoch": 0.1319693405439189,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0019025534311331868,
+      "loss": 0.0869,
+      "step": 15203
+    },
+    {
+      "epoch": 0.13197802102412307,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.001902539960932471,
+      "loss": 0.0962,
+      "step": 15204
+    },
+    {
+      "epoch": 0.13198670150432723,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0019025264898541096,
+      "loss": 0.125,
+      "step": 15205
+    },
+    {
+      "epoch": 0.1319953819845314,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0019025130178981174,
+      "loss": 0.1133,
+      "step": 15206
+    },
+    {
+      "epoch": 0.13200406246473556,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0019024995450645087,
+      "loss": 0.1226,
+      "step": 15207
+    },
+    {
+      "epoch": 0.13201274294493973,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019024860713532988,
+      "loss": 0.0859,
+      "step": 15208
+    },
+    {
+      "epoch": 0.1320214234251439,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001902472596764502,
+      "loss": 0.126,
+      "step": 15209
+    },
+    {
+      "epoch": 0.13203010390534806,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0019024591212981337,
+      "loss": 0.127,
+      "step": 15210
+    },
+    {
+      "epoch": 0.13203878438555222,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0019024456449542077,
+      "loss": 0.1289,
+      "step": 15211
+    },
+    {
+      "epoch": 0.1320474648657564,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0019024321677327393,
+      "loss": 0.1221,
+      "step": 15212
+    },
+    {
+      "epoch": 0.13205614534596055,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0019024186896337432,
+      "loss": 0.0928,
+      "step": 15213
+    },
+    {
+      "epoch": 0.13206482582616472,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0019024052106572339,
+      "loss": 0.0718,
+      "step": 15214
+    },
+    {
+      "epoch": 0.13207350630636888,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019023917308032263,
+      "loss": 0.0938,
+      "step": 15215
+    },
+    {
+      "epoch": 0.13208218678657305,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0019023782500717354,
+      "loss": 0.1279,
+      "step": 15216
+    },
+    {
+      "epoch": 0.13209086726677718,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0019023647684627755,
+      "loss": 0.0962,
+      "step": 15217
+    },
+    {
+      "epoch": 0.13209954774698135,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0019023512859763618,
+      "loss": 0.0996,
+      "step": 15218
+    },
+    {
+      "epoch": 0.13210822822718551,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0019023378026125086,
+      "loss": 0.1611,
+      "step": 15219
+    },
+    {
+      "epoch": 0.13211690870738968,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001902324318371231,
+      "loss": 0.1309,
+      "step": 15220
+    },
+    {
+      "epoch": 0.13212558918759384,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019023108332525433,
+      "loss": 0.1152,
+      "step": 15221
+    },
+    {
+      "epoch": 0.132134269667798,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0019022973472564608,
+      "loss": 0.1445,
+      "step": 15222
+    },
+    {
+      "epoch": 0.13214295014800218,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0019022838603829978,
+      "loss": 0.1035,
+      "step": 15223
+    },
+    {
+      "epoch": 0.13215163062820634,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0019022703726321692,
+      "loss": 0.1406,
+      "step": 15224
+    },
+    {
+      "epoch": 0.1321603111084105,
+      "grad_norm": 0.5,
+      "learning_rate": 0.00190225688400399,
+      "loss": 0.1367,
+      "step": 15225
+    },
+    {
+      "epoch": 0.13216899158861467,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0019022433944984745,
+      "loss": 0.1035,
+      "step": 15226
+    },
+    {
+      "epoch": 0.13217767206881884,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019022299041156377,
+      "loss": 0.1719,
+      "step": 15227
+    },
+    {
+      "epoch": 0.132186352549023,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0019022164128554943,
+      "loss": 0.127,
+      "step": 15228
+    },
+    {
+      "epoch": 0.13219503302922717,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0019022029207180593,
+      "loss": 0.1143,
+      "step": 15229
+    },
+    {
+      "epoch": 0.13220371350943133,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0019021894277033472,
+      "loss": 0.1309,
+      "step": 15230
+    },
+    {
+      "epoch": 0.1322123939896355,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0019021759338113727,
+      "loss": 0.1045,
+      "step": 15231
+    },
+    {
+      "epoch": 0.13222107446983966,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0019021624390421507,
+      "loss": 0.125,
+      "step": 15232
+    },
+    {
+      "epoch": 0.13222975495004383,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0019021489433956958,
+      "loss": 0.166,
+      "step": 15233
+    },
+    {
+      "epoch": 0.132238435430248,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019021354468720228,
+      "loss": 0.1201,
+      "step": 15234
+    },
+    {
+      "epoch": 0.13224711591045216,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0019021219494711468,
+      "loss": 0.1104,
+      "step": 15235
+    },
+    {
+      "epoch": 0.13225579639065632,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0019021084511930825,
+      "loss": 0.124,
+      "step": 15236
+    },
+    {
+      "epoch": 0.1322644768708605,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001902094952037844,
+      "loss": 0.1108,
+      "step": 15237
+    },
+    {
+      "epoch": 0.13227315735106465,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0019020814520054464,
+      "loss": 0.1094,
+      "step": 15238
+    },
+    {
+      "epoch": 0.13228183783126882,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0019020679510959046,
+      "loss": 0.0845,
+      "step": 15239
+    },
+    {
+      "epoch": 0.13229051831147298,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0019020544493092337,
+      "loss": 0.0898,
+      "step": 15240
+    },
+    {
+      "epoch": 0.13229919879167715,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0019020409466454478,
+      "loss": 0.0986,
+      "step": 15241
+    },
+    {
+      "epoch": 0.1323078792718813,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0019020274431045623,
+      "loss": 0.1543,
+      "step": 15242
+    },
+    {
+      "epoch": 0.13231655975208548,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0019020139386865913,
+      "loss": 0.1128,
+      "step": 15243
+    },
+    {
+      "epoch": 0.13232524023228964,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0019020004333915501,
+      "loss": 0.0952,
+      "step": 15244
+    },
+    {
+      "epoch": 0.1323339207124938,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0019019869272194534,
+      "loss": 0.1211,
+      "step": 15245
+    },
+    {
+      "epoch": 0.13234260119269797,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0019019734201703154,
+      "loss": 0.1416,
+      "step": 15246
+    },
+    {
+      "epoch": 0.13235128167290214,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0019019599122441517,
+      "loss": 0.1074,
+      "step": 15247
+    },
+    {
+      "epoch": 0.1323599621531063,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0019019464034409767,
+      "loss": 0.1377,
+      "step": 15248
+    },
+    {
+      "epoch": 0.13236864263331047,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001901932893760805,
+      "loss": 0.1162,
+      "step": 15249
+    },
+    {
+      "epoch": 0.13237732311351463,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0019019193832036515,
+      "loss": 0.0957,
+      "step": 15250
+    },
+    {
+      "epoch": 0.1323860035937188,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001901905871769531,
+      "loss": 0.1816,
+      "step": 15251
+    },
+    {
+      "epoch": 0.13239468407392296,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0019018923594584585,
+      "loss": 0.1875,
+      "step": 15252
+    },
+    {
+      "epoch": 0.13240336455412713,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0019018788462704483,
+      "loss": 0.1445,
+      "step": 15253
+    },
+    {
+      "epoch": 0.1324120450343313,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0019018653322055157,
+      "loss": 0.1182,
+      "step": 15254
+    },
+    {
+      "epoch": 0.13242072551453546,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0019018518172636751,
+      "loss": 0.1045,
+      "step": 15255
+    },
+    {
+      "epoch": 0.13242940599473962,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0019018383014449414,
+      "loss": 0.1025,
+      "step": 15256
+    },
+    {
+      "epoch": 0.1324380864749438,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019018247847493294,
+      "loss": 0.0918,
+      "step": 15257
+    },
+    {
+      "epoch": 0.13244676695514795,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001901811267176854,
+      "loss": 0.1211,
+      "step": 15258
+    },
+    {
+      "epoch": 0.13245544743535212,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0019017977487275296,
+      "loss": 0.1084,
+      "step": 15259
+    },
+    {
+      "epoch": 0.13246412791555628,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0019017842294013713,
+      "loss": 0.1143,
+      "step": 15260
+    },
+    {
+      "epoch": 0.13247280839576045,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0019017707091983937,
+      "loss": 0.1328,
+      "step": 15261
+    },
+    {
+      "epoch": 0.13248148887596461,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0019017571881186118,
+      "loss": 0.1436,
+      "step": 15262
+    },
+    {
+      "epoch": 0.13249016935616878,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0019017436661620407,
+      "loss": 0.1211,
+      "step": 15263
+    },
+    {
+      "epoch": 0.13249884983637294,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0019017301433286943,
+      "loss": 0.085,
+      "step": 15264
+    },
+    {
+      "epoch": 0.1325075303165771,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019017166196185877,
+      "loss": 0.1348,
+      "step": 15265
+    },
+    {
+      "epoch": 0.13251621079678128,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0019017030950317362,
+      "loss": 0.1064,
+      "step": 15266
+    },
+    {
+      "epoch": 0.13252489127698544,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019016895695681542,
+      "loss": 0.1426,
+      "step": 15267
+    },
+    {
+      "epoch": 0.1325335717571896,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0019016760432278563,
+      "loss": 0.0874,
+      "step": 15268
+    },
+    {
+      "epoch": 0.13254225223739377,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019016625160108579,
+      "loss": 0.1465,
+      "step": 15269
+    },
+    {
+      "epoch": 0.13255093271759794,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0019016489879171731,
+      "loss": 0.1406,
+      "step": 15270
+    },
+    {
+      "epoch": 0.1325596131978021,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0019016354589468172,
+      "loss": 0.1309,
+      "step": 15271
+    },
+    {
+      "epoch": 0.13256829367800627,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0019016219290998047,
+      "loss": 0.1602,
+      "step": 15272
+    },
+    {
+      "epoch": 0.13257697415821043,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019016083983761506,
+      "loss": 0.1016,
+      "step": 15273
+    },
+    {
+      "epoch": 0.1325856546384146,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0019015948667758696,
+      "loss": 0.1289,
+      "step": 15274
+    },
+    {
+      "epoch": 0.13259433511861876,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0019015813342989766,
+      "loss": 0.1182,
+      "step": 15275
+    },
+    {
+      "epoch": 0.13260301559882293,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0019015678009454858,
+      "loss": 0.0889,
+      "step": 15276
+    },
+    {
+      "epoch": 0.1326116960790271,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0019015542667154127,
+      "loss": 0.0869,
+      "step": 15277
+    },
+    {
+      "epoch": 0.13262037655923126,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001901540731608772,
+      "loss": 0.127,
+      "step": 15278
+    },
+    {
+      "epoch": 0.13262905703943542,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0019015271956255785,
+      "loss": 0.0781,
+      "step": 15279
+    },
+    {
+      "epoch": 0.1326377375196396,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001901513658765847,
+      "loss": 0.1084,
+      "step": 15280
+    },
+    {
+      "epoch": 0.13264641799984375,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001901500121029592,
+      "loss": 0.1074,
+      "step": 15281
+    },
+    {
+      "epoch": 0.13265509848004792,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0019014865824168285,
+      "loss": 0.1123,
+      "step": 15282
+    },
+    {
+      "epoch": 0.13266377896025208,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0019014730429275713,
+      "loss": 0.0933,
+      "step": 15283
+    },
+    {
+      "epoch": 0.13267245944045625,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0019014595025618353,
+      "loss": 0.1094,
+      "step": 15284
+    },
+    {
+      "epoch": 0.1326811399206604,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019014459613196353,
+      "loss": 0.0786,
+      "step": 15285
+    },
+    {
+      "epoch": 0.13268982040086458,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.001901432419200986,
+      "loss": 0.1099,
+      "step": 15286
+    },
+    {
+      "epoch": 0.13269850088106874,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001901418876205902,
+      "loss": 0.1387,
+      "step": 15287
+    },
+    {
+      "epoch": 0.1327071813612729,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0019014053323343988,
+      "loss": 0.1191,
+      "step": 15288
+    },
+    {
+      "epoch": 0.13271586184147707,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0019013917875864903,
+      "loss": 0.1348,
+      "step": 15289
+    },
+    {
+      "epoch": 0.13272454232168124,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001901378241962192,
+      "loss": 0.1045,
+      "step": 15290
+    },
+    {
+      "epoch": 0.1327332228018854,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0019013646954615185,
+      "loss": 0.1465,
+      "step": 15291
+    },
+    {
+      "epoch": 0.13274190328208957,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0019013511480844847,
+      "loss": 0.1021,
+      "step": 15292
+    },
+    {
+      "epoch": 0.13275058376229373,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001901337599831105,
+      "loss": 0.1289,
+      "step": 15293
+    },
+    {
+      "epoch": 0.1327592642424979,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001901324050701395,
+      "loss": 0.103,
+      "step": 15294
+    },
+    {
+      "epoch": 0.13276794472270206,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001901310500695369,
+      "loss": 0.1289,
+      "step": 15295
+    },
+    {
+      "epoch": 0.13277662520290623,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0019012969498130414,
+      "loss": 0.1289,
+      "step": 15296
+    },
+    {
+      "epoch": 0.1327853056831104,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0019012833980544279,
+      "loss": 0.1021,
+      "step": 15297
+    },
+    {
+      "epoch": 0.13279398616331456,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0019012698454195429,
+      "loss": 0.1167,
+      "step": 15298
+    },
+    {
+      "epoch": 0.13280266664351872,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0019012562919084014,
+      "loss": 0.1309,
+      "step": 15299
+    },
+    {
+      "epoch": 0.1328113471237229,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0019012427375210176,
+      "loss": 0.126,
+      "step": 15300
+    },
+    {
+      "epoch": 0.13282002760392705,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019012291822574068,
+      "loss": 0.1123,
+      "step": 15301
+    },
+    {
+      "epoch": 0.13282870808413122,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019012156261175841,
+      "loss": 0.1289,
+      "step": 15302
+    },
+    {
+      "epoch": 0.13283738856433538,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001901202069101564,
+      "loss": 0.0879,
+      "step": 15303
+    },
+    {
+      "epoch": 0.13284606904453955,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0019011885112093611,
+      "loss": 0.0801,
+      "step": 15304
+    },
+    {
+      "epoch": 0.13285474952474371,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001901174952440991,
+      "loss": 0.1465,
+      "step": 15305
+    },
+    {
+      "epoch": 0.13286343000494788,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0019011613927964672,
+      "loss": 0.1221,
+      "step": 15306
+    },
+    {
+      "epoch": 0.13287211048515205,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0019011478322758062,
+      "loss": 0.105,
+      "step": 15307
+    },
+    {
+      "epoch": 0.1328807909653562,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0019011342708790213,
+      "loss": 0.1143,
+      "step": 15308
+    },
+    {
+      "epoch": 0.13288947144556038,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0019011207086061285,
+      "loss": 0.1367,
+      "step": 15309
+    },
+    {
+      "epoch": 0.13289815192576454,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001901107145457142,
+      "loss": 0.0918,
+      "step": 15310
+    },
+    {
+      "epoch": 0.1329068324059687,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0019010935814320765,
+      "loss": 0.1475,
+      "step": 15311
+    },
+    {
+      "epoch": 0.13291551288617287,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0019010800165309473,
+      "loss": 0.0981,
+      "step": 15312
+    },
+    {
+      "epoch": 0.13292419336637704,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001901066450753769,
+      "loss": 0.125,
+      "step": 15313
+    },
+    {
+      "epoch": 0.1329328738465812,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0019010528841005565,
+      "loss": 0.1191,
+      "step": 15314
+    },
+    {
+      "epoch": 0.13294155432678537,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0019010393165713245,
+      "loss": 0.166,
+      "step": 15315
+    },
+    {
+      "epoch": 0.13295023480698953,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001901025748166088,
+      "loss": 0.0874,
+      "step": 15316
+    },
+    {
+      "epoch": 0.1329589152871937,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001901012178884862,
+      "loss": 0.1094,
+      "step": 15317
+    },
+    {
+      "epoch": 0.13296759576739786,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0019009986087276608,
+      "loss": 0.1045,
+      "step": 15318
+    },
+    {
+      "epoch": 0.13297627624760203,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0019009850376944994,
+      "loss": 0.1196,
+      "step": 15319
+    },
+    {
+      "epoch": 0.1329849567278062,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019009714657853931,
+      "loss": 0.1406,
+      "step": 15320
+    },
+    {
+      "epoch": 0.13299363720801036,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0019009578930003566,
+      "loss": 0.1016,
+      "step": 15321
+    },
+    {
+      "epoch": 0.13300231768821452,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0019009443193394042,
+      "loss": 0.1187,
+      "step": 15322
+    },
+    {
+      "epoch": 0.1330109981684187,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0019009307448025513,
+      "loss": 0.0723,
+      "step": 15323
+    },
+    {
+      "epoch": 0.13301967864862285,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0019009171693898125,
+      "loss": 0.1514,
+      "step": 15324
+    },
+    {
+      "epoch": 0.13302835912882702,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001900903593101203,
+      "loss": 0.1289,
+      "step": 15325
+    },
+    {
+      "epoch": 0.13303703960903118,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001900890015936737,
+      "loss": 0.0923,
+      "step": 15326
+    },
+    {
+      "epoch": 0.13304572008923535,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.00190087643789643,
+      "loss": 0.1055,
+      "step": 15327
+    },
+    {
+      "epoch": 0.1330544005694395,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019008628589802963,
+      "loss": 0.1289,
+      "step": 15328
+    },
+    {
+      "epoch": 0.13306308104964368,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001900849279188351,
+      "loss": 0.1348,
+      "step": 15329
+    },
+    {
+      "epoch": 0.13307176152984784,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0019008356985206092,
+      "loss": 0.1143,
+      "step": 15330
+    },
+    {
+      "epoch": 0.133080442010052,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0019008221169770857,
+      "loss": 0.1348,
+      "step": 15331
+    },
+    {
+      "epoch": 0.13308912249025617,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0019008085345577949,
+      "loss": 0.0923,
+      "step": 15332
+    },
+    {
+      "epoch": 0.13309780297046034,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019007949512627519,
+      "loss": 0.1201,
+      "step": 15333
+    },
+    {
+      "epoch": 0.1331064834506645,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0019007813670919714,
+      "loss": 0.124,
+      "step": 15334
+    },
+    {
+      "epoch": 0.13311516393086867,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0019007677820454686,
+      "loss": 0.1113,
+      "step": 15335
+    },
+    {
+      "epoch": 0.13312384441107283,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0019007541961232582,
+      "loss": 0.1631,
+      "step": 15336
+    },
+    {
+      "epoch": 0.133132524891277,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019007406093253549,
+      "loss": 0.0898,
+      "step": 15337
+    },
+    {
+      "epoch": 0.13314120537148116,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001900727021651774,
+      "loss": 0.1396,
+      "step": 15338
+    },
+    {
+      "epoch": 0.13314988585168533,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0019007134331025298,
+      "loss": 0.1719,
+      "step": 15339
+    },
+    {
+      "epoch": 0.13315856633188947,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019006998436776376,
+      "loss": 0.125,
+      "step": 15340
+    },
+    {
+      "epoch": 0.13316724681209363,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0019006862533771118,
+      "loss": 0.1025,
+      "step": 15341
+    },
+    {
+      "epoch": 0.1331759272922978,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0019006726622009676,
+      "loss": 0.1289,
+      "step": 15342
+    },
+    {
+      "epoch": 0.13318460777250196,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.00190065907014922,
+      "loss": 0.0952,
+      "step": 15343
+    },
+    {
+      "epoch": 0.13319328825270613,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0019006454772218838,
+      "loss": 0.1357,
+      "step": 15344
+    },
+    {
+      "epoch": 0.1332019687329103,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0019006318834189732,
+      "loss": 0.1133,
+      "step": 15345
+    },
+    {
+      "epoch": 0.13321064921311446,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0019006182887405043,
+      "loss": 0.1084,
+      "step": 15346
+    },
+    {
+      "epoch": 0.13321932969331862,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0019006046931864908,
+      "loss": 0.1206,
+      "step": 15347
+    },
+    {
+      "epoch": 0.1332280101735228,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0019005910967569485,
+      "loss": 0.1328,
+      "step": 15348
+    },
+    {
+      "epoch": 0.13323669065372695,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0019005774994518912,
+      "loss": 0.1143,
+      "step": 15349
+    },
+    {
+      "epoch": 0.13324537113393112,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019005639012713346,
+      "loss": 0.1191,
+      "step": 15350
+    },
+    {
+      "epoch": 0.13325405161413528,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0019005503022152934,
+      "loss": 0.0874,
+      "step": 15351
+    },
+    {
+      "epoch": 0.13326273209433945,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0019005367022837823,
+      "loss": 0.1582,
+      "step": 15352
+    },
+    {
+      "epoch": 0.1332714125745436,
+      "grad_norm": 4.21875,
+      "learning_rate": 0.0019005231014768162,
+      "loss": 0.3047,
+      "step": 15353
+    },
+    {
+      "epoch": 0.13328009305474778,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0019005094997944104,
+      "loss": 0.1328,
+      "step": 15354
+    },
+    {
+      "epoch": 0.13328877353495194,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0019004958972365796,
+      "loss": 0.0894,
+      "step": 15355
+    },
+    {
+      "epoch": 0.1332974540151561,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001900482293803338,
+      "loss": 0.1104,
+      "step": 15356
+    },
+    {
+      "epoch": 0.13330613449536027,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0019004686894947011,
+      "loss": 0.0918,
+      "step": 15357
+    },
+    {
+      "epoch": 0.13331481497556444,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0019004550843106838,
+      "loss": 0.1152,
+      "step": 15358
+    },
+    {
+      "epoch": 0.1333234954557686,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019004414782513007,
+      "loss": 0.1147,
+      "step": 15359
+    },
+    {
+      "epoch": 0.13333217593597277,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019004278713165667,
+      "loss": 0.0996,
+      "step": 15360
+    },
+    {
+      "epoch": 0.13334085641617693,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0019004142635064974,
+      "loss": 0.1191,
+      "step": 15361
+    },
+    {
+      "epoch": 0.1333495368963811,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0019004006548211068,
+      "loss": 0.085,
+      "step": 15362
+    },
+    {
+      "epoch": 0.13335821737658526,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.00190038704526041,
+      "loss": 0.0889,
+      "step": 15363
+    },
+    {
+      "epoch": 0.13336689785678943,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0019003734348244218,
+      "loss": 0.125,
+      "step": 15364
+    },
+    {
+      "epoch": 0.1333755783369936,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0019003598235131573,
+      "loss": 0.1318,
+      "step": 15365
+    },
+    {
+      "epoch": 0.13338425881719776,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0019003462113266316,
+      "loss": 0.1064,
+      "step": 15366
+    },
+    {
+      "epoch": 0.13339293929740192,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001900332598264859,
+      "loss": 0.0903,
+      "step": 15367
+    },
+    {
+      "epoch": 0.1334016197776061,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001900318984327855,
+      "loss": 0.1309,
+      "step": 15368
+    },
+    {
+      "epoch": 0.13341030025781025,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019003053695156335,
+      "loss": 0.126,
+      "step": 15369
+    },
+    {
+      "epoch": 0.13341898073801442,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001900291753828211,
+      "loss": 0.125,
+      "step": 15370
+    },
+    {
+      "epoch": 0.13342766121821859,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0019002781372656008,
+      "loss": 0.1289,
+      "step": 15371
+    },
+    {
+      "epoch": 0.13343634169842275,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001900264519827819,
+      "loss": 0.1147,
+      "step": 15372
+    },
+    {
+      "epoch": 0.13344502217862692,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0019002509015148797,
+      "loss": 0.1523,
+      "step": 15373
+    },
+    {
+      "epoch": 0.13345370265883108,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0019002372823267977,
+      "loss": 0.0986,
+      "step": 15374
+    },
+    {
+      "epoch": 0.13346238313903525,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019002236622635887,
+      "loss": 0.1162,
+      "step": 15375
+    },
+    {
+      "epoch": 0.1334710636192394,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019002100413252666,
+      "loss": 0.0898,
+      "step": 15376
+    },
+    {
+      "epoch": 0.13347974409944358,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0019001964195118474,
+      "loss": 0.125,
+      "step": 15377
+    },
+    {
+      "epoch": 0.13348842457964774,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001900182796823345,
+      "loss": 0.1084,
+      "step": 15378
+    },
+    {
+      "epoch": 0.1334971050598519,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0019001691732597747,
+      "loss": 0.1797,
+      "step": 15379
+    },
+    {
+      "epoch": 0.13350578554005607,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0019001555488211519,
+      "loss": 0.106,
+      "step": 15380
+    },
+    {
+      "epoch": 0.13351446602026024,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0019001419235074907,
+      "loss": 0.1035,
+      "step": 15381
+    },
+    {
+      "epoch": 0.1335231465004644,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0019001282973188061,
+      "loss": 0.1055,
+      "step": 15382
+    },
+    {
+      "epoch": 0.13353182698066857,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0019001146702551136,
+      "loss": 0.0977,
+      "step": 15383
+    },
+    {
+      "epoch": 0.13354050746087273,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019001010423164272,
+      "loss": 0.083,
+      "step": 15384
+    },
+    {
+      "epoch": 0.1335491879410769,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001900087413502763,
+      "loss": 0.1172,
+      "step": 15385
+    },
+    {
+      "epoch": 0.13355786842128106,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0019000737838141349,
+      "loss": 0.1436,
+      "step": 15386
+    },
+    {
+      "epoch": 0.13356654890148523,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0019000601532505582,
+      "loss": 0.1084,
+      "step": 15387
+    },
+    {
+      "epoch": 0.1335752293816894,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0019000465218120474,
+      "loss": 0.1182,
+      "step": 15388
+    },
+    {
+      "epoch": 0.13358390986189356,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.001900032889498618,
+      "loss": 0.1011,
+      "step": 15389
+    },
+    {
+      "epoch": 0.13359259034209772,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001900019256310285,
+      "loss": 0.1221,
+      "step": 15390
+    },
+    {
+      "epoch": 0.1336012708223019,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0019000056222470624,
+      "loss": 0.1089,
+      "step": 15391
+    },
+    {
+      "epoch": 0.13360995130250605,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001899991987308966,
+      "loss": 0.125,
+      "step": 15392
+    },
+    {
+      "epoch": 0.13361863178271022,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0018999783514960103,
+      "loss": 0.0967,
+      "step": 15393
+    },
+    {
+      "epoch": 0.13362731226291438,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0018999647148082105,
+      "loss": 0.1133,
+      "step": 15394
+    },
+    {
+      "epoch": 0.13363599274311855,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0018999510772455808,
+      "loss": 0.0942,
+      "step": 15395
+    },
+    {
+      "epoch": 0.1336446732233227,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001899937438808137,
+      "loss": 0.1162,
+      "step": 15396
+    },
+    {
+      "epoch": 0.13365335370352688,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018999237994958935,
+      "loss": 0.1152,
+      "step": 15397
+    },
+    {
+      "epoch": 0.13366203418373104,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0018999101593088655,
+      "loss": 0.1328,
+      "step": 15398
+    },
+    {
+      "epoch": 0.1336707146639352,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0018998965182470675,
+      "loss": 0.1152,
+      "step": 15399
+    },
+    {
+      "epoch": 0.13367939514413937,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.001899882876310515,
+      "loss": 0.1006,
+      "step": 15400
+    },
+    {
+      "epoch": 0.13368807562434354,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0018998692334992223,
+      "loss": 0.1523,
+      "step": 15401
+    },
+    {
+      "epoch": 0.1336967561045477,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0018998555898132048,
+      "loss": 0.1045,
+      "step": 15402
+    },
+    {
+      "epoch": 0.13370543658475187,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0018998419452524774,
+      "loss": 0.1143,
+      "step": 15403
+    },
+    {
+      "epoch": 0.13371411706495603,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0018998282998170543,
+      "loss": 0.127,
+      "step": 15404
+    },
+    {
+      "epoch": 0.1337227975451602,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018998146535069518,
+      "loss": 0.0898,
+      "step": 15405
+    },
+    {
+      "epoch": 0.13373147802536436,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0018998010063221835,
+      "loss": 0.1055,
+      "step": 15406
+    },
+    {
+      "epoch": 0.13374015850556853,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.001899787358262765,
+      "loss": 0.0923,
+      "step": 15407
+    },
+    {
+      "epoch": 0.1337488389857727,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0018997737093287108,
+      "loss": 0.126,
+      "step": 15408
+    },
+    {
+      "epoch": 0.13375751946597686,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0018997600595200364,
+      "loss": 0.1084,
+      "step": 15409
+    },
+    {
+      "epoch": 0.13376619994618102,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0018997464088367562,
+      "loss": 0.1016,
+      "step": 15410
+    },
+    {
+      "epoch": 0.1337748804263852,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0018997327572788853,
+      "loss": 0.0996,
+      "step": 15411
+    },
+    {
+      "epoch": 0.13378356090658935,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0018997191048464388,
+      "loss": 0.1079,
+      "step": 15412
+    },
+    {
+      "epoch": 0.13379224138679352,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0018997054515394317,
+      "loss": 0.1318,
+      "step": 15413
+    },
+    {
+      "epoch": 0.13380092186699769,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0018996917973578785,
+      "loss": 0.1025,
+      "step": 15414
+    },
+    {
+      "epoch": 0.13380960234720185,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0018996781423017944,
+      "loss": 0.1245,
+      "step": 15415
+    },
+    {
+      "epoch": 0.13381828282740602,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0018996644863711943,
+      "loss": 0.0903,
+      "step": 15416
+    },
+    {
+      "epoch": 0.13382696330761018,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001899650829566093,
+      "loss": 0.1074,
+      "step": 15417
+    },
+    {
+      "epoch": 0.13383564378781435,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0018996371718865057,
+      "loss": 0.1226,
+      "step": 15418
+    },
+    {
+      "epoch": 0.1338443242680185,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0018996235133324473,
+      "loss": 0.1328,
+      "step": 15419
+    },
+    {
+      "epoch": 0.13385300474822268,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0018996098539039326,
+      "loss": 0.1396,
+      "step": 15420
+    },
+    {
+      "epoch": 0.13386168522842684,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0018995961936009768,
+      "loss": 0.0908,
+      "step": 15421
+    },
+    {
+      "epoch": 0.133870365708631,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001899582532423594,
+      "loss": 0.1069,
+      "step": 15422
+    },
+    {
+      "epoch": 0.13387904618883517,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018995688703718002,
+      "loss": 0.1172,
+      "step": 15423
+    },
+    {
+      "epoch": 0.13388772666903934,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0018995552074456098,
+      "loss": 0.1348,
+      "step": 15424
+    },
+    {
+      "epoch": 0.1338964071492435,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.001899541543645038,
+      "loss": 0.0928,
+      "step": 15425
+    },
+    {
+      "epoch": 0.13390508762944767,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0018995278789700992,
+      "loss": 0.0957,
+      "step": 15426
+    },
+    {
+      "epoch": 0.13391376810965183,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001899514213420809,
+      "loss": 0.1172,
+      "step": 15427
+    },
+    {
+      "epoch": 0.133922448589856,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001899500546997182,
+      "loss": 0.1152,
+      "step": 15428
+    },
+    {
+      "epoch": 0.13393112907006016,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0018994868796992333,
+      "loss": 0.1289,
+      "step": 15429
+    },
+    {
+      "epoch": 0.13393980955026433,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0018994732115269776,
+      "loss": 0.0898,
+      "step": 15430
+    },
+    {
+      "epoch": 0.1339484900304685,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0018994595424804302,
+      "loss": 0.1069,
+      "step": 15431
+    },
+    {
+      "epoch": 0.13395717051067266,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0018994458725596058,
+      "loss": 0.1123,
+      "step": 15432
+    },
+    {
+      "epoch": 0.13396585099087682,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0018994322017645193,
+      "loss": 0.124,
+      "step": 15433
+    },
+    {
+      "epoch": 0.133974531471081,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0018994185300951856,
+      "loss": 0.127,
+      "step": 15434
+    },
+    {
+      "epoch": 0.13398321195128515,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0018994048575516204,
+      "loss": 0.1309,
+      "step": 15435
+    },
+    {
+      "epoch": 0.13399189243148932,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0018993911841338374,
+      "loss": 0.1133,
+      "step": 15436
+    },
+    {
+      "epoch": 0.13400057291169348,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0018993775098418526,
+      "loss": 0.1084,
+      "step": 15437
+    },
+    {
+      "epoch": 0.13400925339189765,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0018993638346756804,
+      "loss": 0.1064,
+      "step": 15438
+    },
+    {
+      "epoch": 0.1340179338721018,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001899350158635336,
+      "loss": 0.0942,
+      "step": 15439
+    },
+    {
+      "epoch": 0.13402661435230598,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0018993364817208342,
+      "loss": 0.2285,
+      "step": 15440
+    },
+    {
+      "epoch": 0.13403529483251014,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.00189932280393219,
+      "loss": 0.0811,
+      "step": 15441
+    },
+    {
+      "epoch": 0.1340439753127143,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0018993091252694188,
+      "loss": 0.1406,
+      "step": 15442
+    },
+    {
+      "epoch": 0.13405265579291847,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0018992954457325347,
+      "loss": 0.0977,
+      "step": 15443
+    },
+    {
+      "epoch": 0.13406133627312264,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0018992817653215532,
+      "loss": 0.1021,
+      "step": 15444
+    },
+    {
+      "epoch": 0.1340700167533268,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001899268084036489,
+      "loss": 0.0854,
+      "step": 15445
+    },
+    {
+      "epoch": 0.13407869723353097,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0018992544018773577,
+      "loss": 0.0928,
+      "step": 15446
+    },
+    {
+      "epoch": 0.13408737771373513,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0018992407188441734,
+      "loss": 0.085,
+      "step": 15447
+    },
+    {
+      "epoch": 0.1340960581939393,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0018992270349369516,
+      "loss": 0.0972,
+      "step": 15448
+    },
+    {
+      "epoch": 0.13410473867414346,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0018992133501557072,
+      "loss": 0.0864,
+      "step": 15449
+    },
+    {
+      "epoch": 0.13411341915434763,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001899199664500455,
+      "loss": 0.0913,
+      "step": 15450
+    },
+    {
+      "epoch": 0.1341220996345518,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00189918597797121,
+      "loss": 0.1289,
+      "step": 15451
+    },
+    {
+      "epoch": 0.13413078011475596,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0018991722905679874,
+      "loss": 0.1221,
+      "step": 15452
+    },
+    {
+      "epoch": 0.13413946059496012,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0018991586022908016,
+      "loss": 0.127,
+      "step": 15453
+    },
+    {
+      "epoch": 0.1341481410751643,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0018991449131396682,
+      "loss": 0.0767,
+      "step": 15454
+    },
+    {
+      "epoch": 0.13415682155536846,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0018991312231146024,
+      "loss": 0.1128,
+      "step": 15455
+    },
+    {
+      "epoch": 0.13416550203557262,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001899117532215618,
+      "loss": 0.1094,
+      "step": 15456
+    },
+    {
+      "epoch": 0.13417418251577679,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001899103840442731,
+      "loss": 0.1143,
+      "step": 15457
+    },
+    {
+      "epoch": 0.13418286299598095,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001899090147795956,
+      "loss": 0.1074,
+      "step": 15458
+    },
+    {
+      "epoch": 0.13419154347618512,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001899076454275308,
+      "loss": 0.1211,
+      "step": 15459
+    },
+    {
+      "epoch": 0.13420022395638928,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001899062759880802,
+      "loss": 0.0913,
+      "step": 15460
+    },
+    {
+      "epoch": 0.13420890443659345,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001899049064612453,
+      "loss": 0.1094,
+      "step": 15461
+    },
+    {
+      "epoch": 0.1342175849167976,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0018990353684702759,
+      "loss": 0.0967,
+      "step": 15462
+    },
+    {
+      "epoch": 0.13422626539700175,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.001899021671454286,
+      "loss": 0.0957,
+      "step": 15463
+    },
+    {
+      "epoch": 0.1342349458772059,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0018990079735644973,
+      "loss": 0.1377,
+      "step": 15464
+    },
+    {
+      "epoch": 0.13424362635741008,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0018989942748009261,
+      "loss": 0.0981,
+      "step": 15465
+    },
+    {
+      "epoch": 0.13425230683761424,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0018989805751635864,
+      "loss": 0.085,
+      "step": 15466
+    },
+    {
+      "epoch": 0.1342609873178184,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0018989668746524939,
+      "loss": 0.1152,
+      "step": 15467
+    },
+    {
+      "epoch": 0.13426966779802257,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0018989531732676631,
+      "loss": 0.1143,
+      "step": 15468
+    },
+    {
+      "epoch": 0.13427834827822674,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0018989394710091092,
+      "loss": 0.126,
+      "step": 15469
+    },
+    {
+      "epoch": 0.1342870287584309,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0018989257678768468,
+      "loss": 0.0996,
+      "step": 15470
+    },
+    {
+      "epoch": 0.13429570923863507,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0018989120638708914,
+      "loss": 0.1138,
+      "step": 15471
+    },
+    {
+      "epoch": 0.13430438971883923,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001898898358991258,
+      "loss": 0.124,
+      "step": 15472
+    },
+    {
+      "epoch": 0.1343130701990434,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0018988846532379609,
+      "loss": 0.1299,
+      "step": 15473
+    },
+    {
+      "epoch": 0.13432175067924756,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0018988709466110157,
+      "loss": 0.1699,
+      "step": 15474
+    },
+    {
+      "epoch": 0.13433043115945173,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0018988572391104373,
+      "loss": 0.1221,
+      "step": 15475
+    },
+    {
+      "epoch": 0.1343391116396559,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0018988435307362406,
+      "loss": 0.168,
+      "step": 15476
+    },
+    {
+      "epoch": 0.13434779211986006,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0018988298214884407,
+      "loss": 0.127,
+      "step": 15477
+    },
+    {
+      "epoch": 0.13435647260006423,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0018988161113670523,
+      "loss": 0.0938,
+      "step": 15478
+    },
+    {
+      "epoch": 0.1343651530802684,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0018988024003720907,
+      "loss": 0.0825,
+      "step": 15479
+    },
+    {
+      "epoch": 0.13437383356047256,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.001898788688503571,
+      "loss": 0.1445,
+      "step": 15480
+    },
+    {
+      "epoch": 0.13438251404067672,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001898774975761508,
+      "loss": 0.1436,
+      "step": 15481
+    },
+    {
+      "epoch": 0.13439119452088089,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0018987612621459164,
+      "loss": 0.1113,
+      "step": 15482
+    },
+    {
+      "epoch": 0.13439987500108505,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0018987475476568117,
+      "loss": 0.1133,
+      "step": 15483
+    },
+    {
+      "epoch": 0.13440855548128922,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0018987338322942085,
+      "loss": 0.0913,
+      "step": 15484
+    },
+    {
+      "epoch": 0.13441723596149338,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001898720116058122,
+      "loss": 0.1572,
+      "step": 15485
+    },
+    {
+      "epoch": 0.13442591644169755,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0018987063989485673,
+      "loss": 0.1177,
+      "step": 15486
+    },
+    {
+      "epoch": 0.1344345969219017,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0018986926809655593,
+      "loss": 0.125,
+      "step": 15487
+    },
+    {
+      "epoch": 0.13444327740210588,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0018986789621091129,
+      "loss": 0.1113,
+      "step": 15488
+    },
+    {
+      "epoch": 0.13445195788231004,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0018986652423792433,
+      "loss": 0.1147,
+      "step": 15489
+    },
+    {
+      "epoch": 0.1344606383625142,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0018986515217759652,
+      "loss": 0.1104,
+      "step": 15490
+    },
+    {
+      "epoch": 0.13446931884271837,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001898637800299294,
+      "loss": 0.103,
+      "step": 15491
+    },
+    {
+      "epoch": 0.13447799932292254,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0018986240779492443,
+      "loss": 0.1143,
+      "step": 15492
+    },
+    {
+      "epoch": 0.1344866798031267,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0018986103547258312,
+      "loss": 0.0942,
+      "step": 15493
+    },
+    {
+      "epoch": 0.13449536028333087,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00189859663062907,
+      "loss": 0.0732,
+      "step": 15494
+    },
+    {
+      "epoch": 0.13450404076353503,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0018985829056589756,
+      "loss": 0.125,
+      "step": 15495
+    },
+    {
+      "epoch": 0.1345127212437392,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0018985691798155632,
+      "loss": 0.1167,
+      "step": 15496
+    },
+    {
+      "epoch": 0.13452140172394336,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001898555453098847,
+      "loss": 0.1133,
+      "step": 15497
+    },
+    {
+      "epoch": 0.13453008220414753,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0018985417255088426,
+      "loss": 0.0962,
+      "step": 15498
+    },
+    {
+      "epoch": 0.1345387626843517,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018985279970455654,
+      "loss": 0.1133,
+      "step": 15499
+    },
+    {
+      "epoch": 0.13454744316455586,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0018985142677090297,
+      "loss": 0.0996,
+      "step": 15500
+    },
+    {
+      "epoch": 0.13455612364476002,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0018985005374992506,
+      "loss": 0.1055,
+      "step": 15501
+    },
+    {
+      "epoch": 0.1345648041249642,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0018984868064162437,
+      "loss": 0.0957,
+      "step": 15502
+    },
+    {
+      "epoch": 0.13457348460516835,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0018984730744600233,
+      "loss": 0.168,
+      "step": 15503
+    },
+    {
+      "epoch": 0.13458216508537252,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0018984593416306048,
+      "loss": 0.1172,
+      "step": 15504
+    },
+    {
+      "epoch": 0.13459084556557668,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0018984456079280032,
+      "loss": 0.1602,
+      "step": 15505
+    },
+    {
+      "epoch": 0.13459952604578085,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001898431873352234,
+      "loss": 0.1221,
+      "step": 15506
+    },
+    {
+      "epoch": 0.134608206525985,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001898418137903311,
+      "loss": 0.0898,
+      "step": 15507
+    },
+    {
+      "epoch": 0.13461688700618918,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0018984044015812503,
+      "loss": 0.1562,
+      "step": 15508
+    },
+    {
+      "epoch": 0.13462556748639334,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0018983906643860661,
+      "loss": 0.1123,
+      "step": 15509
+    },
+    {
+      "epoch": 0.1346342479665975,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0018983769263177742,
+      "loss": 0.1484,
+      "step": 15510
+    },
+    {
+      "epoch": 0.13464292844680167,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0018983631873763893,
+      "loss": 0.0952,
+      "step": 15511
+    },
+    {
+      "epoch": 0.13465160892700584,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0018983494475619262,
+      "loss": 0.1143,
+      "step": 15512
+    },
+    {
+      "epoch": 0.13466028940721,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0018983357068744005,
+      "loss": 0.1016,
+      "step": 15513
+    },
+    {
+      "epoch": 0.13466896988741417,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0018983219653138266,
+      "loss": 0.1177,
+      "step": 15514
+    },
+    {
+      "epoch": 0.13467765036761833,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00189830822288022,
+      "loss": 0.1187,
+      "step": 15515
+    },
+    {
+      "epoch": 0.1346863308478225,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0018982944795735953,
+      "loss": 0.0977,
+      "step": 15516
+    },
+    {
+      "epoch": 0.13469501132802666,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0018982807353939677,
+      "loss": 0.1118,
+      "step": 15517
+    },
+    {
+      "epoch": 0.13470369180823083,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0018982669903413526,
+      "loss": 0.1016,
+      "step": 15518
+    },
+    {
+      "epoch": 0.134712372288435,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0018982532444157645,
+      "loss": 0.0938,
+      "step": 15519
+    },
+    {
+      "epoch": 0.13472105276863916,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0018982394976172186,
+      "loss": 0.0864,
+      "step": 15520
+    },
+    {
+      "epoch": 0.13472973324884333,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00189822574994573,
+      "loss": 0.1328,
+      "step": 15521
+    },
+    {
+      "epoch": 0.1347384137290475,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0018982120014013138,
+      "loss": 0.1377,
+      "step": 15522
+    },
+    {
+      "epoch": 0.13474709420925166,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0018981982519839851,
+      "loss": 0.125,
+      "step": 15523
+    },
+    {
+      "epoch": 0.13475577468945582,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0018981845016937584,
+      "loss": 0.1797,
+      "step": 15524
+    },
+    {
+      "epoch": 0.13476445516965999,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0018981707505306492,
+      "loss": 0.0957,
+      "step": 15525
+    },
+    {
+      "epoch": 0.13477313564986415,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001898156998494673,
+      "loss": 0.1133,
+      "step": 15526
+    },
+    {
+      "epoch": 0.13478181613006832,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0018981432455858435,
+      "loss": 0.1836,
+      "step": 15527
+    },
+    {
+      "epoch": 0.13479049661027248,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0018981294918041772,
+      "loss": 0.1045,
+      "step": 15528
+    },
+    {
+      "epoch": 0.13479917709047665,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0018981157371496882,
+      "loss": 0.082,
+      "step": 15529
+    },
+    {
+      "epoch": 0.1348078575706808,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0018981019816223916,
+      "loss": 0.0679,
+      "step": 15530
+    },
+    {
+      "epoch": 0.13481653805088498,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001898088225222303,
+      "loss": 0.1206,
+      "step": 15531
+    },
+    {
+      "epoch": 0.13482521853108914,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001898074467949437,
+      "loss": 0.1621,
+      "step": 15532
+    },
+    {
+      "epoch": 0.1348338990112933,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0018980607098038084,
+      "loss": 0.1021,
+      "step": 15533
+    },
+    {
+      "epoch": 0.13484257949149747,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0018980469507854333,
+      "loss": 0.127,
+      "step": 15534
+    },
+    {
+      "epoch": 0.13485125997170164,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0018980331908943254,
+      "loss": 0.0986,
+      "step": 15535
+    },
+    {
+      "epoch": 0.1348599404519058,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001898019430130501,
+      "loss": 0.1309,
+      "step": 15536
+    },
+    {
+      "epoch": 0.13486862093210997,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001898005668493974,
+      "loss": 0.1123,
+      "step": 15537
+    },
+    {
+      "epoch": 0.13487730141231413,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0018979919059847604,
+      "loss": 0.1104,
+      "step": 15538
+    },
+    {
+      "epoch": 0.1348859818925183,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018979781426028746,
+      "loss": 0.166,
+      "step": 15539
+    },
+    {
+      "epoch": 0.13489466237272246,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0018979643783483319,
+      "loss": 0.1504,
+      "step": 15540
+    },
+    {
+      "epoch": 0.13490334285292663,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0018979506132211475,
+      "loss": 0.1133,
+      "step": 15541
+    },
+    {
+      "epoch": 0.1349120233331308,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0018979368472213359,
+      "loss": 0.1143,
+      "step": 15542
+    },
+    {
+      "epoch": 0.13492070381333496,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018979230803489132,
+      "loss": 0.1147,
+      "step": 15543
+    },
+    {
+      "epoch": 0.13492938429353912,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0018979093126038932,
+      "loss": 0.1523,
+      "step": 15544
+    },
+    {
+      "epoch": 0.1349380647737433,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0018978955439862917,
+      "loss": 0.1021,
+      "step": 15545
+    },
+    {
+      "epoch": 0.13494674525394745,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0018978817744961238,
+      "loss": 0.1455,
+      "step": 15546
+    },
+    {
+      "epoch": 0.13495542573415162,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0018978680041334044,
+      "loss": 0.1533,
+      "step": 15547
+    },
+    {
+      "epoch": 0.13496410621435578,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0018978542328981485,
+      "loss": 0.1162,
+      "step": 15548
+    },
+    {
+      "epoch": 0.13497278669455995,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0018978404607903712,
+      "loss": 0.0894,
+      "step": 15549
+    },
+    {
+      "epoch": 0.1349814671747641,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0018978266878100873,
+      "loss": 0.1074,
+      "step": 15550
+    },
+    {
+      "epoch": 0.13499014765496828,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0018978129139573124,
+      "loss": 0.1318,
+      "step": 15551
+    },
+    {
+      "epoch": 0.13499882813517244,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0018977991392320616,
+      "loss": 0.127,
+      "step": 15552
+    },
+    {
+      "epoch": 0.1350075086153766,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001897785363634349,
+      "loss": 0.1201,
+      "step": 15553
+    },
+    {
+      "epoch": 0.13501618909558077,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0018977715871641907,
+      "loss": 0.1196,
+      "step": 15554
+    },
+    {
+      "epoch": 0.13502486957578494,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001897757809821601,
+      "loss": 0.127,
+      "step": 15555
+    },
+    {
+      "epoch": 0.1350335500559891,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0018977440316065956,
+      "loss": 0.1318,
+      "step": 15556
+    },
+    {
+      "epoch": 0.13504223053619327,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0018977302525191896,
+      "loss": 0.1406,
+      "step": 15557
+    },
+    {
+      "epoch": 0.13505091101639743,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0018977164725593975,
+      "loss": 0.1182,
+      "step": 15558
+    },
+    {
+      "epoch": 0.1350595914966016,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0018977026917272343,
+      "loss": 0.0903,
+      "step": 15559
+    },
+    {
+      "epoch": 0.13506827197680576,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0018976889100227158,
+      "loss": 0.1289,
+      "step": 15560
+    },
+    {
+      "epoch": 0.13507695245700993,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0018976751274458567,
+      "loss": 0.1035,
+      "step": 15561
+    },
+    {
+      "epoch": 0.1350856329372141,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0018976613439966722,
+      "loss": 0.1641,
+      "step": 15562
+    },
+    {
+      "epoch": 0.13509431341741826,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0018976475596751772,
+      "loss": 0.0889,
+      "step": 15563
+    },
+    {
+      "epoch": 0.13510299389762243,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0018976337744813865,
+      "loss": 0.1318,
+      "step": 15564
+    },
+    {
+      "epoch": 0.1351116743778266,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0018976199884153157,
+      "loss": 0.1143,
+      "step": 15565
+    },
+    {
+      "epoch": 0.13512035485803076,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0018976062014769794,
+      "loss": 0.0908,
+      "step": 15566
+    },
+    {
+      "epoch": 0.13512903533823492,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0018975924136663933,
+      "loss": 0.1592,
+      "step": 15567
+    },
+    {
+      "epoch": 0.13513771581843909,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0018975786249835723,
+      "loss": 0.0928,
+      "step": 15568
+    },
+    {
+      "epoch": 0.13514639629864325,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0018975648354285308,
+      "loss": 0.1348,
+      "step": 15569
+    },
+    {
+      "epoch": 0.13515507677884742,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0018975510450012847,
+      "loss": 0.0977,
+      "step": 15570
+    },
+    {
+      "epoch": 0.13516375725905158,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0018975372537018485,
+      "loss": 0.1445,
+      "step": 15571
+    },
+    {
+      "epoch": 0.13517243773925575,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0018975234615302378,
+      "loss": 0.1318,
+      "step": 15572
+    },
+    {
+      "epoch": 0.1351811182194599,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0018975096684864673,
+      "loss": 0.1367,
+      "step": 15573
+    },
+    {
+      "epoch": 0.13518979869966408,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0018974958745705522,
+      "loss": 0.1123,
+      "step": 15574
+    },
+    {
+      "epoch": 0.13519847917986824,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0018974820797825075,
+      "loss": 0.0981,
+      "step": 15575
+    },
+    {
+      "epoch": 0.1352071596600724,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0018974682841223485,
+      "loss": 0.1396,
+      "step": 15576
+    },
+    {
+      "epoch": 0.13521584014027657,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00189745448759009,
+      "loss": 0.1299,
+      "step": 15577
+    },
+    {
+      "epoch": 0.13522452062048074,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0018974406901857476,
+      "loss": 0.1069,
+      "step": 15578
+    },
+    {
+      "epoch": 0.1352332011006849,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001897426891909336,
+      "loss": 0.1387,
+      "step": 15579
+    },
+    {
+      "epoch": 0.13524188158088907,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0018974130927608699,
+      "loss": 0.0918,
+      "step": 15580
+    },
+    {
+      "epoch": 0.13525056206109323,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0018973992927403653,
+      "loss": 0.0967,
+      "step": 15581
+    },
+    {
+      "epoch": 0.1352592425412974,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0018973854918478367,
+      "loss": 0.1338,
+      "step": 15582
+    },
+    {
+      "epoch": 0.13526792302150156,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0018973716900832992,
+      "loss": 0.1074,
+      "step": 15583
+    },
+    {
+      "epoch": 0.13527660350170573,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0018973578874467679,
+      "loss": 0.1152,
+      "step": 15584
+    },
+    {
+      "epoch": 0.1352852839819099,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001897344083938258,
+      "loss": 0.0791,
+      "step": 15585
+    },
+    {
+      "epoch": 0.13529396446211406,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0018973302795577847,
+      "loss": 0.1104,
+      "step": 15586
+    },
+    {
+      "epoch": 0.1353026449423182,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001897316474305363,
+      "loss": 0.1475,
+      "step": 15587
+    },
+    {
+      "epoch": 0.13531132542252236,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001897302668181008,
+      "loss": 0.0996,
+      "step": 15588
+    },
+    {
+      "epoch": 0.13532000590272653,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001897288861184735,
+      "loss": 0.1138,
+      "step": 15589
+    },
+    {
+      "epoch": 0.1353286863829307,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0018972750533165585,
+      "loss": 0.1133,
+      "step": 15590
+    },
+    {
+      "epoch": 0.13533736686313486,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0018972612445764942,
+      "loss": 0.1035,
+      "step": 15591
+    },
+    {
+      "epoch": 0.13534604734333902,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0018972474349645567,
+      "loss": 0.1641,
+      "step": 15592
+    },
+    {
+      "epoch": 0.13535472782354319,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018972336244807617,
+      "loss": 0.1455,
+      "step": 15593
+    },
+    {
+      "epoch": 0.13536340830374735,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0018972198131251238,
+      "loss": 0.126,
+      "step": 15594
+    },
+    {
+      "epoch": 0.13537208878395152,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0018972060008976583,
+      "loss": 0.1045,
+      "step": 15595
+    },
+    {
+      "epoch": 0.13538076926415568,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0018971921877983804,
+      "loss": 0.1211,
+      "step": 15596
+    },
+    {
+      "epoch": 0.13538944974435985,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018971783738273053,
+      "loss": 0.1523,
+      "step": 15597
+    },
+    {
+      "epoch": 0.135398130224564,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0018971645589844475,
+      "loss": 0.0879,
+      "step": 15598
+    },
+    {
+      "epoch": 0.13540681070476818,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0018971507432698224,
+      "loss": 0.1299,
+      "step": 15599
+    },
+    {
+      "epoch": 0.13541549118497234,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0018971369266834457,
+      "loss": 0.1104,
+      "step": 15600
+    },
+    {
+      "epoch": 0.1354241716651765,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0018971231092253318,
+      "loss": 0.1006,
+      "step": 15601
+    },
+    {
+      "epoch": 0.13543285214538067,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0018971092908954964,
+      "loss": 0.1021,
+      "step": 15602
+    },
+    {
+      "epoch": 0.13544153262558484,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001897095471693954,
+      "loss": 0.1191,
+      "step": 15603
+    },
+    {
+      "epoch": 0.135450213105789,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.00189708165162072,
+      "loss": 0.1279,
+      "step": 15604
+    },
+    {
+      "epoch": 0.13545889358599317,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001897067830675809,
+      "loss": 0.1172,
+      "step": 15605
+    },
+    {
+      "epoch": 0.13546757406619733,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0018970540088592371,
+      "loss": 0.1221,
+      "step": 15606
+    },
+    {
+      "epoch": 0.1354762545464015,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0018970401861710187,
+      "loss": 0.0967,
+      "step": 15607
+    },
+    {
+      "epoch": 0.13548493502660566,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0018970263626111694,
+      "loss": 0.1006,
+      "step": 15608
+    },
+    {
+      "epoch": 0.13549361550680983,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001897012538179704,
+      "loss": 0.1123,
+      "step": 15609
+    },
+    {
+      "epoch": 0.135502295987014,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0018969987128766375,
+      "loss": 0.1108,
+      "step": 15610
+    },
+    {
+      "epoch": 0.13551097646721816,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001896984886701985,
+      "loss": 0.1543,
+      "step": 15611
+    },
+    {
+      "epoch": 0.13551965694742232,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0018969710596557622,
+      "loss": 0.0942,
+      "step": 15612
+    },
+    {
+      "epoch": 0.1355283374276265,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0018969572317379837,
+      "loss": 0.2158,
+      "step": 15613
+    },
+    {
+      "epoch": 0.13553701790783065,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0018969434029486646,
+      "loss": 0.1279,
+      "step": 15614
+    },
+    {
+      "epoch": 0.13554569838803482,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0018969295732878203,
+      "loss": 0.1133,
+      "step": 15615
+    },
+    {
+      "epoch": 0.13555437886823898,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0018969157427554657,
+      "loss": 0.1533,
+      "step": 15616
+    },
+    {
+      "epoch": 0.13556305934844315,
+      "grad_norm": 3.421875,
+      "learning_rate": 0.001896901911351616,
+      "loss": 0.7305,
+      "step": 15617
+    },
+    {
+      "epoch": 0.13557173982864731,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0018968880790762863,
+      "loss": 0.1016,
+      "step": 15618
+    },
+    {
+      "epoch": 0.13558042030885148,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0018968742459294918,
+      "loss": 0.1191,
+      "step": 15619
+    },
+    {
+      "epoch": 0.13558910078905564,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0018968604119112477,
+      "loss": 0.0874,
+      "step": 15620
+    },
+    {
+      "epoch": 0.1355977812692598,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0018968465770215692,
+      "loss": 0.1445,
+      "step": 15621
+    },
+    {
+      "epoch": 0.13560646174946397,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0018968327412604712,
+      "loss": 0.1465,
+      "step": 15622
+    },
+    {
+      "epoch": 0.13561514222966814,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0018968189046279687,
+      "loss": 0.1118,
+      "step": 15623
+    },
+    {
+      "epoch": 0.1356238227098723,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0018968050671240768,
+      "loss": 0.1201,
+      "step": 15624
+    },
+    {
+      "epoch": 0.13563250319007647,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0018967912287488112,
+      "loss": 0.2363,
+      "step": 15625
+    },
+    {
+      "epoch": 0.13564118367028064,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0018967773895021865,
+      "loss": 0.1094,
+      "step": 15626
+    },
+    {
+      "epoch": 0.1356498641504848,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0018967635493842186,
+      "loss": 0.123,
+      "step": 15627
+    },
+    {
+      "epoch": 0.13565854463068897,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0018967497083949214,
+      "loss": 0.1357,
+      "step": 15628
+    },
+    {
+      "epoch": 0.13566722511089313,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018967358665343108,
+      "loss": 0.1191,
+      "step": 15629
+    },
+    {
+      "epoch": 0.1356759055910973,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0018967220238024017,
+      "loss": 0.1367,
+      "step": 15630
+    },
+    {
+      "epoch": 0.13568458607130146,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0018967081801992097,
+      "loss": 0.1201,
+      "step": 15631
+    },
+    {
+      "epoch": 0.13569326655150563,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0018966943357247494,
+      "loss": 0.1367,
+      "step": 15632
+    },
+    {
+      "epoch": 0.1357019470317098,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0018966804903790364,
+      "loss": 0.126,
+      "step": 15633
+    },
+    {
+      "epoch": 0.13571062751191396,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0018966666441620854,
+      "loss": 0.1172,
+      "step": 15634
+    },
+    {
+      "epoch": 0.13571930799211812,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0018966527970739118,
+      "loss": 0.1465,
+      "step": 15635
+    },
+    {
+      "epoch": 0.1357279884723223,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0018966389491145306,
+      "loss": 0.1602,
+      "step": 15636
+    },
+    {
+      "epoch": 0.13573666895252645,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0018966251002839568,
+      "loss": 0.1118,
+      "step": 15637
+    },
+    {
+      "epoch": 0.13574534943273062,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001896611250582206,
+      "loss": 0.1006,
+      "step": 15638
+    },
+    {
+      "epoch": 0.13575402991293478,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001896597400009293,
+      "loss": 0.0991,
+      "step": 15639
+    },
+    {
+      "epoch": 0.13576271039313895,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0018965835485652332,
+      "loss": 0.1309,
+      "step": 15640
+    },
+    {
+      "epoch": 0.1357713908733431,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0018965696962500416,
+      "loss": 0.1396,
+      "step": 15641
+    },
+    {
+      "epoch": 0.13578007135354728,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001896555843063733,
+      "loss": 0.0879,
+      "step": 15642
+    },
+    {
+      "epoch": 0.13578875183375144,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0018965419890063233,
+      "loss": 0.1191,
+      "step": 15643
+    },
+    {
+      "epoch": 0.1357974323139556,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0018965281340778274,
+      "loss": 0.1484,
+      "step": 15644
+    },
+    {
+      "epoch": 0.13580611279415977,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0018965142782782598,
+      "loss": 0.1523,
+      "step": 15645
+    },
+    {
+      "epoch": 0.13581479327436394,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0018965004216076364,
+      "loss": 0.1055,
+      "step": 15646
+    },
+    {
+      "epoch": 0.1358234737545681,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.001896486564065972,
+      "loss": 0.1484,
+      "step": 15647
+    },
+    {
+      "epoch": 0.13583215423477227,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0018964727056532819,
+      "loss": 0.1436,
+      "step": 15648
+    },
+    {
+      "epoch": 0.13584083471497643,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0018964588463695811,
+      "loss": 0.1123,
+      "step": 15649
+    },
+    {
+      "epoch": 0.1358495151951806,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001896444986214885,
+      "loss": 0.1484,
+      "step": 15650
+    },
+    {
+      "epoch": 0.13585819567538476,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001896431125189209,
+      "loss": 0.1147,
+      "step": 15651
+    },
+    {
+      "epoch": 0.13586687615558893,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0018964172632925673,
+      "loss": 0.0928,
+      "step": 15652
+    },
+    {
+      "epoch": 0.1358755566357931,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0018964034005249759,
+      "loss": 0.1484,
+      "step": 15653
+    },
+    {
+      "epoch": 0.13588423711599726,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0018963895368864499,
+      "loss": 0.1094,
+      "step": 15654
+    },
+    {
+      "epoch": 0.13589291759620142,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0018963756723770037,
+      "loss": 0.1328,
+      "step": 15655
+    },
+    {
+      "epoch": 0.1359015980764056,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0018963618069966538,
+      "loss": 0.0986,
+      "step": 15656
+    },
+    {
+      "epoch": 0.13591027855660975,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0018963479407454141,
+      "loss": 0.1318,
+      "step": 15657
+    },
+    {
+      "epoch": 0.13591895903681392,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0018963340736233001,
+      "loss": 0.1113,
+      "step": 15658
+    },
+    {
+      "epoch": 0.13592763951701808,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0018963202056303276,
+      "loss": 0.1396,
+      "step": 15659
+    },
+    {
+      "epoch": 0.13593631999722225,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001896306336766511,
+      "loss": 0.0928,
+      "step": 15660
+    },
+    {
+      "epoch": 0.13594500047742641,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0018962924670318658,
+      "loss": 0.0918,
+      "step": 15661
+    },
+    {
+      "epoch": 0.13595368095763058,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0018962785964264073,
+      "loss": 0.1001,
+      "step": 15662
+    },
+    {
+      "epoch": 0.13596236143783474,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0018962647249501502,
+      "loss": 0.1475,
+      "step": 15663
+    },
+    {
+      "epoch": 0.1359710419180389,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0018962508526031102,
+      "loss": 0.0981,
+      "step": 15664
+    },
+    {
+      "epoch": 0.13597972239824307,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0018962369793853022,
+      "loss": 0.1426,
+      "step": 15665
+    },
+    {
+      "epoch": 0.13598840287844724,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018962231052967412,
+      "loss": 0.1162,
+      "step": 15666
+    },
+    {
+      "epoch": 0.1359970833586514,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0018962092303374427,
+      "loss": 0.0918,
+      "step": 15667
+    },
+    {
+      "epoch": 0.13600576383885557,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0018961953545074217,
+      "loss": 0.0986,
+      "step": 15668
+    },
+    {
+      "epoch": 0.13601444431905974,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018961814778066936,
+      "loss": 0.106,
+      "step": 15669
+    },
+    {
+      "epoch": 0.1360231247992639,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0018961676002352732,
+      "loss": 0.105,
+      "step": 15670
+    },
+    {
+      "epoch": 0.13603180527946807,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0018961537217931758,
+      "loss": 0.0889,
+      "step": 15671
+    },
+    {
+      "epoch": 0.13604048575967223,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0018961398424804168,
+      "loss": 0.1138,
+      "step": 15672
+    },
+    {
+      "epoch": 0.1360491662398764,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0018961259622970112,
+      "loss": 0.1152,
+      "step": 15673
+    },
+    {
+      "epoch": 0.13605784672008056,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001896112081242974,
+      "loss": 0.1299,
+      "step": 15674
+    },
+    {
+      "epoch": 0.13606652720028473,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001896098199318321,
+      "loss": 0.1025,
+      "step": 15675
+    },
+    {
+      "epoch": 0.1360752076804889,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0018960843165230668,
+      "loss": 0.1299,
+      "step": 15676
+    },
+    {
+      "epoch": 0.13608388816069306,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001896070432857227,
+      "loss": 0.0908,
+      "step": 15677
+    },
+    {
+      "epoch": 0.13609256864089722,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0018960565483208162,
+      "loss": 0.125,
+      "step": 15678
+    },
+    {
+      "epoch": 0.1361012491211014,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0018960426629138498,
+      "loss": 0.0991,
+      "step": 15679
+    },
+    {
+      "epoch": 0.13610992960130555,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0018960287766363432,
+      "loss": 0.168,
+      "step": 15680
+    },
+    {
+      "epoch": 0.13611861008150972,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001896014889488312,
+      "loss": 0.1377,
+      "step": 15681
+    },
+    {
+      "epoch": 0.13612729056171388,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.00189600100146977,
+      "loss": 0.1289,
+      "step": 15682
+    },
+    {
+      "epoch": 0.13613597104191805,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.001895987112580734,
+      "loss": 0.1133,
+      "step": 15683
+    },
+    {
+      "epoch": 0.1361446515221222,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0018959732228212183,
+      "loss": 0.1338,
+      "step": 15684
+    },
+    {
+      "epoch": 0.13615333200232638,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001895959332191238,
+      "loss": 0.1201,
+      "step": 15685
+    },
+    {
+      "epoch": 0.13616201248253054,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0018959454406908085,
+      "loss": 0.1445,
+      "step": 15686
+    },
+    {
+      "epoch": 0.1361706929627347,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0018959315483199452,
+      "loss": 0.1699,
+      "step": 15687
+    },
+    {
+      "epoch": 0.13617937344293887,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0018959176550786634,
+      "loss": 0.1436,
+      "step": 15688
+    },
+    {
+      "epoch": 0.13618805392314304,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0018959037609669778,
+      "loss": 0.084,
+      "step": 15689
+    },
+    {
+      "epoch": 0.1361967344033472,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0018958898659849036,
+      "loss": 0.0947,
+      "step": 15690
+    },
+    {
+      "epoch": 0.13620541488355137,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0018958759701324561,
+      "loss": 0.0913,
+      "step": 15691
+    },
+    {
+      "epoch": 0.13621409536375553,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.001895862073409651,
+      "loss": 0.1089,
+      "step": 15692
+    },
+    {
+      "epoch": 0.1362227758439597,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.001895848175816503,
+      "loss": 0.1196,
+      "step": 15693
+    },
+    {
+      "epoch": 0.13623145632416386,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0018958342773530274,
+      "loss": 0.1006,
+      "step": 15694
+    },
+    {
+      "epoch": 0.13624013680436803,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001895820378019239,
+      "loss": 0.1074,
+      "step": 15695
+    },
+    {
+      "epoch": 0.1362488172845722,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001895806477815154,
+      "loss": 0.123,
+      "step": 15696
+    },
+    {
+      "epoch": 0.13625749776477636,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0018957925767407865,
+      "loss": 0.1484,
+      "step": 15697
+    },
+    {
+      "epoch": 0.13626617824498052,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0018957786747961524,
+      "loss": 0.0835,
+      "step": 15698
+    },
+    {
+      "epoch": 0.1362748587251847,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0018957647719812668,
+      "loss": 0.0967,
+      "step": 15699
+    },
+    {
+      "epoch": 0.13628353920538885,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0018957508682961445,
+      "loss": 0.1221,
+      "step": 15700
+    },
+    {
+      "epoch": 0.13629221968559302,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0018957369637408012,
+      "loss": 0.1152,
+      "step": 15701
+    },
+    {
+      "epoch": 0.13630090016579718,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001895723058315252,
+      "loss": 0.1416,
+      "step": 15702
+    },
+    {
+      "epoch": 0.13630958064600135,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0018957091520195118,
+      "loss": 0.1074,
+      "step": 15703
+    },
+    {
+      "epoch": 0.13631826112620551,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0018956952448535961,
+      "loss": 0.2109,
+      "step": 15704
+    },
+    {
+      "epoch": 0.13632694160640968,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.00189568133681752,
+      "loss": 0.1348,
+      "step": 15705
+    },
+    {
+      "epoch": 0.13633562208661384,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0018956674279112988,
+      "loss": 0.1201,
+      "step": 15706
+    },
+    {
+      "epoch": 0.136344302566818,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0018956535181349479,
+      "loss": 0.1133,
+      "step": 15707
+    },
+    {
+      "epoch": 0.13635298304702217,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001895639607488482,
+      "loss": 0.1289,
+      "step": 15708
+    },
+    {
+      "epoch": 0.13636166352722634,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0018956256959719168,
+      "loss": 0.1235,
+      "step": 15709
+    },
+    {
+      "epoch": 0.13637034400743048,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001895611783585267,
+      "loss": 0.167,
+      "step": 15710
+    },
+    {
+      "epoch": 0.13637902448763464,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0018955978703285481,
+      "loss": 0.1162,
+      "step": 15711
+    },
+    {
+      "epoch": 0.1363877049678388,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0018955839562017757,
+      "loss": 0.085,
+      "step": 15712
+    },
+    {
+      "epoch": 0.13639638544804297,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0018955700412049642,
+      "loss": 0.0796,
+      "step": 15713
+    },
+    {
+      "epoch": 0.13640506592824714,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0018955561253381295,
+      "loss": 0.1289,
+      "step": 15714
+    },
+    {
+      "epoch": 0.1364137464084513,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0018955422086012863,
+      "loss": 0.1201,
+      "step": 15715
+    },
+    {
+      "epoch": 0.13642242688865547,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0018955282909944504,
+      "loss": 0.0806,
+      "step": 15716
+    },
+    {
+      "epoch": 0.13643110736885963,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0018955143725176368,
+      "loss": 0.1016,
+      "step": 15717
+    },
+    {
+      "epoch": 0.1364397878490638,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0018955004531708605,
+      "loss": 0.1699,
+      "step": 15718
+    },
+    {
+      "epoch": 0.13644846832926796,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0018954865329541366,
+      "loss": 0.1025,
+      "step": 15719
+    },
+    {
+      "epoch": 0.13645714880947213,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001895472611867481,
+      "loss": 0.1226,
+      "step": 15720
+    },
+    {
+      "epoch": 0.1364658292896763,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0018954586899109084,
+      "loss": 0.1211,
+      "step": 15721
+    },
+    {
+      "epoch": 0.13647450976988046,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0018954447670844338,
+      "loss": 0.1416,
+      "step": 15722
+    },
+    {
+      "epoch": 0.13648319025008462,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001895430843388073,
+      "loss": 0.1221,
+      "step": 15723
+    },
+    {
+      "epoch": 0.1364918707302888,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001895416918821841,
+      "loss": 0.0913,
+      "step": 15724
+    },
+    {
+      "epoch": 0.13650055121049295,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0018954029933857528,
+      "loss": 0.1123,
+      "step": 15725
+    },
+    {
+      "epoch": 0.13650923169069712,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0018953890670798243,
+      "loss": 0.1069,
+      "step": 15726
+    },
+    {
+      "epoch": 0.13651791217090128,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018953751399040698,
+      "loss": 0.083,
+      "step": 15727
+    },
+    {
+      "epoch": 0.13652659265110545,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001895361211858505,
+      "loss": 0.1162,
+      "step": 15728
+    },
+    {
+      "epoch": 0.13653527313130961,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0018953472829431456,
+      "loss": 0.1084,
+      "step": 15729
+    },
+    {
+      "epoch": 0.13654395361151378,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001895333353158006,
+      "loss": 0.0918,
+      "step": 15730
+    },
+    {
+      "epoch": 0.13655263409171794,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0018953194225031019,
+      "loss": 0.1064,
+      "step": 15731
+    },
+    {
+      "epoch": 0.1365613145719221,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0018953054909784483,
+      "loss": 0.1357,
+      "step": 15732
+    },
+    {
+      "epoch": 0.13656999505212628,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0018952915585840604,
+      "loss": 0.1104,
+      "step": 15733
+    },
+    {
+      "epoch": 0.13657867553233044,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001895277625319954,
+      "loss": 0.1108,
+      "step": 15734
+    },
+    {
+      "epoch": 0.1365873560125346,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.001895263691186144,
+      "loss": 0.0952,
+      "step": 15735
+    },
+    {
+      "epoch": 0.13659603649273877,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0018952497561826455,
+      "loss": 0.0947,
+      "step": 15736
+    },
+    {
+      "epoch": 0.13660471697294294,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0018952358203094734,
+      "loss": 0.1279,
+      "step": 15737
+    },
+    {
+      "epoch": 0.1366133974531471,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0018952218835666438,
+      "loss": 0.1118,
+      "step": 15738
+    },
+    {
+      "epoch": 0.13662207793335127,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0018952079459541712,
+      "loss": 0.1279,
+      "step": 15739
+    },
+    {
+      "epoch": 0.13663075841355543,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0018951940074720714,
+      "loss": 0.125,
+      "step": 15740
+    },
+    {
+      "epoch": 0.1366394388937596,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001895180068120359,
+      "loss": 0.1299,
+      "step": 15741
+    },
+    {
+      "epoch": 0.13664811937396376,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00189516612789905,
+      "loss": 0.1064,
+      "step": 15742
+    },
+    {
+      "epoch": 0.13665679985416793,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0018951521868081592,
+      "loss": 0.126,
+      "step": 15743
+    },
+    {
+      "epoch": 0.1366654803343721,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.001895138244847702,
+      "loss": 0.1592,
+      "step": 15744
+    },
+    {
+      "epoch": 0.13667416081457626,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0018951243020176934,
+      "loss": 0.1504,
+      "step": 15745
+    },
+    {
+      "epoch": 0.13668284129478042,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0018951103583181486,
+      "loss": 0.1406,
+      "step": 15746
+    },
+    {
+      "epoch": 0.1366915217749846,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0018950964137490834,
+      "loss": 0.1299,
+      "step": 15747
+    },
+    {
+      "epoch": 0.13670020225518875,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0018950824683105125,
+      "loss": 0.0933,
+      "step": 15748
+    },
+    {
+      "epoch": 0.13670888273539292,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0018950685220024514,
+      "loss": 0.1621,
+      "step": 15749
+    },
+    {
+      "epoch": 0.13671756321559708,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0018950545748249157,
+      "loss": 0.1426,
+      "step": 15750
+    },
+    {
+      "epoch": 0.13672624369580125,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.00189504062677792,
+      "loss": 0.0918,
+      "step": 15751
+    },
+    {
+      "epoch": 0.1367349241760054,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0018950266778614796,
+      "loss": 0.1011,
+      "step": 15752
+    },
+    {
+      "epoch": 0.13674360465620958,
+      "grad_norm": 0.125,
+      "learning_rate": 0.00189501272807561,
+      "loss": 0.1338,
+      "step": 15753
+    },
+    {
+      "epoch": 0.13675228513641374,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018949987774203266,
+      "loss": 0.1201,
+      "step": 15754
+    },
+    {
+      "epoch": 0.1367609656166179,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0018949848258956446,
+      "loss": 0.1309,
+      "step": 15755
+    },
+    {
+      "epoch": 0.13676964609682207,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001894970873501579,
+      "loss": 0.1191,
+      "step": 15756
+    },
+    {
+      "epoch": 0.13677832657702624,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001894956920238145,
+      "loss": 0.0947,
+      "step": 15757
+    },
+    {
+      "epoch": 0.1367870070572304,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0018949429661053582,
+      "loss": 0.1533,
+      "step": 15758
+    },
+    {
+      "epoch": 0.13679568753743457,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0018949290111032338,
+      "loss": 0.1143,
+      "step": 15759
+    },
+    {
+      "epoch": 0.13680436801763873,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0018949150552317869,
+      "loss": 0.126,
+      "step": 15760
+    },
+    {
+      "epoch": 0.1368130484978429,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0018949010984910327,
+      "loss": 0.1074,
+      "step": 15761
+    },
+    {
+      "epoch": 0.13682172897804706,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0018948871408809868,
+      "loss": 0.0957,
+      "step": 15762
+    },
+    {
+      "epoch": 0.13683040945825123,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0018948731824016641,
+      "loss": 0.1138,
+      "step": 15763
+    },
+    {
+      "epoch": 0.1368390899384554,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018948592230530804,
+      "loss": 0.1138,
+      "step": 15764
+    },
+    {
+      "epoch": 0.13684777041865956,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0018948452628352501,
+      "loss": 0.1992,
+      "step": 15765
+    },
+    {
+      "epoch": 0.13685645089886372,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0018948313017481894,
+      "loss": 0.0991,
+      "step": 15766
+    },
+    {
+      "epoch": 0.1368651313790679,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0018948173397919132,
+      "loss": 0.1699,
+      "step": 15767
+    },
+    {
+      "epoch": 0.13687381185927205,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0018948033769664362,
+      "loss": 0.127,
+      "step": 15768
+    },
+    {
+      "epoch": 0.13688249233947622,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018947894132717742,
+      "loss": 0.1182,
+      "step": 15769
+    },
+    {
+      "epoch": 0.13689117281968038,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.001894775448707943,
+      "loss": 0.1816,
+      "step": 15770
+    },
+    {
+      "epoch": 0.13689985329988455,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001894761483274957,
+      "loss": 0.1021,
+      "step": 15771
+    },
+    {
+      "epoch": 0.13690853378008871,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0018947475169728315,
+      "loss": 0.1064,
+      "step": 15772
+    },
+    {
+      "epoch": 0.13691721426029288,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0018947335498015823,
+      "loss": 0.1201,
+      "step": 15773
+    },
+    {
+      "epoch": 0.13692589474049705,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0018947195817612247,
+      "loss": 0.0996,
+      "step": 15774
+    },
+    {
+      "epoch": 0.1369345752207012,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0018947056128517732,
+      "loss": 0.1406,
+      "step": 15775
+    },
+    {
+      "epoch": 0.13694325570090538,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001894691643073244,
+      "loss": 0.0957,
+      "step": 15776
+    },
+    {
+      "epoch": 0.13695193618110954,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0018946776724256514,
+      "loss": 0.1104,
+      "step": 15777
+    },
+    {
+      "epoch": 0.1369606166613137,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0018946637009090118,
+      "loss": 0.085,
+      "step": 15778
+    },
+    {
+      "epoch": 0.13696929714151787,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0018946497285233397,
+      "loss": 0.1162,
+      "step": 15779
+    },
+    {
+      "epoch": 0.13697797762172204,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018946357552686506,
+      "loss": 0.124,
+      "step": 15780
+    },
+    {
+      "epoch": 0.1369866581019262,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0018946217811449597,
+      "loss": 0.0713,
+      "step": 15781
+    },
+    {
+      "epoch": 0.13699533858213037,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0018946078061522827,
+      "loss": 0.1172,
+      "step": 15782
+    },
+    {
+      "epoch": 0.13700401906233453,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0018945938302906344,
+      "loss": 0.1367,
+      "step": 15783
+    },
+    {
+      "epoch": 0.1370126995425387,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00189457985356003,
+      "loss": 0.1196,
+      "step": 15784
+    },
+    {
+      "epoch": 0.13702138002274286,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.001894565875960485,
+      "loss": 0.0923,
+      "step": 15785
+    },
+    {
+      "epoch": 0.13703006050294703,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0018945518974920151,
+      "loss": 0.1279,
+      "step": 15786
+    },
+    {
+      "epoch": 0.1370387409831512,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001894537918154635,
+      "loss": 0.0967,
+      "step": 15787
+    },
+    {
+      "epoch": 0.13704742146335536,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0018945239379483598,
+      "loss": 0.1289,
+      "step": 15788
+    },
+    {
+      "epoch": 0.13705610194355952,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0018945099568732054,
+      "loss": 0.083,
+      "step": 15789
+    },
+    {
+      "epoch": 0.1370647824237637,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0018944959749291868,
+      "loss": 0.0894,
+      "step": 15790
+    },
+    {
+      "epoch": 0.13707346290396785,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0018944819921163196,
+      "loss": 0.1064,
+      "step": 15791
+    },
+    {
+      "epoch": 0.13708214338417202,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0018944680084346187,
+      "loss": 0.1445,
+      "step": 15792
+    },
+    {
+      "epoch": 0.13709082386437618,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0018944540238840994,
+      "loss": 0.1152,
+      "step": 15793
+    },
+    {
+      "epoch": 0.13709950434458035,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0018944400384647772,
+      "loss": 0.1279,
+      "step": 15794
+    },
+    {
+      "epoch": 0.1371081848247845,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0018944260521766673,
+      "loss": 0.1104,
+      "step": 15795
+    },
+    {
+      "epoch": 0.13711686530498868,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0018944120650197854,
+      "loss": 0.1177,
+      "step": 15796
+    },
+    {
+      "epoch": 0.13712554578519284,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.001894398076994146,
+      "loss": 0.1465,
+      "step": 15797
+    },
+    {
+      "epoch": 0.137134226265397,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0018943840880997647,
+      "loss": 0.0991,
+      "step": 15798
+    },
+    {
+      "epoch": 0.13714290674560117,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001894370098336657,
+      "loss": 0.1338,
+      "step": 15799
+    },
+    {
+      "epoch": 0.13715158722580534,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0018943561077048381,
+      "loss": 0.1367,
+      "step": 15800
+    },
+    {
+      "epoch": 0.1371602677060095,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0018943421162043233,
+      "loss": 0.1157,
+      "step": 15801
+    },
+    {
+      "epoch": 0.13716894818621367,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.001894328123835128,
+      "loss": 0.1592,
+      "step": 15802
+    },
+    {
+      "epoch": 0.13717762866641783,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0018943141305972674,
+      "loss": 0.1426,
+      "step": 15803
+    },
+    {
+      "epoch": 0.137186309146622,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0018943001364907567,
+      "loss": 0.1182,
+      "step": 15804
+    },
+    {
+      "epoch": 0.13719498962682616,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001894286141515611,
+      "loss": 0.0889,
+      "step": 15805
+    },
+    {
+      "epoch": 0.13720367010703033,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0018942721456718461,
+      "loss": 0.125,
+      "step": 15806
+    },
+    {
+      "epoch": 0.1372123505872345,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0018942581489594774,
+      "loss": 0.1758,
+      "step": 15807
+    },
+    {
+      "epoch": 0.13722103106743866,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00189424415137852,
+      "loss": 0.1094,
+      "step": 15808
+    },
+    {
+      "epoch": 0.13722971154764282,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.001894230152928989,
+      "loss": 0.1475,
+      "step": 15809
+    },
+    {
+      "epoch": 0.137238392027847,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0018942161536108994,
+      "loss": 0.1309,
+      "step": 15810
+    },
+    {
+      "epoch": 0.13724707250805115,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0018942021534242672,
+      "loss": 0.0957,
+      "step": 15811
+    },
+    {
+      "epoch": 0.13725575298825532,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0018941881523691076,
+      "loss": 0.1094,
+      "step": 15812
+    },
+    {
+      "epoch": 0.13726443346845948,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001894174150445436,
+      "loss": 0.1143,
+      "step": 15813
+    },
+    {
+      "epoch": 0.13727311394866365,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0018941601476532668,
+      "loss": 0.1099,
+      "step": 15814
+    },
+    {
+      "epoch": 0.13728179442886781,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0018941461439926166,
+      "loss": 0.1094,
+      "step": 15815
+    },
+    {
+      "epoch": 0.13729047490907198,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0018941321394634998,
+      "loss": 0.0928,
+      "step": 15816
+    },
+    {
+      "epoch": 0.13729915538927615,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0018941181340659322,
+      "loss": 0.1113,
+      "step": 15817
+    },
+    {
+      "epoch": 0.1373078358694803,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0018941041277999286,
+      "loss": 0.1533,
+      "step": 15818
+    },
+    {
+      "epoch": 0.13731651634968448,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018940901206655053,
+      "loss": 0.1172,
+      "step": 15819
+    },
+    {
+      "epoch": 0.13732519682988864,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0018940761126626765,
+      "loss": 0.1089,
+      "step": 15820
+    },
+    {
+      "epoch": 0.1373338773100928,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001894062103791458,
+      "loss": 0.1152,
+      "step": 15821
+    },
+    {
+      "epoch": 0.13734255779029697,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0018940480940518652,
+      "loss": 0.1055,
+      "step": 15822
+    },
+    {
+      "epoch": 0.13735123827050114,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0018940340834439134,
+      "loss": 0.1582,
+      "step": 15823
+    },
+    {
+      "epoch": 0.1373599187507053,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.001894020071967618,
+      "loss": 0.0967,
+      "step": 15824
+    },
+    {
+      "epoch": 0.13736859923090947,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0018940060596229936,
+      "loss": 0.1147,
+      "step": 15825
+    },
+    {
+      "epoch": 0.13737727971111363,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0018939920464100567,
+      "loss": 0.1147,
+      "step": 15826
+    },
+    {
+      "epoch": 0.1373859601913178,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0018939780323288218,
+      "loss": 0.1182,
+      "step": 15827
+    },
+    {
+      "epoch": 0.13739464067152196,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018939640173793045,
+      "loss": 0.104,
+      "step": 15828
+    },
+    {
+      "epoch": 0.13740332115172613,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.00189395000156152,
+      "loss": 0.1191,
+      "step": 15829
+    },
+    {
+      "epoch": 0.1374120016319303,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0018939359848754835,
+      "loss": 0.1074,
+      "step": 15830
+    },
+    {
+      "epoch": 0.13742068211213446,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.001893921967321211,
+      "loss": 0.1104,
+      "step": 15831
+    },
+    {
+      "epoch": 0.13742936259233862,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0018939079488987168,
+      "loss": 0.1016,
+      "step": 15832
+    },
+    {
+      "epoch": 0.13743804307254276,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0018938939296080173,
+      "loss": 0.125,
+      "step": 15833
+    },
+    {
+      "epoch": 0.13744672355274692,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0018938799094491271,
+      "loss": 0.125,
+      "step": 15834
+    },
+    {
+      "epoch": 0.1374554040329511,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0018938658884220618,
+      "loss": 0.085,
+      "step": 15835
+    },
+    {
+      "epoch": 0.13746408451315525,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0018938518665268368,
+      "loss": 0.1543,
+      "step": 15836
+    },
+    {
+      "epoch": 0.13747276499335942,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.001893837843763467,
+      "loss": 0.1357,
+      "step": 15837
+    },
+    {
+      "epoch": 0.13748144547356358,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0018938238201319686,
+      "loss": 0.1006,
+      "step": 15838
+    },
+    {
+      "epoch": 0.13749012595376775,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001893809795632356,
+      "loss": 0.1045,
+      "step": 15839
+    },
+    {
+      "epoch": 0.13749880643397192,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0018937957702646448,
+      "loss": 0.0967,
+      "step": 15840
+    },
+    {
+      "epoch": 0.13750748691417608,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0018937817440288509,
+      "loss": 0.1621,
+      "step": 15841
+    },
+    {
+      "epoch": 0.13751616739438025,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001893767716924989,
+      "loss": 0.1177,
+      "step": 15842
+    },
+    {
+      "epoch": 0.1375248478745844,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001893753688953075,
+      "loss": 0.1035,
+      "step": 15843
+    },
+    {
+      "epoch": 0.13753352835478858,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0018937396601131232,
+      "loss": 0.1748,
+      "step": 15844
+    },
+    {
+      "epoch": 0.13754220883499274,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0018937256304051501,
+      "loss": 0.0928,
+      "step": 15845
+    },
+    {
+      "epoch": 0.1375508893151969,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0018937115998291702,
+      "loss": 0.0928,
+      "step": 15846
+    },
+    {
+      "epoch": 0.13755956979540107,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018936975683851995,
+      "loss": 0.1211,
+      "step": 15847
+    },
+    {
+      "epoch": 0.13756825027560524,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018936835360732532,
+      "loss": 0.1172,
+      "step": 15848
+    },
+    {
+      "epoch": 0.1375769307558094,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0018936695028933465,
+      "loss": 0.124,
+      "step": 15849
+    },
+    {
+      "epoch": 0.13758561123601357,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0018936554688454945,
+      "loss": 0.0942,
+      "step": 15850
+    },
+    {
+      "epoch": 0.13759429171621773,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0018936414339297129,
+      "loss": 0.1123,
+      "step": 15851
+    },
+    {
+      "epoch": 0.1376029721964219,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0018936273981460172,
+      "loss": 0.1719,
+      "step": 15852
+    },
+    {
+      "epoch": 0.13761165267662606,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0018936133614944222,
+      "loss": 0.1582,
+      "step": 15853
+    },
+    {
+      "epoch": 0.13762033315683023,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0018935993239749436,
+      "loss": 0.1172,
+      "step": 15854
+    },
+    {
+      "epoch": 0.1376290136370344,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001893585285587597,
+      "loss": 0.1523,
+      "step": 15855
+    },
+    {
+      "epoch": 0.13763769411723856,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0018935712463323967,
+      "loss": 0.1973,
+      "step": 15856
+    },
+    {
+      "epoch": 0.13764637459744272,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0018935572062093594,
+      "loss": 0.1064,
+      "step": 15857
+    },
+    {
+      "epoch": 0.1376550550776469,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018935431652184998,
+      "loss": 0.1167,
+      "step": 15858
+    },
+    {
+      "epoch": 0.13766373555785105,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0018935291233598331,
+      "loss": 0.1055,
+      "step": 15859
+    },
+    {
+      "epoch": 0.13767241603805522,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001893515080633375,
+      "loss": 0.1504,
+      "step": 15860
+    },
+    {
+      "epoch": 0.13768109651825938,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001893501037039141,
+      "loss": 0.1406,
+      "step": 15861
+    },
+    {
+      "epoch": 0.13768977699846355,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001893486992577146,
+      "loss": 0.1143,
+      "step": 15862
+    },
+    {
+      "epoch": 0.1376984574786677,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0018934729472474055,
+      "loss": 0.1309,
+      "step": 15863
+    },
+    {
+      "epoch": 0.13770713795887188,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001893458901049935,
+      "loss": 0.1553,
+      "step": 15864
+    },
+    {
+      "epoch": 0.13771581843907604,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0018934448539847494,
+      "loss": 0.0957,
+      "step": 15865
+    },
+    {
+      "epoch": 0.1377244989192802,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001893430806051865,
+      "loss": 0.1406,
+      "step": 15866
+    },
+    {
+      "epoch": 0.13773317939948437,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0018934167572512962,
+      "loss": 0.1016,
+      "step": 15867
+    },
+    {
+      "epoch": 0.13774185987968854,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.001893402707583059,
+      "loss": 0.1357,
+      "step": 15868
+    },
+    {
+      "epoch": 0.1377505403598927,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0018933886570471684,
+      "loss": 0.1309,
+      "step": 15869
+    },
+    {
+      "epoch": 0.13775922084009687,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0018933746056436398,
+      "loss": 0.1133,
+      "step": 15870
+    },
+    {
+      "epoch": 0.13776790132030103,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0018933605533724886,
+      "loss": 0.105,
+      "step": 15871
+    },
+    {
+      "epoch": 0.1377765818005052,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0018933465002337305,
+      "loss": 0.0879,
+      "step": 15872
+    },
+    {
+      "epoch": 0.13778526228070936,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0018933324462273803,
+      "loss": 0.0918,
+      "step": 15873
+    },
+    {
+      "epoch": 0.13779394276091353,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0018933183913534537,
+      "loss": 0.1035,
+      "step": 15874
+    },
+    {
+      "epoch": 0.1378026232411177,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0018933043356119663,
+      "loss": 0.1406,
+      "step": 15875
+    },
+    {
+      "epoch": 0.13781130372132186,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.001893290279002933,
+      "loss": 0.0991,
+      "step": 15876
+    },
+    {
+      "epoch": 0.13781998420152602,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0018932762215263692,
+      "loss": 0.1084,
+      "step": 15877
+    },
+    {
+      "epoch": 0.1378286646817302,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018932621631822905,
+      "loss": 0.1279,
+      "step": 15878
+    },
+    {
+      "epoch": 0.13783734516193435,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0018932481039707126,
+      "loss": 0.1055,
+      "step": 15879
+    },
+    {
+      "epoch": 0.13784602564213852,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0018932340438916502,
+      "loss": 0.1699,
+      "step": 15880
+    },
+    {
+      "epoch": 0.13785470612234269,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001893219982945119,
+      "loss": 0.123,
+      "step": 15881
+    },
+    {
+      "epoch": 0.13786338660254685,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0018932059211311343,
+      "loss": 0.1445,
+      "step": 15882
+    },
+    {
+      "epoch": 0.13787206708275102,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0018931918584497115,
+      "loss": 0.0977,
+      "step": 15883
+    },
+    {
+      "epoch": 0.13788074756295518,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001893177794900866,
+      "loss": 0.124,
+      "step": 15884
+    },
+    {
+      "epoch": 0.13788942804315935,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0018931637304846133,
+      "loss": 0.0913,
+      "step": 15885
+    },
+    {
+      "epoch": 0.1378981085233635,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0018931496652009685,
+      "loss": 0.084,
+      "step": 15886
+    },
+    {
+      "epoch": 0.13790678900356768,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0018931355990499475,
+      "loss": 0.1543,
+      "step": 15887
+    },
+    {
+      "epoch": 0.13791546948377184,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0018931215320315649,
+      "loss": 0.1113,
+      "step": 15888
+    },
+    {
+      "epoch": 0.137924149963976,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0018931074641458368,
+      "loss": 0.1118,
+      "step": 15889
+    },
+    {
+      "epoch": 0.13793283044418017,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.001893093395392778,
+      "loss": 0.1064,
+      "step": 15890
+    },
+    {
+      "epoch": 0.13794151092438434,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0018930793257724045,
+      "loss": 0.1216,
+      "step": 15891
+    },
+    {
+      "epoch": 0.1379501914045885,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0018930652552847313,
+      "loss": 0.127,
+      "step": 15892
+    },
+    {
+      "epoch": 0.13795887188479267,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001893051183929774,
+      "loss": 0.1147,
+      "step": 15893
+    },
+    {
+      "epoch": 0.13796755236499683,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0018930371117075473,
+      "loss": 0.0913,
+      "step": 15894
+    },
+    {
+      "epoch": 0.137976232845201,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0018930230386180673,
+      "loss": 0.0884,
+      "step": 15895
+    },
+    {
+      "epoch": 0.13798491332540516,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0018930089646613496,
+      "loss": 0.1289,
+      "step": 15896
+    },
+    {
+      "epoch": 0.13799359380560933,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001892994889837409,
+      "loss": 0.0938,
+      "step": 15897
+    },
+    {
+      "epoch": 0.1380022742858135,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.001892980814146261,
+      "loss": 0.1396,
+      "step": 15898
+    },
+    {
+      "epoch": 0.13801095476601766,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0018929667375879211,
+      "loss": 0.1001,
+      "step": 15899
+    },
+    {
+      "epoch": 0.13801963524622182,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001892952660162405,
+      "loss": 0.1055,
+      "step": 15900
+    },
+    {
+      "epoch": 0.138028315726426,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0018929385818697274,
+      "loss": 0.1152,
+      "step": 15901
+    },
+    {
+      "epoch": 0.13803699620663015,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0018929245027099043,
+      "loss": 0.1118,
+      "step": 15902
+    },
+    {
+      "epoch": 0.13804567668683432,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0018929104226829507,
+      "loss": 0.0986,
+      "step": 15903
+    },
+    {
+      "epoch": 0.13805435716703848,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0018928963417888822,
+      "loss": 0.1187,
+      "step": 15904
+    },
+    {
+      "epoch": 0.13806303764724265,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0018928822600277142,
+      "loss": 0.1055,
+      "step": 15905
+    },
+    {
+      "epoch": 0.1380717181274468,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0018928681773994623,
+      "loss": 0.1162,
+      "step": 15906
+    },
+    {
+      "epoch": 0.13808039860765098,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0018928540939041416,
+      "loss": 0.1113,
+      "step": 15907
+    },
+    {
+      "epoch": 0.13808907908785514,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0018928400095417676,
+      "loss": 0.124,
+      "step": 15908
+    },
+    {
+      "epoch": 0.1380977595680593,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0018928259243123553,
+      "loss": 0.1094,
+      "step": 15909
+    },
+    {
+      "epoch": 0.13810644004826347,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0018928118382159208,
+      "loss": 0.0942,
+      "step": 15910
+    },
+    {
+      "epoch": 0.13811512052846764,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0018927977512524792,
+      "loss": 0.127,
+      "step": 15911
+    },
+    {
+      "epoch": 0.1381238010086718,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0018927836634220457,
+      "loss": 0.1177,
+      "step": 15912
+    },
+    {
+      "epoch": 0.13813248148887597,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001892769574724636,
+      "loss": 0.1152,
+      "step": 15913
+    },
+    {
+      "epoch": 0.13814116196908013,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0018927554851602656,
+      "loss": 0.1094,
+      "step": 15914
+    },
+    {
+      "epoch": 0.1381498424492843,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0018927413947289495,
+      "loss": 0.1387,
+      "step": 15915
+    },
+    {
+      "epoch": 0.13815852292948846,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0018927273034307034,
+      "loss": 0.1143,
+      "step": 15916
+    },
+    {
+      "epoch": 0.13816720340969263,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0018927132112655425,
+      "loss": 0.1094,
+      "step": 15917
+    },
+    {
+      "epoch": 0.1381758838898968,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0018926991182334823,
+      "loss": 0.1074,
+      "step": 15918
+    },
+    {
+      "epoch": 0.13818456437010096,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0018926850243345384,
+      "loss": 0.1001,
+      "step": 15919
+    },
+    {
+      "epoch": 0.13819324485030512,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0018926709295687262,
+      "loss": 0.1758,
+      "step": 15920
+    },
+    {
+      "epoch": 0.1382019253305093,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0018926568339360608,
+      "loss": 0.123,
+      "step": 15921
+    },
+    {
+      "epoch": 0.13821060581071345,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0018926427374365577,
+      "loss": 0.1094,
+      "step": 15922
+    },
+    {
+      "epoch": 0.13821928629091762,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0018926286400702326,
+      "loss": 0.1147,
+      "step": 15923
+    },
+    {
+      "epoch": 0.13822796677112179,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0018926145418371007,
+      "loss": 0.1094,
+      "step": 15924
+    },
+    {
+      "epoch": 0.13823664725132595,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0018926004427371774,
+      "loss": 0.103,
+      "step": 15925
+    },
+    {
+      "epoch": 0.13824532773153012,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0018925863427704782,
+      "loss": 0.1289,
+      "step": 15926
+    },
+    {
+      "epoch": 0.13825400821173428,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0018925722419370185,
+      "loss": 0.1445,
+      "step": 15927
+    },
+    {
+      "epoch": 0.13826268869193845,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0018925581402368134,
+      "loss": 0.0815,
+      "step": 15928
+    },
+    {
+      "epoch": 0.1382713691721426,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001892544037669879,
+      "loss": 0.1123,
+      "step": 15929
+    },
+    {
+      "epoch": 0.13828004965234678,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0018925299342362303,
+      "loss": 0.1309,
+      "step": 15930
+    },
+    {
+      "epoch": 0.13828873013255094,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0018925158299358826,
+      "loss": 0.0889,
+      "step": 15931
+    },
+    {
+      "epoch": 0.1382974106127551,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001892501724768852,
+      "loss": 0.1426,
+      "step": 15932
+    },
+    {
+      "epoch": 0.13830609109295927,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0018924876187351527,
+      "loss": 0.1177,
+      "step": 15933
+    },
+    {
+      "epoch": 0.13831477157316344,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001892473511834801,
+      "loss": 0.1226,
+      "step": 15934
+    },
+    {
+      "epoch": 0.1383234520533676,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0018924594040678124,
+      "loss": 0.0786,
+      "step": 15935
+    },
+    {
+      "epoch": 0.13833213253357177,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0018924452954342025,
+      "loss": 0.1045,
+      "step": 15936
+    },
+    {
+      "epoch": 0.13834081301377593,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0018924311859339858,
+      "loss": 0.1074,
+      "step": 15937
+    },
+    {
+      "epoch": 0.1383494934939801,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0018924170755671781,
+      "loss": 0.1172,
+      "step": 15938
+    },
+    {
+      "epoch": 0.13835817397418426,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0018924029643337954,
+      "loss": 0.124,
+      "step": 15939
+    },
+    {
+      "epoch": 0.13836685445438843,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018923888522338523,
+      "loss": 0.1201,
+      "step": 15940
+    },
+    {
+      "epoch": 0.1383755349345926,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001892374739267365,
+      "loss": 0.0659,
+      "step": 15941
+    },
+    {
+      "epoch": 0.13838421541479676,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0018923606254343487,
+      "loss": 0.1162,
+      "step": 15942
+    },
+    {
+      "epoch": 0.13839289589500092,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0018923465107348184,
+      "loss": 0.0762,
+      "step": 15943
+    },
+    {
+      "epoch": 0.1384015763752051,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0018923323951687903,
+      "loss": 0.1157,
+      "step": 15944
+    },
+    {
+      "epoch": 0.13841025685540925,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0018923182787362789,
+      "loss": 0.1069,
+      "step": 15945
+    },
+    {
+      "epoch": 0.13841893733561342,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0018923041614373002,
+      "loss": 0.1396,
+      "step": 15946
+    },
+    {
+      "epoch": 0.13842761781581758,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0018922900432718696,
+      "loss": 0.1064,
+      "step": 15947
+    },
+    {
+      "epoch": 0.13843629829602175,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0018922759242400028,
+      "loss": 0.085,
+      "step": 15948
+    },
+    {
+      "epoch": 0.1384449787762259,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0018922618043417149,
+      "loss": 0.124,
+      "step": 15949
+    },
+    {
+      "epoch": 0.13845365925643008,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001892247683577021,
+      "loss": 0.1172,
+      "step": 15950
+    },
+    {
+      "epoch": 0.13846233973663424,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0018922335619459372,
+      "loss": 0.1025,
+      "step": 15951
+    },
+    {
+      "epoch": 0.1384710202168384,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0018922194394484788,
+      "loss": 0.0977,
+      "step": 15952
+    },
+    {
+      "epoch": 0.13847970069704257,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001892205316084661,
+      "loss": 0.1289,
+      "step": 15953
+    },
+    {
+      "epoch": 0.13848838117724674,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0018921911918544994,
+      "loss": 0.1221,
+      "step": 15954
+    },
+    {
+      "epoch": 0.1384970616574509,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018921770667580094,
+      "loss": 0.0776,
+      "step": 15955
+    },
+    {
+      "epoch": 0.13850574213765504,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0018921629407952062,
+      "loss": 0.0977,
+      "step": 15956
+    },
+    {
+      "epoch": 0.1385144226178592,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0018921488139661059,
+      "loss": 0.1934,
+      "step": 15957
+    },
+    {
+      "epoch": 0.13852310309806337,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0018921346862707234,
+      "loss": 0.1123,
+      "step": 15958
+    },
+    {
+      "epoch": 0.13853178357826754,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0018921205577090746,
+      "loss": 0.0791,
+      "step": 15959
+    },
+    {
+      "epoch": 0.1385404640584717,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0018921064282811741,
+      "loss": 0.1016,
+      "step": 15960
+    },
+    {
+      "epoch": 0.13854914453867587,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0018920922979870382,
+      "loss": 0.1055,
+      "step": 15961
+    },
+    {
+      "epoch": 0.13855782501888003,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0018920781668266822,
+      "loss": 0.0967,
+      "step": 15962
+    },
+    {
+      "epoch": 0.1385665054990842,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0018920640348001213,
+      "loss": 0.1289,
+      "step": 15963
+    },
+    {
+      "epoch": 0.13857518597928836,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001892049901907371,
+      "loss": 0.0884,
+      "step": 15964
+    },
+    {
+      "epoch": 0.13858386645949253,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0018920357681484468,
+      "loss": 0.1445,
+      "step": 15965
+    },
+    {
+      "epoch": 0.1385925469396967,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0018920216335233642,
+      "loss": 0.165,
+      "step": 15966
+    },
+    {
+      "epoch": 0.13860122741990086,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0018920074980321388,
+      "loss": 0.1216,
+      "step": 15967
+    },
+    {
+      "epoch": 0.13860990790010502,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0018919933616747858,
+      "loss": 0.0869,
+      "step": 15968
+    },
+    {
+      "epoch": 0.1386185883803092,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0018919792244513208,
+      "loss": 0.1006,
+      "step": 15969
+    },
+    {
+      "epoch": 0.13862726886051335,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0018919650863617591,
+      "loss": 0.0962,
+      "step": 15970
+    },
+    {
+      "epoch": 0.13863594934071752,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0018919509474061163,
+      "loss": 0.085,
+      "step": 15971
+    },
+    {
+      "epoch": 0.13864462982092168,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001891936807584408,
+      "loss": 0.1016,
+      "step": 15972
+    },
+    {
+      "epoch": 0.13865331030112585,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0018919226668966493,
+      "loss": 0.104,
+      "step": 15973
+    },
+    {
+      "epoch": 0.13866199078133,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0018919085253428561,
+      "loss": 0.1348,
+      "step": 15974
+    },
+    {
+      "epoch": 0.13867067126153418,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0018918943829230434,
+      "loss": 0.1055,
+      "step": 15975
+    },
+    {
+      "epoch": 0.13867935174173834,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0018918802396372272,
+      "loss": 0.127,
+      "step": 15976
+    },
+    {
+      "epoch": 0.1386880322219425,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0018918660954854224,
+      "loss": 0.1099,
+      "step": 15977
+    },
+    {
+      "epoch": 0.13869671270214667,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0018918519504676449,
+      "loss": 0.1611,
+      "step": 15978
+    },
+    {
+      "epoch": 0.13870539318235084,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00189183780458391,
+      "loss": 0.1235,
+      "step": 15979
+    },
+    {
+      "epoch": 0.138714073662555,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0018918236578342332,
+      "loss": 0.1147,
+      "step": 15980
+    },
+    {
+      "epoch": 0.13872275414275917,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0018918095102186297,
+      "loss": 0.0947,
+      "step": 15981
+    },
+    {
+      "epoch": 0.13873143462296333,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0018917953617371152,
+      "loss": 0.1289,
+      "step": 15982
+    },
+    {
+      "epoch": 0.1387401151031675,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0018917812123897056,
+      "loss": 0.1162,
+      "step": 15983
+    },
+    {
+      "epoch": 0.13874879558337166,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0018917670621764156,
+      "loss": 0.1025,
+      "step": 15984
+    },
+    {
+      "epoch": 0.13875747606357583,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001891752911097261,
+      "loss": 0.1279,
+      "step": 15985
+    },
+    {
+      "epoch": 0.13876615654378,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0018917387591522578,
+      "loss": 0.1445,
+      "step": 15986
+    },
+    {
+      "epoch": 0.13877483702398416,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0018917246063414204,
+      "loss": 0.1348,
+      "step": 15987
+    },
+    {
+      "epoch": 0.13878351750418833,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0018917104526647653,
+      "loss": 0.0889,
+      "step": 15988
+    },
+    {
+      "epoch": 0.1387921979843925,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0018916962981223072,
+      "loss": 0.1289,
+      "step": 15989
+    },
+    {
+      "epoch": 0.13880087846459666,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001891682142714062,
+      "loss": 0.1318,
+      "step": 15990
+    },
+    {
+      "epoch": 0.13880955894480082,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0018916679864400454,
+      "loss": 0.1143,
+      "step": 15991
+    },
+    {
+      "epoch": 0.13881823942500499,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018916538293002722,
+      "loss": 0.1348,
+      "step": 15992
+    },
+    {
+      "epoch": 0.13882691990520915,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0018916396712947586,
+      "loss": 0.0938,
+      "step": 15993
+    },
+    {
+      "epoch": 0.13883560038541332,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0018916255124235195,
+      "loss": 0.1367,
+      "step": 15994
+    },
+    {
+      "epoch": 0.13884428086561748,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0018916113526865708,
+      "loss": 0.1094,
+      "step": 15995
+    },
+    {
+      "epoch": 0.13885296134582165,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0018915971920839276,
+      "loss": 0.127,
+      "step": 15996
+    },
+    {
+      "epoch": 0.1388616418260258,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0018915830306156058,
+      "loss": 0.1211,
+      "step": 15997
+    },
+    {
+      "epoch": 0.13887032230622998,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0018915688682816206,
+      "loss": 0.0967,
+      "step": 15998
+    },
+    {
+      "epoch": 0.13887900278643414,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001891554705081988,
+      "loss": 0.0938,
+      "step": 15999
+    },
+    {
+      "epoch": 0.1388876832666383,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0018915405410167225,
+      "loss": 0.1094,
+      "step": 16000
+    },
+    {
+      "epoch": 0.13889636374684247,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0018915263760858401,
+      "loss": 0.1582,
+      "step": 16001
+    },
+    {
+      "epoch": 0.13890504422704664,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0018915122102893568,
+      "loss": 0.125,
+      "step": 16002
+    },
+    {
+      "epoch": 0.1389137247072508,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0018914980436272873,
+      "loss": 0.1406,
+      "step": 16003
+    },
+    {
+      "epoch": 0.13892240518745497,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0018914838760996475,
+      "loss": 0.0864,
+      "step": 16004
+    },
+    {
+      "epoch": 0.13893108566765913,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0018914697077064531,
+      "loss": 0.0869,
+      "step": 16005
+    },
+    {
+      "epoch": 0.1389397661478633,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001891455538447719,
+      "loss": 0.125,
+      "step": 16006
+    },
+    {
+      "epoch": 0.13894844662806746,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0018914413683234614,
+      "loss": 0.0801,
+      "step": 16007
+    },
+    {
+      "epoch": 0.13895712710827163,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0018914271973336947,
+      "loss": 0.1074,
+      "step": 16008
+    },
+    {
+      "epoch": 0.1389658075884758,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0018914130254784355,
+      "loss": 0.1699,
+      "step": 16009
+    },
+    {
+      "epoch": 0.13897448806867996,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0018913988527576994,
+      "loss": 0.123,
+      "step": 16010
+    },
+    {
+      "epoch": 0.13898316854888412,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0018913846791715006,
+      "loss": 0.0894,
+      "step": 16011
+    },
+    {
+      "epoch": 0.1389918490290883,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0018913705047198558,
+      "loss": 0.1191,
+      "step": 16012
+    },
+    {
+      "epoch": 0.13900052950929245,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00189135632940278,
+      "loss": 0.1514,
+      "step": 16013
+    },
+    {
+      "epoch": 0.13900920998949662,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0018913421532202887,
+      "loss": 0.0972,
+      "step": 16014
+    },
+    {
+      "epoch": 0.13901789046970078,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001891327976172398,
+      "loss": 0.1826,
+      "step": 16015
+    },
+    {
+      "epoch": 0.13902657094990495,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018913137982591226,
+      "loss": 0.0967,
+      "step": 16016
+    },
+    {
+      "epoch": 0.1390352514301091,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0018912996194804783,
+      "loss": 0.1084,
+      "step": 16017
+    },
+    {
+      "epoch": 0.13904393191031328,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0018912854398364805,
+      "loss": 0.0889,
+      "step": 16018
+    },
+    {
+      "epoch": 0.13905261239051744,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001891271259327145,
+      "loss": 0.103,
+      "step": 16019
+    },
+    {
+      "epoch": 0.1390612928707216,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001891257077952487,
+      "loss": 0.1348,
+      "step": 16020
+    },
+    {
+      "epoch": 0.13906997335092577,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018912428957125222,
+      "loss": 0.1094,
+      "step": 16021
+    },
+    {
+      "epoch": 0.13907865383112994,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0018912287126072664,
+      "loss": 0.0762,
+      "step": 16022
+    },
+    {
+      "epoch": 0.1390873343113341,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0018912145286367344,
+      "loss": 0.1182,
+      "step": 16023
+    },
+    {
+      "epoch": 0.13909601479153827,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018912003438009421,
+      "loss": 0.1045,
+      "step": 16024
+    },
+    {
+      "epoch": 0.13910469527174243,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0018911861580999053,
+      "loss": 0.0791,
+      "step": 16025
+    },
+    {
+      "epoch": 0.1391133757519466,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0018911719715336388,
+      "loss": 0.1289,
+      "step": 16026
+    },
+    {
+      "epoch": 0.13912205623215076,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0018911577841021587,
+      "loss": 0.1162,
+      "step": 16027
+    },
+    {
+      "epoch": 0.13913073671235493,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0018911435958054806,
+      "loss": 0.1484,
+      "step": 16028
+    },
+    {
+      "epoch": 0.1391394171925591,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001891129406643619,
+      "loss": 0.0898,
+      "step": 16029
+    },
+    {
+      "epoch": 0.13914809767276326,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0018911152166165908,
+      "loss": 0.1167,
+      "step": 16030
+    },
+    {
+      "epoch": 0.13915677815296743,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0018911010257244105,
+      "loss": 0.1162,
+      "step": 16031
+    },
+    {
+      "epoch": 0.1391654586331716,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0018910868339670945,
+      "loss": 0.0889,
+      "step": 16032
+    },
+    {
+      "epoch": 0.13917413911337576,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0018910726413446577,
+      "loss": 0.1377,
+      "step": 16033
+    },
+    {
+      "epoch": 0.13918281959357992,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0018910584478571155,
+      "loss": 0.0898,
+      "step": 16034
+    },
+    {
+      "epoch": 0.13919150007378409,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0018910442535044836,
+      "loss": 0.1182,
+      "step": 16035
+    },
+    {
+      "epoch": 0.13920018055398825,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0018910300582867779,
+      "loss": 0.125,
+      "step": 16036
+    },
+    {
+      "epoch": 0.13920886103419242,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0018910158622040135,
+      "loss": 0.0698,
+      "step": 16037
+    },
+    {
+      "epoch": 0.13921754151439658,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0018910016652562058,
+      "loss": 0.123,
+      "step": 16038
+    },
+    {
+      "epoch": 0.13922622199460075,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001890987467443371,
+      "loss": 0.1016,
+      "step": 16039
+    },
+    {
+      "epoch": 0.1392349024748049,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001890973268765524,
+      "loss": 0.104,
+      "step": 16040
+    },
+    {
+      "epoch": 0.13924358295500908,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0018909590692226803,
+      "loss": 0.0938,
+      "step": 16041
+    },
+    {
+      "epoch": 0.13925226343521324,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0018909448688148559,
+      "loss": 0.1045,
+      "step": 16042
+    },
+    {
+      "epoch": 0.1392609439154174,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001890930667542066,
+      "loss": 0.1289,
+      "step": 16043
+    },
+    {
+      "epoch": 0.13926962439562157,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001890916465404326,
+      "loss": 0.1582,
+      "step": 16044
+    },
+    {
+      "epoch": 0.13927830487582574,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001890902262401652,
+      "loss": 0.166,
+      "step": 16045
+    },
+    {
+      "epoch": 0.1392869853560299,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0018908880585340592,
+      "loss": 0.0737,
+      "step": 16046
+    },
+    {
+      "epoch": 0.13929566583623407,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018908738538015628,
+      "loss": 0.1396,
+      "step": 16047
+    },
+    {
+      "epoch": 0.13930434631643823,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0018908596482041786,
+      "loss": 0.1152,
+      "step": 16048
+    },
+    {
+      "epoch": 0.1393130267966424,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0018908454417419223,
+      "loss": 0.123,
+      "step": 16049
+    },
+    {
+      "epoch": 0.13932170727684656,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0018908312344148095,
+      "loss": 0.0996,
+      "step": 16050
+    },
+    {
+      "epoch": 0.13933038775705073,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0018908170262228552,
+      "loss": 0.103,
+      "step": 16051
+    },
+    {
+      "epoch": 0.1393390682372549,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0018908028171660753,
+      "loss": 0.0991,
+      "step": 16052
+    },
+    {
+      "epoch": 0.13934774871745906,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0018907886072444858,
+      "loss": 0.1187,
+      "step": 16053
+    },
+    {
+      "epoch": 0.13935642919766322,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0018907743964581012,
+      "loss": 0.1279,
+      "step": 16054
+    },
+    {
+      "epoch": 0.1393651096778674,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001890760184806938,
+      "loss": 0.1592,
+      "step": 16055
+    },
+    {
+      "epoch": 0.13937379015807155,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.001890745972291011,
+      "loss": 0.1338,
+      "step": 16056
+    },
+    {
+      "epoch": 0.13938247063827572,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0018907317589103364,
+      "loss": 0.0874,
+      "step": 16057
+    },
+    {
+      "epoch": 0.13939115111847988,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0018907175446649291,
+      "loss": 0.0942,
+      "step": 16058
+    },
+    {
+      "epoch": 0.13939983159868405,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0018907033295548052,
+      "loss": 0.0913,
+      "step": 16059
+    },
+    {
+      "epoch": 0.1394085120788882,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00189068911357998,
+      "loss": 0.1348,
+      "step": 16060
+    },
+    {
+      "epoch": 0.13941719255909238,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0018906748967404692,
+      "loss": 0.0938,
+      "step": 16061
+    },
+    {
+      "epoch": 0.13942587303929654,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0018906606790362879,
+      "loss": 0.1006,
+      "step": 16062
+    },
+    {
+      "epoch": 0.1394345535195007,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0018906464604674523,
+      "loss": 0.1475,
+      "step": 16063
+    },
+    {
+      "epoch": 0.13944323399970487,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0018906322410339774,
+      "loss": 0.1094,
+      "step": 16064
+    },
+    {
+      "epoch": 0.13945191447990904,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001890618020735879,
+      "loss": 0.1338,
+      "step": 16065
+    },
+    {
+      "epoch": 0.1394605949601132,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0018906037995731727,
+      "loss": 0.1299,
+      "step": 16066
+    },
+    {
+      "epoch": 0.13946927544031737,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.001890589577545874,
+      "loss": 0.1162,
+      "step": 16067
+    },
+    {
+      "epoch": 0.13947795592052153,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0018905753546539978,
+      "loss": 0.0991,
+      "step": 16068
+    },
+    {
+      "epoch": 0.1394866364007257,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001890561130897561,
+      "loss": 0.0986,
+      "step": 16069
+    },
+    {
+      "epoch": 0.13949531688092986,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0018905469062765782,
+      "loss": 0.1504,
+      "step": 16070
+    },
+    {
+      "epoch": 0.13950399736113403,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001890532680791065,
+      "loss": 0.1118,
+      "step": 16071
+    },
+    {
+      "epoch": 0.1395126778413382,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0018905184544410374,
+      "loss": 0.1465,
+      "step": 16072
+    },
+    {
+      "epoch": 0.13952135832154236,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0018905042272265105,
+      "loss": 0.1279,
+      "step": 16073
+    },
+    {
+      "epoch": 0.13953003880174653,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0018904899991475004,
+      "loss": 0.0776,
+      "step": 16074
+    },
+    {
+      "epoch": 0.1395387192819507,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0018904757702040218,
+      "loss": 0.1094,
+      "step": 16075
+    },
+    {
+      "epoch": 0.13954739976215486,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0018904615403960913,
+      "loss": 0.1055,
+      "step": 16076
+    },
+    {
+      "epoch": 0.13955608024235902,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0018904473097237236,
+      "loss": 0.1729,
+      "step": 16077
+    },
+    {
+      "epoch": 0.13956476072256319,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0018904330781869345,
+      "loss": 0.0962,
+      "step": 16078
+    },
+    {
+      "epoch": 0.13957344120276732,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00189041884578574,
+      "loss": 0.1299,
+      "step": 16079
+    },
+    {
+      "epoch": 0.1395821216829715,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0018904046125201553,
+      "loss": 0.1113,
+      "step": 16080
+    },
+    {
+      "epoch": 0.13959080216317565,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0018903903783901957,
+      "loss": 0.0947,
+      "step": 16081
+    },
+    {
+      "epoch": 0.13959948264337982,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018903761433958772,
+      "loss": 0.1289,
+      "step": 16082
+    },
+    {
+      "epoch": 0.13960816312358398,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0018903619075372154,
+      "loss": 0.0859,
+      "step": 16083
+    },
+    {
+      "epoch": 0.13961684360378815,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0018903476708142253,
+      "loss": 0.1201,
+      "step": 16084
+    },
+    {
+      "epoch": 0.13962552408399231,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001890333433226923,
+      "loss": 0.1211,
+      "step": 16085
+    },
+    {
+      "epoch": 0.13963420456419648,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0018903191947753242,
+      "loss": 0.1279,
+      "step": 16086
+    },
+    {
+      "epoch": 0.13964288504440064,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.001890304955459444,
+      "loss": 0.0981,
+      "step": 16087
+    },
+    {
+      "epoch": 0.1396515655246048,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001890290715279298,
+      "loss": 0.1299,
+      "step": 16088
+    },
+    {
+      "epoch": 0.13966024600480897,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0018902764742349024,
+      "loss": 0.1196,
+      "step": 16089
+    },
+    {
+      "epoch": 0.13966892648501314,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001890262232326272,
+      "loss": 0.0967,
+      "step": 16090
+    },
+    {
+      "epoch": 0.1396776069652173,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0018902479895534228,
+      "loss": 0.0908,
+      "step": 16091
+    },
+    {
+      "epoch": 0.13968628744542147,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0018902337459163704,
+      "loss": 0.0732,
+      "step": 16092
+    },
+    {
+      "epoch": 0.13969496792562563,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0018902195014151297,
+      "loss": 0.1172,
+      "step": 16093
+    },
+    {
+      "epoch": 0.1397036484058298,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0018902052560497173,
+      "loss": 0.1074,
+      "step": 16094
+    },
+    {
+      "epoch": 0.13971232888603397,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0018901910098201484,
+      "loss": 0.1709,
+      "step": 16095
+    },
+    {
+      "epoch": 0.13972100936623813,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0018901767627264384,
+      "loss": 0.0977,
+      "step": 16096
+    },
+    {
+      "epoch": 0.1397296898464423,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0018901625147686029,
+      "loss": 0.0835,
+      "step": 16097
+    },
+    {
+      "epoch": 0.13973837032664646,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0018901482659466574,
+      "loss": 0.0928,
+      "step": 16098
+    },
+    {
+      "epoch": 0.13974705080685063,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0018901340162606178,
+      "loss": 0.1367,
+      "step": 16099
+    },
+    {
+      "epoch": 0.1397557312870548,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018901197657104993,
+      "loss": 0.1025,
+      "step": 16100
+    },
+    {
+      "epoch": 0.13976441176725896,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.001890105514296318,
+      "loss": 0.2041,
+      "step": 16101
+    },
+    {
+      "epoch": 0.13977309224746312,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001890091262018089,
+      "loss": 0.1216,
+      "step": 16102
+    },
+    {
+      "epoch": 0.13978177272766729,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0018900770088758278,
+      "loss": 0.0918,
+      "step": 16103
+    },
+    {
+      "epoch": 0.13979045320787145,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0018900627548695506,
+      "loss": 0.1021,
+      "step": 16104
+    },
+    {
+      "epoch": 0.13979913368807562,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0018900484999992727,
+      "loss": 0.1279,
+      "step": 16105
+    },
+    {
+      "epoch": 0.13980781416827978,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0018900342442650095,
+      "loss": 0.1289,
+      "step": 16106
+    },
+    {
+      "epoch": 0.13981649464848395,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0018900199876667769,
+      "loss": 0.1094,
+      "step": 16107
+    },
+    {
+      "epoch": 0.1398251751286881,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00189000573020459,
+      "loss": 0.1729,
+      "step": 16108
+    },
+    {
+      "epoch": 0.13983385560889228,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001889991471878465,
+      "loss": 0.0972,
+      "step": 16109
+    },
+    {
+      "epoch": 0.13984253608909644,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001889977212688417,
+      "loss": 0.1216,
+      "step": 16110
+    },
+    {
+      "epoch": 0.1398512165693006,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001889962952634462,
+      "loss": 0.1104,
+      "step": 16111
+    },
+    {
+      "epoch": 0.13985989704950477,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001889948691716615,
+      "loss": 0.0771,
+      "step": 16112
+    },
+    {
+      "epoch": 0.13986857752970894,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0018899344299348922,
+      "loss": 0.1138,
+      "step": 16113
+    },
+    {
+      "epoch": 0.1398772580099131,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0018899201672893092,
+      "loss": 0.1157,
+      "step": 16114
+    },
+    {
+      "epoch": 0.13988593849011727,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001889905903779881,
+      "loss": 0.1177,
+      "step": 16115
+    },
+    {
+      "epoch": 0.13989461897032143,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0018898916394066237,
+      "loss": 0.1074,
+      "step": 16116
+    },
+    {
+      "epoch": 0.1399032994505256,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001889877374169553,
+      "loss": 0.1318,
+      "step": 16117
+    },
+    {
+      "epoch": 0.13991197993072976,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0018898631080686839,
+      "loss": 0.0908,
+      "step": 16118
+    },
+    {
+      "epoch": 0.13992066041093393,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018898488411040327,
+      "loss": 0.1074,
+      "step": 16119
+    },
+    {
+      "epoch": 0.1399293408911381,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0018898345732756144,
+      "loss": 0.1934,
+      "step": 16120
+    },
+    {
+      "epoch": 0.13993802137134226,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001889820304583445,
+      "loss": 0.1045,
+      "step": 16121
+    },
+    {
+      "epoch": 0.13994670185154642,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00188980603502754,
+      "loss": 0.1309,
+      "step": 16122
+    },
+    {
+      "epoch": 0.1399553823317506,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0018897917646079148,
+      "loss": 0.1006,
+      "step": 16123
+    },
+    {
+      "epoch": 0.13996406281195475,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0018897774933245852,
+      "loss": 0.0986,
+      "step": 16124
+    },
+    {
+      "epoch": 0.13997274329215892,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.001889763221177567,
+      "loss": 0.1113,
+      "step": 16125
+    },
+    {
+      "epoch": 0.13998142377236308,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0018897489481668756,
+      "loss": 0.1396,
+      "step": 16126
+    },
+    {
+      "epoch": 0.13999010425256725,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001889734674292527,
+      "loss": 0.0918,
+      "step": 16127
+    },
+    {
+      "epoch": 0.13999878473277141,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0018897203995545356,
+      "loss": 0.085,
+      "step": 16128
+    },
+    {
+      "epoch": 0.14000746521297558,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0018897061239529186,
+      "loss": 0.1387,
+      "step": 16129
+    },
+    {
+      "epoch": 0.14001614569317974,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0018896918474876905,
+      "loss": 0.0908,
+      "step": 16130
+    },
+    {
+      "epoch": 0.1400248261733839,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0018896775701588674,
+      "loss": 0.1221,
+      "step": 16131
+    },
+    {
+      "epoch": 0.14003350665358807,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0018896632919664644,
+      "loss": 0.1016,
+      "step": 16132
+    },
+    {
+      "epoch": 0.14004218713379224,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0018896490129104975,
+      "loss": 0.1123,
+      "step": 16133
+    },
+    {
+      "epoch": 0.1400508676139964,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0018896347329909826,
+      "loss": 0.1162,
+      "step": 16134
+    },
+    {
+      "epoch": 0.14005954809420057,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001889620452207935,
+      "loss": 0.1279,
+      "step": 16135
+    },
+    {
+      "epoch": 0.14006822857440474,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0018896061705613702,
+      "loss": 0.1094,
+      "step": 16136
+    },
+    {
+      "epoch": 0.1400769090546089,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0018895918880513043,
+      "loss": 0.1416,
+      "step": 16137
+    },
+    {
+      "epoch": 0.14008558953481307,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001889577604677752,
+      "loss": 0.1089,
+      "step": 16138
+    },
+    {
+      "epoch": 0.14009427001501723,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0018895633204407299,
+      "loss": 0.1191,
+      "step": 16139
+    },
+    {
+      "epoch": 0.1401029504952214,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001889549035340253,
+      "loss": 0.1514,
+      "step": 16140
+    },
+    {
+      "epoch": 0.14011163097542556,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0018895347493763372,
+      "loss": 0.0918,
+      "step": 16141
+    },
+    {
+      "epoch": 0.14012031145562973,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0018895204625489982,
+      "loss": 0.1152,
+      "step": 16142
+    },
+    {
+      "epoch": 0.1401289919358339,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0018895061748582514,
+      "loss": 0.0767,
+      "step": 16143
+    },
+    {
+      "epoch": 0.14013767241603806,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0018894918863041127,
+      "loss": 0.1133,
+      "step": 16144
+    },
+    {
+      "epoch": 0.14014635289624222,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001889477596886597,
+      "loss": 0.1455,
+      "step": 16145
+    },
+    {
+      "epoch": 0.1401550333764464,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001889463306605721,
+      "loss": 0.1108,
+      "step": 16146
+    },
+    {
+      "epoch": 0.14016371385665055,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0018894490154614996,
+      "loss": 0.1221,
+      "step": 16147
+    },
+    {
+      "epoch": 0.14017239433685472,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0018894347234539486,
+      "loss": 0.1348,
+      "step": 16148
+    },
+    {
+      "epoch": 0.14018107481705888,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0018894204305830837,
+      "loss": 0.1128,
+      "step": 16149
+    },
+    {
+      "epoch": 0.14018975529726305,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0018894061368489204,
+      "loss": 0.1436,
+      "step": 16150
+    },
+    {
+      "epoch": 0.1401984357774672,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0018893918422514745,
+      "loss": 0.106,
+      "step": 16151
+    },
+    {
+      "epoch": 0.14020711625767138,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0018893775467907616,
+      "loss": 0.1445,
+      "step": 16152
+    },
+    {
+      "epoch": 0.14021579673787554,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0018893632504667973,
+      "loss": 0.1885,
+      "step": 16153
+    },
+    {
+      "epoch": 0.1402244772180797,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0018893489532795969,
+      "loss": 0.1182,
+      "step": 16154
+    },
+    {
+      "epoch": 0.14023315769828387,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0018893346552291766,
+      "loss": 0.1094,
+      "step": 16155
+    },
+    {
+      "epoch": 0.14024183817848804,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0018893203563155514,
+      "loss": 0.0972,
+      "step": 16156
+    },
+    {
+      "epoch": 0.1402505186586922,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0018893060565387378,
+      "loss": 0.0957,
+      "step": 16157
+    },
+    {
+      "epoch": 0.14025919913889637,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001889291755898751,
+      "loss": 0.1211,
+      "step": 16158
+    },
+    {
+      "epoch": 0.14026787961910053,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.001889277454395606,
+      "loss": 0.1494,
+      "step": 16159
+    },
+    {
+      "epoch": 0.1402765600993047,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0018892631520293194,
+      "loss": 0.1113,
+      "step": 16160
+    },
+    {
+      "epoch": 0.14028524057950886,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0018892488487999067,
+      "loss": 0.1787,
+      "step": 16161
+    },
+    {
+      "epoch": 0.14029392105971303,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0018892345447073832,
+      "loss": 0.1367,
+      "step": 16162
+    },
+    {
+      "epoch": 0.1403026015399172,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0018892202397517643,
+      "loss": 0.1064,
+      "step": 16163
+    },
+    {
+      "epoch": 0.14031128202012136,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0018892059339330662,
+      "loss": 0.0986,
+      "step": 16164
+    },
+    {
+      "epoch": 0.14031996250032552,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0018891916272513044,
+      "loss": 0.1035,
+      "step": 16165
+    },
+    {
+      "epoch": 0.1403286429805297,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0018891773197064947,
+      "loss": 0.0977,
+      "step": 16166
+    },
+    {
+      "epoch": 0.14033732346073385,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0018891630112986522,
+      "loss": 0.0811,
+      "step": 16167
+    },
+    {
+      "epoch": 0.14034600394093802,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001889148702027793,
+      "loss": 0.1084,
+      "step": 16168
+    },
+    {
+      "epoch": 0.14035468442114218,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0018891343918939327,
+      "loss": 0.1113,
+      "step": 16169
+    },
+    {
+      "epoch": 0.14036336490134635,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001889120080897087,
+      "loss": 0.1191,
+      "step": 16170
+    },
+    {
+      "epoch": 0.14037204538155051,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0018891057690372712,
+      "loss": 0.1216,
+      "step": 16171
+    },
+    {
+      "epoch": 0.14038072586175468,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0018890914563145015,
+      "loss": 0.0977,
+      "step": 16172
+    },
+    {
+      "epoch": 0.14038940634195884,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0018890771427287926,
+      "loss": 0.1699,
+      "step": 16173
+    },
+    {
+      "epoch": 0.140398086822163,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0018890628282801616,
+      "loss": 0.1465,
+      "step": 16174
+    },
+    {
+      "epoch": 0.14040676730236717,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001889048512968623,
+      "loss": 0.1621,
+      "step": 16175
+    },
+    {
+      "epoch": 0.14041544778257134,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0018890341967941926,
+      "loss": 0.0869,
+      "step": 16176
+    },
+    {
+      "epoch": 0.1404241282627755,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0018890198797568863,
+      "loss": 0.1338,
+      "step": 16177
+    },
+    {
+      "epoch": 0.14043280874297967,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0018890055618567198,
+      "loss": 0.1045,
+      "step": 16178
+    },
+    {
+      "epoch": 0.14044148922318384,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0018889912430937088,
+      "loss": 0.085,
+      "step": 16179
+    },
+    {
+      "epoch": 0.140450169703388,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0018889769234678687,
+      "loss": 0.0693,
+      "step": 16180
+    },
+    {
+      "epoch": 0.14045885018359217,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0018889626029792153,
+      "loss": 0.375,
+      "step": 16181
+    },
+    {
+      "epoch": 0.14046753066379633,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0018889482816277644,
+      "loss": 0.1221,
+      "step": 16182
+    },
+    {
+      "epoch": 0.1404762111440005,
+      "grad_norm": 4.65625,
+      "learning_rate": 0.0018889339594135312,
+      "loss": 0.1387,
+      "step": 16183
+    },
+    {
+      "epoch": 0.14048489162420466,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.001888919636336532,
+      "loss": 0.1543,
+      "step": 16184
+    },
+    {
+      "epoch": 0.14049357210440883,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001888905312396782,
+      "loss": 0.1133,
+      "step": 16185
+    },
+    {
+      "epoch": 0.140502252584613,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001888890987594297,
+      "loss": 0.0806,
+      "step": 16186
+    },
+    {
+      "epoch": 0.14051093306481716,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0018888766619290928,
+      "loss": 0.1943,
+      "step": 16187
+    },
+    {
+      "epoch": 0.14051961354502132,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0018888623354011847,
+      "loss": 0.1465,
+      "step": 16188
+    },
+    {
+      "epoch": 0.1405282940252255,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0018888480080105888,
+      "loss": 0.0713,
+      "step": 16189
+    },
+    {
+      "epoch": 0.14053697450542965,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018888336797573204,
+      "loss": 0.0884,
+      "step": 16190
+    },
+    {
+      "epoch": 0.14054565498563382,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0018888193506413954,
+      "loss": 0.1113,
+      "step": 16191
+    },
+    {
+      "epoch": 0.14055433546583798,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0018888050206628294,
+      "loss": 0.0986,
+      "step": 16192
+    },
+    {
+      "epoch": 0.14056301594604215,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0018887906898216382,
+      "loss": 0.1406,
+      "step": 16193
+    },
+    {
+      "epoch": 0.1405716964262463,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0018887763581178375,
+      "loss": 0.1182,
+      "step": 16194
+    },
+    {
+      "epoch": 0.14058037690645048,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0018887620255514425,
+      "loss": 0.1133,
+      "step": 16195
+    },
+    {
+      "epoch": 0.14058905738665464,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.001888747692122469,
+      "loss": 0.0972,
+      "step": 16196
+    },
+    {
+      "epoch": 0.1405977378668588,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0018887333578309334,
+      "loss": 0.103,
+      "step": 16197
+    },
+    {
+      "epoch": 0.14060641834706297,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0018887190226768505,
+      "loss": 0.1191,
+      "step": 16198
+    },
+    {
+      "epoch": 0.14061509882726714,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0018887046866602366,
+      "loss": 0.0806,
+      "step": 16199
+    },
+    {
+      "epoch": 0.1406237793074713,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0018886903497811068,
+      "loss": 0.1357,
+      "step": 16200
+    },
+    {
+      "epoch": 0.14063245978767547,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0018886760120394772,
+      "loss": 0.1572,
+      "step": 16201
+    },
+    {
+      "epoch": 0.14064114026787963,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0018886616734353635,
+      "loss": 0.1152,
+      "step": 16202
+    },
+    {
+      "epoch": 0.14064982074808377,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001888647333968781,
+      "loss": 0.1602,
+      "step": 16203
+    },
+    {
+      "epoch": 0.14065850122828794,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001888632993639746,
+      "loss": 0.1147,
+      "step": 16204
+    },
+    {
+      "epoch": 0.1406671817084921,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0018886186524482735,
+      "loss": 0.1523,
+      "step": 16205
+    },
+    {
+      "epoch": 0.14067586218869627,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0018886043103943796,
+      "loss": 0.0918,
+      "step": 16206
+    },
+    {
+      "epoch": 0.14068454266890043,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0018885899674780796,
+      "loss": 0.0859,
+      "step": 16207
+    },
+    {
+      "epoch": 0.1406932231491046,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.00188857562369939,
+      "loss": 0.0859,
+      "step": 16208
+    },
+    {
+      "epoch": 0.14070190362930876,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0018885612790583255,
+      "loss": 0.1348,
+      "step": 16209
+    },
+    {
+      "epoch": 0.14071058410951293,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0018885469335549023,
+      "loss": 0.0967,
+      "step": 16210
+    },
+    {
+      "epoch": 0.1407192645897171,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018885325871891361,
+      "loss": 0.1182,
+      "step": 16211
+    },
+    {
+      "epoch": 0.14072794506992126,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0018885182399610426,
+      "loss": 0.1445,
+      "step": 16212
+    },
+    {
+      "epoch": 0.14073662555012542,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0018885038918706374,
+      "loss": 0.1221,
+      "step": 16213
+    },
+    {
+      "epoch": 0.1407453060303296,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0018884895429179358,
+      "loss": 0.126,
+      "step": 16214
+    },
+    {
+      "epoch": 0.14075398651053375,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0018884751931029546,
+      "loss": 0.1162,
+      "step": 16215
+    },
+    {
+      "epoch": 0.14076266699073792,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0018884608424257082,
+      "loss": 0.1172,
+      "step": 16216
+    },
+    {
+      "epoch": 0.14077134747094208,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0018884464908862132,
+      "loss": 0.0991,
+      "step": 16217
+    },
+    {
+      "epoch": 0.14078002795114625,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0018884321384844847,
+      "loss": 0.1162,
+      "step": 16218
+    },
+    {
+      "epoch": 0.1407887084313504,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0018884177852205386,
+      "loss": 0.1201,
+      "step": 16219
+    },
+    {
+      "epoch": 0.14079738891155458,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001888403431094391,
+      "loss": 0.124,
+      "step": 16220
+    },
+    {
+      "epoch": 0.14080606939175874,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0018883890761060569,
+      "loss": 0.1152,
+      "step": 16221
+    },
+    {
+      "epoch": 0.1408147498719629,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0018883747202555526,
+      "loss": 0.1177,
+      "step": 16222
+    },
+    {
+      "epoch": 0.14082343035216707,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0018883603635428934,
+      "loss": 0.1099,
+      "step": 16223
+    },
+    {
+      "epoch": 0.14083211083237124,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0018883460059680953,
+      "loss": 0.1348,
+      "step": 16224
+    },
+    {
+      "epoch": 0.1408407913125754,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001888331647531174,
+      "loss": 0.1245,
+      "step": 16225
+    },
+    {
+      "epoch": 0.14084947179277957,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018883172882321443,
+      "loss": 0.0918,
+      "step": 16226
+    },
+    {
+      "epoch": 0.14085815227298373,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0018883029280710234,
+      "loss": 0.1006,
+      "step": 16227
+    },
+    {
+      "epoch": 0.1408668327531879,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0018882885670478261,
+      "loss": 0.123,
+      "step": 16228
+    },
+    {
+      "epoch": 0.14087551323339206,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001888274205162568,
+      "loss": 0.1104,
+      "step": 16229
+    },
+    {
+      "epoch": 0.14088419371359623,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0018882598424152654,
+      "loss": 0.1777,
+      "step": 16230
+    },
+    {
+      "epoch": 0.1408928741938004,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0018882454788059335,
+      "loss": 0.1621,
+      "step": 16231
+    },
+    {
+      "epoch": 0.14090155467400456,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0018882311143345884,
+      "loss": 0.1299,
+      "step": 16232
+    },
+    {
+      "epoch": 0.14091023515420872,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0018882167490012454,
+      "loss": 0.0913,
+      "step": 16233
+    },
+    {
+      "epoch": 0.1409189156344129,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0018882023828059205,
+      "loss": 0.1426,
+      "step": 16234
+    },
+    {
+      "epoch": 0.14092759611461705,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0018881880157486292,
+      "loss": 0.2129,
+      "step": 16235
+    },
+    {
+      "epoch": 0.14093627659482122,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0018881736478293873,
+      "loss": 0.0952,
+      "step": 16236
+    },
+    {
+      "epoch": 0.14094495707502538,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001888159279048211,
+      "loss": 0.1133,
+      "step": 16237
+    },
+    {
+      "epoch": 0.14095363755522955,
+      "grad_norm": 2.328125,
+      "learning_rate": 0.001888144909405115,
+      "loss": 0.1992,
+      "step": 16238
+    },
+    {
+      "epoch": 0.14096231803543371,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001888130538900116,
+      "loss": 0.1172,
+      "step": 16239
+    },
+    {
+      "epoch": 0.14097099851563788,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0018881161675332288,
+      "loss": 0.1064,
+      "step": 16240
+    },
+    {
+      "epoch": 0.14097967899584204,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.00188810179530447,
+      "loss": 0.1289,
+      "step": 16241
+    },
+    {
+      "epoch": 0.1409883594760462,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018880874222138547,
+      "loss": 0.1396,
+      "step": 16242
+    },
+    {
+      "epoch": 0.14099703995625038,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0018880730482613987,
+      "loss": 0.0718,
+      "step": 16243
+    },
+    {
+      "epoch": 0.14100572043645454,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0018880586734471181,
+      "loss": 0.1221,
+      "step": 16244
+    },
+    {
+      "epoch": 0.1410144009166587,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0018880442977710283,
+      "loss": 0.1055,
+      "step": 16245
+    },
+    {
+      "epoch": 0.14102308139686287,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0018880299212331454,
+      "loss": 0.125,
+      "step": 16246
+    },
+    {
+      "epoch": 0.14103176187706704,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0018880155438334846,
+      "loss": 0.1152,
+      "step": 16247
+    },
+    {
+      "epoch": 0.1410404423572712,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0018880011655720615,
+      "loss": 0.1172,
+      "step": 16248
+    },
+    {
+      "epoch": 0.14104912283747537,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0018879867864488925,
+      "loss": 0.0874,
+      "step": 16249
+    },
+    {
+      "epoch": 0.14105780331767953,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001887972406463993,
+      "loss": 0.1162,
+      "step": 16250
+    },
+    {
+      "epoch": 0.1410664837978837,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0018879580256173788,
+      "loss": 0.0972,
+      "step": 16251
+    },
+    {
+      "epoch": 0.14107516427808786,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0018879436439090654,
+      "loss": 0.1035,
+      "step": 16252
+    },
+    {
+      "epoch": 0.14108384475829203,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0018879292613390684,
+      "loss": 0.0864,
+      "step": 16253
+    },
+    {
+      "epoch": 0.1410925252384962,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0018879148779074042,
+      "loss": 0.0879,
+      "step": 16254
+    },
+    {
+      "epoch": 0.14110120571870036,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0018879004936140878,
+      "loss": 0.1104,
+      "step": 16255
+    },
+    {
+      "epoch": 0.14110988619890452,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0018878861084591356,
+      "loss": 0.1543,
+      "step": 16256
+    },
+    {
+      "epoch": 0.1411185666791087,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001887871722442563,
+      "loss": 0.1143,
+      "step": 16257
+    },
+    {
+      "epoch": 0.14112724715931285,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0018878573355643855,
+      "loss": 0.1162,
+      "step": 16258
+    },
+    {
+      "epoch": 0.14113592763951702,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0018878429478246192,
+      "loss": 0.1128,
+      "step": 16259
+    },
+    {
+      "epoch": 0.14114460811972118,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0018878285592232795,
+      "loss": 0.1504,
+      "step": 16260
+    },
+    {
+      "epoch": 0.14115328859992535,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0018878141697603824,
+      "loss": 0.1123,
+      "step": 16261
+    },
+    {
+      "epoch": 0.1411619690801295,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0018877997794359439,
+      "loss": 0.0942,
+      "step": 16262
+    },
+    {
+      "epoch": 0.14117064956033368,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001887785388249979,
+      "loss": 0.1621,
+      "step": 16263
+    },
+    {
+      "epoch": 0.14117933004053784,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0018877709962025037,
+      "loss": 0.1445,
+      "step": 16264
+    },
+    {
+      "epoch": 0.141188010520742,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001887756603293534,
+      "loss": 0.1211,
+      "step": 16265
+    },
+    {
+      "epoch": 0.14119669100094617,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0018877422095230857,
+      "loss": 0.1318,
+      "step": 16266
+    },
+    {
+      "epoch": 0.14120537148115034,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0018877278148911746,
+      "loss": 0.1079,
+      "step": 16267
+    },
+    {
+      "epoch": 0.1412140519613545,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0018877134193978156,
+      "loss": 0.0918,
+      "step": 16268
+    },
+    {
+      "epoch": 0.14122273244155867,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0018876990230430252,
+      "loss": 0.1006,
+      "step": 16269
+    },
+    {
+      "epoch": 0.14123141292176283,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0018876846258268191,
+      "loss": 0.1035,
+      "step": 16270
+    },
+    {
+      "epoch": 0.141240093401967,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.001887670227749213,
+      "loss": 0.1143,
+      "step": 16271
+    },
+    {
+      "epoch": 0.14124877388217116,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0018876558288102227,
+      "loss": 0.0977,
+      "step": 16272
+    },
+    {
+      "epoch": 0.14125745436237533,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0018876414290098635,
+      "loss": 0.1543,
+      "step": 16273
+    },
+    {
+      "epoch": 0.1412661348425795,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018876270283481515,
+      "loss": 0.1533,
+      "step": 16274
+    },
+    {
+      "epoch": 0.14127481532278366,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0018876126268251026,
+      "loss": 0.123,
+      "step": 16275
+    },
+    {
+      "epoch": 0.14128349580298782,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0018875982244407324,
+      "loss": 0.1191,
+      "step": 16276
+    },
+    {
+      "epoch": 0.141292176283192,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0018875838211950562,
+      "loss": 0.1436,
+      "step": 16277
+    },
+    {
+      "epoch": 0.14130085676339615,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0018875694170880906,
+      "loss": 0.0859,
+      "step": 16278
+    },
+    {
+      "epoch": 0.14130953724360032,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0018875550121198509,
+      "loss": 0.0879,
+      "step": 16279
+    },
+    {
+      "epoch": 0.14131821772380448,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0018875406062903526,
+      "loss": 0.1055,
+      "step": 16280
+    },
+    {
+      "epoch": 0.14132689820400865,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0018875261995996118,
+      "loss": 0.1309,
+      "step": 16281
+    },
+    {
+      "epoch": 0.14133557868421281,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0018875117920476444,
+      "loss": 0.1357,
+      "step": 16282
+    },
+    {
+      "epoch": 0.14134425916441698,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0018874973836344657,
+      "loss": 0.1465,
+      "step": 16283
+    },
+    {
+      "epoch": 0.14135293964462115,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0018874829743600917,
+      "loss": 0.1279,
+      "step": 16284
+    },
+    {
+      "epoch": 0.1413616201248253,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0018874685642245384,
+      "loss": 0.0947,
+      "step": 16285
+    },
+    {
+      "epoch": 0.14137030060502948,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0018874541532278208,
+      "loss": 0.0986,
+      "step": 16286
+    },
+    {
+      "epoch": 0.14137898108523364,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0018874397413699557,
+      "loss": 0.1182,
+      "step": 16287
+    },
+    {
+      "epoch": 0.1413876615654378,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001887425328650958,
+      "loss": 0.1318,
+      "step": 16288
+    },
+    {
+      "epoch": 0.14139634204564197,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0018874109150708441,
+      "loss": 0.1094,
+      "step": 16289
+    },
+    {
+      "epoch": 0.14140502252584614,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0018873965006296293,
+      "loss": 0.1045,
+      "step": 16290
+    },
+    {
+      "epoch": 0.1414137030060503,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0018873820853273294,
+      "loss": 0.1001,
+      "step": 16291
+    },
+    {
+      "epoch": 0.14142238348625447,
+      "grad_norm": 3.84375,
+      "learning_rate": 0.0018873676691639606,
+      "loss": 0.2891,
+      "step": 16292
+    },
+    {
+      "epoch": 0.14143106396645863,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.001887353252139538,
+      "loss": 0.1118,
+      "step": 16293
+    },
+    {
+      "epoch": 0.1414397444466628,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001887338834254078,
+      "loss": 0.123,
+      "step": 16294
+    },
+    {
+      "epoch": 0.14144842492686696,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0018873244155075957,
+      "loss": 0.1055,
+      "step": 16295
+    },
+    {
+      "epoch": 0.14145710540707113,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0018873099959001076,
+      "loss": 0.1484,
+      "step": 16296
+    },
+    {
+      "epoch": 0.1414657858872753,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001887295575431629,
+      "loss": 0.1426,
+      "step": 16297
+    },
+    {
+      "epoch": 0.14147446636747946,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0018872811541021757,
+      "loss": 0.127,
+      "step": 16298
+    },
+    {
+      "epoch": 0.14148314684768362,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0018872667319117637,
+      "loss": 0.1089,
+      "step": 16299
+    },
+    {
+      "epoch": 0.1414918273278878,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0018872523088604085,
+      "loss": 0.0967,
+      "step": 16300
+    },
+    {
+      "epoch": 0.14150050780809195,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001887237884948126,
+      "loss": 0.1123,
+      "step": 16301
+    },
+    {
+      "epoch": 0.14150918828829612,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0018872234601749322,
+      "loss": 0.1191,
+      "step": 16302
+    },
+    {
+      "epoch": 0.14151786876850028,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0018872090345408423,
+      "loss": 0.1104,
+      "step": 16303
+    },
+    {
+      "epoch": 0.14152654924870445,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0018871946080458728,
+      "loss": 0.1172,
+      "step": 16304
+    },
+    {
+      "epoch": 0.1415352297289086,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001887180180690039,
+      "loss": 0.1289,
+      "step": 16305
+    },
+    {
+      "epoch": 0.14154391020911278,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0018871657524733567,
+      "loss": 0.0967,
+      "step": 16306
+    },
+    {
+      "epoch": 0.14155259068931694,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0018871513233958417,
+      "loss": 0.0713,
+      "step": 16307
+    },
+    {
+      "epoch": 0.1415612711695211,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.00188713689345751,
+      "loss": 0.1543,
+      "step": 16308
+    },
+    {
+      "epoch": 0.14156995164972527,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001887122462658377,
+      "loss": 0.1621,
+      "step": 16309
+    },
+    {
+      "epoch": 0.14157863212992944,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001887108030998459,
+      "loss": 0.1709,
+      "step": 16310
+    },
+    {
+      "epoch": 0.1415873126101336,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001887093598477771,
+      "loss": 0.1182,
+      "step": 16311
+    },
+    {
+      "epoch": 0.14159599309033777,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.00188707916509633,
+      "loss": 0.1406,
+      "step": 16312
+    },
+    {
+      "epoch": 0.14160467357054193,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0018870647308541503,
+      "loss": 0.0889,
+      "step": 16313
+    },
+    {
+      "epoch": 0.1416133540507461,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001887050295751249,
+      "loss": 0.1152,
+      "step": 16314
+    },
+    {
+      "epoch": 0.14162203453095026,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0018870358597876408,
+      "loss": 0.1416,
+      "step": 16315
+    },
+    {
+      "epoch": 0.14163071501115443,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0018870214229633425,
+      "loss": 0.1406,
+      "step": 16316
+    },
+    {
+      "epoch": 0.1416393954913586,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0018870069852783692,
+      "loss": 0.1206,
+      "step": 16317
+    },
+    {
+      "epoch": 0.14164807597156276,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0018869925467327367,
+      "loss": 0.0957,
+      "step": 16318
+    },
+    {
+      "epoch": 0.14165675645176692,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0018869781073264614,
+      "loss": 0.1387,
+      "step": 16319
+    },
+    {
+      "epoch": 0.1416654369319711,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0018869636670595582,
+      "loss": 0.1289,
+      "step": 16320
+    },
+    {
+      "epoch": 0.14167411741217525,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018869492259320439,
+      "loss": 0.0908,
+      "step": 16321
+    },
+    {
+      "epoch": 0.14168279789237942,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0018869347839439333,
+      "loss": 0.125,
+      "step": 16322
+    },
+    {
+      "epoch": 0.14169147837258358,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0018869203410952426,
+      "loss": 0.1406,
+      "step": 16323
+    },
+    {
+      "epoch": 0.14170015885278775,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0018869058973859881,
+      "loss": 0.1191,
+      "step": 16324
+    },
+    {
+      "epoch": 0.14170883933299191,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0018868914528161846,
+      "loss": 0.1162,
+      "step": 16325
+    },
+    {
+      "epoch": 0.14171751981319605,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0018868770073858488,
+      "loss": 0.1016,
+      "step": 16326
+    },
+    {
+      "epoch": 0.14172620029340022,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0018868625610949958,
+      "loss": 0.1348,
+      "step": 16327
+    },
+    {
+      "epoch": 0.14173488077360438,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.001886848113943642,
+      "loss": 0.123,
+      "step": 16328
+    },
+    {
+      "epoch": 0.14174356125380855,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001886833665931803,
+      "loss": 0.1367,
+      "step": 16329
+    },
+    {
+      "epoch": 0.1417522417340127,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0018868192170594944,
+      "loss": 0.1113,
+      "step": 16330
+    },
+    {
+      "epoch": 0.14176092221421688,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001886804767326732,
+      "loss": 0.1289,
+      "step": 16331
+    },
+    {
+      "epoch": 0.14176960269442104,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0018867903167335321,
+      "loss": 0.0962,
+      "step": 16332
+    },
+    {
+      "epoch": 0.1417782831746252,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0018867758652799096,
+      "loss": 0.1221,
+      "step": 16333
+    },
+    {
+      "epoch": 0.14178696365482937,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.001886761412965881,
+      "loss": 0.1289,
+      "step": 16334
+    },
+    {
+      "epoch": 0.14179564413503354,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018867469597914621,
+      "loss": 0.1387,
+      "step": 16335
+    },
+    {
+      "epoch": 0.1418043246152377,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0018867325057566684,
+      "loss": 0.1328,
+      "step": 16336
+    },
+    {
+      "epoch": 0.14181300509544187,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001886718050861516,
+      "loss": 0.1079,
+      "step": 16337
+    },
+    {
+      "epoch": 0.14182168557564603,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0018867035951060207,
+      "loss": 0.1133,
+      "step": 16338
+    },
+    {
+      "epoch": 0.1418303660558502,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001886689138490198,
+      "loss": 0.1309,
+      "step": 16339
+    },
+    {
+      "epoch": 0.14183904653605436,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0018866746810140638,
+      "loss": 0.1074,
+      "step": 16340
+    },
+    {
+      "epoch": 0.14184772701625853,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0018866602226776342,
+      "loss": 0.1035,
+      "step": 16341
+    },
+    {
+      "epoch": 0.1418564074964627,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0018866457634809245,
+      "loss": 0.126,
+      "step": 16342
+    },
+    {
+      "epoch": 0.14186508797666686,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0018866313034239506,
+      "loss": 0.1562,
+      "step": 16343
+    },
+    {
+      "epoch": 0.14187376845687102,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001886616842506729,
+      "loss": 0.0991,
+      "step": 16344
+    },
+    {
+      "epoch": 0.1418824489370752,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0018866023807292752,
+      "loss": 0.1035,
+      "step": 16345
+    },
+    {
+      "epoch": 0.14189112941727935,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0018865879180916044,
+      "loss": 0.1094,
+      "step": 16346
+    },
+    {
+      "epoch": 0.14189980989748352,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001886573454593733,
+      "loss": 0.1426,
+      "step": 16347
+    },
+    {
+      "epoch": 0.14190849037768768,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.001886558990235677,
+      "loss": 0.1064,
+      "step": 16348
+    },
+    {
+      "epoch": 0.14191717085789185,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0018865445250174514,
+      "loss": 0.126,
+      "step": 16349
+    },
+    {
+      "epoch": 0.14192585133809602,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0018865300589390728,
+      "loss": 0.1309,
+      "step": 16350
+    },
+    {
+      "epoch": 0.14193453181830018,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0018865155920005565,
+      "loss": 0.1367,
+      "step": 16351
+    },
+    {
+      "epoch": 0.14194321229850435,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001886501124201919,
+      "loss": 0.0981,
+      "step": 16352
+    },
+    {
+      "epoch": 0.1419518927787085,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0018864866555431755,
+      "loss": 0.1279,
+      "step": 16353
+    },
+    {
+      "epoch": 0.14196057325891268,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001886472186024342,
+      "loss": 0.0986,
+      "step": 16354
+    },
+    {
+      "epoch": 0.14196925373911684,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0018864577156454342,
+      "loss": 0.1426,
+      "step": 16355
+    },
+    {
+      "epoch": 0.141977934219321,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0018864432444064682,
+      "loss": 0.1699,
+      "step": 16356
+    },
+    {
+      "epoch": 0.14198661469952517,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0018864287723074596,
+      "loss": 0.1562,
+      "step": 16357
+    },
+    {
+      "epoch": 0.14199529517972934,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0018864142993484244,
+      "loss": 0.1299,
+      "step": 16358
+    },
+    {
+      "epoch": 0.1420039756599335,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0018863998255293781,
+      "loss": 0.1494,
+      "step": 16359
+    },
+    {
+      "epoch": 0.14201265614013767,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001886385350850337,
+      "loss": 0.1055,
+      "step": 16360
+    },
+    {
+      "epoch": 0.14202133662034183,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001886370875311317,
+      "loss": 0.1016,
+      "step": 16361
+    },
+    {
+      "epoch": 0.142030017100546,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0018863563989123331,
+      "loss": 0.1006,
+      "step": 16362
+    },
+    {
+      "epoch": 0.14203869758075016,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001886341921653402,
+      "loss": 0.0801,
+      "step": 16363
+    },
+    {
+      "epoch": 0.14204737806095433,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0018863274435345389,
+      "loss": 0.0996,
+      "step": 16364
+    },
+    {
+      "epoch": 0.1420560585411585,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.00188631296455576,
+      "loss": 0.1162,
+      "step": 16365
+    },
+    {
+      "epoch": 0.14206473902136266,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001886298484717081,
+      "loss": 0.1182,
+      "step": 16366
+    },
+    {
+      "epoch": 0.14207341950156682,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0018862840040185182,
+      "loss": 0.0889,
+      "step": 16367
+    },
+    {
+      "epoch": 0.142082099981771,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0018862695224600864,
+      "loss": 0.1328,
+      "step": 16368
+    },
+    {
+      "epoch": 0.14209078046197515,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0018862550400418026,
+      "loss": 0.1641,
+      "step": 16369
+    },
+    {
+      "epoch": 0.14209946094217932,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0018862405567636818,
+      "loss": 0.1152,
+      "step": 16370
+    },
+    {
+      "epoch": 0.14210814142238348,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0018862260726257402,
+      "loss": 0.0933,
+      "step": 16371
+    },
+    {
+      "epoch": 0.14211682190258765,
+      "grad_norm": 0.052978515625,
+      "learning_rate": 0.0018862115876279936,
+      "loss": 0.0957,
+      "step": 16372
+    },
+    {
+      "epoch": 0.1421255023827918,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.001886197101770458,
+      "loss": 0.0908,
+      "step": 16373
+    },
+    {
+      "epoch": 0.14213418286299598,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0018861826150531487,
+      "loss": 0.0752,
+      "step": 16374
+    },
+    {
+      "epoch": 0.14214286334320014,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0018861681274760823,
+      "loss": 0.0835,
+      "step": 16375
+    },
+    {
+      "epoch": 0.1421515438234043,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001886153639039274,
+      "loss": 0.1172,
+      "step": 16376
+    },
+    {
+      "epoch": 0.14216022430360847,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.00188613914974274,
+      "loss": 0.1475,
+      "step": 16377
+    },
+    {
+      "epoch": 0.14216890478381264,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0018861246595864955,
+      "loss": 0.166,
+      "step": 16378
+    },
+    {
+      "epoch": 0.1421775852640168,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0018861101685705578,
+      "loss": 0.1582,
+      "step": 16379
+    },
+    {
+      "epoch": 0.14218626574422097,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0018860956766949414,
+      "loss": 0.127,
+      "step": 16380
+    },
+    {
+      "epoch": 0.14219494622442513,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0018860811839596623,
+      "loss": 0.103,
+      "step": 16381
+    },
+    {
+      "epoch": 0.1422036267046293,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001886066690364737,
+      "loss": 0.1133,
+      "step": 16382
+    },
+    {
+      "epoch": 0.14221230718483346,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001886052195910181,
+      "loss": 0.1182,
+      "step": 16383
+    },
+    {
+      "epoch": 0.14222098766503763,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.00188603770059601,
+      "loss": 0.1172,
+      "step": 16384
+    },
+    {
+      "epoch": 0.1422296681452418,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00188602320442224,
+      "loss": 0.1387,
+      "step": 16385
+    },
+    {
+      "epoch": 0.14223834862544596,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0018860087073888866,
+      "loss": 0.0864,
+      "step": 16386
+    },
+    {
+      "epoch": 0.14224702910565012,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0018859942094959664,
+      "loss": 0.0942,
+      "step": 16387
+    },
+    {
+      "epoch": 0.1422557095858543,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001885979710743494,
+      "loss": 0.0957,
+      "step": 16388
+    },
+    {
+      "epoch": 0.14226439006605845,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0018859652111314867,
+      "loss": 0.1533,
+      "step": 16389
+    },
+    {
+      "epoch": 0.14227307054626262,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0018859507106599594,
+      "loss": 0.1367,
+      "step": 16390
+    },
+    {
+      "epoch": 0.14228175102646679,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001885936209328928,
+      "loss": 0.103,
+      "step": 16391
+    },
+    {
+      "epoch": 0.14229043150667095,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0018859217071384088,
+      "loss": 0.0669,
+      "step": 16392
+    },
+    {
+      "epoch": 0.14229911198687512,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0018859072040884176,
+      "loss": 0.0991,
+      "step": 16393
+    },
+    {
+      "epoch": 0.14230779246707928,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0018858927001789698,
+      "loss": 0.1719,
+      "step": 16394
+    },
+    {
+      "epoch": 0.14231647294728345,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0018858781954100817,
+      "loss": 0.0889,
+      "step": 16395
+    },
+    {
+      "epoch": 0.1423251534274876,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.001885863689781769,
+      "loss": 0.0801,
+      "step": 16396
+    },
+    {
+      "epoch": 0.14233383390769178,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0018858491832940477,
+      "loss": 0.1143,
+      "step": 16397
+    },
+    {
+      "epoch": 0.14234251438789594,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001885834675946933,
+      "loss": 0.1445,
+      "step": 16398
+    },
+    {
+      "epoch": 0.1423511948681001,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0018858201677404418,
+      "loss": 0.1211,
+      "step": 16399
+    },
+    {
+      "epoch": 0.14235987534830427,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0018858056586745892,
+      "loss": 0.083,
+      "step": 16400
+    },
+    {
+      "epoch": 0.14236855582850844,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0018857911487493917,
+      "loss": 0.1211,
+      "step": 16401
+    },
+    {
+      "epoch": 0.1423772363087126,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0018857766379648646,
+      "loss": 0.1123,
+      "step": 16402
+    },
+    {
+      "epoch": 0.14238591678891677,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001885762126321024,
+      "loss": 0.1099,
+      "step": 16403
+    },
+    {
+      "epoch": 0.14239459726912093,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001885747613817886,
+      "loss": 0.1396,
+      "step": 16404
+    },
+    {
+      "epoch": 0.1424032777493251,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0018857331004554657,
+      "loss": 0.1167,
+      "step": 16405
+    },
+    {
+      "epoch": 0.14241195822952926,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018857185862337798,
+      "loss": 0.1191,
+      "step": 16406
+    },
+    {
+      "epoch": 0.14242063870973343,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0018857040711528438,
+      "loss": 0.125,
+      "step": 16407
+    },
+    {
+      "epoch": 0.1424293191899376,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0018856895552126737,
+      "loss": 0.1289,
+      "step": 16408
+    },
+    {
+      "epoch": 0.14243799967014176,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001885675038413285,
+      "loss": 0.1289,
+      "step": 16409
+    },
+    {
+      "epoch": 0.14244668015034592,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001885660520754694,
+      "loss": 0.1064,
+      "step": 16410
+    },
+    {
+      "epoch": 0.1424553606305501,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0018856460022369165,
+      "loss": 0.1064,
+      "step": 16411
+    },
+    {
+      "epoch": 0.14246404111075425,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0018856314828599686,
+      "loss": 0.1348,
+      "step": 16412
+    },
+    {
+      "epoch": 0.14247272159095842,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018856169626238657,
+      "loss": 0.0659,
+      "step": 16413
+    },
+    {
+      "epoch": 0.14248140207116258,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001885602441528624,
+      "loss": 0.127,
+      "step": 16414
+    },
+    {
+      "epoch": 0.14249008255136675,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0018855879195742592,
+      "loss": 0.1108,
+      "step": 16415
+    },
+    {
+      "epoch": 0.1424987630315709,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0018855733967607874,
+      "loss": 0.0752,
+      "step": 16416
+    },
+    {
+      "epoch": 0.14250744351177508,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0018855588730882237,
+      "loss": 0.0967,
+      "step": 16417
+    },
+    {
+      "epoch": 0.14251612399197924,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0018855443485565854,
+      "loss": 0.1074,
+      "step": 16418
+    },
+    {
+      "epoch": 0.1425248044721834,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0018855298231658874,
+      "loss": 0.1025,
+      "step": 16419
+    },
+    {
+      "epoch": 0.14253348495238757,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0018855152969161455,
+      "loss": 0.1416,
+      "step": 16420
+    },
+    {
+      "epoch": 0.14254216543259174,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001885500769807376,
+      "loss": 0.1191,
+      "step": 16421
+    },
+    {
+      "epoch": 0.1425508459127959,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0018854862418395948,
+      "loss": 0.1289,
+      "step": 16422
+    },
+    {
+      "epoch": 0.14255952639300007,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0018854717130128175,
+      "loss": 0.1162,
+      "step": 16423
+    },
+    {
+      "epoch": 0.14256820687320423,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0018854571833270602,
+      "loss": 0.1191,
+      "step": 16424
+    },
+    {
+      "epoch": 0.1425768873534084,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0018854426527823388,
+      "loss": 0.1504,
+      "step": 16425
+    },
+    {
+      "epoch": 0.14258556783361256,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0018854281213786692,
+      "loss": 0.127,
+      "step": 16426
+    },
+    {
+      "epoch": 0.14259424831381673,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0018854135891160668,
+      "loss": 0.1235,
+      "step": 16427
+    },
+    {
+      "epoch": 0.1426029287940209,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0018853990559945483,
+      "loss": 0.1182,
+      "step": 16428
+    },
+    {
+      "epoch": 0.14261160927422506,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001885384522014129,
+      "loss": 0.1113,
+      "step": 16429
+    },
+    {
+      "epoch": 0.14262028975442922,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0018853699871748252,
+      "loss": 0.103,
+      "step": 16430
+    },
+    {
+      "epoch": 0.1426289702346334,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0018853554514766523,
+      "loss": 0.127,
+      "step": 16431
+    },
+    {
+      "epoch": 0.14263765071483755,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0018853409149196265,
+      "loss": 0.1387,
+      "step": 16432
+    },
+    {
+      "epoch": 0.14264633119504172,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0018853263775037638,
+      "loss": 0.0864,
+      "step": 16433
+    },
+    {
+      "epoch": 0.14265501167524589,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.00188531183922908,
+      "loss": 0.1138,
+      "step": 16434
+    },
+    {
+      "epoch": 0.14266369215545005,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001885297300095591,
+      "loss": 0.0996,
+      "step": 16435
+    },
+    {
+      "epoch": 0.14267237263565422,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0018852827601033124,
+      "loss": 0.1221,
+      "step": 16436
+    },
+    {
+      "epoch": 0.14268105311585838,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0018852682192522605,
+      "loss": 0.0752,
+      "step": 16437
+    },
+    {
+      "epoch": 0.14268973359606255,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0018852536775424511,
+      "loss": 0.1484,
+      "step": 16438
+    },
+    {
+      "epoch": 0.1426984140762667,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0018852391349739003,
+      "loss": 0.127,
+      "step": 16439
+    },
+    {
+      "epoch": 0.14270709455647088,
+      "grad_norm": 1.8828125,
+      "learning_rate": 0.0018852245915466235,
+      "loss": 0.1162,
+      "step": 16440
+    },
+    {
+      "epoch": 0.14271577503667504,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.001885210047260637,
+      "loss": 0.1162,
+      "step": 16441
+    },
+    {
+      "epoch": 0.1427244555168792,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0018851955021159566,
+      "loss": 0.1133,
+      "step": 16442
+    },
+    {
+      "epoch": 0.14273313599708337,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0018851809561125981,
+      "loss": 0.1533,
+      "step": 16443
+    },
+    {
+      "epoch": 0.14274181647728754,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0018851664092505778,
+      "loss": 0.1416,
+      "step": 16444
+    },
+    {
+      "epoch": 0.1427504969574917,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0018851518615299108,
+      "loss": 0.0962,
+      "step": 16445
+    },
+    {
+      "epoch": 0.14275917743769587,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001885137312950614,
+      "loss": 0.1113,
+      "step": 16446
+    },
+    {
+      "epoch": 0.14276785791790003,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0018851227635127024,
+      "loss": 0.1108,
+      "step": 16447
+    },
+    {
+      "epoch": 0.1427765383981042,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0018851082132161927,
+      "loss": 0.0947,
+      "step": 16448
+    },
+    {
+      "epoch": 0.14278521887830833,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0018850936620611002,
+      "loss": 0.0918,
+      "step": 16449
+    },
+    {
+      "epoch": 0.1427938993585125,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0018850791100474414,
+      "loss": 0.1699,
+      "step": 16450
+    },
+    {
+      "epoch": 0.14280257983871666,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0018850645571752316,
+      "loss": 0.0806,
+      "step": 16451
+    },
+    {
+      "epoch": 0.14281126031892083,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001885050003444487,
+      "loss": 0.1338,
+      "step": 16452
+    },
+    {
+      "epoch": 0.142819940799125,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0018850354488552238,
+      "loss": 0.0967,
+      "step": 16453
+    },
+    {
+      "epoch": 0.14282862127932916,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001885020893407457,
+      "loss": 0.0996,
+      "step": 16454
+    },
+    {
+      "epoch": 0.14283730175953332,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0018850063371012038,
+      "loss": 0.1084,
+      "step": 16455
+    },
+    {
+      "epoch": 0.1428459822397375,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001884991779936479,
+      "loss": 0.0977,
+      "step": 16456
+    },
+    {
+      "epoch": 0.14285466271994166,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0018849772219132993,
+      "loss": 0.126,
+      "step": 16457
+    },
+    {
+      "epoch": 0.14286334320014582,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0018849626630316801,
+      "loss": 0.1445,
+      "step": 16458
+    },
+    {
+      "epoch": 0.14287202368034999,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0018849481032916378,
+      "loss": 0.0869,
+      "step": 16459
+    },
+    {
+      "epoch": 0.14288070416055415,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0018849335426931878,
+      "loss": 0.1035,
+      "step": 16460
+    },
+    {
+      "epoch": 0.14288938464075832,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0018849189812363465,
+      "loss": 0.1328,
+      "step": 16461
+    },
+    {
+      "epoch": 0.14289806512096248,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0018849044189211293,
+      "loss": 0.0952,
+      "step": 16462
+    },
+    {
+      "epoch": 0.14290674560116665,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0018848898557475525,
+      "loss": 0.0918,
+      "step": 16463
+    },
+    {
+      "epoch": 0.1429154260813708,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001884875291715632,
+      "loss": 0.2246,
+      "step": 16464
+    },
+    {
+      "epoch": 0.14292410656157498,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0018848607268253839,
+      "loss": 0.1016,
+      "step": 16465
+    },
+    {
+      "epoch": 0.14293278704177914,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0018848461610768237,
+      "loss": 0.1367,
+      "step": 16466
+    },
+    {
+      "epoch": 0.1429414675219833,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018848315944699674,
+      "loss": 0.1484,
+      "step": 16467
+    },
+    {
+      "epoch": 0.14295014800218747,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0018848170270048313,
+      "loss": 0.123,
+      "step": 16468
+    },
+    {
+      "epoch": 0.14295882848239164,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0018848024586814308,
+      "loss": 0.1592,
+      "step": 16469
+    },
+    {
+      "epoch": 0.1429675089625958,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0018847878894997826,
+      "loss": 0.3047,
+      "step": 16470
+    },
+    {
+      "epoch": 0.14297618944279997,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0018847733194599014,
+      "loss": 0.1006,
+      "step": 16471
+    },
+    {
+      "epoch": 0.14298486992300413,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0018847587485618046,
+      "loss": 0.1201,
+      "step": 16472
+    },
+    {
+      "epoch": 0.1429935504032083,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001884744176805507,
+      "loss": 0.126,
+      "step": 16473
+    },
+    {
+      "epoch": 0.14300223088341246,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001884729604191025,
+      "loss": 0.1025,
+      "step": 16474
+    },
+    {
+      "epoch": 0.14301091136361663,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0018847150307183748,
+      "loss": 0.165,
+      "step": 16475
+    },
+    {
+      "epoch": 0.1430195918438208,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0018847004563875718,
+      "loss": 0.1016,
+      "step": 16476
+    },
+    {
+      "epoch": 0.14302827232402496,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0018846858811986324,
+      "loss": 0.1113,
+      "step": 16477
+    },
+    {
+      "epoch": 0.14303695280422912,
+      "grad_norm": 1.671875,
+      "learning_rate": 0.001884671305151572,
+      "loss": 0.3145,
+      "step": 16478
+    },
+    {
+      "epoch": 0.1430456332844333,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001884656728246407,
+      "loss": 0.083,
+      "step": 16479
+    },
+    {
+      "epoch": 0.14305431376463745,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0018846421504831532,
+      "loss": 0.0854,
+      "step": 16480
+    },
+    {
+      "epoch": 0.14306299424484162,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0018846275718618264,
+      "loss": 0.1108,
+      "step": 16481
+    },
+    {
+      "epoch": 0.14307167472504578,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0018846129923824429,
+      "loss": 0.1084,
+      "step": 16482
+    },
+    {
+      "epoch": 0.14308035520524995,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0018845984120450185,
+      "loss": 0.1582,
+      "step": 16483
+    },
+    {
+      "epoch": 0.1430890356854541,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0018845838308495686,
+      "loss": 0.125,
+      "step": 16484
+    },
+    {
+      "epoch": 0.14309771616565828,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.00188456924879611,
+      "loss": 0.0967,
+      "step": 16485
+    },
+    {
+      "epoch": 0.14310639664586244,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0018845546658846583,
+      "loss": 0.083,
+      "step": 16486
+    },
+    {
+      "epoch": 0.1431150771260666,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001884540082115229,
+      "loss": 0.1641,
+      "step": 16487
+    },
+    {
+      "epoch": 0.14312375760627077,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0018845254974878386,
+      "loss": 0.1426,
+      "step": 16488
+    },
+    {
+      "epoch": 0.14313243808647494,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0018845109120025033,
+      "loss": 0.1406,
+      "step": 16489
+    },
+    {
+      "epoch": 0.1431411185666791,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001884496325659238,
+      "loss": 0.1182,
+      "step": 16490
+    },
+    {
+      "epoch": 0.14314979904688327,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0018844817384580598,
+      "loss": 0.1025,
+      "step": 16491
+    },
+    {
+      "epoch": 0.14315847952708743,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018844671503989842,
+      "loss": 0.1055,
+      "step": 16492
+    },
+    {
+      "epoch": 0.1431671600072916,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0018844525614820267,
+      "loss": 0.1196,
+      "step": 16493
+    },
+    {
+      "epoch": 0.14317584048749576,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0018844379717072037,
+      "loss": 0.0845,
+      "step": 16494
+    },
+    {
+      "epoch": 0.14318452096769993,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0018844233810745314,
+      "loss": 0.0913,
+      "step": 16495
+    },
+    {
+      "epoch": 0.1431932014479041,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0018844087895840257,
+      "loss": 0.1562,
+      "step": 16496
+    },
+    {
+      "epoch": 0.14320188192810826,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001884394197235702,
+      "loss": 0.1279,
+      "step": 16497
+    },
+    {
+      "epoch": 0.14321056240831243,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0018843796040295764,
+      "loss": 0.1191,
+      "step": 16498
+    },
+    {
+      "epoch": 0.1432192428885166,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0018843650099656653,
+      "loss": 0.123,
+      "step": 16499
+    },
+    {
+      "epoch": 0.14322792336872076,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0018843504150439843,
+      "loss": 0.1108,
+      "step": 16500
+    },
+    {
+      "epoch": 0.14323660384892492,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0018843358192645494,
+      "loss": 0.103,
+      "step": 16501
+    },
+    {
+      "epoch": 0.14324528432912909,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001884321222627377,
+      "loss": 0.0752,
+      "step": 16502
+    },
+    {
+      "epoch": 0.14325396480933325,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0018843066251324825,
+      "loss": 0.1562,
+      "step": 16503
+    },
+    {
+      "epoch": 0.14326264528953742,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001884292026779882,
+      "loss": 0.1279,
+      "step": 16504
+    },
+    {
+      "epoch": 0.14327132576974158,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0018842774275695915,
+      "loss": 0.1328,
+      "step": 16505
+    },
+    {
+      "epoch": 0.14328000624994575,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001884262827501627,
+      "loss": 0.1211,
+      "step": 16506
+    },
+    {
+      "epoch": 0.1432886867301499,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018842482265760045,
+      "loss": 0.1318,
+      "step": 16507
+    },
+    {
+      "epoch": 0.14329736721035408,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0018842336247927398,
+      "loss": 0.1465,
+      "step": 16508
+    },
+    {
+      "epoch": 0.14330604769055824,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0018842190221518493,
+      "loss": 0.126,
+      "step": 16509
+    },
+    {
+      "epoch": 0.1433147281707624,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0018842044186533485,
+      "loss": 0.1079,
+      "step": 16510
+    },
+    {
+      "epoch": 0.14332340865096657,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0018841898142972536,
+      "loss": 0.1133,
+      "step": 16511
+    },
+    {
+      "epoch": 0.14333208913117074,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0018841752090835802,
+      "loss": 0.1279,
+      "step": 16512
+    },
+    {
+      "epoch": 0.1433407696113749,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0018841606030123449,
+      "loss": 0.1074,
+      "step": 16513
+    },
+    {
+      "epoch": 0.14334945009157907,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0018841459960835631,
+      "loss": 0.1089,
+      "step": 16514
+    },
+    {
+      "epoch": 0.14335813057178323,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0018841313882972512,
+      "loss": 0.127,
+      "step": 16515
+    },
+    {
+      "epoch": 0.1433668110519874,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0018841167796534249,
+      "loss": 0.1543,
+      "step": 16516
+    },
+    {
+      "epoch": 0.14337549153219156,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018841021701521002,
+      "loss": 0.083,
+      "step": 16517
+    },
+    {
+      "epoch": 0.14338417201239573,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0018840875597932933,
+      "loss": 0.1226,
+      "step": 16518
+    },
+    {
+      "epoch": 0.1433928524925999,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.00188407294857702,
+      "loss": 0.0889,
+      "step": 16519
+    },
+    {
+      "epoch": 0.14340153297280406,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0018840583365032965,
+      "loss": 0.0977,
+      "step": 16520
+    },
+    {
+      "epoch": 0.14341021345300822,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0018840437235721384,
+      "loss": 0.1426,
+      "step": 16521
+    },
+    {
+      "epoch": 0.1434188939332124,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018840291097835615,
+      "loss": 0.124,
+      "step": 16522
+    },
+    {
+      "epoch": 0.14342757441341655,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0018840144951375827,
+      "loss": 0.126,
+      "step": 16523
+    },
+    {
+      "epoch": 0.14343625489362072,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001883999879634217,
+      "loss": 0.1025,
+      "step": 16524
+    },
+    {
+      "epoch": 0.14344493537382488,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0018839852632734813,
+      "loss": 0.1436,
+      "step": 16525
+    },
+    {
+      "epoch": 0.14345361585402905,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0018839706460553906,
+      "loss": 0.1289,
+      "step": 16526
+    },
+    {
+      "epoch": 0.1434622963342332,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0018839560279799614,
+      "loss": 0.1196,
+      "step": 16527
+    },
+    {
+      "epoch": 0.14347097681443738,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00188394140904721,
+      "loss": 0.1289,
+      "step": 16528
+    },
+    {
+      "epoch": 0.14347965729464154,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0018839267892571522,
+      "loss": 0.1426,
+      "step": 16529
+    },
+    {
+      "epoch": 0.1434883377748457,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0018839121686098036,
+      "loss": 0.1201,
+      "step": 16530
+    },
+    {
+      "epoch": 0.14349701825504987,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0018838975471051802,
+      "loss": 0.0957,
+      "step": 16531
+    },
+    {
+      "epoch": 0.14350569873525404,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0018838829247432983,
+      "loss": 0.0947,
+      "step": 16532
+    },
+    {
+      "epoch": 0.1435143792154582,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0018838683015241738,
+      "loss": 0.1523,
+      "step": 16533
+    },
+    {
+      "epoch": 0.14352305969566237,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0018838536774478233,
+      "loss": 0.0981,
+      "step": 16534
+    },
+    {
+      "epoch": 0.14353174017586653,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0018838390525142614,
+      "loss": 0.1406,
+      "step": 16535
+    },
+    {
+      "epoch": 0.1435404206560707,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0018838244267235051,
+      "loss": 0.1465,
+      "step": 16536
+    },
+    {
+      "epoch": 0.14354910113627486,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0018838098000755704,
+      "loss": 0.0903,
+      "step": 16537
+    },
+    {
+      "epoch": 0.14355778161647903,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0018837951725704728,
+      "loss": 0.1279,
+      "step": 16538
+    },
+    {
+      "epoch": 0.1435664620966832,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0018837805442082289,
+      "loss": 0.0981,
+      "step": 16539
+    },
+    {
+      "epoch": 0.14357514257688736,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0018837659149888541,
+      "loss": 0.1245,
+      "step": 16540
+    },
+    {
+      "epoch": 0.14358382305709153,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.001883751284912365,
+      "loss": 0.1191,
+      "step": 16541
+    },
+    {
+      "epoch": 0.1435925035372957,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0018837366539787767,
+      "loss": 0.1162,
+      "step": 16542
+    },
+    {
+      "epoch": 0.14360118401749986,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0018837220221881064,
+      "loss": 0.0967,
+      "step": 16543
+    },
+    {
+      "epoch": 0.14360986449770402,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0018837073895403692,
+      "loss": 0.1162,
+      "step": 16544
+    },
+    {
+      "epoch": 0.14361854497790819,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0018836927560355815,
+      "loss": 0.1504,
+      "step": 16545
+    },
+    {
+      "epoch": 0.14362722545811235,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001883678121673759,
+      "loss": 0.1562,
+      "step": 16546
+    },
+    {
+      "epoch": 0.14363590593831652,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001883663486454918,
+      "loss": 0.1338,
+      "step": 16547
+    },
+    {
+      "epoch": 0.14364458641852068,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0018836488503790744,
+      "loss": 0.1074,
+      "step": 16548
+    },
+    {
+      "epoch": 0.14365326689872485,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0018836342134462439,
+      "loss": 0.1143,
+      "step": 16549
+    },
+    {
+      "epoch": 0.143661947378929,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001883619575656443,
+      "loss": 0.0771,
+      "step": 16550
+    },
+    {
+      "epoch": 0.14367062785913318,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0018836049370096876,
+      "loss": 0.1172,
+      "step": 16551
+    },
+    {
+      "epoch": 0.14367930833933734,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0018835902975059935,
+      "loss": 0.0991,
+      "step": 16552
+    },
+    {
+      "epoch": 0.1436879888195415,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001883575657145377,
+      "loss": 0.127,
+      "step": 16553
+    },
+    {
+      "epoch": 0.14369666929974567,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0018835610159278536,
+      "loss": 0.1035,
+      "step": 16554
+    },
+    {
+      "epoch": 0.14370534977994984,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.00188354637385344,
+      "loss": 0.1152,
+      "step": 16555
+    },
+    {
+      "epoch": 0.143714030260154,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0018835317309221515,
+      "loss": 0.0977,
+      "step": 16556
+    },
+    {
+      "epoch": 0.14372271074035817,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0018835170871340048,
+      "loss": 0.084,
+      "step": 16557
+    },
+    {
+      "epoch": 0.14373139122056233,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0018835024424890153,
+      "loss": 0.1055,
+      "step": 16558
+    },
+    {
+      "epoch": 0.1437400717007665,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0018834877969871994,
+      "loss": 0.1152,
+      "step": 16559
+    },
+    {
+      "epoch": 0.14374875218097066,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001883473150628573,
+      "loss": 0.0991,
+      "step": 16560
+    },
+    {
+      "epoch": 0.14375743266117483,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018834585034131525,
+      "loss": 0.1074,
+      "step": 16561
+    },
+    {
+      "epoch": 0.143766113141379,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0018834438553409533,
+      "loss": 0.1377,
+      "step": 16562
+    },
+    {
+      "epoch": 0.14377479362158316,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0018834292064119915,
+      "loss": 0.1719,
+      "step": 16563
+    },
+    {
+      "epoch": 0.14378347410178732,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0018834145566262834,
+      "loss": 0.1006,
+      "step": 16564
+    },
+    {
+      "epoch": 0.1437921545819915,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0018833999059838448,
+      "loss": 0.1299,
+      "step": 16565
+    },
+    {
+      "epoch": 0.14380083506219565,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0018833852544846918,
+      "loss": 0.1436,
+      "step": 16566
+    },
+    {
+      "epoch": 0.14380951554239982,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001883370602128841,
+      "loss": 0.0938,
+      "step": 16567
+    },
+    {
+      "epoch": 0.14381819602260398,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0018833559489163075,
+      "loss": 0.1104,
+      "step": 16568
+    },
+    {
+      "epoch": 0.14382687650280815,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0018833412948471076,
+      "loss": 0.1011,
+      "step": 16569
+    },
+    {
+      "epoch": 0.1438355569830123,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0018833266399212573,
+      "loss": 0.0986,
+      "step": 16570
+    },
+    {
+      "epoch": 0.14384423746321648,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0018833119841387732,
+      "loss": 0.1309,
+      "step": 16571
+    },
+    {
+      "epoch": 0.14385291794342062,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0018832973274996708,
+      "loss": 0.1162,
+      "step": 16572
+    },
+    {
+      "epoch": 0.14386159842362478,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0018832826700039657,
+      "loss": 0.1064,
+      "step": 16573
+    },
+    {
+      "epoch": 0.14387027890382895,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001883268011651675,
+      "loss": 0.123,
+      "step": 16574
+    },
+    {
+      "epoch": 0.1438789593840331,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001883253352442814,
+      "loss": 0.1602,
+      "step": 16575
+    },
+    {
+      "epoch": 0.14388763986423728,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0018832386923773993,
+      "loss": 0.0957,
+      "step": 16576
+    },
+    {
+      "epoch": 0.14389632034444144,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0018832240314554459,
+      "loss": 0.1318,
+      "step": 16577
+    },
+    {
+      "epoch": 0.1439050008246456,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0018832093696769705,
+      "loss": 0.1104,
+      "step": 16578
+    },
+    {
+      "epoch": 0.14391368130484977,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0018831947070419896,
+      "loss": 0.082,
+      "step": 16579
+    },
+    {
+      "epoch": 0.14392236178505394,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0018831800435505181,
+      "loss": 0.127,
+      "step": 16580
+    },
+    {
+      "epoch": 0.1439310422652581,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0018831653792025732,
+      "loss": 0.1035,
+      "step": 16581
+    },
+    {
+      "epoch": 0.14393972274546227,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0018831507139981702,
+      "loss": 0.1309,
+      "step": 16582
+    },
+    {
+      "epoch": 0.14394840322566643,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0018831360479373254,
+      "loss": 0.1738,
+      "step": 16583
+    },
+    {
+      "epoch": 0.1439570837058706,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0018831213810200547,
+      "loss": 0.1416,
+      "step": 16584
+    },
+    {
+      "epoch": 0.14396576418607476,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0018831067132463741,
+      "loss": 0.127,
+      "step": 16585
+    },
+    {
+      "epoch": 0.14397444466627893,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0018830920446163003,
+      "loss": 0.1777,
+      "step": 16586
+    },
+    {
+      "epoch": 0.1439831251464831,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0018830773751298481,
+      "loss": 0.1045,
+      "step": 16587
+    },
+    {
+      "epoch": 0.14399180562668726,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0018830627047870345,
+      "loss": 0.1123,
+      "step": 16588
+    },
+    {
+      "epoch": 0.14400048610689142,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0018830480335878755,
+      "loss": 0.0889,
+      "step": 16589
+    },
+    {
+      "epoch": 0.1440091665870956,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0018830333615323866,
+      "loss": 0.0977,
+      "step": 16590
+    },
+    {
+      "epoch": 0.14401784706729975,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0018830186886205848,
+      "loss": 0.1367,
+      "step": 16591
+    },
+    {
+      "epoch": 0.14402652754750392,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0018830040148524849,
+      "loss": 0.1543,
+      "step": 16592
+    },
+    {
+      "epoch": 0.14403520802770808,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001882989340228104,
+      "loss": 0.0864,
+      "step": 16593
+    },
+    {
+      "epoch": 0.14404388850791225,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0018829746647474575,
+      "loss": 0.083,
+      "step": 16594
+    },
+    {
+      "epoch": 0.14405256898811641,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0018829599884105618,
+      "loss": 0.1069,
+      "step": 16595
+    },
+    {
+      "epoch": 0.14406124946832058,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0018829453112174325,
+      "loss": 0.0996,
+      "step": 16596
+    },
+    {
+      "epoch": 0.14406992994852474,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0018829306331680862,
+      "loss": 0.1396,
+      "step": 16597
+    },
+    {
+      "epoch": 0.1440786104287289,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0018829159542625385,
+      "loss": 0.1416,
+      "step": 16598
+    },
+    {
+      "epoch": 0.14408729090893307,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001882901274500806,
+      "loss": 0.1152,
+      "step": 16599
+    },
+    {
+      "epoch": 0.14409597138913724,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0018828865938829043,
+      "loss": 0.1348,
+      "step": 16600
+    },
+    {
+      "epoch": 0.1441046518693414,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0018828719124088498,
+      "loss": 0.1045,
+      "step": 16601
+    },
+    {
+      "epoch": 0.14411333234954557,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.001882857230078658,
+      "loss": 0.062,
+      "step": 16602
+    },
+    {
+      "epoch": 0.14412201282974973,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0018828425468923455,
+      "loss": 0.1191,
+      "step": 16603
+    },
+    {
+      "epoch": 0.1441306933099539,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0018828278628499281,
+      "loss": 0.0952,
+      "step": 16604
+    },
+    {
+      "epoch": 0.14413937379015807,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0018828131779514217,
+      "loss": 0.1172,
+      "step": 16605
+    },
+    {
+      "epoch": 0.14414805427036223,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.001882798492196843,
+      "loss": 0.1504,
+      "step": 16606
+    },
+    {
+      "epoch": 0.1441567347505664,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0018827838055862072,
+      "loss": 0.125,
+      "step": 16607
+    },
+    {
+      "epoch": 0.14416541523077056,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001882769118119531,
+      "loss": 0.1025,
+      "step": 16608
+    },
+    {
+      "epoch": 0.14417409571097473,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0018827544297968302,
+      "loss": 0.1074,
+      "step": 16609
+    },
+    {
+      "epoch": 0.1441827761911789,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001882739740618121,
+      "loss": 0.1543,
+      "step": 16610
+    },
+    {
+      "epoch": 0.14419145667138306,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0018827250505834194,
+      "loss": 0.1055,
+      "step": 16611
+    },
+    {
+      "epoch": 0.14420013715158722,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0018827103596927414,
+      "loss": 0.0781,
+      "step": 16612
+    },
+    {
+      "epoch": 0.1442088176317914,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001882695667946103,
+      "loss": 0.0854,
+      "step": 16613
+    },
+    {
+      "epoch": 0.14421749811199555,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018826809753435204,
+      "loss": 0.1133,
+      "step": 16614
+    },
+    {
+      "epoch": 0.14422617859219972,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0018826662818850098,
+      "loss": 0.1152,
+      "step": 16615
+    },
+    {
+      "epoch": 0.14423485907240388,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001882651587570587,
+      "loss": 0.1416,
+      "step": 16616
+    },
+    {
+      "epoch": 0.14424353955260805,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001882636892400268,
+      "loss": 0.0957,
+      "step": 16617
+    },
+    {
+      "epoch": 0.1442522200328122,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0018826221963740696,
+      "loss": 0.0947,
+      "step": 16618
+    },
+    {
+      "epoch": 0.14426090051301638,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001882607499492007,
+      "loss": 0.1484,
+      "step": 16619
+    },
+    {
+      "epoch": 0.14426958099322054,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018825928017540963,
+      "loss": 0.0981,
+      "step": 16620
+    },
+    {
+      "epoch": 0.1442782614734247,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0018825781031603544,
+      "loss": 0.1123,
+      "step": 16621
+    },
+    {
+      "epoch": 0.14428694195362887,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0018825634037107964,
+      "loss": 0.0791,
+      "step": 16622
+    },
+    {
+      "epoch": 0.14429562243383304,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001882548703405439,
+      "loss": 0.1484,
+      "step": 16623
+    },
+    {
+      "epoch": 0.1443043029140372,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0018825340022442982,
+      "loss": 0.1162,
+      "step": 16624
+    },
+    {
+      "epoch": 0.14431298339424137,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0018825193002273899,
+      "loss": 0.0879,
+      "step": 16625
+    },
+    {
+      "epoch": 0.14432166387444553,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.00188250459735473,
+      "loss": 0.1001,
+      "step": 16626
+    },
+    {
+      "epoch": 0.1443303443546497,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0018824898936263353,
+      "loss": 0.1191,
+      "step": 16627
+    },
+    {
+      "epoch": 0.14433902483485386,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001882475189042221,
+      "loss": 0.1035,
+      "step": 16628
+    },
+    {
+      "epoch": 0.14434770531505803,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0018824604836024039,
+      "loss": 0.1299,
+      "step": 16629
+    },
+    {
+      "epoch": 0.1443563857952622,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0018824457773068998,
+      "loss": 0.1094,
+      "step": 16630
+    },
+    {
+      "epoch": 0.14436506627546636,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0018824310701557243,
+      "loss": 0.1299,
+      "step": 16631
+    },
+    {
+      "epoch": 0.14437374675567052,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018824163621488942,
+      "loss": 0.1035,
+      "step": 16632
+    },
+    {
+      "epoch": 0.1443824272358747,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0018824016532864257,
+      "loss": 0.125,
+      "step": 16633
+    },
+    {
+      "epoch": 0.14439110771607885,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0018823869435683341,
+      "loss": 0.1064,
+      "step": 16634
+    },
+    {
+      "epoch": 0.14439978819628302,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0018823722329946357,
+      "loss": 0.0796,
+      "step": 16635
+    },
+    {
+      "epoch": 0.14440846867648718,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0018823575215653472,
+      "loss": 0.1494,
+      "step": 16636
+    },
+    {
+      "epoch": 0.14441714915669135,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001882342809280484,
+      "loss": 0.1406,
+      "step": 16637
+    },
+    {
+      "epoch": 0.14442582963689551,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0018823280961400627,
+      "loss": 0.207,
+      "step": 16638
+    },
+    {
+      "epoch": 0.14443451011709968,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0018823133821440987,
+      "loss": 0.1079,
+      "step": 16639
+    },
+    {
+      "epoch": 0.14444319059730384,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001882298667292609,
+      "loss": 0.0796,
+      "step": 16640
+    },
+    {
+      "epoch": 0.144451871077508,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0018822839515856091,
+      "loss": 0.1191,
+      "step": 16641
+    },
+    {
+      "epoch": 0.14446055155771217,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0018822692350231151,
+      "loss": 0.0879,
+      "step": 16642
+    },
+    {
+      "epoch": 0.14446923203791634,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0018822545176051435,
+      "loss": 0.1035,
+      "step": 16643
+    },
+    {
+      "epoch": 0.1444779125181205,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.00188223979933171,
+      "loss": 0.1206,
+      "step": 16644
+    },
+    {
+      "epoch": 0.14448659299832467,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0018822250802028307,
+      "loss": 0.1143,
+      "step": 16645
+    },
+    {
+      "epoch": 0.14449527347852884,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001882210360218522,
+      "loss": 0.1289,
+      "step": 16646
+    },
+    {
+      "epoch": 0.144503953958733,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0018821956393787996,
+      "loss": 0.0933,
+      "step": 16647
+    },
+    {
+      "epoch": 0.14451263443893717,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0018821809176836797,
+      "loss": 0.1406,
+      "step": 16648
+    },
+    {
+      "epoch": 0.14452131491914133,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0018821661951331787,
+      "loss": 0.1152,
+      "step": 16649
+    },
+    {
+      "epoch": 0.1445299953993455,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0018821514717273125,
+      "loss": 0.0938,
+      "step": 16650
+    },
+    {
+      "epoch": 0.14453867587954966,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001882136747466097,
+      "loss": 0.1162,
+      "step": 16651
+    },
+    {
+      "epoch": 0.14454735635975383,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0018821220223495486,
+      "loss": 0.1133,
+      "step": 16652
+    },
+    {
+      "epoch": 0.144556036839958,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0018821072963776835,
+      "loss": 0.1108,
+      "step": 16653
+    },
+    {
+      "epoch": 0.14456471732016216,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0018820925695505176,
+      "loss": 0.0967,
+      "step": 16654
+    },
+    {
+      "epoch": 0.14457339780036632,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0018820778418680668,
+      "loss": 0.0913,
+      "step": 16655
+    },
+    {
+      "epoch": 0.1445820782805705,
+      "grad_norm": 3.609375,
+      "learning_rate": 0.0018820631133303475,
+      "loss": 0.3047,
+      "step": 16656
+    },
+    {
+      "epoch": 0.14459075876077465,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.001882048383937376,
+      "loss": 0.1289,
+      "step": 16657
+    },
+    {
+      "epoch": 0.14459943924097882,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0018820336536891679,
+      "loss": 0.1738,
+      "step": 16658
+    },
+    {
+      "epoch": 0.14460811972118298,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0018820189225857394,
+      "loss": 0.1045,
+      "step": 16659
+    },
+    {
+      "epoch": 0.14461680020138715,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0018820041906271068,
+      "loss": 0.1348,
+      "step": 16660
+    },
+    {
+      "epoch": 0.1446254806815913,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0018819894578132861,
+      "loss": 0.1113,
+      "step": 16661
+    },
+    {
+      "epoch": 0.14463416116179548,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0018819747241442938,
+      "loss": 0.1211,
+      "step": 16662
+    },
+    {
+      "epoch": 0.14464284164199964,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0018819599896201455,
+      "loss": 0.1484,
+      "step": 16663
+    },
+    {
+      "epoch": 0.1446515221222038,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0018819452542408574,
+      "loss": 0.0957,
+      "step": 16664
+    },
+    {
+      "epoch": 0.14466020260240797,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001881930518006446,
+      "loss": 0.1035,
+      "step": 16665
+    },
+    {
+      "epoch": 0.14466888308261214,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0018819157809169269,
+      "loss": 0.1235,
+      "step": 16666
+    },
+    {
+      "epoch": 0.1446775635628163,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0018819010429723167,
+      "loss": 0.1309,
+      "step": 16667
+    },
+    {
+      "epoch": 0.14468624404302047,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001881886304172631,
+      "loss": 0.0796,
+      "step": 16668
+    },
+    {
+      "epoch": 0.14469492452322463,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0018818715645178862,
+      "loss": 0.126,
+      "step": 16669
+    },
+    {
+      "epoch": 0.1447036050034288,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0018818568240080986,
+      "loss": 0.1299,
+      "step": 16670
+    },
+    {
+      "epoch": 0.14471228548363296,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001881842082643284,
+      "loss": 0.1089,
+      "step": 16671
+    },
+    {
+      "epoch": 0.14472096596383713,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0018818273404234587,
+      "loss": 0.1094,
+      "step": 16672
+    },
+    {
+      "epoch": 0.1447296464440413,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0018818125973486387,
+      "loss": 0.0957,
+      "step": 16673
+    },
+    {
+      "epoch": 0.14473832692424546,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018817978534188403,
+      "loss": 0.1167,
+      "step": 16674
+    },
+    {
+      "epoch": 0.14474700740444962,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0018817831086340793,
+      "loss": 0.1182,
+      "step": 16675
+    },
+    {
+      "epoch": 0.1447556878846538,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018817683629943722,
+      "loss": 0.1309,
+      "step": 16676
+    },
+    {
+      "epoch": 0.14476436836485795,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0018817536164997348,
+      "loss": 0.1309,
+      "step": 16677
+    },
+    {
+      "epoch": 0.14477304884506212,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018817388691501837,
+      "loss": 0.1221,
+      "step": 16678
+    },
+    {
+      "epoch": 0.14478172932526628,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0018817241209457343,
+      "loss": 0.1172,
+      "step": 16679
+    },
+    {
+      "epoch": 0.14479040980547045,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0018817093718864035,
+      "loss": 0.1221,
+      "step": 16680
+    },
+    {
+      "epoch": 0.14479909028567461,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001881694621972207,
+      "loss": 0.1162,
+      "step": 16681
+    },
+    {
+      "epoch": 0.14480777076587878,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001881679871203161,
+      "loss": 0.1445,
+      "step": 16682
+    },
+    {
+      "epoch": 0.14481645124608294,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0018816651195792816,
+      "loss": 0.0957,
+      "step": 16683
+    },
+    {
+      "epoch": 0.1448251317262871,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0018816503671005849,
+      "loss": 0.1348,
+      "step": 16684
+    },
+    {
+      "epoch": 0.14483381220649127,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001881635613767087,
+      "loss": 0.1123,
+      "step": 16685
+    },
+    {
+      "epoch": 0.14484249268669544,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0018816208595788047,
+      "loss": 0.0942,
+      "step": 16686
+    },
+    {
+      "epoch": 0.1448511731668996,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018816061045357532,
+      "loss": 0.1172,
+      "step": 16687
+    },
+    {
+      "epoch": 0.14485985364710377,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001881591348637949,
+      "loss": 0.1309,
+      "step": 16688
+    },
+    {
+      "epoch": 0.14486853412730794,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001881576591885408,
+      "loss": 0.1172,
+      "step": 16689
+    },
+    {
+      "epoch": 0.1448772146075121,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001881561834278147,
+      "loss": 0.1191,
+      "step": 16690
+    },
+    {
+      "epoch": 0.14488589508771627,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0018815470758161814,
+      "loss": 0.1172,
+      "step": 16691
+    },
+    {
+      "epoch": 0.14489457556792043,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001881532316499528,
+      "loss": 0.1387,
+      "step": 16692
+    },
+    {
+      "epoch": 0.1449032560481246,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0018815175563282023,
+      "loss": 0.1338,
+      "step": 16693
+    },
+    {
+      "epoch": 0.14491193652832876,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0018815027953022207,
+      "loss": 0.1196,
+      "step": 16694
+    },
+    {
+      "epoch": 0.1449206170085329,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0018814880334215999,
+      "loss": 0.1055,
+      "step": 16695
+    },
+    {
+      "epoch": 0.14492929748873706,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001881473270686355,
+      "loss": 0.1348,
+      "step": 16696
+    },
+    {
+      "epoch": 0.14493797796894123,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001881458507096503,
+      "loss": 0.1328,
+      "step": 16697
+    },
+    {
+      "epoch": 0.1449466584491454,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0018814437426520594,
+      "loss": 0.124,
+      "step": 16698
+    },
+    {
+      "epoch": 0.14495533892934956,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0018814289773530406,
+      "loss": 0.1211,
+      "step": 16699
+    },
+    {
+      "epoch": 0.14496401940955372,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018814142111994632,
+      "loss": 0.1069,
+      "step": 16700
+    },
+    {
+      "epoch": 0.1449726998897579,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018813994441913427,
+      "loss": 0.0942,
+      "step": 16701
+    },
+    {
+      "epoch": 0.14498138036996205,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0018813846763286958,
+      "loss": 0.1123,
+      "step": 16702
+    },
+    {
+      "epoch": 0.14499006085016622,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0018813699076115381,
+      "loss": 0.1221,
+      "step": 16703
+    },
+    {
+      "epoch": 0.14499874133037038,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001881355138039886,
+      "loss": 0.1221,
+      "step": 16704
+    },
+    {
+      "epoch": 0.14500742181057455,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0018813403676137556,
+      "loss": 0.0752,
+      "step": 16705
+    },
+    {
+      "epoch": 0.14501610229077871,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0018813255963331633,
+      "loss": 0.1182,
+      "step": 16706
+    },
+    {
+      "epoch": 0.14502478277098288,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001881310824198125,
+      "loss": 0.1055,
+      "step": 16707
+    },
+    {
+      "epoch": 0.14503346325118704,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.001881296051208657,
+      "loss": 0.1045,
+      "step": 16708
+    },
+    {
+      "epoch": 0.1450421437313912,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001881281277364775,
+      "loss": 0.0806,
+      "step": 16709
+    },
+    {
+      "epoch": 0.14505082421159537,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001881266502666496,
+      "loss": 0.0898,
+      "step": 16710
+    },
+    {
+      "epoch": 0.14505950469179954,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001881251727113836,
+      "loss": 0.1348,
+      "step": 16711
+    },
+    {
+      "epoch": 0.1450681851720037,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00188123695070681,
+      "loss": 0.1152,
+      "step": 16712
+    },
+    {
+      "epoch": 0.14507686565220787,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0018812221734454356,
+      "loss": 0.0996,
+      "step": 16713
+    },
+    {
+      "epoch": 0.14508554613241204,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0018812073953297281,
+      "loss": 0.2363,
+      "step": 16714
+    },
+    {
+      "epoch": 0.1450942266126162,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001881192616359704,
+      "loss": 0.1113,
+      "step": 16715
+    },
+    {
+      "epoch": 0.14510290709282037,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.001881177836535379,
+      "loss": 0.0996,
+      "step": 16716
+    },
+    {
+      "epoch": 0.14511158757302453,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0018811630558567703,
+      "loss": 0.1123,
+      "step": 16717
+    },
+    {
+      "epoch": 0.1451202680532287,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0018811482743238933,
+      "loss": 0.0815,
+      "step": 16718
+    },
+    {
+      "epoch": 0.14512894853343286,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001881133491936764,
+      "loss": 0.1348,
+      "step": 16719
+    },
+    {
+      "epoch": 0.14513762901363703,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001881118708695399,
+      "loss": 0.1357,
+      "step": 16720
+    },
+    {
+      "epoch": 0.1451463094938412,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0018811039245998143,
+      "loss": 0.103,
+      "step": 16721
+    },
+    {
+      "epoch": 0.14515498997404536,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001881089139650026,
+      "loss": 0.1348,
+      "step": 16722
+    },
+    {
+      "epoch": 0.14516367045424952,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0018810743538460505,
+      "loss": 0.0996,
+      "step": 16723
+    },
+    {
+      "epoch": 0.1451723509344537,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0018810595671879037,
+      "loss": 0.0796,
+      "step": 16724
+    },
+    {
+      "epoch": 0.14518103141465785,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001881044779675602,
+      "loss": 0.1104,
+      "step": 16725
+    },
+    {
+      "epoch": 0.14518971189486202,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0018810299913091615,
+      "loss": 0.1221,
+      "step": 16726
+    },
+    {
+      "epoch": 0.14519839237506618,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0018810152020885982,
+      "loss": 0.1426,
+      "step": 16727
+    },
+    {
+      "epoch": 0.14520707285527035,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018810004120139283,
+      "loss": 0.1143,
+      "step": 16728
+    },
+    {
+      "epoch": 0.1452157533354745,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0018809856210851683,
+      "loss": 0.085,
+      "step": 16729
+    },
+    {
+      "epoch": 0.14522443381567868,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0018809708293023343,
+      "loss": 0.1299,
+      "step": 16730
+    },
+    {
+      "epoch": 0.14523311429588284,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0018809560366654422,
+      "loss": 0.1133,
+      "step": 16731
+    },
+    {
+      "epoch": 0.145241794776087,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0018809412431745078,
+      "loss": 0.0825,
+      "step": 16732
+    },
+    {
+      "epoch": 0.14525047525629117,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0018809264488295486,
+      "loss": 0.1445,
+      "step": 16733
+    },
+    {
+      "epoch": 0.14525915573649534,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0018809116536305794,
+      "loss": 0.1094,
+      "step": 16734
+    },
+    {
+      "epoch": 0.1452678362166995,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0018808968575776174,
+      "loss": 0.0947,
+      "step": 16735
+    },
+    {
+      "epoch": 0.14527651669690367,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.001880882060670678,
+      "loss": 0.1104,
+      "step": 16736
+    },
+    {
+      "epoch": 0.14528519717710783,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0018808672629097775,
+      "loss": 0.0996,
+      "step": 16737
+    },
+    {
+      "epoch": 0.145293877657312,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001880852464294933,
+      "loss": 0.1406,
+      "step": 16738
+    },
+    {
+      "epoch": 0.14530255813751616,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0018808376648261594,
+      "loss": 0.1123,
+      "step": 16739
+    },
+    {
+      "epoch": 0.14531123861772033,
+      "grad_norm": 1.75,
+      "learning_rate": 0.0018808228645034739,
+      "loss": 0.3457,
+      "step": 16740
+    },
+    {
+      "epoch": 0.1453199190979245,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018808080633268918,
+      "loss": 0.1221,
+      "step": 16741
+    },
+    {
+      "epoch": 0.14532859957812866,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.00188079326129643,
+      "loss": 0.1133,
+      "step": 16742
+    },
+    {
+      "epoch": 0.14533728005833282,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0018807784584121041,
+      "loss": 0.0854,
+      "step": 16743
+    },
+    {
+      "epoch": 0.145345960538537,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0018807636546739309,
+      "loss": 0.1172,
+      "step": 16744
+    },
+    {
+      "epoch": 0.14535464101874115,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0018807488500819263,
+      "loss": 0.1216,
+      "step": 16745
+    },
+    {
+      "epoch": 0.14536332149894532,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0018807340446361065,
+      "loss": 0.1309,
+      "step": 16746
+    },
+    {
+      "epoch": 0.14537200197914948,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0018807192383364878,
+      "loss": 0.1016,
+      "step": 16747
+    },
+    {
+      "epoch": 0.14538068245935365,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.001880704431183086,
+      "loss": 0.1138,
+      "step": 16748
+    },
+    {
+      "epoch": 0.14538936293955781,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0018806896231759174,
+      "loss": 0.1011,
+      "step": 16749
+    },
+    {
+      "epoch": 0.14539804341976198,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0018806748143149988,
+      "loss": 0.127,
+      "step": 16750
+    },
+    {
+      "epoch": 0.14540672389996614,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0018806600046003456,
+      "loss": 0.0986,
+      "step": 16751
+    },
+    {
+      "epoch": 0.1454154043801703,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001880645194031975,
+      "loss": 0.1045,
+      "step": 16752
+    },
+    {
+      "epoch": 0.14542408486037448,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0018806303826099015,
+      "loss": 0.1084,
+      "step": 16753
+    },
+    {
+      "epoch": 0.14543276534057864,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001880615570334143,
+      "loss": 0.1206,
+      "step": 16754
+    },
+    {
+      "epoch": 0.1454414458207828,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001880600757204715,
+      "loss": 0.1436,
+      "step": 16755
+    },
+    {
+      "epoch": 0.14545012630098697,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0018805859432216338,
+      "loss": 0.1318,
+      "step": 16756
+    },
+    {
+      "epoch": 0.14545880678119114,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0018805711283849155,
+      "loss": 0.1406,
+      "step": 16757
+    },
+    {
+      "epoch": 0.1454674872613953,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001880556312694576,
+      "loss": 0.1543,
+      "step": 16758
+    },
+    {
+      "epoch": 0.14547616774159947,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0018805414961506322,
+      "loss": 0.1191,
+      "step": 16759
+    },
+    {
+      "epoch": 0.14548484822180363,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0018805266787530997,
+      "loss": 0.126,
+      "step": 16760
+    },
+    {
+      "epoch": 0.1454935287020078,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001880511860501995,
+      "loss": 0.126,
+      "step": 16761
+    },
+    {
+      "epoch": 0.14550220918221196,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0018804970413973346,
+      "loss": 0.1279,
+      "step": 16762
+    },
+    {
+      "epoch": 0.14551088966241613,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.001880482221439134,
+      "loss": 0.1021,
+      "step": 16763
+    },
+    {
+      "epoch": 0.1455195701426203,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.00188046740062741,
+      "loss": 0.1436,
+      "step": 16764
+    },
+    {
+      "epoch": 0.14552825062282446,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0018804525789621785,
+      "loss": 0.0908,
+      "step": 16765
+    },
+    {
+      "epoch": 0.14553693110302862,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0018804377564434556,
+      "loss": 0.1455,
+      "step": 16766
+    },
+    {
+      "epoch": 0.1455456115832328,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001880422933071258,
+      "loss": 0.1348,
+      "step": 16767
+    },
+    {
+      "epoch": 0.14555429206343695,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0018804081088456012,
+      "loss": 0.125,
+      "step": 16768
+    },
+    {
+      "epoch": 0.14556297254364112,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001880393283766502,
+      "loss": 0.0811,
+      "step": 16769
+    },
+    {
+      "epoch": 0.14557165302384528,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001880378457833977,
+      "loss": 0.1084,
+      "step": 16770
+    },
+    {
+      "epoch": 0.14558033350404945,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001880363631048041,
+      "loss": 0.1309,
+      "step": 16771
+    },
+    {
+      "epoch": 0.1455890139842536,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0018803488034087115,
+      "loss": 0.1484,
+      "step": 16772
+    },
+    {
+      "epoch": 0.14559769446445778,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0018803339749160041,
+      "loss": 0.0815,
+      "step": 16773
+    },
+    {
+      "epoch": 0.14560637494466194,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0018803191455699355,
+      "loss": 0.1172,
+      "step": 16774
+    },
+    {
+      "epoch": 0.1456150554248661,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0018803043153705213,
+      "loss": 0.1758,
+      "step": 16775
+    },
+    {
+      "epoch": 0.14562373590507027,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0018802894843177778,
+      "loss": 0.1875,
+      "step": 16776
+    },
+    {
+      "epoch": 0.14563241638527444,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001880274652411722,
+      "loss": 0.0869,
+      "step": 16777
+    },
+    {
+      "epoch": 0.1456410968654786,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0018802598196523692,
+      "loss": 0.0996,
+      "step": 16778
+    },
+    {
+      "epoch": 0.14564977734568277,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0018802449860397363,
+      "loss": 0.1108,
+      "step": 16779
+    },
+    {
+      "epoch": 0.14565845782588693,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0018802301515738389,
+      "loss": 0.1113,
+      "step": 16780
+    },
+    {
+      "epoch": 0.1456671383060911,
+      "grad_norm": 2.140625,
+      "learning_rate": 0.0018802153162546936,
+      "loss": 0.1865,
+      "step": 16781
+    },
+    {
+      "epoch": 0.14567581878629526,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018802004800823164,
+      "loss": 0.1123,
+      "step": 16782
+    },
+    {
+      "epoch": 0.14568449926649943,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001880185643056724,
+      "loss": 0.1162,
+      "step": 16783
+    },
+    {
+      "epoch": 0.1456931797467036,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001880170805177932,
+      "loss": 0.1299,
+      "step": 16784
+    },
+    {
+      "epoch": 0.14570186022690776,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001880155966445957,
+      "loss": 0.1025,
+      "step": 16785
+    },
+    {
+      "epoch": 0.14571054070711192,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0018801411268608156,
+      "loss": 0.1631,
+      "step": 16786
+    },
+    {
+      "epoch": 0.1457192211873161,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001880126286422523,
+      "loss": 0.1084,
+      "step": 16787
+    },
+    {
+      "epoch": 0.14572790166752025,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0018801114451310963,
+      "loss": 0.1309,
+      "step": 16788
+    },
+    {
+      "epoch": 0.14573658214772442,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0018800966029865515,
+      "loss": 0.1914,
+      "step": 16789
+    },
+    {
+      "epoch": 0.14574526262792858,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0018800817599889047,
+      "loss": 0.0908,
+      "step": 16790
+    },
+    {
+      "epoch": 0.14575394310813275,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0018800669161381722,
+      "loss": 0.0811,
+      "step": 16791
+    },
+    {
+      "epoch": 0.14576262358833691,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.00188005207143437,
+      "loss": 0.1328,
+      "step": 16792
+    },
+    {
+      "epoch": 0.14577130406854108,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0018800372258775148,
+      "loss": 0.165,
+      "step": 16793
+    },
+    {
+      "epoch": 0.14577998454874525,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0018800223794676228,
+      "loss": 0.1108,
+      "step": 16794
+    },
+    {
+      "epoch": 0.1457886650289494,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0018800075322047097,
+      "loss": 0.0967,
+      "step": 16795
+    },
+    {
+      "epoch": 0.14579734550915358,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0018799926840887922,
+      "loss": 0.1133,
+      "step": 16796
+    },
+    {
+      "epoch": 0.14580602598935774,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0018799778351198863,
+      "loss": 0.1826,
+      "step": 16797
+    },
+    {
+      "epoch": 0.1458147064695619,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018799629852980088,
+      "loss": 0.1338,
+      "step": 16798
+    },
+    {
+      "epoch": 0.14582338694976607,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.001879948134623175,
+      "loss": 0.0874,
+      "step": 16799
+    },
+    {
+      "epoch": 0.14583206742997024,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0018799332830954018,
+      "loss": 0.1406,
+      "step": 16800
+    },
+    {
+      "epoch": 0.1458407479101744,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0018799184307147054,
+      "loss": 0.1357,
+      "step": 16801
+    },
+    {
+      "epoch": 0.14584942839037857,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001879903577481102,
+      "loss": 0.0981,
+      "step": 16802
+    },
+    {
+      "epoch": 0.14585810887058273,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0018798887233946076,
+      "loss": 0.1182,
+      "step": 16803
+    },
+    {
+      "epoch": 0.1458667893507869,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018798738684552385,
+      "loss": 0.0947,
+      "step": 16804
+    },
+    {
+      "epoch": 0.14587546983099106,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0018798590126630113,
+      "loss": 0.0981,
+      "step": 16805
+    },
+    {
+      "epoch": 0.14588415031119523,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0018798441560179417,
+      "loss": 0.1221,
+      "step": 16806
+    },
+    {
+      "epoch": 0.1458928307913994,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0018798292985200465,
+      "loss": 0.0991,
+      "step": 16807
+    },
+    {
+      "epoch": 0.14590151127160356,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0018798144401693416,
+      "loss": 0.1196,
+      "step": 16808
+    },
+    {
+      "epoch": 0.14591019175180772,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0018797995809658435,
+      "loss": 0.105,
+      "step": 16809
+    },
+    {
+      "epoch": 0.1459188722320119,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001879784720909568,
+      "loss": 0.1172,
+      "step": 16810
+    },
+    {
+      "epoch": 0.14592755271221605,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0018797698600005318,
+      "loss": 0.1143,
+      "step": 16811
+    },
+    {
+      "epoch": 0.14593623319242022,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018797549982387512,
+      "loss": 0.1758,
+      "step": 16812
+    },
+    {
+      "epoch": 0.14594491367262438,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001879740135624242,
+      "loss": 0.1445,
+      "step": 16813
+    },
+    {
+      "epoch": 0.14595359415282855,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0018797252721570207,
+      "loss": 0.0981,
+      "step": 16814
+    },
+    {
+      "epoch": 0.1459622746330327,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0018797104078371034,
+      "loss": 0.124,
+      "step": 16815
+    },
+    {
+      "epoch": 0.14597095511323688,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001879695542664507,
+      "loss": 0.1045,
+      "step": 16816
+    },
+    {
+      "epoch": 0.14597963559344104,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001879680676639247,
+      "loss": 0.1367,
+      "step": 16817
+    },
+    {
+      "epoch": 0.1459883160736452,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.00187966580976134,
+      "loss": 0.1089,
+      "step": 16818
+    },
+    {
+      "epoch": 0.14599699655384935,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001879650942030802,
+      "loss": 0.1201,
+      "step": 16819
+    },
+    {
+      "epoch": 0.1460056770340535,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0018796360734476497,
+      "loss": 0.1016,
+      "step": 16820
+    },
+    {
+      "epoch": 0.14601435751425768,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0018796212040118987,
+      "loss": 0.1143,
+      "step": 16821
+    },
+    {
+      "epoch": 0.14602303799446184,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0018796063337235657,
+      "loss": 0.126,
+      "step": 16822
+    },
+    {
+      "epoch": 0.146031718474666,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0018795914625826676,
+      "loss": 0.1084,
+      "step": 16823
+    },
+    {
+      "epoch": 0.14604039895487017,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0018795765905892192,
+      "loss": 0.126,
+      "step": 16824
+    },
+    {
+      "epoch": 0.14604907943507434,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001879561717743238,
+      "loss": 0.1348,
+      "step": 16825
+    },
+    {
+      "epoch": 0.1460577599152785,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0018795468440447399,
+      "loss": 0.1494,
+      "step": 16826
+    },
+    {
+      "epoch": 0.14606644039548267,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0018795319694937408,
+      "loss": 0.123,
+      "step": 16827
+    },
+    {
+      "epoch": 0.14607512087568683,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018795170940902573,
+      "loss": 0.126,
+      "step": 16828
+    },
+    {
+      "epoch": 0.146083801355891,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0018795022178343058,
+      "loss": 0.0947,
+      "step": 16829
+    },
+    {
+      "epoch": 0.14609248183609516,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0018794873407259021,
+      "loss": 0.123,
+      "step": 16830
+    },
+    {
+      "epoch": 0.14610116231629933,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0018794724627650627,
+      "loss": 0.085,
+      "step": 16831
+    },
+    {
+      "epoch": 0.1461098427965035,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0018794575839518042,
+      "loss": 0.1123,
+      "step": 16832
+    },
+    {
+      "epoch": 0.14611852327670766,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0018794427042861427,
+      "loss": 0.1338,
+      "step": 16833
+    },
+    {
+      "epoch": 0.14612720375691182,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001879427823768094,
+      "loss": 0.0703,
+      "step": 16834
+    },
+    {
+      "epoch": 0.146135884237116,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001879412942397675,
+      "loss": 0.1064,
+      "step": 16835
+    },
+    {
+      "epoch": 0.14614456471732015,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0018793980601749017,
+      "loss": 0.1055,
+      "step": 16836
+    },
+    {
+      "epoch": 0.14615324519752432,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0018793831770997904,
+      "loss": 0.0791,
+      "step": 16837
+    },
+    {
+      "epoch": 0.14616192567772848,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0018793682931723572,
+      "loss": 0.1387,
+      "step": 16838
+    },
+    {
+      "epoch": 0.14617060615793265,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0018793534083926186,
+      "loss": 0.0967,
+      "step": 16839
+    },
+    {
+      "epoch": 0.1461792866381368,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001879338522760591,
+      "loss": 0.1226,
+      "step": 16840
+    },
+    {
+      "epoch": 0.14618796711834098,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0018793236362762904,
+      "loss": 0.1167,
+      "step": 16841
+    },
+    {
+      "epoch": 0.14619664759854514,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001879308748939733,
+      "loss": 0.1396,
+      "step": 16842
+    },
+    {
+      "epoch": 0.1462053280787493,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0018792938607509356,
+      "loss": 0.0903,
+      "step": 16843
+    },
+    {
+      "epoch": 0.14621400855895347,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001879278971709914,
+      "loss": 0.1279,
+      "step": 16844
+    },
+    {
+      "epoch": 0.14622268903915764,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0018792640818166846,
+      "loss": 0.1348,
+      "step": 16845
+    },
+    {
+      "epoch": 0.1462313695193618,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0018792491910712638,
+      "loss": 0.0859,
+      "step": 16846
+    },
+    {
+      "epoch": 0.14624004999956597,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0018792342994736677,
+      "loss": 0.1064,
+      "step": 16847
+    },
+    {
+      "epoch": 0.14624873047977013,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0018792194070239125,
+      "loss": 0.1226,
+      "step": 16848
+    },
+    {
+      "epoch": 0.1462574109599743,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.001879204513722015,
+      "loss": 0.085,
+      "step": 16849
+    },
+    {
+      "epoch": 0.14626609144017846,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001879189619567991,
+      "loss": 0.1196,
+      "step": 16850
+    },
+    {
+      "epoch": 0.14627477192038263,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0018791747245618566,
+      "loss": 0.1494,
+      "step": 16851
+    },
+    {
+      "epoch": 0.1462834524005868,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0018791598287036292,
+      "loss": 0.0972,
+      "step": 16852
+    },
+    {
+      "epoch": 0.14629213288079096,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0018791449319933238,
+      "loss": 0.1064,
+      "step": 16853
+    },
+    {
+      "epoch": 0.14630081336099512,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0018791300344309571,
+      "loss": 0.0923,
+      "step": 16854
+    },
+    {
+      "epoch": 0.1463094938411993,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0018791151360165457,
+      "loss": 0.1396,
+      "step": 16855
+    },
+    {
+      "epoch": 0.14631817432140345,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0018791002367501057,
+      "loss": 0.1201,
+      "step": 16856
+    },
+    {
+      "epoch": 0.14632685480160762,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0018790853366316533,
+      "loss": 0.1011,
+      "step": 16857
+    },
+    {
+      "epoch": 0.14633553528181178,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001879070435661205,
+      "loss": 0.1816,
+      "step": 16858
+    },
+    {
+      "epoch": 0.14634421576201595,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0018790555338387771,
+      "loss": 0.1445,
+      "step": 16859
+    },
+    {
+      "epoch": 0.14635289624222012,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0018790406311643853,
+      "loss": 0.1143,
+      "step": 16860
+    },
+    {
+      "epoch": 0.14636157672242428,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0018790257276380467,
+      "loss": 0.1377,
+      "step": 16861
+    },
+    {
+      "epoch": 0.14637025720262845,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0018790108232597774,
+      "loss": 0.0967,
+      "step": 16862
+    },
+    {
+      "epoch": 0.1463789376828326,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0018789959180295934,
+      "loss": 0.1152,
+      "step": 16863
+    },
+    {
+      "epoch": 0.14638761816303678,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0018789810119475112,
+      "loss": 0.1035,
+      "step": 16864
+    },
+    {
+      "epoch": 0.14639629864324094,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0018789661050135468,
+      "loss": 0.1299,
+      "step": 16865
+    },
+    {
+      "epoch": 0.1464049791234451,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001878951197227717,
+      "loss": 0.1348,
+      "step": 16866
+    },
+    {
+      "epoch": 0.14641365960364927,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001878936288590038,
+      "loss": 0.1206,
+      "step": 16867
+    },
+    {
+      "epoch": 0.14642234008385344,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001878921379100526,
+      "loss": 0.1016,
+      "step": 16868
+    },
+    {
+      "epoch": 0.1464310205640576,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0018789064687591973,
+      "loss": 0.0801,
+      "step": 16869
+    },
+    {
+      "epoch": 0.14643970104426177,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0018788915575660677,
+      "loss": 0.125,
+      "step": 16870
+    },
+    {
+      "epoch": 0.14644838152446593,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0018788766455211544,
+      "loss": 0.1396,
+      "step": 16871
+    },
+    {
+      "epoch": 0.1464570620046701,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0018788617326244731,
+      "loss": 0.0859,
+      "step": 16872
+    },
+    {
+      "epoch": 0.14646574248487426,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0018788468188760405,
+      "loss": 0.1084,
+      "step": 16873
+    },
+    {
+      "epoch": 0.14647442296507843,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0018788319042758728,
+      "loss": 0.0854,
+      "step": 16874
+    },
+    {
+      "epoch": 0.1464831034452826,
+      "grad_norm": 3.109375,
+      "learning_rate": 0.001878816988823986,
+      "loss": 0.5625,
+      "step": 16875
+    },
+    {
+      "epoch": 0.14649178392548676,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0018788020725203968,
+      "loss": 0.0879,
+      "step": 16876
+    },
+    {
+      "epoch": 0.14650046440569092,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018787871553651212,
+      "loss": 0.0889,
+      "step": 16877
+    },
+    {
+      "epoch": 0.1465091448858951,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001878772237358176,
+      "loss": 0.166,
+      "step": 16878
+    },
+    {
+      "epoch": 0.14651782536609925,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0018787573184995768,
+      "loss": 0.1006,
+      "step": 16879
+    },
+    {
+      "epoch": 0.14652650584630342,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0018787423987893406,
+      "loss": 0.1177,
+      "step": 16880
+    },
+    {
+      "epoch": 0.14653518632650758,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001878727478227483,
+      "loss": 0.0972,
+      "step": 16881
+    },
+    {
+      "epoch": 0.14654386680671175,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0018787125568140212,
+      "loss": 0.1016,
+      "step": 16882
+    },
+    {
+      "epoch": 0.1465525472869159,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0018786976345489706,
+      "loss": 0.1299,
+      "step": 16883
+    },
+    {
+      "epoch": 0.14656122776712008,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0018786827114323484,
+      "loss": 0.0967,
+      "step": 16884
+    },
+    {
+      "epoch": 0.14656990824732424,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0018786677874641704,
+      "loss": 0.1562,
+      "step": 16885
+    },
+    {
+      "epoch": 0.1465785887275284,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0018786528626444528,
+      "loss": 0.1011,
+      "step": 16886
+    },
+    {
+      "epoch": 0.14658726920773257,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001878637936973212,
+      "loss": 0.1201,
+      "step": 16887
+    },
+    {
+      "epoch": 0.14659594968793674,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0018786230104504648,
+      "loss": 0.0801,
+      "step": 16888
+    },
+    {
+      "epoch": 0.1466046301681409,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001878608083076227,
+      "loss": 0.1143,
+      "step": 16889
+    },
+    {
+      "epoch": 0.14661331064834507,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0018785931548505152,
+      "loss": 0.1074,
+      "step": 16890
+    },
+    {
+      "epoch": 0.14662199112854923,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0018785782257733454,
+      "loss": 0.1387,
+      "step": 16891
+    },
+    {
+      "epoch": 0.1466306716087534,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0018785632958447345,
+      "loss": 0.1133,
+      "step": 16892
+    },
+    {
+      "epoch": 0.14663935208895756,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0018785483650646982,
+      "loss": 0.1504,
+      "step": 16893
+    },
+    {
+      "epoch": 0.14664803256916173,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0018785334334332534,
+      "loss": 0.1836,
+      "step": 16894
+    },
+    {
+      "epoch": 0.1466567130493659,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001878518500950416,
+      "loss": 0.1157,
+      "step": 16895
+    },
+    {
+      "epoch": 0.14666539352957006,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0018785035676162024,
+      "loss": 0.1055,
+      "step": 16896
+    },
+    {
+      "epoch": 0.14667407400977422,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 0.001878488633430629,
+      "loss": 0.105,
+      "step": 16897
+    },
+    {
+      "epoch": 0.1466827544899784,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001878473698393712,
+      "loss": 0.1387,
+      "step": 16898
+    },
+    {
+      "epoch": 0.14669143497018255,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0018784587625054678,
+      "loss": 0.1196,
+      "step": 16899
+    },
+    {
+      "epoch": 0.14670011545038672,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0018784438257659132,
+      "loss": 0.1318,
+      "step": 16900
+    },
+    {
+      "epoch": 0.14670879593059089,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0018784288881750638,
+      "loss": 0.1221,
+      "step": 16901
+    },
+    {
+      "epoch": 0.14671747641079505,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0018784139497329364,
+      "loss": 0.0913,
+      "step": 16902
+    },
+    {
+      "epoch": 0.14672615689099922,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0018783990104395472,
+      "loss": 0.1592,
+      "step": 16903
+    },
+    {
+      "epoch": 0.14673483737120338,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0018783840702949126,
+      "loss": 0.1582,
+      "step": 16904
+    },
+    {
+      "epoch": 0.14674351785140755,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0018783691292990489,
+      "loss": 0.1187,
+      "step": 16905
+    },
+    {
+      "epoch": 0.1467521983316117,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.001878354187451972,
+      "loss": 0.1299,
+      "step": 16906
+    },
+    {
+      "epoch": 0.14676087881181588,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001878339244753699,
+      "loss": 0.1152,
+      "step": 16907
+    },
+    {
+      "epoch": 0.14676955929202004,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001878324301204246,
+      "loss": 0.0859,
+      "step": 16908
+    },
+    {
+      "epoch": 0.1467782397722242,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0018783093568036288,
+      "loss": 0.1328,
+      "step": 16909
+    },
+    {
+      "epoch": 0.14678692025242837,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0018782944115518648,
+      "loss": 0.0996,
+      "step": 16910
+    },
+    {
+      "epoch": 0.14679560073263254,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0018782794654489694,
+      "loss": 0.1357,
+      "step": 16911
+    },
+    {
+      "epoch": 0.1468042812128367,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0018782645184949591,
+      "loss": 0.1079,
+      "step": 16912
+    },
+    {
+      "epoch": 0.14681296169304087,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0018782495706898509,
+      "loss": 0.125,
+      "step": 16913
+    },
+    {
+      "epoch": 0.14682164217324503,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0018782346220336602,
+      "loss": 0.127,
+      "step": 16914
+    },
+    {
+      "epoch": 0.1468303226534492,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001878219672526404,
+      "loss": 0.1465,
+      "step": 16915
+    },
+    {
+      "epoch": 0.14683900313365336,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0018782047221680982,
+      "loss": 0.3555,
+      "step": 16916
+    },
+    {
+      "epoch": 0.14684768361385753,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0018781897709587599,
+      "loss": 0.1055,
+      "step": 16917
+    },
+    {
+      "epoch": 0.1468563640940617,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0018781748188984044,
+      "loss": 0.0771,
+      "step": 16918
+    },
+    {
+      "epoch": 0.14686504457426586,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001878159865987049,
+      "loss": 0.1152,
+      "step": 16919
+    },
+    {
+      "epoch": 0.14687372505447002,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0018781449122247096,
+      "loss": 0.1377,
+      "step": 16920
+    },
+    {
+      "epoch": 0.1468824055346742,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0018781299576114024,
+      "loss": 0.1006,
+      "step": 16921
+    },
+    {
+      "epoch": 0.14689108601487835,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0018781150021471443,
+      "loss": 0.123,
+      "step": 16922
+    },
+    {
+      "epoch": 0.14689976649508252,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001878100045831951,
+      "loss": 0.1309,
+      "step": 16923
+    },
+    {
+      "epoch": 0.14690844697528668,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018780850886658395,
+      "loss": 0.1221,
+      "step": 16924
+    },
+    {
+      "epoch": 0.14691712745549085,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0018780701306488254,
+      "loss": 0.1406,
+      "step": 16925
+    },
+    {
+      "epoch": 0.146925807935695,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001878055171780926,
+      "loss": 0.1157,
+      "step": 16926
+    },
+    {
+      "epoch": 0.14693448841589918,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0018780402120621572,
+      "loss": 0.1621,
+      "step": 16927
+    },
+    {
+      "epoch": 0.14694316889610334,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0018780252514925347,
+      "loss": 0.1162,
+      "step": 16928
+    },
+    {
+      "epoch": 0.1469518493763075,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.001878010290072076,
+      "loss": 0.1143,
+      "step": 16929
+    },
+    {
+      "epoch": 0.14696052985651167,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0018779953278007966,
+      "loss": 0.1123,
+      "step": 16930
+    },
+    {
+      "epoch": 0.14696921033671584,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0018779803646787135,
+      "loss": 0.1299,
+      "step": 16931
+    },
+    {
+      "epoch": 0.14697789081692,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0018779654007058427,
+      "loss": 0.0835,
+      "step": 16932
+    },
+    {
+      "epoch": 0.14698657129712417,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018779504358822005,
+      "loss": 0.1216,
+      "step": 16933
+    },
+    {
+      "epoch": 0.14699525177732833,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0018779354702078033,
+      "loss": 0.1143,
+      "step": 16934
+    },
+    {
+      "epoch": 0.1470039322575325,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0018779205036826676,
+      "loss": 0.1426,
+      "step": 16935
+    },
+    {
+      "epoch": 0.14701261273773666,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.00187790553630681,
+      "loss": 0.1152,
+      "step": 16936
+    },
+    {
+      "epoch": 0.14702129321794083,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018778905680802464,
+      "loss": 0.0991,
+      "step": 16937
+    },
+    {
+      "epoch": 0.147029973698145,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018778755990029935,
+      "loss": 0.1543,
+      "step": 16938
+    },
+    {
+      "epoch": 0.14703865417834916,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0018778606290750673,
+      "loss": 0.1934,
+      "step": 16939
+    },
+    {
+      "epoch": 0.14704733465855332,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0018778456582964845,
+      "loss": 0.1035,
+      "step": 16940
+    },
+    {
+      "epoch": 0.1470560151387575,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0018778306866672613,
+      "loss": 0.1992,
+      "step": 16941
+    },
+    {
+      "epoch": 0.14706469561896163,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0018778157141874144,
+      "loss": 0.2188,
+      "step": 16942
+    },
+    {
+      "epoch": 0.1470733760991658,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0018778007408569598,
+      "loss": 0.1191,
+      "step": 16943
+    },
+    {
+      "epoch": 0.14708205657936996,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001877785766675914,
+      "loss": 0.1436,
+      "step": 16944
+    },
+    {
+      "epoch": 0.14709073705957412,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0018777707916442933,
+      "loss": 0.1182,
+      "step": 16945
+    },
+    {
+      "epoch": 0.1470994175397783,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001877755815762114,
+      "loss": 0.1143,
+      "step": 16946
+    },
+    {
+      "epoch": 0.14710809801998245,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0018777408390293928,
+      "loss": 0.1309,
+      "step": 16947
+    },
+    {
+      "epoch": 0.14711677850018662,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001877725861446146,
+      "loss": 0.1338,
+      "step": 16948
+    },
+    {
+      "epoch": 0.14712545898039078,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0018777108830123899,
+      "loss": 0.0991,
+      "step": 16949
+    },
+    {
+      "epoch": 0.14713413946059495,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018776959037281405,
+      "loss": 0.083,
+      "step": 16950
+    },
+    {
+      "epoch": 0.1471428199407991,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001877680923593415,
+      "loss": 0.1543,
+      "step": 16951
+    },
+    {
+      "epoch": 0.14715150042100328,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001877665942608229,
+      "loss": 0.1338,
+      "step": 16952
+    },
+    {
+      "epoch": 0.14716018090120744,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0018776509607725993,
+      "loss": 0.1357,
+      "step": 16953
+    },
+    {
+      "epoch": 0.1471688613814116,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0018776359780865424,
+      "loss": 0.1123,
+      "step": 16954
+    },
+    {
+      "epoch": 0.14717754186161577,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0018776209945500742,
+      "loss": 0.1289,
+      "step": 16955
+    },
+    {
+      "epoch": 0.14718622234181994,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0018776060101632116,
+      "loss": 0.1011,
+      "step": 16956
+    },
+    {
+      "epoch": 0.1471949028220241,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0018775910249259704,
+      "loss": 0.0991,
+      "step": 16957
+    },
+    {
+      "epoch": 0.14720358330222827,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0018775760388383676,
+      "loss": 0.1084,
+      "step": 16958
+    },
+    {
+      "epoch": 0.14721226378243243,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001877561051900419,
+      "loss": 0.1426,
+      "step": 16959
+    },
+    {
+      "epoch": 0.1472209442626366,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001877546064112142,
+      "loss": 0.1123,
+      "step": 16960
+    },
+    {
+      "epoch": 0.14722962474284076,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0018775310754735517,
+      "loss": 0.1416,
+      "step": 16961
+    },
+    {
+      "epoch": 0.14723830522304493,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0018775160859846656,
+      "loss": 0.1484,
+      "step": 16962
+    },
+    {
+      "epoch": 0.1472469857032491,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018775010956454993,
+      "loss": 0.0854,
+      "step": 16963
+    },
+    {
+      "epoch": 0.14725566618345326,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018774861044560693,
+      "loss": 0.1191,
+      "step": 16964
+    },
+    {
+      "epoch": 0.14726434666365742,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0018774711124163925,
+      "loss": 0.1016,
+      "step": 16965
+    },
+    {
+      "epoch": 0.1472730271438616,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0018774561195264849,
+      "loss": 0.1123,
+      "step": 16966
+    },
+    {
+      "epoch": 0.14728170762406576,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0018774411257863625,
+      "loss": 0.1367,
+      "step": 16967
+    },
+    {
+      "epoch": 0.14729038810426992,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018774261311960427,
+      "loss": 0.124,
+      "step": 16968
+    },
+    {
+      "epoch": 0.14729906858447409,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0018774111357555412,
+      "loss": 0.1299,
+      "step": 16969
+    },
+    {
+      "epoch": 0.14730774906467825,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0018773961394648747,
+      "loss": 0.1455,
+      "step": 16970
+    },
+    {
+      "epoch": 0.14731642954488242,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0018773811423240595,
+      "loss": 0.1279,
+      "step": 16971
+    },
+    {
+      "epoch": 0.14732511002508658,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0018773661443331115,
+      "loss": 0.0947,
+      "step": 16972
+    },
+    {
+      "epoch": 0.14733379050529075,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001877351145492048,
+      "loss": 0.1206,
+      "step": 16973
+    },
+    {
+      "epoch": 0.1473424709854949,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0018773361458008849,
+      "loss": 0.104,
+      "step": 16974
+    },
+    {
+      "epoch": 0.14735115146569908,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0018773211452596387,
+      "loss": 0.1045,
+      "step": 16975
+    },
+    {
+      "epoch": 0.14735983194590324,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0018773061438683257,
+      "loss": 0.1465,
+      "step": 16976
+    },
+    {
+      "epoch": 0.1473685124261074,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0018772911416269622,
+      "loss": 0.1006,
+      "step": 16977
+    },
+    {
+      "epoch": 0.14737719290631157,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001877276138535565,
+      "loss": 0.1426,
+      "step": 16978
+    },
+    {
+      "epoch": 0.14738587338651574,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0018772611345941504,
+      "loss": 0.1094,
+      "step": 16979
+    },
+    {
+      "epoch": 0.1473945538667199,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0018772461298027345,
+      "loss": 0.1357,
+      "step": 16980
+    },
+    {
+      "epoch": 0.14740323434692407,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001877231124161334,
+      "loss": 0.0977,
+      "step": 16981
+    },
+    {
+      "epoch": 0.14741191482712823,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001877216117669965,
+      "loss": 0.1348,
+      "step": 16982
+    },
+    {
+      "epoch": 0.1474205953073324,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0018772011103286444,
+      "loss": 0.1221,
+      "step": 16983
+    },
+    {
+      "epoch": 0.14742927578753656,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0018771861021373883,
+      "loss": 0.1162,
+      "step": 16984
+    },
+    {
+      "epoch": 0.14743795626774073,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0018771710930962132,
+      "loss": 0.0859,
+      "step": 16985
+    },
+    {
+      "epoch": 0.1474466367479449,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0018771560832051353,
+      "loss": 0.1895,
+      "step": 16986
+    },
+    {
+      "epoch": 0.14745531722814906,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001877141072464171,
+      "loss": 0.1104,
+      "step": 16987
+    },
+    {
+      "epoch": 0.14746399770835322,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0018771260608733374,
+      "loss": 0.083,
+      "step": 16988
+    },
+    {
+      "epoch": 0.1474726781885574,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0018771110484326501,
+      "loss": 0.0952,
+      "step": 16989
+    },
+    {
+      "epoch": 0.14748135866876155,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001877096035142126,
+      "loss": 0.1147,
+      "step": 16990
+    },
+    {
+      "epoch": 0.14749003914896572,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0018770810210017813,
+      "loss": 0.0908,
+      "step": 16991
+    },
+    {
+      "epoch": 0.14749871962916988,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0018770660060116325,
+      "loss": 0.1611,
+      "step": 16992
+    },
+    {
+      "epoch": 0.14750740010937405,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0018770509901716956,
+      "loss": 0.1074,
+      "step": 16993
+    },
+    {
+      "epoch": 0.1475160805895782,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001877035973481988,
+      "loss": 0.1143,
+      "step": 16994
+    },
+    {
+      "epoch": 0.14752476106978238,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0018770209559425253,
+      "loss": 0.1357,
+      "step": 16995
+    },
+    {
+      "epoch": 0.14753344154998654,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0018770059375533242,
+      "loss": 0.1035,
+      "step": 16996
+    },
+    {
+      "epoch": 0.1475421220301907,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001876990918314401,
+      "loss": 0.1201,
+      "step": 16997
+    },
+    {
+      "epoch": 0.14755080251039487,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001876975898225772,
+      "loss": 0.1367,
+      "step": 16998
+    },
+    {
+      "epoch": 0.14755948299059904,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0018769608772874543,
+      "loss": 0.1182,
+      "step": 16999
+    },
+    {
+      "epoch": 0.1475681634708032,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0018769458554994635,
+      "loss": 0.1348,
+      "step": 17000
+    },
+    {
+      "epoch": 0.14757684395100737,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0018769308328618166,
+      "loss": 0.125,
+      "step": 17001
+    },
+    {
+      "epoch": 0.14758552443121153,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0018769158093745295,
+      "loss": 0.0874,
+      "step": 17002
+    },
+    {
+      "epoch": 0.1475942049114157,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0018769007850376193,
+      "loss": 0.0781,
+      "step": 17003
+    },
+    {
+      "epoch": 0.14760288539161986,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0018768857598511018,
+      "loss": 0.1406,
+      "step": 17004
+    },
+    {
+      "epoch": 0.14761156587182403,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.001876870733814994,
+      "loss": 0.1016,
+      "step": 17005
+    },
+    {
+      "epoch": 0.1476202463520282,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0018768557069293117,
+      "loss": 0.1113,
+      "step": 17006
+    },
+    {
+      "epoch": 0.14762892683223236,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.001876840679194072,
+      "loss": 0.1177,
+      "step": 17007
+    },
+    {
+      "epoch": 0.14763760731243653,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001876825650609291,
+      "loss": 0.1426,
+      "step": 17008
+    },
+    {
+      "epoch": 0.1476462877926407,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0018768106211749847,
+      "loss": 0.0977,
+      "step": 17009
+    },
+    {
+      "epoch": 0.14765496827284486,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0018767955908911705,
+      "loss": 0.1025,
+      "step": 17010
+    },
+    {
+      "epoch": 0.14766364875304902,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001876780559757864,
+      "loss": 0.1309,
+      "step": 17011
+    },
+    {
+      "epoch": 0.14767232923325319,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0018767655277750822,
+      "loss": 0.1484,
+      "step": 17012
+    },
+    {
+      "epoch": 0.14768100971345735,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001876750494942841,
+      "loss": 0.1299,
+      "step": 17013
+    },
+    {
+      "epoch": 0.14768969019366152,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0018767354612611571,
+      "loss": 0.126,
+      "step": 17014
+    },
+    {
+      "epoch": 0.14769837067386568,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001876720426730047,
+      "loss": 0.1416,
+      "step": 17015
+    },
+    {
+      "epoch": 0.14770705115406985,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0018767053913495274,
+      "loss": 0.1226,
+      "step": 17016
+    },
+    {
+      "epoch": 0.147715731634274,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0018766903551196141,
+      "loss": 0.0898,
+      "step": 17017
+    },
+    {
+      "epoch": 0.14772441211447818,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0018766753180403245,
+      "loss": 0.1426,
+      "step": 17018
+    },
+    {
+      "epoch": 0.14773309259468234,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001876660280111674,
+      "loss": 0.1514,
+      "step": 17019
+    },
+    {
+      "epoch": 0.1477417730748865,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0018766452413336793,
+      "loss": 0.168,
+      "step": 17020
+    },
+    {
+      "epoch": 0.14775045355509067,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0018766302017063571,
+      "loss": 0.1016,
+      "step": 17021
+    },
+    {
+      "epoch": 0.14775913403529484,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001876615161229724,
+      "loss": 0.106,
+      "step": 17022
+    },
+    {
+      "epoch": 0.147767814515499,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0018766001199037963,
+      "loss": 0.166,
+      "step": 17023
+    },
+    {
+      "epoch": 0.14777649499570317,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0018765850777285901,
+      "loss": 0.1855,
+      "step": 17024
+    },
+    {
+      "epoch": 0.14778517547590733,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0018765700347041223,
+      "loss": 0.0986,
+      "step": 17025
+    },
+    {
+      "epoch": 0.1477938559561115,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0018765549908304092,
+      "loss": 0.1436,
+      "step": 17026
+    },
+    {
+      "epoch": 0.14780253643631566,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001876539946107467,
+      "loss": 0.166,
+      "step": 17027
+    },
+    {
+      "epoch": 0.14781121691651983,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0018765249005353128,
+      "loss": 0.1445,
+      "step": 17028
+    },
+    {
+      "epoch": 0.147819897396724,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0018765098541139622,
+      "loss": 0.1396,
+      "step": 17029
+    },
+    {
+      "epoch": 0.14782857787692816,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0018764948068434324,
+      "loss": 0.082,
+      "step": 17030
+    },
+    {
+      "epoch": 0.14783725835713232,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0018764797587237393,
+      "loss": 0.1182,
+      "step": 17031
+    },
+    {
+      "epoch": 0.1478459388373365,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0018764647097548996,
+      "loss": 0.0977,
+      "step": 17032
+    },
+    {
+      "epoch": 0.14785461931754065,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.00187644965993693,
+      "loss": 0.1348,
+      "step": 17033
+    },
+    {
+      "epoch": 0.14786329979774482,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0018764346092698468,
+      "loss": 0.1069,
+      "step": 17034
+    },
+    {
+      "epoch": 0.14787198027794898,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001876419557753666,
+      "loss": 0.1387,
+      "step": 17035
+    },
+    {
+      "epoch": 0.14788066075815315,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0018764045053884049,
+      "loss": 0.1113,
+      "step": 17036
+    },
+    {
+      "epoch": 0.1478893412383573,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001876389452174079,
+      "loss": 0.1309,
+      "step": 17037
+    },
+    {
+      "epoch": 0.14789802171856148,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0018763743981107055,
+      "loss": 0.1011,
+      "step": 17038
+    },
+    {
+      "epoch": 0.14790670219876564,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0018763593431983007,
+      "loss": 0.0781,
+      "step": 17039
+    },
+    {
+      "epoch": 0.1479153826789698,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0018763442874368808,
+      "loss": 0.2041,
+      "step": 17040
+    },
+    {
+      "epoch": 0.14792406315917397,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0018763292308264623,
+      "loss": 0.1211,
+      "step": 17041
+    },
+    {
+      "epoch": 0.14793274363937814,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001876314173367062,
+      "loss": 0.124,
+      "step": 17042
+    },
+    {
+      "epoch": 0.1479414241195823,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0018762991150586964,
+      "loss": 0.1147,
+      "step": 17043
+    },
+    {
+      "epoch": 0.14795010459978647,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0018762840559013816,
+      "loss": 0.1172,
+      "step": 17044
+    },
+    {
+      "epoch": 0.14795878507999063,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0018762689958951345,
+      "loss": 0.1289,
+      "step": 17045
+    },
+    {
+      "epoch": 0.1479674655601948,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0018762539350399708,
+      "loss": 0.1211,
+      "step": 17046
+    },
+    {
+      "epoch": 0.14797614604039896,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0018762388733359076,
+      "loss": 0.1035,
+      "step": 17047
+    },
+    {
+      "epoch": 0.14798482652060313,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0018762238107829612,
+      "loss": 0.1621,
+      "step": 17048
+    },
+    {
+      "epoch": 0.1479935070008073,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0018762087473811484,
+      "loss": 0.0947,
+      "step": 17049
+    },
+    {
+      "epoch": 0.14800218748101146,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.001876193683130485,
+      "loss": 0.1631,
+      "step": 17050
+    },
+    {
+      "epoch": 0.14801086796121563,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018761786180309882,
+      "loss": 0.1162,
+      "step": 17051
+    },
+    {
+      "epoch": 0.1480195484414198,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0018761635520826738,
+      "loss": 0.0962,
+      "step": 17052
+    },
+    {
+      "epoch": 0.14802822892162396,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0018761484852855591,
+      "loss": 0.1152,
+      "step": 17053
+    },
+    {
+      "epoch": 0.14803690940182812,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0018761334176396597,
+      "loss": 0.0874,
+      "step": 17054
+    },
+    {
+      "epoch": 0.14804558988203229,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0018761183491449922,
+      "loss": 0.1025,
+      "step": 17055
+    },
+    {
+      "epoch": 0.14805427036223645,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0018761032798015737,
+      "loss": 0.1143,
+      "step": 17056
+    },
+    {
+      "epoch": 0.14806295084244062,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0018760882096094202,
+      "loss": 0.1123,
+      "step": 17057
+    },
+    {
+      "epoch": 0.14807163132264478,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0018760731385685481,
+      "loss": 0.1377,
+      "step": 17058
+    },
+    {
+      "epoch": 0.14808031180284895,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0018760580666789745,
+      "loss": 0.1562,
+      "step": 17059
+    },
+    {
+      "epoch": 0.1480889922830531,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0018760429939407154,
+      "loss": 0.1143,
+      "step": 17060
+    },
+    {
+      "epoch": 0.14809767276325728,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0018760279203537868,
+      "loss": 0.1104,
+      "step": 17061
+    },
+    {
+      "epoch": 0.14810635324346144,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018760128459182064,
+      "loss": 0.1221,
+      "step": 17062
+    },
+    {
+      "epoch": 0.1481150337236656,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0018759977706339897,
+      "loss": 0.1553,
+      "step": 17063
+    },
+    {
+      "epoch": 0.14812371420386977,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0018759826945011536,
+      "loss": 0.1445,
+      "step": 17064
+    },
+    {
+      "epoch": 0.1481323946840739,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0018759676175197143,
+      "loss": 0.1406,
+      "step": 17065
+    },
+    {
+      "epoch": 0.14814107516427807,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0018759525396896885,
+      "loss": 0.1328,
+      "step": 17066
+    },
+    {
+      "epoch": 0.14814975564448224,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0018759374610110926,
+      "loss": 0.1279,
+      "step": 17067
+    },
+    {
+      "epoch": 0.1481584361246864,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0018759223814839435,
+      "loss": 0.1094,
+      "step": 17068
+    },
+    {
+      "epoch": 0.14816711660489057,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0018759073011082573,
+      "loss": 0.1445,
+      "step": 17069
+    },
+    {
+      "epoch": 0.14817579708509473,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0018758922198840503,
+      "loss": 0.1465,
+      "step": 17070
+    },
+    {
+      "epoch": 0.1481844775652989,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0018758771378113393,
+      "loss": 0.1895,
+      "step": 17071
+    },
+    {
+      "epoch": 0.14819315804550307,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0018758620548901409,
+      "loss": 0.1016,
+      "step": 17072
+    },
+    {
+      "epoch": 0.14820183852570723,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0018758469711204712,
+      "loss": 0.1201,
+      "step": 17073
+    },
+    {
+      "epoch": 0.1482105190059114,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0018758318865023467,
+      "loss": 0.1484,
+      "step": 17074
+    },
+    {
+      "epoch": 0.14821919948611556,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0018758168010357843,
+      "loss": 0.1152,
+      "step": 17075
+    },
+    {
+      "epoch": 0.14822787996631973,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0018758017147208004,
+      "loss": 0.1221,
+      "step": 17076
+    },
+    {
+      "epoch": 0.1482365604465239,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018757866275574114,
+      "loss": 0.1104,
+      "step": 17077
+    },
+    {
+      "epoch": 0.14824524092672806,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0018757715395456337,
+      "loss": 0.1309,
+      "step": 17078
+    },
+    {
+      "epoch": 0.14825392140693222,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.001875756450685484,
+      "loss": 0.1201,
+      "step": 17079
+    },
+    {
+      "epoch": 0.14826260188713639,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001875741360976979,
+      "loss": 0.1113,
+      "step": 17080
+    },
+    {
+      "epoch": 0.14827128236734055,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0018757262704201343,
+      "loss": 0.1611,
+      "step": 17081
+    },
+    {
+      "epoch": 0.14827996284754472,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0018757111790149673,
+      "loss": 0.0903,
+      "step": 17082
+    },
+    {
+      "epoch": 0.14828864332774888,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0018756960867614943,
+      "loss": 0.1025,
+      "step": 17083
+    },
+    {
+      "epoch": 0.14829732380795305,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0018756809936597313,
+      "loss": 0.0957,
+      "step": 17084
+    },
+    {
+      "epoch": 0.1483060042881572,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0018756658997096955,
+      "loss": 0.124,
+      "step": 17085
+    },
+    {
+      "epoch": 0.14831468476836138,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0018756508049114033,
+      "loss": 0.1348,
+      "step": 17086
+    },
+    {
+      "epoch": 0.14832336524856554,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0018756357092648709,
+      "loss": 0.0986,
+      "step": 17087
+    },
+    {
+      "epoch": 0.1483320457287697,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0018756206127701147,
+      "loss": 0.1309,
+      "step": 17088
+    },
+    {
+      "epoch": 0.14834072620897387,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001875605515427152,
+      "loss": 0.0938,
+      "step": 17089
+    },
+    {
+      "epoch": 0.14834940668917804,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018755904172359983,
+      "loss": 0.127,
+      "step": 17090
+    },
+    {
+      "epoch": 0.1483580871693822,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0018755753181966706,
+      "loss": 0.1318,
+      "step": 17091
+    },
+    {
+      "epoch": 0.14836676764958637,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0018755602183091857,
+      "loss": 0.0981,
+      "step": 17092
+    },
+    {
+      "epoch": 0.14837544812979053,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0018755451175735593,
+      "loss": 0.124,
+      "step": 17093
+    },
+    {
+      "epoch": 0.1483841286099947,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0018755300159898087,
+      "loss": 0.1177,
+      "step": 17094
+    },
+    {
+      "epoch": 0.14839280909019886,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0018755149135579504,
+      "loss": 0.1426,
+      "step": 17095
+    },
+    {
+      "epoch": 0.14840148957040303,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0018754998102780004,
+      "loss": 0.1104,
+      "step": 17096
+    },
+    {
+      "epoch": 0.1484101700506072,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0018754847061499753,
+      "loss": 0.1094,
+      "step": 17097
+    },
+    {
+      "epoch": 0.14841885053081136,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001875469601173892,
+      "loss": 0.0977,
+      "step": 17098
+    },
+    {
+      "epoch": 0.14842753101101552,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0018754544953497668,
+      "loss": 0.1309,
+      "step": 17099
+    },
+    {
+      "epoch": 0.1484362114912197,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0018754393886776157,
+      "loss": 0.1982,
+      "step": 17100
+    },
+    {
+      "epoch": 0.14844489197142385,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0018754242811574564,
+      "loss": 0.0811,
+      "step": 17101
+    },
+    {
+      "epoch": 0.14845357245162802,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0018754091727893043,
+      "loss": 0.1016,
+      "step": 17102
+    },
+    {
+      "epoch": 0.14846225293183218,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0018753940635731765,
+      "loss": 0.1387,
+      "step": 17103
+    },
+    {
+      "epoch": 0.14847093341203635,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0018753789535090896,
+      "loss": 0.0874,
+      "step": 17104
+    },
+    {
+      "epoch": 0.14847961389224051,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0018753638425970597,
+      "loss": 0.0815,
+      "step": 17105
+    },
+    {
+      "epoch": 0.14848829437244468,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0018753487308371037,
+      "loss": 0.1074,
+      "step": 17106
+    },
+    {
+      "epoch": 0.14849697485264884,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001875333618229238,
+      "loss": 0.0933,
+      "step": 17107
+    },
+    {
+      "epoch": 0.148505655332853,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001875318504773479,
+      "loss": 0.1367,
+      "step": 17108
+    },
+    {
+      "epoch": 0.14851433581305717,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0018753033904698436,
+      "loss": 0.1387,
+      "step": 17109
+    },
+    {
+      "epoch": 0.14852301629326134,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0018752882753183477,
+      "loss": 0.105,
+      "step": 17110
+    },
+    {
+      "epoch": 0.1485316967734655,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0018752731593190083,
+      "loss": 0.1455,
+      "step": 17111
+    },
+    {
+      "epoch": 0.14854037725366967,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0018752580424718417,
+      "loss": 0.1055,
+      "step": 17112
+    },
+    {
+      "epoch": 0.14854905773387383,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001875242924776865,
+      "loss": 0.1592,
+      "step": 17113
+    },
+    {
+      "epoch": 0.148557738214078,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001875227806234094,
+      "loss": 0.1143,
+      "step": 17114
+    },
+    {
+      "epoch": 0.14856641869428217,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0018752126868435456,
+      "loss": 0.124,
+      "step": 17115
+    },
+    {
+      "epoch": 0.14857509917448633,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0018751975666052361,
+      "loss": 0.0869,
+      "step": 17116
+    },
+    {
+      "epoch": 0.1485837796546905,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0018751824455191819,
+      "loss": 0.1064,
+      "step": 17117
+    },
+    {
+      "epoch": 0.14859246013489466,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0018751673235854005,
+      "loss": 0.125,
+      "step": 17118
+    },
+    {
+      "epoch": 0.14860114061509883,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018751522008039075,
+      "loss": 0.1289,
+      "step": 17119
+    },
+    {
+      "epoch": 0.148609821095303,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0018751370771747196,
+      "loss": 0.1426,
+      "step": 17120
+    },
+    {
+      "epoch": 0.14861850157550716,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0018751219526978535,
+      "loss": 0.1113,
+      "step": 17121
+    },
+    {
+      "epoch": 0.14862718205571132,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0018751068273733253,
+      "loss": 0.0957,
+      "step": 17122
+    },
+    {
+      "epoch": 0.1486358625359155,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0018750917012011524,
+      "loss": 0.1021,
+      "step": 17123
+    },
+    {
+      "epoch": 0.14864454301611965,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001875076574181351,
+      "loss": 0.1367,
+      "step": 17124
+    },
+    {
+      "epoch": 0.14865322349632382,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0018750614463139371,
+      "loss": 0.1201,
+      "step": 17125
+    },
+    {
+      "epoch": 0.14866190397652798,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0018750463175989276,
+      "loss": 0.1211,
+      "step": 17126
+    },
+    {
+      "epoch": 0.14867058445673215,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0018750311880363394,
+      "loss": 0.0894,
+      "step": 17127
+    },
+    {
+      "epoch": 0.1486792649369363,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018750160576261885,
+      "loss": 0.1855,
+      "step": 17128
+    },
+    {
+      "epoch": 0.14868794541714048,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001875000926368492,
+      "loss": 0.1113,
+      "step": 17129
+    },
+    {
+      "epoch": 0.14869662589734464,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001874985794263266,
+      "loss": 0.1211,
+      "step": 17130
+    },
+    {
+      "epoch": 0.1487053063775488,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001874970661310527,
+      "loss": 0.0918,
+      "step": 17131
+    },
+    {
+      "epoch": 0.14871398685775297,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0018749555275102917,
+      "loss": 0.1191,
+      "step": 17132
+    },
+    {
+      "epoch": 0.14872266733795714,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001874940392862577,
+      "loss": 0.124,
+      "step": 17133
+    },
+    {
+      "epoch": 0.1487313478181613,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.001874925257367399,
+      "loss": 0.1377,
+      "step": 17134
+    },
+    {
+      "epoch": 0.14874002829836547,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001874910121024774,
+      "loss": 0.1177,
+      "step": 17135
+    },
+    {
+      "epoch": 0.14874870877856963,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0018748949838347196,
+      "loss": 0.1133,
+      "step": 17136
+    },
+    {
+      "epoch": 0.1487573892587738,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0018748798457972511,
+      "loss": 0.0986,
+      "step": 17137
+    },
+    {
+      "epoch": 0.14876606973897796,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0018748647069123858,
+      "loss": 0.1211,
+      "step": 17138
+    },
+    {
+      "epoch": 0.14877475021918213,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0018748495671801404,
+      "loss": 0.0977,
+      "step": 17139
+    },
+    {
+      "epoch": 0.1487834306993863,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001874834426600531,
+      "loss": 0.0942,
+      "step": 17140
+    },
+    {
+      "epoch": 0.14879211117959046,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0018748192851735742,
+      "loss": 0.1084,
+      "step": 17141
+    },
+    {
+      "epoch": 0.14880079165979462,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001874804142899287,
+      "loss": 0.125,
+      "step": 17142
+    },
+    {
+      "epoch": 0.1488094721399988,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001874788999777685,
+      "loss": 0.1123,
+      "step": 17143
+    },
+    {
+      "epoch": 0.14881815262020295,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001874773855808786,
+      "loss": 0.1064,
+      "step": 17144
+    },
+    {
+      "epoch": 0.14882683310040712,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0018747587109926057,
+      "loss": 0.1309,
+      "step": 17145
+    },
+    {
+      "epoch": 0.14883551358061128,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018747435653291608,
+      "loss": 0.1162,
+      "step": 17146
+    },
+    {
+      "epoch": 0.14884419406081545,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.001874728418818468,
+      "loss": 0.1089,
+      "step": 17147
+    },
+    {
+      "epoch": 0.14885287454101961,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001874713271460544,
+      "loss": 0.1846,
+      "step": 17148
+    },
+    {
+      "epoch": 0.14886155502122378,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0018746981232554052,
+      "loss": 0.1182,
+      "step": 17149
+    },
+    {
+      "epoch": 0.14887023550142794,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0018746829742030682,
+      "loss": 0.1582,
+      "step": 17150
+    },
+    {
+      "epoch": 0.1488789159816321,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0018746678243035492,
+      "loss": 0.1118,
+      "step": 17151
+    },
+    {
+      "epoch": 0.14888759646183627,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018746526735568653,
+      "loss": 0.0737,
+      "step": 17152
+    },
+    {
+      "epoch": 0.14889627694204044,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0018746375219630333,
+      "loss": 0.127,
+      "step": 17153
+    },
+    {
+      "epoch": 0.1489049574222446,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001874622369522069,
+      "loss": 0.168,
+      "step": 17154
+    },
+    {
+      "epoch": 0.14891363790244877,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0018746072162339892,
+      "loss": 0.1289,
+      "step": 17155
+    },
+    {
+      "epoch": 0.14892231838265294,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0018745920620988107,
+      "loss": 0.1016,
+      "step": 17156
+    },
+    {
+      "epoch": 0.1489309988628571,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.00187457690711655,
+      "loss": 0.1641,
+      "step": 17157
+    },
+    {
+      "epoch": 0.14893967934306127,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0018745617512872235,
+      "loss": 0.1025,
+      "step": 17158
+    },
+    {
+      "epoch": 0.14894835982326543,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001874546594610848,
+      "loss": 0.0938,
+      "step": 17159
+    },
+    {
+      "epoch": 0.1489570403034696,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0018745314370874398,
+      "loss": 0.166,
+      "step": 17160
+    },
+    {
+      "epoch": 0.14896572078367376,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001874516278717016,
+      "loss": 0.1777,
+      "step": 17161
+    },
+    {
+      "epoch": 0.14897440126387793,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0018745011194995927,
+      "loss": 0.1348,
+      "step": 17162
+    },
+    {
+      "epoch": 0.1489830817440821,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0018744859594351864,
+      "loss": 0.123,
+      "step": 17163
+    },
+    {
+      "epoch": 0.14899176222428626,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0018744707985238142,
+      "loss": 0.1045,
+      "step": 17164
+    },
+    {
+      "epoch": 0.14900044270449042,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001874455636765492,
+      "loss": 0.1318,
+      "step": 17165
+    },
+    {
+      "epoch": 0.1490091231846946,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001874440474160237,
+      "loss": 0.1396,
+      "step": 17166
+    },
+    {
+      "epoch": 0.14901780366489875,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0018744253107080655,
+      "loss": 0.1084,
+      "step": 17167
+    },
+    {
+      "epoch": 0.14902648414510292,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0018744101464089945,
+      "loss": 0.1123,
+      "step": 17168
+    },
+    {
+      "epoch": 0.14903516462530708,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0018743949812630397,
+      "loss": 0.1108,
+      "step": 17169
+    },
+    {
+      "epoch": 0.14904384510551125,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001874379815270218,
+      "loss": 0.1016,
+      "step": 17170
+    },
+    {
+      "epoch": 0.1490525255857154,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0018743646484305468,
+      "loss": 0.0928,
+      "step": 17171
+    },
+    {
+      "epoch": 0.14906120606591958,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0018743494807440416,
+      "loss": 0.0889,
+      "step": 17172
+    },
+    {
+      "epoch": 0.14906988654612374,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0018743343122107196,
+      "loss": 0.1689,
+      "step": 17173
+    },
+    {
+      "epoch": 0.1490785670263279,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0018743191428305976,
+      "loss": 0.1064,
+      "step": 17174
+    },
+    {
+      "epoch": 0.14908724750653207,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.001874303972603691,
+      "loss": 0.1143,
+      "step": 17175
+    },
+    {
+      "epoch": 0.14909592798673624,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0018742888015300177,
+      "loss": 0.127,
+      "step": 17176
+    },
+    {
+      "epoch": 0.1491046084669404,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0018742736296095937,
+      "loss": 0.1416,
+      "step": 17177
+    },
+    {
+      "epoch": 0.14911328894714457,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0018742584568424358,
+      "loss": 0.0728,
+      "step": 17178
+    },
+    {
+      "epoch": 0.14912196942734873,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0018742432832285604,
+      "loss": 0.1133,
+      "step": 17179
+    },
+    {
+      "epoch": 0.1491306499075529,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001874228108767984,
+      "loss": 0.1396,
+      "step": 17180
+    },
+    {
+      "epoch": 0.14913933038775706,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0018742129334607235,
+      "loss": 0.1191,
+      "step": 17181
+    },
+    {
+      "epoch": 0.14914801086796123,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0018741977573067956,
+      "loss": 0.1543,
+      "step": 17182
+    },
+    {
+      "epoch": 0.1491566913481654,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0018741825803062168,
+      "loss": 0.1143,
+      "step": 17183
+    },
+    {
+      "epoch": 0.14916537182836956,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018741674024590031,
+      "loss": 0.0967,
+      "step": 17184
+    },
+    {
+      "epoch": 0.14917405230857372,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0018741522237651718,
+      "loss": 0.1099,
+      "step": 17185
+    },
+    {
+      "epoch": 0.1491827327887779,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0018741370442247392,
+      "loss": 0.083,
+      "step": 17186
+    },
+    {
+      "epoch": 0.14919141326898205,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0018741218638377217,
+      "loss": 0.1182,
+      "step": 17187
+    },
+    {
+      "epoch": 0.1492000937491862,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0018741066826041366,
+      "loss": 0.0898,
+      "step": 17188
+    },
+    {
+      "epoch": 0.14920877422939036,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0018740915005239999,
+      "loss": 0.1689,
+      "step": 17189
+    },
+    {
+      "epoch": 0.14921745470959452,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0018740763175973281,
+      "loss": 0.1099,
+      "step": 17190
+    },
+    {
+      "epoch": 0.1492261351897987,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0018740611338241384,
+      "loss": 0.0996,
+      "step": 17191
+    },
+    {
+      "epoch": 0.14923481567000285,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0018740459492044468,
+      "loss": 0.1035,
+      "step": 17192
+    },
+    {
+      "epoch": 0.14924349615020702,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0018740307637382706,
+      "loss": 0.1143,
+      "step": 17193
+    },
+    {
+      "epoch": 0.14925217663041118,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0018740155774256257,
+      "loss": 0.0815,
+      "step": 17194
+    },
+    {
+      "epoch": 0.14926085711061535,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001874000390266529,
+      "loss": 0.123,
+      "step": 17195
+    },
+    {
+      "epoch": 0.1492695375908195,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0018739852022609971,
+      "loss": 0.1133,
+      "step": 17196
+    },
+    {
+      "epoch": 0.14927821807102368,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0018739700134090466,
+      "loss": 0.1021,
+      "step": 17197
+    },
+    {
+      "epoch": 0.14928689855122784,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0018739548237106944,
+      "loss": 0.1084,
+      "step": 17198
+    },
+    {
+      "epoch": 0.149295579031432,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0018739396331659563,
+      "loss": 0.1396,
+      "step": 17199
+    },
+    {
+      "epoch": 0.14930425951163617,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0018739244417748501,
+      "loss": 0.1309,
+      "step": 17200
+    },
+    {
+      "epoch": 0.14931293999184034,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0018739092495373912,
+      "loss": 0.0811,
+      "step": 17201
+    },
+    {
+      "epoch": 0.1493216204720445,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001873894056453597,
+      "loss": 0.1543,
+      "step": 17202
+    },
+    {
+      "epoch": 0.14933030095224867,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0018738788625234839,
+      "loss": 0.1055,
+      "step": 17203
+    },
+    {
+      "epoch": 0.14933898143245283,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0018738636677470687,
+      "loss": 0.0986,
+      "step": 17204
+    },
+    {
+      "epoch": 0.149347661912657,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0018738484721243674,
+      "loss": 0.1133,
+      "step": 17205
+    },
+    {
+      "epoch": 0.14935634239286116,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0018738332756553973,
+      "loss": 0.123,
+      "step": 17206
+    },
+    {
+      "epoch": 0.14936502287306533,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018738180783401746,
+      "loss": 0.1357,
+      "step": 17207
+    },
+    {
+      "epoch": 0.1493737033532695,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001873802880178716,
+      "loss": 0.0981,
+      "step": 17208
+    },
+    {
+      "epoch": 0.14938238383347366,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0018737876811710387,
+      "loss": 0.0918,
+      "step": 17209
+    },
+    {
+      "epoch": 0.14939106431367782,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0018737724813171583,
+      "loss": 0.127,
+      "step": 17210
+    },
+    {
+      "epoch": 0.149399744793882,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0018737572806170925,
+      "loss": 0.1064,
+      "step": 17211
+    },
+    {
+      "epoch": 0.14940842527408615,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0018737420790708565,
+      "loss": 0.1279,
+      "step": 17212
+    },
+    {
+      "epoch": 0.14941710575429032,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0018737268766784684,
+      "loss": 0.1079,
+      "step": 17213
+    },
+    {
+      "epoch": 0.14942578623449448,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0018737116734399441,
+      "loss": 0.1367,
+      "step": 17214
+    },
+    {
+      "epoch": 0.14943446671469865,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018736964693553003,
+      "loss": 0.1162,
+      "step": 17215
+    },
+    {
+      "epoch": 0.14944314719490281,
+      "grad_norm": 0.057861328125,
+      "learning_rate": 0.0018736812644245537,
+      "loss": 0.0815,
+      "step": 17216
+    },
+    {
+      "epoch": 0.14945182767510698,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.001873666058647721,
+      "loss": 0.0801,
+      "step": 17217
+    },
+    {
+      "epoch": 0.14946050815531114,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0018736508520248185,
+      "loss": 0.127,
+      "step": 17218
+    },
+    {
+      "epoch": 0.1494691886355153,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0018736356445558631,
+      "loss": 0.1079,
+      "step": 17219
+    },
+    {
+      "epoch": 0.14947786911571947,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0018736204362408717,
+      "loss": 0.1069,
+      "step": 17220
+    },
+    {
+      "epoch": 0.14948654959592364,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0018736052270798603,
+      "loss": 0.1055,
+      "step": 17221
+    },
+    {
+      "epoch": 0.1494952300761278,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0018735900170728458,
+      "loss": 0.1084,
+      "step": 17222
+    },
+    {
+      "epoch": 0.14950391055633197,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001873574806219845,
+      "loss": 0.1143,
+      "step": 17223
+    },
+    {
+      "epoch": 0.14951259103653614,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0018735595945208743,
+      "loss": 0.0942,
+      "step": 17224
+    },
+    {
+      "epoch": 0.1495212715167403,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0018735443819759506,
+      "loss": 0.0859,
+      "step": 17225
+    },
+    {
+      "epoch": 0.14952995199694447,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0018735291685850904,
+      "loss": 0.0854,
+      "step": 17226
+    },
+    {
+      "epoch": 0.14953863247714863,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0018735139543483105,
+      "loss": 0.124,
+      "step": 17227
+    },
+    {
+      "epoch": 0.1495473129573528,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001873498739265627,
+      "loss": 0.106,
+      "step": 17228
+    },
+    {
+      "epoch": 0.14955599343755696,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001873483523337057,
+      "loss": 0.1328,
+      "step": 17229
+    },
+    {
+      "epoch": 0.14956467391776113,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001873468306562617,
+      "loss": 0.0835,
+      "step": 17230
+    },
+    {
+      "epoch": 0.1495733543979653,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0018734530889423237,
+      "loss": 0.1084,
+      "step": 17231
+    },
+    {
+      "epoch": 0.14958203487816946,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0018734378704761939,
+      "loss": 0.1133,
+      "step": 17232
+    },
+    {
+      "epoch": 0.14959071535837362,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0018734226511642439,
+      "loss": 0.1064,
+      "step": 17233
+    },
+    {
+      "epoch": 0.1495993958385778,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0018734074310064906,
+      "loss": 0.0947,
+      "step": 17234
+    },
+    {
+      "epoch": 0.14960807631878195,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0018733922100029503,
+      "loss": 0.0942,
+      "step": 17235
+    },
+    {
+      "epoch": 0.14961675679898612,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0018733769881536402,
+      "loss": 0.1387,
+      "step": 17236
+    },
+    {
+      "epoch": 0.14962543727919028,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0018733617654585763,
+      "loss": 0.1289,
+      "step": 17237
+    },
+    {
+      "epoch": 0.14963411775939445,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.001873346541917776,
+      "loss": 0.1328,
+      "step": 17238
+    },
+    {
+      "epoch": 0.1496427982395986,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018733313175312554,
+      "loss": 0.1309,
+      "step": 17239
+    },
+    {
+      "epoch": 0.14965147871980278,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0018733160922990314,
+      "loss": 0.1021,
+      "step": 17240
+    },
+    {
+      "epoch": 0.14966015920000694,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0018733008662211203,
+      "loss": 0.0952,
+      "step": 17241
+    },
+    {
+      "epoch": 0.1496688396802111,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001873285639297539,
+      "loss": 0.0908,
+      "step": 17242
+    },
+    {
+      "epoch": 0.14967752016041527,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001873270411528304,
+      "loss": 0.0962,
+      "step": 17243
+    },
+    {
+      "epoch": 0.14968620064061944,
+      "grad_norm": 3.6875,
+      "learning_rate": 0.0018732551829134325,
+      "loss": 0.252,
+      "step": 17244
+    },
+    {
+      "epoch": 0.1496948811208236,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0018732399534529408,
+      "loss": 0.1201,
+      "step": 17245
+    },
+    {
+      "epoch": 0.14970356160102777,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0018732247231468453,
+      "loss": 0.0952,
+      "step": 17246
+    },
+    {
+      "epoch": 0.14971224208123193,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018732094919951627,
+      "loss": 0.126,
+      "step": 17247
+    },
+    {
+      "epoch": 0.1497209225614361,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.00187319425999791,
+      "loss": 0.1191,
+      "step": 17248
+    },
+    {
+      "epoch": 0.14972960304164026,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0018731790271551037,
+      "loss": 0.0986,
+      "step": 17249
+    },
+    {
+      "epoch": 0.14973828352184443,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0018731637934667603,
+      "loss": 0.1523,
+      "step": 17250
+    },
+    {
+      "epoch": 0.1497469640020486,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0018731485589328968,
+      "loss": 0.1182,
+      "step": 17251
+    },
+    {
+      "epoch": 0.14975564448225276,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0018731333235535296,
+      "loss": 0.1455,
+      "step": 17252
+    },
+    {
+      "epoch": 0.14976432496245692,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0018731180873286753,
+      "loss": 0.126,
+      "step": 17253
+    },
+    {
+      "epoch": 0.1497730054426611,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0018731028502583505,
+      "loss": 0.1196,
+      "step": 17254
+    },
+    {
+      "epoch": 0.14978168592286525,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001873087612342572,
+      "loss": 0.1064,
+      "step": 17255
+    },
+    {
+      "epoch": 0.14979036640306942,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001873072373581357,
+      "loss": 0.1118,
+      "step": 17256
+    },
+    {
+      "epoch": 0.14979904688327358,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0018730571339747215,
+      "loss": 0.1348,
+      "step": 17257
+    },
+    {
+      "epoch": 0.14980772736347775,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018730418935226823,
+      "loss": 0.0806,
+      "step": 17258
+    },
+    {
+      "epoch": 0.14981640784368191,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0018730266522252559,
+      "loss": 0.1006,
+      "step": 17259
+    },
+    {
+      "epoch": 0.14982508832388608,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0018730114100824593,
+      "loss": 0.1025,
+      "step": 17260
+    },
+    {
+      "epoch": 0.14983376880409024,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0018729961670943091,
+      "loss": 0.125,
+      "step": 17261
+    },
+    {
+      "epoch": 0.1498424492842944,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001872980923260822,
+      "loss": 0.1245,
+      "step": 17262
+    },
+    {
+      "epoch": 0.14985112976449858,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0018729656785820147,
+      "loss": 0.1699,
+      "step": 17263
+    },
+    {
+      "epoch": 0.14985981024470274,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0018729504330579033,
+      "loss": 0.1309,
+      "step": 17264
+    },
+    {
+      "epoch": 0.1498684907249069,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001872935186688505,
+      "loss": 0.1064,
+      "step": 17265
+    },
+    {
+      "epoch": 0.14987717120511107,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0018729199394738364,
+      "loss": 0.1055,
+      "step": 17266
+    },
+    {
+      "epoch": 0.14988585168531524,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0018729046914139145,
+      "loss": 0.1201,
+      "step": 17267
+    },
+    {
+      "epoch": 0.1498945321655194,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.001872889442508755,
+      "loss": 0.1641,
+      "step": 17268
+    },
+    {
+      "epoch": 0.14990321264572357,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001872874192758376,
+      "loss": 0.1162,
+      "step": 17269
+    },
+    {
+      "epoch": 0.14991189312592773,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018728589421627931,
+      "loss": 0.1445,
+      "step": 17270
+    },
+    {
+      "epoch": 0.1499205736061319,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001872843690722023,
+      "loss": 0.1123,
+      "step": 17271
+    },
+    {
+      "epoch": 0.14992925408633606,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001872828438436083,
+      "loss": 0.0879,
+      "step": 17272
+    },
+    {
+      "epoch": 0.14993793456654023,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001872813185304989,
+      "loss": 0.125,
+      "step": 17273
+    },
+    {
+      "epoch": 0.1499466150467444,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0018727979313287587,
+      "loss": 0.1094,
+      "step": 17274
+    },
+    {
+      "epoch": 0.14995529552694856,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0018727826765074077,
+      "loss": 0.1279,
+      "step": 17275
+    },
+    {
+      "epoch": 0.14996397600715272,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0018727674208409532,
+      "loss": 0.1348,
+      "step": 17276
+    },
+    {
+      "epoch": 0.1499726564873569,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0018727521643294124,
+      "loss": 0.1177,
+      "step": 17277
+    },
+    {
+      "epoch": 0.14998133696756105,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001872736906972801,
+      "loss": 0.1484,
+      "step": 17278
+    },
+    {
+      "epoch": 0.14999001744776522,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0018727216487711361,
+      "loss": 0.1035,
+      "step": 17279
+    },
+    {
+      "epoch": 0.14999869792796938,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0018727063897244344,
+      "loss": 0.1235,
+      "step": 17280
+    },
+    {
+      "epoch": 0.15000737840817355,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0018726911298327126,
+      "loss": 0.123,
+      "step": 17281
+    },
+    {
+      "epoch": 0.1500160588883777,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0018726758690959875,
+      "loss": 0.0825,
+      "step": 17282
+    },
+    {
+      "epoch": 0.15002473936858188,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0018726606075142758,
+      "loss": 0.0942,
+      "step": 17283
+    },
+    {
+      "epoch": 0.15003341984878604,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001872645345087594,
+      "loss": 0.1016,
+      "step": 17284
+    },
+    {
+      "epoch": 0.1500421003289902,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018726300818159586,
+      "loss": 0.1113,
+      "step": 17285
+    },
+    {
+      "epoch": 0.15005078080919437,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.001872614817699387,
+      "loss": 0.0991,
+      "step": 17286
+    },
+    {
+      "epoch": 0.15005946128939854,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001872599552737895,
+      "loss": 0.1104,
+      "step": 17287
+    },
+    {
+      "epoch": 0.1500681417696027,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0018725842869314999,
+      "loss": 0.1426,
+      "step": 17288
+    },
+    {
+      "epoch": 0.15007682224980687,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0018725690202802184,
+      "loss": 0.1123,
+      "step": 17289
+    },
+    {
+      "epoch": 0.15008550273001103,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0018725537527840667,
+      "loss": 0.1348,
+      "step": 17290
+    },
+    {
+      "epoch": 0.1500941832102152,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0018725384844430623,
+      "loss": 0.1045,
+      "step": 17291
+    },
+    {
+      "epoch": 0.15010286369041936,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0018725232152572212,
+      "loss": 0.1025,
+      "step": 17292
+    },
+    {
+      "epoch": 0.15011154417062353,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.00187250794522656,
+      "loss": 0.1167,
+      "step": 17293
+    },
+    {
+      "epoch": 0.1501202246508277,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0018724926743510961,
+      "loss": 0.0796,
+      "step": 17294
+    },
+    {
+      "epoch": 0.15012890513103186,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0018724774026308458,
+      "loss": 0.1367,
+      "step": 17295
+    },
+    {
+      "epoch": 0.15013758561123602,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0018724621300658256,
+      "loss": 0.1719,
+      "step": 17296
+    },
+    {
+      "epoch": 0.1501462660914402,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0018724468566560526,
+      "loss": 0.1309,
+      "step": 17297
+    },
+    {
+      "epoch": 0.15015494657164435,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0018724315824015431,
+      "loss": 0.1104,
+      "step": 17298
+    },
+    {
+      "epoch": 0.15016362705184852,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0018724163073023145,
+      "loss": 0.127,
+      "step": 17299
+    },
+    {
+      "epoch": 0.15017230753205268,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0018724010313583826,
+      "loss": 0.1133,
+      "step": 17300
+    },
+    {
+      "epoch": 0.15018098801225685,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0018723857545697646,
+      "loss": 0.1475,
+      "step": 17301
+    },
+    {
+      "epoch": 0.15018966849246101,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001872370476936477,
+      "loss": 0.0923,
+      "step": 17302
+    },
+    {
+      "epoch": 0.15019834897266518,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0018723551984585372,
+      "loss": 0.1152,
+      "step": 17303
+    },
+    {
+      "epoch": 0.15020702945286935,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0018723399191359607,
+      "loss": 0.1201,
+      "step": 17304
+    },
+    {
+      "epoch": 0.1502157099330735,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0018723246389687651,
+      "loss": 0.0869,
+      "step": 17305
+    },
+    {
+      "epoch": 0.15022439041327768,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0018723093579569672,
+      "loss": 0.0898,
+      "step": 17306
+    },
+    {
+      "epoch": 0.15023307089348184,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0018722940761005831,
+      "loss": 0.0996,
+      "step": 17307
+    },
+    {
+      "epoch": 0.150241751373686,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0018722787933996299,
+      "loss": 0.3867,
+      "step": 17308
+    },
+    {
+      "epoch": 0.15025043185389017,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0018722635098541243,
+      "loss": 0.0933,
+      "step": 17309
+    },
+    {
+      "epoch": 0.15025911233409434,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0018722482254640827,
+      "loss": 0.1328,
+      "step": 17310
+    },
+    {
+      "epoch": 0.15026779281429847,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0018722329402295222,
+      "loss": 0.0996,
+      "step": 17311
+    },
+    {
+      "epoch": 0.15027647329450264,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0018722176541504593,
+      "loss": 0.1572,
+      "step": 17312
+    },
+    {
+      "epoch": 0.1502851537747068,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0018722023672269107,
+      "loss": 0.1523,
+      "step": 17313
+    },
+    {
+      "epoch": 0.15029383425491097,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001872187079458893,
+      "loss": 0.1055,
+      "step": 17314
+    },
+    {
+      "epoch": 0.15030251473511513,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0018721717908464235,
+      "loss": 0.1113,
+      "step": 17315
+    },
+    {
+      "epoch": 0.1503111952153193,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0018721565013895185,
+      "loss": 0.1338,
+      "step": 17316
+    },
+    {
+      "epoch": 0.15031987569552346,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0018721412110881946,
+      "loss": 0.1338,
+      "step": 17317
+    },
+    {
+      "epoch": 0.15032855617572763,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0018721259199424686,
+      "loss": 0.1162,
+      "step": 17318
+    },
+    {
+      "epoch": 0.1503372366559318,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0018721106279523576,
+      "loss": 0.1445,
+      "step": 17319
+    },
+    {
+      "epoch": 0.15034591713613596,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0018720953351178777,
+      "loss": 0.0903,
+      "step": 17320
+    },
+    {
+      "epoch": 0.15035459761634012,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0018720800414390463,
+      "loss": 0.1084,
+      "step": 17321
+    },
+    {
+      "epoch": 0.1503632780965443,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0018720647469158795,
+      "loss": 0.1348,
+      "step": 17322
+    },
+    {
+      "epoch": 0.15037195857674845,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0018720494515483944,
+      "loss": 0.0928,
+      "step": 17323
+    },
+    {
+      "epoch": 0.15038063905695262,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0018720341553366074,
+      "loss": 0.1318,
+      "step": 17324
+    },
+    {
+      "epoch": 0.15038931953715678,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0018720188582805357,
+      "loss": 0.0874,
+      "step": 17325
+    },
+    {
+      "epoch": 0.15039800001736095,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001872003560380196,
+      "loss": 0.1064,
+      "step": 17326
+    },
+    {
+      "epoch": 0.15040668049756512,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0018719882616356042,
+      "loss": 0.1133,
+      "step": 17327
+    },
+    {
+      "epoch": 0.15041536097776928,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001871972962046778,
+      "loss": 0.1309,
+      "step": 17328
+    },
+    {
+      "epoch": 0.15042404145797345,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0018719576616137335,
+      "loss": 0.1221,
+      "step": 17329
+    },
+    {
+      "epoch": 0.1504327219381776,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001871942360336488,
+      "loss": 0.1021,
+      "step": 17330
+    },
+    {
+      "epoch": 0.15044140241838178,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0018719270582150578,
+      "loss": 0.0977,
+      "step": 17331
+    },
+    {
+      "epoch": 0.15045008289858594,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.00187191175524946,
+      "loss": 0.1064,
+      "step": 17332
+    },
+    {
+      "epoch": 0.1504587633787901,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001871896451439711,
+      "loss": 0.1191,
+      "step": 17333
+    },
+    {
+      "epoch": 0.15046744385899427,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0018718811467858274,
+      "loss": 0.0947,
+      "step": 17334
+    },
+    {
+      "epoch": 0.15047612433919844,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0018718658412878262,
+      "loss": 0.1045,
+      "step": 17335
+    },
+    {
+      "epoch": 0.1504848048194026,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0018718505349457244,
+      "loss": 0.1152,
+      "step": 17336
+    },
+    {
+      "epoch": 0.15049348529960677,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001871835227759538,
+      "loss": 0.1367,
+      "step": 17337
+    },
+    {
+      "epoch": 0.15050216577981093,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0018718199197292847,
+      "loss": 0.1367,
+      "step": 17338
+    },
+    {
+      "epoch": 0.1505108462600151,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0018718046108549804,
+      "loss": 0.0986,
+      "step": 17339
+    },
+    {
+      "epoch": 0.15051952674021926,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0018717893011366423,
+      "loss": 0.1162,
+      "step": 17340
+    },
+    {
+      "epoch": 0.15052820722042343,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0018717739905742873,
+      "loss": 0.104,
+      "step": 17341
+    },
+    {
+      "epoch": 0.1505368877006276,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0018717586791679314,
+      "loss": 0.1289,
+      "step": 17342
+    },
+    {
+      "epoch": 0.15054556818083176,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001871743366917592,
+      "loss": 0.1729,
+      "step": 17343
+    },
+    {
+      "epoch": 0.15055424866103592,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0018717280538232855,
+      "loss": 0.103,
+      "step": 17344
+    },
+    {
+      "epoch": 0.1505629291412401,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0018717127398850289,
+      "loss": 0.127,
+      "step": 17345
+    },
+    {
+      "epoch": 0.15057160962144425,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0018716974251028389,
+      "loss": 0.0977,
+      "step": 17346
+    },
+    {
+      "epoch": 0.15058029010164842,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001871682109476732,
+      "loss": 0.1426,
+      "step": 17347
+    },
+    {
+      "epoch": 0.15058897058185258,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0018716667930067253,
+      "loss": 0.0903,
+      "step": 17348
+    },
+    {
+      "epoch": 0.15059765106205675,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0018716514756928354,
+      "loss": 0.123,
+      "step": 17349
+    },
+    {
+      "epoch": 0.1506063315422609,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001871636157535079,
+      "loss": 0.1113,
+      "step": 17350
+    },
+    {
+      "epoch": 0.15061501202246508,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001871620838533473,
+      "loss": 0.0898,
+      "step": 17351
+    },
+    {
+      "epoch": 0.15062369250266924,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0018716055186880339,
+      "loss": 0.127,
+      "step": 17352
+    },
+    {
+      "epoch": 0.1506323729828734,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018715901979987786,
+      "loss": 0.123,
+      "step": 17353
+    },
+    {
+      "epoch": 0.15064105346307757,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0018715748764657242,
+      "loss": 0.125,
+      "step": 17354
+    },
+    {
+      "epoch": 0.15064973394328174,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0018715595540888868,
+      "loss": 0.1523,
+      "step": 17355
+    },
+    {
+      "epoch": 0.1506584144234859,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001871544230868283,
+      "loss": 0.1367,
+      "step": 17356
+    },
+    {
+      "epoch": 0.15066709490369007,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0018715289068039307,
+      "loss": 0.1514,
+      "step": 17357
+    },
+    {
+      "epoch": 0.15067577538389423,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001871513581895846,
+      "loss": 0.0967,
+      "step": 17358
+    },
+    {
+      "epoch": 0.1506844558640984,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001871498256144045,
+      "loss": 0.083,
+      "step": 17359
+    },
+    {
+      "epoch": 0.15069313634430256,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0018714829295485458,
+      "loss": 0.1182,
+      "step": 17360
+    },
+    {
+      "epoch": 0.15070181682450673,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0018714676021093642,
+      "loss": 0.209,
+      "step": 17361
+    },
+    {
+      "epoch": 0.1507104973047109,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0018714522738265172,
+      "loss": 0.1582,
+      "step": 17362
+    },
+    {
+      "epoch": 0.15071917778491506,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0018714369447000213,
+      "loss": 0.1064,
+      "step": 17363
+    },
+    {
+      "epoch": 0.15072785826511922,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001871421614729894,
+      "loss": 0.1211,
+      "step": 17364
+    },
+    {
+      "epoch": 0.1507365387453234,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0018714062839161514,
+      "loss": 0.1221,
+      "step": 17365
+    },
+    {
+      "epoch": 0.15074521922552755,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0018713909522588106,
+      "loss": 0.1084,
+      "step": 17366
+    },
+    {
+      "epoch": 0.15075389970573172,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0018713756197578882,
+      "loss": 0.1177,
+      "step": 17367
+    },
+    {
+      "epoch": 0.15076258018593588,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0018713602864134007,
+      "loss": 0.1069,
+      "step": 17368
+    },
+    {
+      "epoch": 0.15077126066614005,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0018713449522253654,
+      "loss": 0.0981,
+      "step": 17369
+    },
+    {
+      "epoch": 0.15077994114634422,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001871329617193799,
+      "loss": 0.1318,
+      "step": 17370
+    },
+    {
+      "epoch": 0.15078862162654838,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0018713142813187181,
+      "loss": 0.0986,
+      "step": 17371
+    },
+    {
+      "epoch": 0.15079730210675255,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0018712989446001394,
+      "loss": 0.126,
+      "step": 17372
+    },
+    {
+      "epoch": 0.1508059825869567,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018712836070380796,
+      "loss": 0.1045,
+      "step": 17373
+    },
+    {
+      "epoch": 0.15081466306716088,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0018712682686325558,
+      "loss": 0.0918,
+      "step": 17374
+    },
+    {
+      "epoch": 0.15082334354736504,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0018712529293835847,
+      "loss": 0.1387,
+      "step": 17375
+    },
+    {
+      "epoch": 0.1508320240275692,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001871237589291183,
+      "loss": 0.168,
+      "step": 17376
+    },
+    {
+      "epoch": 0.15084070450777337,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0018712222483553672,
+      "loss": 0.1138,
+      "step": 17377
+    },
+    {
+      "epoch": 0.15084938498797754,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0018712069065761545,
+      "loss": 0.1406,
+      "step": 17378
+    },
+    {
+      "epoch": 0.1508580654681817,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0018711915639535617,
+      "loss": 0.0933,
+      "step": 17379
+    },
+    {
+      "epoch": 0.15086674594838587,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0018711762204876052,
+      "loss": 0.0806,
+      "step": 17380
+    },
+    {
+      "epoch": 0.15087542642859003,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0018711608761783022,
+      "loss": 0.1138,
+      "step": 17381
+    },
+    {
+      "epoch": 0.1508841069087942,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0018711455310256686,
+      "loss": 0.1475,
+      "step": 17382
+    },
+    {
+      "epoch": 0.15089278738899836,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0018711301850297224,
+      "loss": 0.0977,
+      "step": 17383
+    },
+    {
+      "epoch": 0.15090146786920253,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0018711148381904796,
+      "loss": 0.0967,
+      "step": 17384
+    },
+    {
+      "epoch": 0.1509101483494067,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0018710994905079572,
+      "loss": 0.1484,
+      "step": 17385
+    },
+    {
+      "epoch": 0.15091882882961086,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0018710841419821722,
+      "loss": 0.124,
+      "step": 17386
+    },
+    {
+      "epoch": 0.15092750930981502,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0018710687926131412,
+      "loss": 0.1953,
+      "step": 17387
+    },
+    {
+      "epoch": 0.1509361897900192,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0018710534424008806,
+      "loss": 0.1172,
+      "step": 17388
+    },
+    {
+      "epoch": 0.15094487027022335,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0018710380913454078,
+      "loss": 0.1475,
+      "step": 17389
+    },
+    {
+      "epoch": 0.15095355075042752,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0018710227394467393,
+      "loss": 0.1309,
+      "step": 17390
+    },
+    {
+      "epoch": 0.15096223123063168,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0018710073867048915,
+      "loss": 0.0811,
+      "step": 17391
+    },
+    {
+      "epoch": 0.15097091171083585,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0018709920331198822,
+      "loss": 0.1104,
+      "step": 17392
+    },
+    {
+      "epoch": 0.15097959219104,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0018709766786917273,
+      "loss": 0.1006,
+      "step": 17393
+    },
+    {
+      "epoch": 0.15098827267124418,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001870961323420444,
+      "loss": 0.1279,
+      "step": 17394
+    },
+    {
+      "epoch": 0.15099695315144834,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0018709459673060492,
+      "loss": 0.1025,
+      "step": 17395
+    },
+    {
+      "epoch": 0.1510056336316525,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001870930610348559,
+      "loss": 0.1211,
+      "step": 17396
+    },
+    {
+      "epoch": 0.15101431411185667,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001870915252547991,
+      "loss": 0.1504,
+      "step": 17397
+    },
+    {
+      "epoch": 0.15102299459206084,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0018708998939043614,
+      "loss": 0.1074,
+      "step": 17398
+    },
+    {
+      "epoch": 0.151031675072265,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0018708845344176878,
+      "loss": 0.1123,
+      "step": 17399
+    },
+    {
+      "epoch": 0.15104035555246917,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0018708691740879858,
+      "loss": 0.1445,
+      "step": 17400
+    },
+    {
+      "epoch": 0.15104903603267333,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018708538129152733,
+      "loss": 0.1006,
+      "step": 17401
+    },
+    {
+      "epoch": 0.1510577165128775,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0018708384508995663,
+      "loss": 0.1123,
+      "step": 17402
+    },
+    {
+      "epoch": 0.15106639699308166,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001870823088040882,
+      "loss": 0.0864,
+      "step": 17403
+    },
+    {
+      "epoch": 0.15107507747328583,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0018708077243392372,
+      "loss": 0.1011,
+      "step": 17404
+    },
+    {
+      "epoch": 0.15108375795349,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0018707923597946487,
+      "loss": 0.126,
+      "step": 17405
+    },
+    {
+      "epoch": 0.15109243843369416,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0018707769944071332,
+      "loss": 0.1133,
+      "step": 17406
+    },
+    {
+      "epoch": 0.15110111891389832,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0018707616281767078,
+      "loss": 0.1011,
+      "step": 17407
+    },
+    {
+      "epoch": 0.1511097993941025,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0018707462611033889,
+      "loss": 0.1328,
+      "step": 17408
+    },
+    {
+      "epoch": 0.15111847987430665,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0018707308931871936,
+      "loss": 0.0889,
+      "step": 17409
+    },
+    {
+      "epoch": 0.15112716035451082,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0018707155244281383,
+      "loss": 0.1582,
+      "step": 17410
+    },
+    {
+      "epoch": 0.15113584083471499,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00187070015482624,
+      "loss": 0.1094,
+      "step": 17411
+    },
+    {
+      "epoch": 0.15114452131491915,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001870684784381516,
+      "loss": 0.166,
+      "step": 17412
+    },
+    {
+      "epoch": 0.15115320179512332,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0018706694130939828,
+      "loss": 0.1016,
+      "step": 17413
+    },
+    {
+      "epoch": 0.15116188227532748,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0018706540409636565,
+      "loss": 0.1016,
+      "step": 17414
+    },
+    {
+      "epoch": 0.15117056275553165,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001870638667990555,
+      "loss": 0.0879,
+      "step": 17415
+    },
+    {
+      "epoch": 0.1511792432357358,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0018706232941746944,
+      "loss": 0.0933,
+      "step": 17416
+    },
+    {
+      "epoch": 0.15118792371593998,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0018706079195160916,
+      "loss": 0.1445,
+      "step": 17417
+    },
+    {
+      "epoch": 0.15119660419614414,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001870592544014764,
+      "loss": 0.1543,
+      "step": 17418
+    },
+    {
+      "epoch": 0.1512052846763483,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0018705771676707278,
+      "loss": 0.0962,
+      "step": 17419
+    },
+    {
+      "epoch": 0.15121396515655247,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001870561790484,
+      "loss": 0.1836,
+      "step": 17420
+    },
+    {
+      "epoch": 0.15122264563675664,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018705464124545974,
+      "loss": 0.125,
+      "step": 17421
+    },
+    {
+      "epoch": 0.1512313261169608,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0018705310335825366,
+      "loss": 0.0791,
+      "step": 17422
+    },
+    {
+      "epoch": 0.15124000659716497,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001870515653867835,
+      "loss": 0.1357,
+      "step": 17423
+    },
+    {
+      "epoch": 0.15124868707736913,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0018705002733105088,
+      "loss": 0.1934,
+      "step": 17424
+    },
+    {
+      "epoch": 0.1512573675575733,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001870484891910575,
+      "loss": 0.0801,
+      "step": 17425
+    },
+    {
+      "epoch": 0.15126604803777746,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0018704695096680506,
+      "loss": 0.103,
+      "step": 17426
+    },
+    {
+      "epoch": 0.15127472851798163,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0018704541265829524,
+      "loss": 0.1309,
+      "step": 17427
+    },
+    {
+      "epoch": 0.1512834089981858,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0018704387426552972,
+      "loss": 0.0874,
+      "step": 17428
+    },
+    {
+      "epoch": 0.15129208947838996,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018704233578851015,
+      "loss": 0.0986,
+      "step": 17429
+    },
+    {
+      "epoch": 0.15130076995859412,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0018704079722723825,
+      "loss": 0.1299,
+      "step": 17430
+    },
+    {
+      "epoch": 0.1513094504387983,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018703925858171572,
+      "loss": 0.1309,
+      "step": 17431
+    },
+    {
+      "epoch": 0.15131813091900245,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0018703771985194417,
+      "loss": 0.0962,
+      "step": 17432
+    },
+    {
+      "epoch": 0.15132681139920662,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0018703618103792536,
+      "loss": 0.105,
+      "step": 17433
+    },
+    {
+      "epoch": 0.15133549187941078,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0018703464213966092,
+      "loss": 0.1543,
+      "step": 17434
+    },
+    {
+      "epoch": 0.15134417235961492,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0018703310315715256,
+      "loss": 0.1523,
+      "step": 17435
+    },
+    {
+      "epoch": 0.15135285283981909,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0018703156409040197,
+      "loss": 0.1416,
+      "step": 17436
+    },
+    {
+      "epoch": 0.15136153332002325,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001870300249394108,
+      "loss": 0.1094,
+      "step": 17437
+    },
+    {
+      "epoch": 0.15137021380022742,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0018702848570418076,
+      "loss": 0.1582,
+      "step": 17438
+    },
+    {
+      "epoch": 0.15137889428043158,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0018702694638471353,
+      "loss": 0.1719,
+      "step": 17439
+    },
+    {
+      "epoch": 0.15138757476063575,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0018702540698101078,
+      "loss": 0.1406,
+      "step": 17440
+    },
+    {
+      "epoch": 0.1513962552408399,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0018702386749307421,
+      "loss": 0.0986,
+      "step": 17441
+    },
+    {
+      "epoch": 0.15140493572104408,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001870223279209055,
+      "loss": 0.168,
+      "step": 17442
+    },
+    {
+      "epoch": 0.15141361620124824,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001870207882645063,
+      "loss": 0.1357,
+      "step": 17443
+    },
+    {
+      "epoch": 0.1514222966814524,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0018701924852387835,
+      "loss": 0.1177,
+      "step": 17444
+    },
+    {
+      "epoch": 0.15143097716165657,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001870177086990233,
+      "loss": 0.1289,
+      "step": 17445
+    },
+    {
+      "epoch": 0.15143965764186074,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0018701616878994286,
+      "loss": 0.1025,
+      "step": 17446
+    },
+    {
+      "epoch": 0.1514483381220649,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018701462879663867,
+      "loss": 0.083,
+      "step": 17447
+    },
+    {
+      "epoch": 0.15145701860226907,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0018701308871911242,
+      "loss": 0.1348,
+      "step": 17448
+    },
+    {
+      "epoch": 0.15146569908247323,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0018701154855736586,
+      "loss": 0.1338,
+      "step": 17449
+    },
+    {
+      "epoch": 0.1514743795626774,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001870100083114006,
+      "loss": 0.1396,
+      "step": 17450
+    },
+    {
+      "epoch": 0.15148306004288156,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0018700846798121835,
+      "loss": 0.0903,
+      "step": 17451
+    },
+    {
+      "epoch": 0.15149174052308573,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0018700692756682081,
+      "loss": 0.1191,
+      "step": 17452
+    },
+    {
+      "epoch": 0.1515004210032899,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0018700538706820965,
+      "loss": 0.1006,
+      "step": 17453
+    },
+    {
+      "epoch": 0.15150910148349406,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0018700384648538653,
+      "loss": 0.1484,
+      "step": 17454
+    },
+    {
+      "epoch": 0.15151778196369822,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001870023058183532,
+      "loss": 0.0869,
+      "step": 17455
+    },
+    {
+      "epoch": 0.1515264624439024,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001870007650671113,
+      "loss": 0.1113,
+      "step": 17456
+    },
+    {
+      "epoch": 0.15153514292410655,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0018699922423166249,
+      "loss": 0.084,
+      "step": 17457
+    },
+    {
+      "epoch": 0.15154382340431072,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001869976833120085,
+      "loss": 0.1514,
+      "step": 17458
+    },
+    {
+      "epoch": 0.15155250388451488,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.00186996142308151,
+      "loss": 0.127,
+      "step": 17459
+    },
+    {
+      "epoch": 0.15156118436471905,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0018699460122009169,
+      "loss": 0.1074,
+      "step": 17460
+    },
+    {
+      "epoch": 0.1515698648449232,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0018699306004783223,
+      "loss": 0.1279,
+      "step": 17461
+    },
+    {
+      "epoch": 0.15157854532512738,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0018699151879137434,
+      "loss": 0.1113,
+      "step": 17462
+    },
+    {
+      "epoch": 0.15158722580533154,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018698997745071962,
+      "loss": 0.1191,
+      "step": 17463
+    },
+    {
+      "epoch": 0.1515959062855357,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0018698843602586986,
+      "loss": 0.1235,
+      "step": 17464
+    },
+    {
+      "epoch": 0.15160458676573987,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001869868945168267,
+      "loss": 0.1406,
+      "step": 17465
+    },
+    {
+      "epoch": 0.15161326724594404,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0018698535292359184,
+      "loss": 0.1523,
+      "step": 17466
+    },
+    {
+      "epoch": 0.1516219477261482,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0018698381124616695,
+      "loss": 0.0928,
+      "step": 17467
+    },
+    {
+      "epoch": 0.15163062820635237,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001869822694845537,
+      "loss": 0.0728,
+      "step": 17468
+    },
+    {
+      "epoch": 0.15163930868655653,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001869807276387538,
+      "loss": 0.0991,
+      "step": 17469
+    },
+    {
+      "epoch": 0.1516479891667607,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0018697918570876894,
+      "loss": 0.1016,
+      "step": 17470
+    },
+    {
+      "epoch": 0.15165666964696486,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0018697764369460079,
+      "loss": 0.1338,
+      "step": 17471
+    },
+    {
+      "epoch": 0.15166535012716903,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0018697610159625108,
+      "loss": 0.0791,
+      "step": 17472
+    },
+    {
+      "epoch": 0.1516740306073732,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0018697455941372144,
+      "loss": 0.1143,
+      "step": 17473
+    },
+    {
+      "epoch": 0.15168271108757736,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0018697301714701356,
+      "loss": 0.1064,
+      "step": 17474
+    },
+    {
+      "epoch": 0.15169139156778152,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0018697147479612916,
+      "loss": 0.0986,
+      "step": 17475
+    },
+    {
+      "epoch": 0.1517000720479857,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0018696993236106995,
+      "loss": 0.127,
+      "step": 17476
+    },
+    {
+      "epoch": 0.15170875252818986,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0018696838984183753,
+      "loss": 0.0762,
+      "step": 17477
+    },
+    {
+      "epoch": 0.15171743300839402,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0018696684723843364,
+      "loss": 0.0874,
+      "step": 17478
+    },
+    {
+      "epoch": 0.15172611348859819,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0018696530455085997,
+      "loss": 0.1299,
+      "step": 17479
+    },
+    {
+      "epoch": 0.15173479396880235,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001869637617791182,
+      "loss": 0.0791,
+      "step": 17480
+    },
+    {
+      "epoch": 0.15174347444900652,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0018696221892321,
+      "loss": 0.1377,
+      "step": 17481
+    },
+    {
+      "epoch": 0.15175215492921068,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0018696067598313708,
+      "loss": 0.0986,
+      "step": 17482
+    },
+    {
+      "epoch": 0.15176083540941485,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0018695913295890116,
+      "loss": 0.0889,
+      "step": 17483
+    },
+    {
+      "epoch": 0.151769515889619,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0018695758985050384,
+      "loss": 0.1211,
+      "step": 17484
+    },
+    {
+      "epoch": 0.15177819636982318,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0018695604665794688,
+      "loss": 0.0762,
+      "step": 17485
+    },
+    {
+      "epoch": 0.15178687685002734,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018695450338123195,
+      "loss": 0.1201,
+      "step": 17486
+    },
+    {
+      "epoch": 0.1517955573302315,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001869529600203607,
+      "loss": 0.1475,
+      "step": 17487
+    },
+    {
+      "epoch": 0.15180423781043567,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0018695141657533488,
+      "loss": 0.1426,
+      "step": 17488
+    },
+    {
+      "epoch": 0.15181291829063984,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0018694987304615613,
+      "loss": 0.0952,
+      "step": 17489
+    },
+    {
+      "epoch": 0.151821598770844,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0018694832943282615,
+      "loss": 0.1123,
+      "step": 17490
+    },
+    {
+      "epoch": 0.15183027925104817,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0018694678573534667,
+      "loss": 0.1455,
+      "step": 17491
+    },
+    {
+      "epoch": 0.15183895973125233,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0018694524195371931,
+      "loss": 0.0898,
+      "step": 17492
+    },
+    {
+      "epoch": 0.1518476402114565,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001869436980879458,
+      "loss": 0.1162,
+      "step": 17493
+    },
+    {
+      "epoch": 0.15185632069166066,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0018694215413802783,
+      "loss": 0.1035,
+      "step": 17494
+    },
+    {
+      "epoch": 0.15186500117186483,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0018694061010396704,
+      "loss": 0.0928,
+      "step": 17495
+    },
+    {
+      "epoch": 0.151873681652069,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0018693906598576519,
+      "loss": 0.0996,
+      "step": 17496
+    },
+    {
+      "epoch": 0.15188236213227316,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.001869375217834239,
+      "loss": 0.0967,
+      "step": 17497
+    },
+    {
+      "epoch": 0.15189104261247732,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001869359774969449,
+      "loss": 0.0869,
+      "step": 17498
+    },
+    {
+      "epoch": 0.1518997230926815,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001869344331263299,
+      "loss": 0.2012,
+      "step": 17499
+    },
+    {
+      "epoch": 0.15190840357288565,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018693288867158053,
+      "loss": 0.1104,
+      "step": 17500
+    },
+    {
+      "epoch": 0.15191708405308982,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0018693134413269852,
+      "loss": 0.1094,
+      "step": 17501
+    },
+    {
+      "epoch": 0.15192576453329398,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0018692979950968554,
+      "loss": 0.1108,
+      "step": 17502
+    },
+    {
+      "epoch": 0.15193444501349815,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0018692825480254329,
+      "loss": 0.1104,
+      "step": 17503
+    },
+    {
+      "epoch": 0.1519431254937023,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0018692671001127347,
+      "loss": 0.1191,
+      "step": 17504
+    },
+    {
+      "epoch": 0.15195180597390648,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0018692516513587774,
+      "loss": 0.1113,
+      "step": 17505
+    },
+    {
+      "epoch": 0.15196048645411064,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0018692362017635783,
+      "loss": 0.0972,
+      "step": 17506
+    },
+    {
+      "epoch": 0.1519691669343148,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0018692207513271537,
+      "loss": 0.127,
+      "step": 17507
+    },
+    {
+      "epoch": 0.15197784741451897,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001869205300049521,
+      "loss": 0.1211,
+      "step": 17508
+    },
+    {
+      "epoch": 0.15198652789472314,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0018691898479306968,
+      "loss": 0.0967,
+      "step": 17509
+    },
+    {
+      "epoch": 0.1519952083749273,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0018691743949706984,
+      "loss": 0.1289,
+      "step": 17510
+    },
+    {
+      "epoch": 0.15200388885513147,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0018691589411695424,
+      "loss": 0.1128,
+      "step": 17511
+    },
+    {
+      "epoch": 0.15201256933533563,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0018691434865272456,
+      "loss": 0.1396,
+      "step": 17512
+    },
+    {
+      "epoch": 0.1520212498155398,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.001869128031043825,
+      "loss": 0.1289,
+      "step": 17513
+    },
+    {
+      "epoch": 0.15202993029574396,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0018691125747192976,
+      "loss": 0.1318,
+      "step": 17514
+    },
+    {
+      "epoch": 0.15203861077594813,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0018690971175536807,
+      "loss": 0.1504,
+      "step": 17515
+    },
+    {
+      "epoch": 0.1520472912561523,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0018690816595469902,
+      "loss": 0.1816,
+      "step": 17516
+    },
+    {
+      "epoch": 0.15205597173635646,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0018690662006992436,
+      "loss": 0.1201,
+      "step": 17517
+    },
+    {
+      "epoch": 0.15206465221656063,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0018690507410104578,
+      "loss": 0.1113,
+      "step": 17518
+    },
+    {
+      "epoch": 0.1520733326967648,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0018690352804806496,
+      "loss": 0.0908,
+      "step": 17519
+    },
+    {
+      "epoch": 0.15208201317696896,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0018690198191098361,
+      "loss": 0.1162,
+      "step": 17520
+    },
+    {
+      "epoch": 0.15209069365717312,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0018690043568980344,
+      "loss": 0.1104,
+      "step": 17521
+    },
+    {
+      "epoch": 0.15209937413737729,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0018689888938452604,
+      "loss": 0.0918,
+      "step": 17522
+    },
+    {
+      "epoch": 0.15210805461758145,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0018689734299515322,
+      "loss": 0.1016,
+      "step": 17523
+    },
+    {
+      "epoch": 0.15211673509778562,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.001868957965216866,
+      "loss": 0.1025,
+      "step": 17524
+    },
+    {
+      "epoch": 0.15212541557798978,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0018689424996412793,
+      "loss": 0.1177,
+      "step": 17525
+    },
+    {
+      "epoch": 0.15213409605819395,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001868927033224788,
+      "loss": 0.1162,
+      "step": 17526
+    },
+    {
+      "epoch": 0.1521427765383981,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0018689115659674099,
+      "loss": 0.1104,
+      "step": 17527
+    },
+    {
+      "epoch": 0.15215145701860228,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0018688960978691619,
+      "loss": 0.1338,
+      "step": 17528
+    },
+    {
+      "epoch": 0.15216013749880644,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0018688806289300604,
+      "loss": 0.1123,
+      "step": 17529
+    },
+    {
+      "epoch": 0.1521688179790106,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0018688651591501227,
+      "loss": 0.1426,
+      "step": 17530
+    },
+    {
+      "epoch": 0.15217749845921477,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0018688496885293658,
+      "loss": 0.0737,
+      "step": 17531
+    },
+    {
+      "epoch": 0.15218617893941894,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001868834217067806,
+      "loss": 0.0825,
+      "step": 17532
+    },
+    {
+      "epoch": 0.1521948594196231,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001868818744765461,
+      "loss": 0.0947,
+      "step": 17533
+    },
+    {
+      "epoch": 0.15220353989982727,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0018688032716223473,
+      "loss": 0.0879,
+      "step": 17534
+    },
+    {
+      "epoch": 0.15221222038003143,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0018687877976384817,
+      "loss": 0.1055,
+      "step": 17535
+    },
+    {
+      "epoch": 0.1522209008602356,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0018687723228138818,
+      "loss": 0.1328,
+      "step": 17536
+    },
+    {
+      "epoch": 0.15222958134043976,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0018687568471485636,
+      "loss": 0.1211,
+      "step": 17537
+    },
+    {
+      "epoch": 0.15223826182064393,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0018687413706425444,
+      "loss": 0.1055,
+      "step": 17538
+    },
+    {
+      "epoch": 0.1522469423008481,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0018687258932958415,
+      "loss": 0.1543,
+      "step": 17539
+    },
+    {
+      "epoch": 0.15225562278105226,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0018687104151084715,
+      "loss": 0.1387,
+      "step": 17540
+    },
+    {
+      "epoch": 0.15226430326125642,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001868694936080451,
+      "loss": 0.0957,
+      "step": 17541
+    },
+    {
+      "epoch": 0.1522729837414606,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018686794562117978,
+      "loss": 0.1357,
+      "step": 17542
+    },
+    {
+      "epoch": 0.15228166422166475,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0018686639755025282,
+      "loss": 0.127,
+      "step": 17543
+    },
+    {
+      "epoch": 0.15229034470186892,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0018686484939526588,
+      "loss": 0.0952,
+      "step": 17544
+    },
+    {
+      "epoch": 0.15229902518207308,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0018686330115622073,
+      "loss": 0.1206,
+      "step": 17545
+    },
+    {
+      "epoch": 0.15230770566227725,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.00186861752833119,
+      "loss": 0.1836,
+      "step": 17546
+    },
+    {
+      "epoch": 0.1523163861424814,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018686020442596244,
+      "loss": 0.1064,
+      "step": 17547
+    },
+    {
+      "epoch": 0.15232506662268558,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0018685865593475268,
+      "loss": 0.0928,
+      "step": 17548
+    },
+    {
+      "epoch": 0.15233374710288974,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0018685710735949147,
+      "loss": 0.1396,
+      "step": 17549
+    },
+    {
+      "epoch": 0.1523424275830939,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0018685555870018048,
+      "loss": 0.1055,
+      "step": 17550
+    },
+    {
+      "epoch": 0.15235110806329807,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0018685400995682143,
+      "loss": 0.0986,
+      "step": 17551
+    },
+    {
+      "epoch": 0.15235978854350224,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0018685246112941597,
+      "loss": 0.0977,
+      "step": 17552
+    },
+    {
+      "epoch": 0.1523684690237064,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0018685091221796583,
+      "loss": 0.0894,
+      "step": 17553
+    },
+    {
+      "epoch": 0.15237714950391057,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0018684936322247263,
+      "loss": 0.1196,
+      "step": 17554
+    },
+    {
+      "epoch": 0.15238582998411473,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001868478141429382,
+      "loss": 0.1016,
+      "step": 17555
+    },
+    {
+      "epoch": 0.1523945104643189,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001868462649793641,
+      "loss": 0.1089,
+      "step": 17556
+    },
+    {
+      "epoch": 0.15240319094452306,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001868447157317521,
+      "loss": 0.126,
+      "step": 17557
+    },
+    {
+      "epoch": 0.1524118714247272,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0018684316640010389,
+      "loss": 0.1553,
+      "step": 17558
+    },
+    {
+      "epoch": 0.15242055190493137,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001868416169844211,
+      "loss": 0.1357,
+      "step": 17559
+    },
+    {
+      "epoch": 0.15242923238513553,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001868400674847055,
+      "loss": 0.1138,
+      "step": 17560
+    },
+    {
+      "epoch": 0.1524379128653397,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0018683851790095873,
+      "loss": 0.0811,
+      "step": 17561
+    },
+    {
+      "epoch": 0.15244659334554386,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0018683696823318257,
+      "loss": 0.1045,
+      "step": 17562
+    },
+    {
+      "epoch": 0.15245527382574803,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001868354184813786,
+      "loss": 0.1445,
+      "step": 17563
+    },
+    {
+      "epoch": 0.1524639543059522,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0018683386864554862,
+      "loss": 0.124,
+      "step": 17564
+    },
+    {
+      "epoch": 0.15247263478615636,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0018683231872569422,
+      "loss": 0.0928,
+      "step": 17565
+    },
+    {
+      "epoch": 0.15248131526636052,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001868307687218172,
+      "loss": 0.166,
+      "step": 17566
+    },
+    {
+      "epoch": 0.1524899957465647,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0018682921863391918,
+      "loss": 0.1162,
+      "step": 17567
+    },
+    {
+      "epoch": 0.15249867622676885,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0018682766846200188,
+      "loss": 0.1553,
+      "step": 17568
+    },
+    {
+      "epoch": 0.15250735670697302,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0018682611820606701,
+      "loss": 0.1914,
+      "step": 17569
+    },
+    {
+      "epoch": 0.15251603718717718,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0018682456786611624,
+      "loss": 0.1299,
+      "step": 17570
+    },
+    {
+      "epoch": 0.15252471766738135,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0018682301744215128,
+      "loss": 0.1289,
+      "step": 17571
+    },
+    {
+      "epoch": 0.1525333981475855,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0018682146693417383,
+      "loss": 0.1104,
+      "step": 17572
+    },
+    {
+      "epoch": 0.15254207862778968,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0018681991634218555,
+      "loss": 0.0962,
+      "step": 17573
+    },
+    {
+      "epoch": 0.15255075910799384,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001868183656661882,
+      "loss": 0.1211,
+      "step": 17574
+    },
+    {
+      "epoch": 0.152559439588198,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.001868168149061834,
+      "loss": 0.1562,
+      "step": 17575
+    },
+    {
+      "epoch": 0.15256812006840217,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0018681526406217294,
+      "loss": 0.1084,
+      "step": 17576
+    },
+    {
+      "epoch": 0.15257680054860634,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0018681371313415843,
+      "loss": 0.1279,
+      "step": 17577
+    },
+    {
+      "epoch": 0.1525854810288105,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001868121621221416,
+      "loss": 0.1318,
+      "step": 17578
+    },
+    {
+      "epoch": 0.15259416150901467,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0018681061102612413,
+      "loss": 0.123,
+      "step": 17579
+    },
+    {
+      "epoch": 0.15260284198921883,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018680905984610773,
+      "loss": 0.0938,
+      "step": 17580
+    },
+    {
+      "epoch": 0.152611522469423,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0018680750858209411,
+      "loss": 0.1445,
+      "step": 17581
+    },
+    {
+      "epoch": 0.15262020294962717,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0018680595723408499,
+      "loss": 0.1079,
+      "step": 17582
+    },
+    {
+      "epoch": 0.15262888342983133,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0018680440580208198,
+      "loss": 0.1475,
+      "step": 17583
+    },
+    {
+      "epoch": 0.1526375639100355,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0018680285428608683,
+      "loss": 0.0991,
+      "step": 17584
+    },
+    {
+      "epoch": 0.15264624439023966,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0018680130268610125,
+      "loss": 0.0894,
+      "step": 17585
+    },
+    {
+      "epoch": 0.15265492487044383,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018679975100212689,
+      "loss": 0.1992,
+      "step": 17586
+    },
+    {
+      "epoch": 0.152663605350648,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001867981992341655,
+      "loss": 0.1387,
+      "step": 17587
+    },
+    {
+      "epoch": 0.15267228583085216,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0018679664738221876,
+      "loss": 0.1387,
+      "step": 17588
+    },
+    {
+      "epoch": 0.15268096631105632,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0018679509544628835,
+      "loss": 0.1191,
+      "step": 17589
+    },
+    {
+      "epoch": 0.15268964679126049,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0018679354342637595,
+      "loss": 0.0996,
+      "step": 17590
+    },
+    {
+      "epoch": 0.15269832727146465,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001867919913224833,
+      "loss": 0.1152,
+      "step": 17591
+    },
+    {
+      "epoch": 0.15270700775166882,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0018679043913461212,
+      "loss": 0.1055,
+      "step": 17592
+    },
+    {
+      "epoch": 0.15271568823187298,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018678888686276404,
+      "loss": 0.0762,
+      "step": 17593
+    },
+    {
+      "epoch": 0.15272436871207715,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0018678733450694077,
+      "loss": 0.1064,
+      "step": 17594
+    },
+    {
+      "epoch": 0.1527330491922813,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0018678578206714404,
+      "loss": 0.0938,
+      "step": 17595
+    },
+    {
+      "epoch": 0.15274172967248548,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0018678422954337556,
+      "loss": 0.127,
+      "step": 17596
+    },
+    {
+      "epoch": 0.15275041015268964,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0018678267693563698,
+      "loss": 0.1045,
+      "step": 17597
+    },
+    {
+      "epoch": 0.1527590906328938,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0018678112424392998,
+      "loss": 0.1436,
+      "step": 17598
+    },
+    {
+      "epoch": 0.15276777111309797,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0018677957146825633,
+      "loss": 0.1025,
+      "step": 17599
+    },
+    {
+      "epoch": 0.15277645159330214,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0018677801860861772,
+      "loss": 0.1523,
+      "step": 17600
+    },
+    {
+      "epoch": 0.1527851320735063,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0018677646566501577,
+      "loss": 0.1113,
+      "step": 17601
+    },
+    {
+      "epoch": 0.15279381255371047,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0018677491263745225,
+      "loss": 0.1074,
+      "step": 17602
+    },
+    {
+      "epoch": 0.15280249303391463,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0018677335952592886,
+      "loss": 0.1064,
+      "step": 17603
+    },
+    {
+      "epoch": 0.1528111735141188,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0018677180633044728,
+      "loss": 0.1426,
+      "step": 17604
+    },
+    {
+      "epoch": 0.15281985399432296,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0018677025305100917,
+      "loss": 0.1309,
+      "step": 17605
+    },
+    {
+      "epoch": 0.15282853447452713,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001867686996876163,
+      "loss": 0.1035,
+      "step": 17606
+    },
+    {
+      "epoch": 0.1528372149547313,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001867671462402703,
+      "loss": 0.0933,
+      "step": 17607
+    },
+    {
+      "epoch": 0.15284589543493546,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0018676559270897294,
+      "loss": 0.1206,
+      "step": 17608
+    },
+    {
+      "epoch": 0.15285457591513962,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0018676403909372583,
+      "loss": 0.1113,
+      "step": 17609
+    },
+    {
+      "epoch": 0.1528632563953438,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0018676248539453077,
+      "loss": 0.1504,
+      "step": 17610
+    },
+    {
+      "epoch": 0.15287193687554795,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0018676093161138939,
+      "loss": 0.1074,
+      "step": 17611
+    },
+    {
+      "epoch": 0.15288061735575212,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0018675937774430343,
+      "loss": 0.1035,
+      "step": 17612
+    },
+    {
+      "epoch": 0.15288929783595628,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0018675782379327453,
+      "loss": 0.1074,
+      "step": 17613
+    },
+    {
+      "epoch": 0.15289797831616045,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0018675626975830444,
+      "loss": 0.1475,
+      "step": 17614
+    },
+    {
+      "epoch": 0.15290665879636461,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0018675471563939486,
+      "loss": 0.1055,
+      "step": 17615
+    },
+    {
+      "epoch": 0.15291533927656878,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001867531614365475,
+      "loss": 0.1875,
+      "step": 17616
+    },
+    {
+      "epoch": 0.15292401975677294,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0018675160714976397,
+      "loss": 0.1299,
+      "step": 17617
+    },
+    {
+      "epoch": 0.1529327002369771,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018675005277904607,
+      "loss": 0.1084,
+      "step": 17618
+    },
+    {
+      "epoch": 0.15294138071718127,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0018674849832439546,
+      "loss": 0.083,
+      "step": 17619
+    },
+    {
+      "epoch": 0.15295006119738544,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0018674694378581385,
+      "loss": 0.1289,
+      "step": 17620
+    },
+    {
+      "epoch": 0.1529587416775896,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0018674538916330294,
+      "loss": 0.085,
+      "step": 17621
+    },
+    {
+      "epoch": 0.15296742215779377,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001867438344568644,
+      "loss": 0.1025,
+      "step": 17622
+    },
+    {
+      "epoch": 0.15297610263799793,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001867422796665,
+      "loss": 0.1064,
+      "step": 17623
+    },
+    {
+      "epoch": 0.1529847831182021,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0018674072479221138,
+      "loss": 0.1562,
+      "step": 17624
+    },
+    {
+      "epoch": 0.15299346359840627,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0018673916983400023,
+      "loss": 0.1113,
+      "step": 17625
+    },
+    {
+      "epoch": 0.15300214407861043,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001867376147918683,
+      "loss": 0.1191,
+      "step": 17626
+    },
+    {
+      "epoch": 0.1530108245588146,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0018673605966581725,
+      "loss": 0.125,
+      "step": 17627
+    },
+    {
+      "epoch": 0.15301950503901876,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0018673450445584883,
+      "loss": 0.1289,
+      "step": 17628
+    },
+    {
+      "epoch": 0.15302818551922293,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018673294916196467,
+      "loss": 0.0928,
+      "step": 17629
+    },
+    {
+      "epoch": 0.1530368659994271,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0018673139378416657,
+      "loss": 0.1045,
+      "step": 17630
+    },
+    {
+      "epoch": 0.15304554647963126,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018672983832245611,
+      "loss": 0.1162,
+      "step": 17631
+    },
+    {
+      "epoch": 0.15305422695983542,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0018672828277683507,
+      "loss": 0.0986,
+      "step": 17632
+    },
+    {
+      "epoch": 0.1530629074400396,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0018672672714730514,
+      "loss": 0.1177,
+      "step": 17633
+    },
+    {
+      "epoch": 0.15307158792024375,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0018672517143386801,
+      "loss": 0.1719,
+      "step": 17634
+    },
+    {
+      "epoch": 0.15308026840044792,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001867236156365254,
+      "loss": 0.1191,
+      "step": 17635
+    },
+    {
+      "epoch": 0.15308894888065208,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0018672205975527895,
+      "loss": 0.1128,
+      "step": 17636
+    },
+    {
+      "epoch": 0.15309762936085625,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0018672050379013043,
+      "loss": 0.1191,
+      "step": 17637
+    },
+    {
+      "epoch": 0.1531063098410604,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018671894774108153,
+      "loss": 0.124,
+      "step": 17638
+    },
+    {
+      "epoch": 0.15311499032126458,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0018671739160813394,
+      "loss": 0.0815,
+      "step": 17639
+    },
+    {
+      "epoch": 0.15312367080146874,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0018671583539128938,
+      "loss": 0.1484,
+      "step": 17640
+    },
+    {
+      "epoch": 0.1531323512816729,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0018671427909054952,
+      "loss": 0.0942,
+      "step": 17641
+    },
+    {
+      "epoch": 0.15314103176187707,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0018671272270591609,
+      "loss": 0.1006,
+      "step": 17642
+    },
+    {
+      "epoch": 0.15314971224208124,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0018671116623739078,
+      "loss": 0.1543,
+      "step": 17643
+    },
+    {
+      "epoch": 0.1531583927222854,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0018670960968497526,
+      "loss": 0.127,
+      "step": 17644
+    },
+    {
+      "epoch": 0.15316707320248957,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0018670805304867126,
+      "loss": 0.1621,
+      "step": 17645
+    },
+    {
+      "epoch": 0.15317575368269373,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001867064963284805,
+      "loss": 0.1416,
+      "step": 17646
+    },
+    {
+      "epoch": 0.1531844341628979,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0018670493952440469,
+      "loss": 0.104,
+      "step": 17647
+    },
+    {
+      "epoch": 0.15319311464310206,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0018670338263644549,
+      "loss": 0.1201,
+      "step": 17648
+    },
+    {
+      "epoch": 0.15320179512330623,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001867018256646046,
+      "loss": 0.1211,
+      "step": 17649
+    },
+    {
+      "epoch": 0.1532104756035104,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.001867002686088838,
+      "loss": 0.0918,
+      "step": 17650
+    },
+    {
+      "epoch": 0.15321915608371456,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0018669871146928473,
+      "loss": 0.0947,
+      "step": 17651
+    },
+    {
+      "epoch": 0.15322783656391872,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0018669715424580906,
+      "loss": 0.0967,
+      "step": 17652
+    },
+    {
+      "epoch": 0.1532365170441229,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018669559693845854,
+      "loss": 0.0762,
+      "step": 17653
+    },
+    {
+      "epoch": 0.15324519752432705,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001866940395472349,
+      "loss": 0.1523,
+      "step": 17654
+    },
+    {
+      "epoch": 0.15325387800453122,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001866924820721398,
+      "loss": 0.0991,
+      "step": 17655
+    },
+    {
+      "epoch": 0.15326255848473538,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0018669092451317495,
+      "loss": 0.1074,
+      "step": 17656
+    },
+    {
+      "epoch": 0.15327123896493955,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0018668936687034207,
+      "loss": 0.0811,
+      "step": 17657
+    },
+    {
+      "epoch": 0.15327991944514371,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0018668780914364283,
+      "loss": 0.1104,
+      "step": 17658
+    },
+    {
+      "epoch": 0.15328859992534788,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0018668625133307895,
+      "loss": 0.0928,
+      "step": 17659
+    },
+    {
+      "epoch": 0.15329728040555204,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0018668469343865216,
+      "loss": 0.165,
+      "step": 17660
+    },
+    {
+      "epoch": 0.1533059608857562,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0018668313546036414,
+      "loss": 0.0933,
+      "step": 17661
+    },
+    {
+      "epoch": 0.15331464136596037,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0018668157739821659,
+      "loss": 0.1514,
+      "step": 17662
+    },
+    {
+      "epoch": 0.15332332184616454,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0018668001925221118,
+      "loss": 0.104,
+      "step": 17663
+    },
+    {
+      "epoch": 0.1533320023263687,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0018667846102234972,
+      "loss": 0.123,
+      "step": 17664
+    },
+    {
+      "epoch": 0.15334068280657287,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.001866769027086338,
+      "loss": 0.1689,
+      "step": 17665
+    },
+    {
+      "epoch": 0.15334936328677704,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001866753443110652,
+      "loss": 0.1436,
+      "step": 17666
+    },
+    {
+      "epoch": 0.1533580437669812,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0018667378582964559,
+      "loss": 0.1279,
+      "step": 17667
+    },
+    {
+      "epoch": 0.15336672424718537,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0018667222726437664,
+      "loss": 0.1045,
+      "step": 17668
+    },
+    {
+      "epoch": 0.15337540472738953,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0018667066861526012,
+      "loss": 0.1748,
+      "step": 17669
+    },
+    {
+      "epoch": 0.1533840852075937,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0018666910988229775,
+      "loss": 0.1016,
+      "step": 17670
+    },
+    {
+      "epoch": 0.15339276568779786,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0018666755106549113,
+      "loss": 0.1318,
+      "step": 17671
+    },
+    {
+      "epoch": 0.15340144616800203,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018666599216484207,
+      "loss": 0.1035,
+      "step": 17672
+    },
+    {
+      "epoch": 0.1534101266482062,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0018666443318035221,
+      "loss": 0.0894,
+      "step": 17673
+    },
+    {
+      "epoch": 0.15341880712841036,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0018666287411202326,
+      "loss": 0.168,
+      "step": 17674
+    },
+    {
+      "epoch": 0.15342748760861452,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0018666131495985701,
+      "loss": 0.1357,
+      "step": 17675
+    },
+    {
+      "epoch": 0.1534361680888187,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0018665975572385504,
+      "loss": 0.1367,
+      "step": 17676
+    },
+    {
+      "epoch": 0.15344484856902285,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0018665819640401914,
+      "loss": 0.1328,
+      "step": 17677
+    },
+    {
+      "epoch": 0.15345352904922702,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0018665663700035099,
+      "loss": 0.1021,
+      "step": 17678
+    },
+    {
+      "epoch": 0.15346220952943118,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0018665507751285228,
+      "loss": 0.1357,
+      "step": 17679
+    },
+    {
+      "epoch": 0.15347089000963535,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.001866535179415247,
+      "loss": 0.1289,
+      "step": 17680
+    },
+    {
+      "epoch": 0.15347957048983948,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0018665195828637003,
+      "loss": 0.1074,
+      "step": 17681
+    },
+    {
+      "epoch": 0.15348825097004365,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0018665039854738991,
+      "loss": 0.0835,
+      "step": 17682
+    },
+    {
+      "epoch": 0.15349693145024781,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001866488387245861,
+      "loss": 0.1621,
+      "step": 17683
+    },
+    {
+      "epoch": 0.15350561193045198,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0018664727881796022,
+      "loss": 0.0967,
+      "step": 17684
+    },
+    {
+      "epoch": 0.15351429241065614,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0018664571882751405,
+      "loss": 0.104,
+      "step": 17685
+    },
+    {
+      "epoch": 0.1535229728908603,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0018664415875324926,
+      "loss": 0.1094,
+      "step": 17686
+    },
+    {
+      "epoch": 0.15353165337106447,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001866425985951676,
+      "loss": 0.1299,
+      "step": 17687
+    },
+    {
+      "epoch": 0.15354033385126864,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0018664103835327073,
+      "loss": 0.0908,
+      "step": 17688
+    },
+    {
+      "epoch": 0.1535490143314728,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0018663947802756038,
+      "loss": 0.125,
+      "step": 17689
+    },
+    {
+      "epoch": 0.15355769481167697,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0018663791761803824,
+      "loss": 0.0991,
+      "step": 17690
+    },
+    {
+      "epoch": 0.15356637529188114,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0018663635712470603,
+      "loss": 0.1445,
+      "step": 17691
+    },
+    {
+      "epoch": 0.1535750557720853,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0018663479654756543,
+      "loss": 0.1338,
+      "step": 17692
+    },
+    {
+      "epoch": 0.15358373625228947,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001866332358866182,
+      "loss": 0.1104,
+      "step": 17693
+    },
+    {
+      "epoch": 0.15359241673249363,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0018663167514186602,
+      "loss": 0.1162,
+      "step": 17694
+    },
+    {
+      "epoch": 0.1536010972126978,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0018663011431331053,
+      "loss": 0.1318,
+      "step": 17695
+    },
+    {
+      "epoch": 0.15360977769290196,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0018662855340095355,
+      "loss": 0.124,
+      "step": 17696
+    },
+    {
+      "epoch": 0.15361845817310613,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0018662699240479675,
+      "loss": 0.0879,
+      "step": 17697
+    },
+    {
+      "epoch": 0.1536271386533103,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018662543132484176,
+      "loss": 0.1001,
+      "step": 17698
+    },
+    {
+      "epoch": 0.15363581913351446,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001866238701610904,
+      "loss": 0.1006,
+      "step": 17699
+    },
+    {
+      "epoch": 0.15364449961371862,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0018662230891354434,
+      "loss": 0.0854,
+      "step": 17700
+    },
+    {
+      "epoch": 0.1536531800939228,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0018662074758220523,
+      "loss": 0.1069,
+      "step": 17701
+    },
+    {
+      "epoch": 0.15366186057412695,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0018661918616707483,
+      "loss": 0.0918,
+      "step": 17702
+    },
+    {
+      "epoch": 0.15367054105433112,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0018661762466815484,
+      "loss": 0.1484,
+      "step": 17703
+    },
+    {
+      "epoch": 0.15367922153453528,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0018661606308544695,
+      "loss": 0.1309,
+      "step": 17704
+    },
+    {
+      "epoch": 0.15368790201473945,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0018661450141895292,
+      "loss": 0.0952,
+      "step": 17705
+    },
+    {
+      "epoch": 0.1536965824949436,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001866129396686744,
+      "loss": 0.0962,
+      "step": 17706
+    },
+    {
+      "epoch": 0.15370526297514778,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018661137783461314,
+      "loss": 0.1182,
+      "step": 17707
+    },
+    {
+      "epoch": 0.15371394345535194,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0018660981591677084,
+      "loss": 0.1006,
+      "step": 17708
+    },
+    {
+      "epoch": 0.1537226239355561,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0018660825391514917,
+      "loss": 0.0771,
+      "step": 17709
+    },
+    {
+      "epoch": 0.15373130441576027,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0018660669182974988,
+      "loss": 0.1299,
+      "step": 17710
+    },
+    {
+      "epoch": 0.15373998489596444,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0018660512966057466,
+      "loss": 0.1143,
+      "step": 17711
+    },
+    {
+      "epoch": 0.1537486653761686,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0018660356740762521,
+      "loss": 0.1338,
+      "step": 17712
+    },
+    {
+      "epoch": 0.15375734585637277,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0018660200507090324,
+      "loss": 0.1006,
+      "step": 17713
+    },
+    {
+      "epoch": 0.15376602633657693,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0018660044265041047,
+      "loss": 0.1006,
+      "step": 17714
+    },
+    {
+      "epoch": 0.1537747068167811,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0018659888014614862,
+      "loss": 0.1104,
+      "step": 17715
+    },
+    {
+      "epoch": 0.15378338729698526,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0018659731755811943,
+      "loss": 0.1328,
+      "step": 17716
+    },
+    {
+      "epoch": 0.15379206777718943,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0018659575488632447,
+      "loss": 0.1133,
+      "step": 17717
+    },
+    {
+      "epoch": 0.1538007482573936,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0018659419213076561,
+      "loss": 0.0664,
+      "step": 17718
+    },
+    {
+      "epoch": 0.15380942873759776,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0018659262929144448,
+      "loss": 0.0908,
+      "step": 17719
+    },
+    {
+      "epoch": 0.15381810921780192,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0018659106636836277,
+      "loss": 0.0806,
+      "step": 17720
+    },
+    {
+      "epoch": 0.1538267896980061,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0018658950336152228,
+      "loss": 0.1206,
+      "step": 17721
+    },
+    {
+      "epoch": 0.15383547017821025,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0018658794027092464,
+      "loss": 0.082,
+      "step": 17722
+    },
+    {
+      "epoch": 0.15384415065841442,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0018658637709657153,
+      "loss": 0.1172,
+      "step": 17723
+    },
+    {
+      "epoch": 0.15385283113861858,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0018658481383846475,
+      "loss": 0.1011,
+      "step": 17724
+    },
+    {
+      "epoch": 0.15386151161882275,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0018658325049660598,
+      "loss": 0.1328,
+      "step": 17725
+    },
+    {
+      "epoch": 0.15387019209902691,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0018658168707099687,
+      "loss": 0.1133,
+      "step": 17726
+    },
+    {
+      "epoch": 0.15387887257923108,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0018658012356163923,
+      "loss": 0.1387,
+      "step": 17727
+    },
+    {
+      "epoch": 0.15388755305943524,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001865785599685347,
+      "loss": 0.0918,
+      "step": 17728
+    },
+    {
+      "epoch": 0.1538962335396394,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0018657699629168502,
+      "loss": 0.1099,
+      "step": 17729
+    },
+    {
+      "epoch": 0.15390491401984358,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0018657543253109185,
+      "loss": 0.0811,
+      "step": 17730
+    },
+    {
+      "epoch": 0.15391359450004774,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0018657386868675698,
+      "loss": 0.1377,
+      "step": 17731
+    },
+    {
+      "epoch": 0.1539222749802519,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0018657230475868202,
+      "loss": 0.1191,
+      "step": 17732
+    },
+    {
+      "epoch": 0.15393095546045607,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001865707407468688,
+      "loss": 0.1123,
+      "step": 17733
+    },
+    {
+      "epoch": 0.15393963594066024,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018656917665131893,
+      "loss": 0.126,
+      "step": 17734
+    },
+    {
+      "epoch": 0.1539483164208644,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0018656761247203418,
+      "loss": 0.0972,
+      "step": 17735
+    },
+    {
+      "epoch": 0.15395699690106857,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018656604820901622,
+      "loss": 0.0977,
+      "step": 17736
+    },
+    {
+      "epoch": 0.15396567738127273,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0018656448386226681,
+      "loss": 0.1934,
+      "step": 17737
+    },
+    {
+      "epoch": 0.1539743578614769,
+      "grad_norm": 0.875,
+      "learning_rate": 0.001865629194317876,
+      "loss": 0.1216,
+      "step": 17738
+    },
+    {
+      "epoch": 0.15398303834168106,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018656135491758033,
+      "loss": 0.1016,
+      "step": 17739
+    },
+    {
+      "epoch": 0.15399171882188523,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0018655979031964674,
+      "loss": 0.1123,
+      "step": 17740
+    },
+    {
+      "epoch": 0.1540003993020894,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0018655822563798847,
+      "loss": 0.1025,
+      "step": 17741
+    },
+    {
+      "epoch": 0.15400907978229356,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0018655666087260734,
+      "loss": 0.1162,
+      "step": 17742
+    },
+    {
+      "epoch": 0.15401776026249772,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0018655509602350496,
+      "loss": 0.1338,
+      "step": 17743
+    },
+    {
+      "epoch": 0.1540264407427019,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0018655353109068306,
+      "loss": 0.125,
+      "step": 17744
+    },
+    {
+      "epoch": 0.15403512122290605,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001865519660741434,
+      "loss": 0.1348,
+      "step": 17745
+    },
+    {
+      "epoch": 0.15404380170311022,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0018655040097388762,
+      "loss": 0.1523,
+      "step": 17746
+    },
+    {
+      "epoch": 0.15405248218331438,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018654883578991754,
+      "loss": 0.1045,
+      "step": 17747
+    },
+    {
+      "epoch": 0.15406116266351855,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0018654727052223473,
+      "loss": 0.1143,
+      "step": 17748
+    },
+    {
+      "epoch": 0.1540698431437227,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0018654570517084102,
+      "loss": 0.1162,
+      "step": 17749
+    },
+    {
+      "epoch": 0.15407852362392688,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0018654413973573805,
+      "loss": 0.1143,
+      "step": 17750
+    },
+    {
+      "epoch": 0.15408720410413104,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0018654257421692757,
+      "loss": 0.1074,
+      "step": 17751
+    },
+    {
+      "epoch": 0.1540958845843352,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001865410086144113,
+      "loss": 0.1377,
+      "step": 17752
+    },
+    {
+      "epoch": 0.15410456506453937,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001865394429281909,
+      "loss": 0.1211,
+      "step": 17753
+    },
+    {
+      "epoch": 0.15411324554474354,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0018653787715826816,
+      "loss": 0.0957,
+      "step": 17754
+    },
+    {
+      "epoch": 0.1541219260249477,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001865363113046447,
+      "loss": 0.1001,
+      "step": 17755
+    },
+    {
+      "epoch": 0.15413060650515187,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001865347453673223,
+      "loss": 0.1465,
+      "step": 17756
+    },
+    {
+      "epoch": 0.15413928698535603,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0018653317934630262,
+      "loss": 0.1094,
+      "step": 17757
+    },
+    {
+      "epoch": 0.1541479674655602,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0018653161324158748,
+      "loss": 0.1118,
+      "step": 17758
+    },
+    {
+      "epoch": 0.15415664794576436,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0018653004705317846,
+      "loss": 0.1514,
+      "step": 17759
+    },
+    {
+      "epoch": 0.15416532842596853,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0018652848078107734,
+      "loss": 0.1011,
+      "step": 17760
+    },
+    {
+      "epoch": 0.1541740089061727,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0018652691442528585,
+      "loss": 0.1147,
+      "step": 17761
+    },
+    {
+      "epoch": 0.15418268938637686,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0018652534798580562,
+      "loss": 0.085,
+      "step": 17762
+    },
+    {
+      "epoch": 0.15419136986658102,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0018652378146263849,
+      "loss": 0.1172,
+      "step": 17763
+    },
+    {
+      "epoch": 0.1542000503467852,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0018652221485578604,
+      "loss": 0.1118,
+      "step": 17764
+    },
+    {
+      "epoch": 0.15420873082698935,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0018652064816525005,
+      "loss": 0.1045,
+      "step": 17765
+    },
+    {
+      "epoch": 0.15421741130719352,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0018651908139103227,
+      "loss": 0.1123,
+      "step": 17766
+    },
+    {
+      "epoch": 0.15422609178739768,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0018651751453313435,
+      "loss": 0.103,
+      "step": 17767
+    },
+    {
+      "epoch": 0.15423477226760185,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0018651594759155803,
+      "loss": 0.1025,
+      "step": 17768
+    },
+    {
+      "epoch": 0.15424345274780601,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.00186514380566305,
+      "loss": 0.1133,
+      "step": 17769
+    },
+    {
+      "epoch": 0.15425213322801018,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0018651281345737703,
+      "loss": 0.1099,
+      "step": 17770
+    },
+    {
+      "epoch": 0.15426081370821434,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0018651124626477577,
+      "loss": 0.0972,
+      "step": 17771
+    },
+    {
+      "epoch": 0.1542694941884185,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0018650967898850297,
+      "loss": 0.1094,
+      "step": 17772
+    },
+    {
+      "epoch": 0.15427817466862268,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0018650811162856033,
+      "loss": 0.1309,
+      "step": 17773
+    },
+    {
+      "epoch": 0.15428685514882684,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0018650654418494957,
+      "loss": 0.209,
+      "step": 17774
+    },
+    {
+      "epoch": 0.154295535629031,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001865049766576724,
+      "loss": 0.1699,
+      "step": 17775
+    },
+    {
+      "epoch": 0.15430421610923517,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0018650340904673055,
+      "loss": 0.1011,
+      "step": 17776
+    },
+    {
+      "epoch": 0.15431289658943934,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0018650184135212571,
+      "loss": 0.0737,
+      "step": 17777
+    },
+    {
+      "epoch": 0.1543215770696435,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.001865002735738596,
+      "loss": 0.103,
+      "step": 17778
+    },
+    {
+      "epoch": 0.15433025754984767,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0018649870571193394,
+      "loss": 0.1201,
+      "step": 17779
+    },
+    {
+      "epoch": 0.15433893803005183,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0018649713776635046,
+      "loss": 0.0977,
+      "step": 17780
+    },
+    {
+      "epoch": 0.154347618510256,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0018649556973711085,
+      "loss": 0.0874,
+      "step": 17781
+    },
+    {
+      "epoch": 0.15435629899046016,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0018649400162421687,
+      "loss": 0.1562,
+      "step": 17782
+    },
+    {
+      "epoch": 0.15436497947066433,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0018649243342767016,
+      "loss": 0.1338,
+      "step": 17783
+    },
+    {
+      "epoch": 0.1543736599508685,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0018649086514747244,
+      "loss": 0.1064,
+      "step": 17784
+    },
+    {
+      "epoch": 0.15438234043107266,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001864892967836255,
+      "loss": 0.1035,
+      "step": 17785
+    },
+    {
+      "epoch": 0.15439102091127682,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0018648772833613104,
+      "loss": 0.1318,
+      "step": 17786
+    },
+    {
+      "epoch": 0.154399701391481,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0018648615980499073,
+      "loss": 0.1387,
+      "step": 17787
+    },
+    {
+      "epoch": 0.15440838187168515,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001864845911902063,
+      "loss": 0.0972,
+      "step": 17788
+    },
+    {
+      "epoch": 0.15441706235188932,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0018648302249177947,
+      "loss": 0.1514,
+      "step": 17789
+    },
+    {
+      "epoch": 0.15442574283209348,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0018648145370971192,
+      "loss": 0.1182,
+      "step": 17790
+    },
+    {
+      "epoch": 0.15443442331229765,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0018647988484400547,
+      "loss": 0.1494,
+      "step": 17791
+    },
+    {
+      "epoch": 0.1544431037925018,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001864783158946617,
+      "loss": 0.1445,
+      "step": 17792
+    },
+    {
+      "epoch": 0.15445178427270598,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0018647674686168244,
+      "loss": 0.1211,
+      "step": 17793
+    },
+    {
+      "epoch": 0.15446046475291014,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0018647517774506932,
+      "loss": 0.1064,
+      "step": 17794
+    },
+    {
+      "epoch": 0.1544691452331143,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0018647360854482414,
+      "loss": 0.1123,
+      "step": 17795
+    },
+    {
+      "epoch": 0.15447782571331847,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0018647203926094853,
+      "loss": 0.126,
+      "step": 17796
+    },
+    {
+      "epoch": 0.15448650619352264,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0018647046989344429,
+      "loss": 0.0957,
+      "step": 17797
+    },
+    {
+      "epoch": 0.1544951866737268,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0018646890044231308,
+      "loss": 0.0996,
+      "step": 17798
+    },
+    {
+      "epoch": 0.15450386715393097,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0018646733090755658,
+      "loss": 0.0942,
+      "step": 17799
+    },
+    {
+      "epoch": 0.15451254763413513,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001864657612891766,
+      "loss": 0.1104,
+      "step": 17800
+    },
+    {
+      "epoch": 0.1545212281143393,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0018646419158717482,
+      "loss": 0.1426,
+      "step": 17801
+    },
+    {
+      "epoch": 0.15452990859454346,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0018646262180155293,
+      "loss": 0.0977,
+      "step": 17802
+    },
+    {
+      "epoch": 0.15453858907474763,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0018646105193231267,
+      "loss": 0.1055,
+      "step": 17803
+    },
+    {
+      "epoch": 0.15454726955495177,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0018645948197945576,
+      "loss": 0.1245,
+      "step": 17804
+    },
+    {
+      "epoch": 0.15455595003515593,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001864579119429839,
+      "loss": 0.0869,
+      "step": 17805
+    },
+    {
+      "epoch": 0.1545646305153601,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0018645634182289882,
+      "loss": 0.0981,
+      "step": 17806
+    },
+    {
+      "epoch": 0.15457331099556426,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0018645477161920224,
+      "loss": 0.0791,
+      "step": 17807
+    },
+    {
+      "epoch": 0.15458199147576843,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0018645320133189585,
+      "loss": 0.0996,
+      "step": 17808
+    },
+    {
+      "epoch": 0.1545906719559726,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0018645163096098138,
+      "loss": 0.1396,
+      "step": 17809
+    },
+    {
+      "epoch": 0.15459935243617676,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0018645006050646061,
+      "loss": 0.1309,
+      "step": 17810
+    },
+    {
+      "epoch": 0.15460803291638092,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0018644848996833515,
+      "loss": 0.1055,
+      "step": 17811
+    },
+    {
+      "epoch": 0.1546167133965851,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0018644691934660677,
+      "loss": 0.1147,
+      "step": 17812
+    },
+    {
+      "epoch": 0.15462539387678925,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0018644534864127723,
+      "loss": 0.1562,
+      "step": 17813
+    },
+    {
+      "epoch": 0.15463407435699342,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0018644377785234816,
+      "loss": 0.1118,
+      "step": 17814
+    },
+    {
+      "epoch": 0.15464275483719758,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0018644220697982136,
+      "loss": 0.1182,
+      "step": 17815
+    },
+    {
+      "epoch": 0.15465143531740175,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0018644063602369849,
+      "loss": 0.1001,
+      "step": 17816
+    },
+    {
+      "epoch": 0.1546601157976059,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0018643906498398129,
+      "loss": 0.0972,
+      "step": 17817
+    },
+    {
+      "epoch": 0.15466879627781008,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0018643749386067147,
+      "loss": 0.2148,
+      "step": 17818
+    },
+    {
+      "epoch": 0.15467747675801424,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0018643592265377075,
+      "loss": 0.1006,
+      "step": 17819
+    },
+    {
+      "epoch": 0.1546861572382184,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001864343513632809,
+      "loss": 0.1182,
+      "step": 17820
+    },
+    {
+      "epoch": 0.15469483771842257,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0018643277998920357,
+      "loss": 0.1001,
+      "step": 17821
+    },
+    {
+      "epoch": 0.15470351819862674,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0018643120853154048,
+      "loss": 0.127,
+      "step": 17822
+    },
+    {
+      "epoch": 0.1547121986788309,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001864296369902934,
+      "loss": 0.1719,
+      "step": 17823
+    },
+    {
+      "epoch": 0.15472087915903507,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0018642806536546398,
+      "loss": 0.0908,
+      "step": 17824
+    },
+    {
+      "epoch": 0.15472955963923923,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.00186426493657054,
+      "loss": 0.1367,
+      "step": 17825
+    },
+    {
+      "epoch": 0.1547382401194434,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018642492186506517,
+      "loss": 0.1357,
+      "step": 17826
+    },
+    {
+      "epoch": 0.15474692059964756,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0018642334998949914,
+      "loss": 0.0615,
+      "step": 17827
+    },
+    {
+      "epoch": 0.15475560107985173,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0018642177803035772,
+      "loss": 0.0928,
+      "step": 17828
+    },
+    {
+      "epoch": 0.1547642815600559,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.001864202059876426,
+      "loss": 0.1719,
+      "step": 17829
+    },
+    {
+      "epoch": 0.15477296204026006,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0018641863386135546,
+      "loss": 0.1001,
+      "step": 17830
+    },
+    {
+      "epoch": 0.15478164252046422,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0018641706165149808,
+      "loss": 0.1211,
+      "step": 17831
+    },
+    {
+      "epoch": 0.1547903230006684,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018641548935807212,
+      "loss": 0.1006,
+      "step": 17832
+    },
+    {
+      "epoch": 0.15479900348087255,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0018641391698107934,
+      "loss": 0.0977,
+      "step": 17833
+    },
+    {
+      "epoch": 0.15480768396107672,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0018641234452052148,
+      "loss": 0.1162,
+      "step": 17834
+    },
+    {
+      "epoch": 0.15481636444128088,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.001864107719764002,
+      "loss": 0.1445,
+      "step": 17835
+    },
+    {
+      "epoch": 0.15482504492148505,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0018640919934871725,
+      "loss": 0.1025,
+      "step": 17836
+    },
+    {
+      "epoch": 0.15483372540168922,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0018640762663747435,
+      "loss": 0.0835,
+      "step": 17837
+    },
+    {
+      "epoch": 0.15484240588189338,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0018640605384267319,
+      "loss": 0.166,
+      "step": 17838
+    },
+    {
+      "epoch": 0.15485108636209755,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0018640448096431555,
+      "loss": 0.0874,
+      "step": 17839
+    },
+    {
+      "epoch": 0.1548597668423017,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0018640290800240311,
+      "loss": 0.1387,
+      "step": 17840
+    },
+    {
+      "epoch": 0.15486844732250588,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0018640133495693761,
+      "loss": 0.125,
+      "step": 17841
+    },
+    {
+      "epoch": 0.15487712780271004,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018639976182792074,
+      "loss": 0.106,
+      "step": 17842
+    },
+    {
+      "epoch": 0.1548858082829142,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0018639818861535425,
+      "loss": 0.0913,
+      "step": 17843
+    },
+    {
+      "epoch": 0.15489448876311837,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0018639661531923985,
+      "loss": 0.1475,
+      "step": 17844
+    },
+    {
+      "epoch": 0.15490316924332254,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0018639504193957925,
+      "loss": 0.1162,
+      "step": 17845
+    },
+    {
+      "epoch": 0.1549118497235267,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0018639346847637416,
+      "loss": 0.1069,
+      "step": 17846
+    },
+    {
+      "epoch": 0.15492053020373087,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0018639189492962636,
+      "loss": 0.1309,
+      "step": 17847
+    },
+    {
+      "epoch": 0.15492921068393503,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.001863903212993375,
+      "loss": 0.1289,
+      "step": 17848
+    },
+    {
+      "epoch": 0.1549378911641392,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0018638874758550937,
+      "loss": 0.0996,
+      "step": 17849
+    },
+    {
+      "epoch": 0.15494657164434336,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0018638717378814363,
+      "loss": 0.0869,
+      "step": 17850
+    },
+    {
+      "epoch": 0.15495525212454753,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0018638559990724203,
+      "loss": 0.1084,
+      "step": 17851
+    },
+    {
+      "epoch": 0.1549639326047517,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0018638402594280629,
+      "loss": 0.082,
+      "step": 17852
+    },
+    {
+      "epoch": 0.15497261308495586,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001863824518948381,
+      "loss": 0.1182,
+      "step": 17853
+    },
+    {
+      "epoch": 0.15498129356516002,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0018638087776333924,
+      "loss": 0.1523,
+      "step": 17854
+    },
+    {
+      "epoch": 0.1549899740453642,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.001863793035483114,
+      "loss": 0.0933,
+      "step": 17855
+    },
+    {
+      "epoch": 0.15499865452556835,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0018637772924975627,
+      "loss": 0.1289,
+      "step": 17856
+    },
+    {
+      "epoch": 0.15500733500577252,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0018637615486767562,
+      "loss": 0.123,
+      "step": 17857
+    },
+    {
+      "epoch": 0.15501601548597668,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0018637458040207115,
+      "loss": 0.1396,
+      "step": 17858
+    },
+    {
+      "epoch": 0.15502469596618085,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001863730058529446,
+      "loss": 0.1133,
+      "step": 17859
+    },
+    {
+      "epoch": 0.155033376446385,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0018637143122029764,
+      "loss": 0.0947,
+      "step": 17860
+    },
+    {
+      "epoch": 0.15504205692658918,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0018636985650413208,
+      "loss": 0.1021,
+      "step": 17861
+    },
+    {
+      "epoch": 0.15505073740679334,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0018636828170444954,
+      "loss": 0.1221,
+      "step": 17862
+    },
+    {
+      "epoch": 0.1550594178869975,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0018636670682125186,
+      "loss": 0.1201,
+      "step": 17863
+    },
+    {
+      "epoch": 0.15506809836720167,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018636513185454064,
+      "loss": 0.0938,
+      "step": 17864
+    },
+    {
+      "epoch": 0.15507677884740584,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0018636355680431772,
+      "loss": 0.1211,
+      "step": 17865
+    },
+    {
+      "epoch": 0.15508545932761,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001863619816705847,
+      "loss": 0.123,
+      "step": 17866
+    },
+    {
+      "epoch": 0.15509413980781417,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001863604064533434,
+      "loss": 0.0991,
+      "step": 17867
+    },
+    {
+      "epoch": 0.15510282028801833,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0018635883115259548,
+      "loss": 0.0991,
+      "step": 17868
+    },
+    {
+      "epoch": 0.1551115007682225,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001863572557683427,
+      "loss": 0.0938,
+      "step": 17869
+    },
+    {
+      "epoch": 0.15512018124842666,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001863556803005868,
+      "loss": 0.123,
+      "step": 17870
+    },
+    {
+      "epoch": 0.15512886172863083,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0018635410474932943,
+      "loss": 0.1289,
+      "step": 17871
+    },
+    {
+      "epoch": 0.155137542208835,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0018635252911457238,
+      "loss": 0.0913,
+      "step": 17872
+    },
+    {
+      "epoch": 0.15514622268903916,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0018635095339631735,
+      "loss": 0.1484,
+      "step": 17873
+    },
+    {
+      "epoch": 0.15515490316924332,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0018634937759456607,
+      "loss": 0.1328,
+      "step": 17874
+    },
+    {
+      "epoch": 0.1551635836494475,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0018634780170932027,
+      "loss": 0.1089,
+      "step": 17875
+    },
+    {
+      "epoch": 0.15517226412965165,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0018634622574058163,
+      "loss": 0.0942,
+      "step": 17876
+    },
+    {
+      "epoch": 0.15518094460985582,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0018634464968835193,
+      "loss": 0.1099,
+      "step": 17877
+    },
+    {
+      "epoch": 0.15518962509005998,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0018634307355263285,
+      "loss": 0.1152,
+      "step": 17878
+    },
+    {
+      "epoch": 0.15519830557026415,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0018634149733342616,
+      "loss": 0.125,
+      "step": 17879
+    },
+    {
+      "epoch": 0.15520698605046832,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001863399210307335,
+      "loss": 0.1191,
+      "step": 17880
+    },
+    {
+      "epoch": 0.15521566653067248,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0018633834464455673,
+      "loss": 0.1133,
+      "step": 17881
+    },
+    {
+      "epoch": 0.15522434701087665,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0018633676817489742,
+      "loss": 0.1182,
+      "step": 17882
+    },
+    {
+      "epoch": 0.1552330274910808,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0018633519162175739,
+      "loss": 0.1816,
+      "step": 17883
+    },
+    {
+      "epoch": 0.15524170797128498,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0018633361498513836,
+      "loss": 0.1123,
+      "step": 17884
+    },
+    {
+      "epoch": 0.15525038845148914,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0018633203826504202,
+      "loss": 0.1143,
+      "step": 17885
+    },
+    {
+      "epoch": 0.1552590689316933,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001863304614614701,
+      "loss": 0.0967,
+      "step": 17886
+    },
+    {
+      "epoch": 0.15526774941189747,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0018632888457442436,
+      "loss": 0.168,
+      "step": 17887
+    },
+    {
+      "epoch": 0.15527642989210164,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0018632730760390647,
+      "loss": 0.0942,
+      "step": 17888
+    },
+    {
+      "epoch": 0.1552851103723058,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.001863257305499182,
+      "loss": 0.104,
+      "step": 17889
+    },
+    {
+      "epoch": 0.15529379085250997,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0018632415341246127,
+      "loss": 0.0889,
+      "step": 17890
+    },
+    {
+      "epoch": 0.15530247133271413,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0018632257619153737,
+      "loss": 0.0908,
+      "step": 17891
+    },
+    {
+      "epoch": 0.1553111518129183,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0018632099888714825,
+      "loss": 0.124,
+      "step": 17892
+    },
+    {
+      "epoch": 0.15531983229312246,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0018631942149929567,
+      "loss": 0.1167,
+      "step": 17893
+    },
+    {
+      "epoch": 0.15532851277332663,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0018631784402798129,
+      "loss": 0.1455,
+      "step": 17894
+    },
+    {
+      "epoch": 0.1553371932535308,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0018631626647320682,
+      "loss": 0.1006,
+      "step": 17895
+    },
+    {
+      "epoch": 0.15534587373373496,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.001863146888349741,
+      "loss": 0.1182,
+      "step": 17896
+    },
+    {
+      "epoch": 0.15535455421393912,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0018631311111328474,
+      "loss": 0.1299,
+      "step": 17897
+    },
+    {
+      "epoch": 0.1553632346941433,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001863115333081405,
+      "loss": 0.0864,
+      "step": 17898
+    },
+    {
+      "epoch": 0.15537191517434745,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0018630995541954315,
+      "loss": 0.0957,
+      "step": 17899
+    },
+    {
+      "epoch": 0.15538059565455162,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0018630837744749436,
+      "loss": 0.1133,
+      "step": 17900
+    },
+    {
+      "epoch": 0.15538927613475578,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0018630679939199586,
+      "loss": 0.1357,
+      "step": 17901
+    },
+    {
+      "epoch": 0.15539795661495995,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001863052212530494,
+      "loss": 0.1118,
+      "step": 17902
+    },
+    {
+      "epoch": 0.1554066370951641,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0018630364303065673,
+      "loss": 0.1572,
+      "step": 17903
+    },
+    {
+      "epoch": 0.15541531757536828,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0018630206472481949,
+      "loss": 0.0938,
+      "step": 17904
+    },
+    {
+      "epoch": 0.15542399805557244,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018630048633553951,
+      "loss": 0.124,
+      "step": 17905
+    },
+    {
+      "epoch": 0.1554326785357766,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001862989078628184,
+      "loss": 0.168,
+      "step": 17906
+    },
+    {
+      "epoch": 0.15544135901598077,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00186297329306658,
+      "loss": 0.1094,
+      "step": 17907
+    },
+    {
+      "epoch": 0.15545003949618494,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018629575066705997,
+      "loss": 0.0811,
+      "step": 17908
+    },
+    {
+      "epoch": 0.1554587199763891,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018629417194402607,
+      "loss": 0.1182,
+      "step": 17909
+    },
+    {
+      "epoch": 0.15546740045659327,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.00186292593137558,
+      "loss": 0.1055,
+      "step": 17910
+    },
+    {
+      "epoch": 0.15547608093679743,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0018629101424765748,
+      "loss": 0.1113,
+      "step": 17911
+    },
+    {
+      "epoch": 0.1554847614170016,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001862894352743263,
+      "loss": 0.1758,
+      "step": 17912
+    },
+    {
+      "epoch": 0.15549344189720576,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001862878562175661,
+      "loss": 0.1123,
+      "step": 17913
+    },
+    {
+      "epoch": 0.15550212237740993,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0018628627707737864,
+      "loss": 0.0869,
+      "step": 17914
+    },
+    {
+      "epoch": 0.1555108028576141,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001862846978537657,
+      "loss": 0.0913,
+      "step": 17915
+    },
+    {
+      "epoch": 0.15551948333781826,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001862831185467289,
+      "loss": 0.0977,
+      "step": 17916
+    },
+    {
+      "epoch": 0.15552816381802242,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0018628153915627005,
+      "loss": 0.1094,
+      "step": 17917
+    },
+    {
+      "epoch": 0.1555368442982266,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.001862799596823909,
+      "loss": 0.0879,
+      "step": 17918
+    },
+    {
+      "epoch": 0.15554552477843075,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0018627838012509309,
+      "loss": 0.1426,
+      "step": 17919
+    },
+    {
+      "epoch": 0.15555420525863492,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018627680048437837,
+      "loss": 0.1465,
+      "step": 17920
+    },
+    {
+      "epoch": 0.15556288573883909,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.001862752207602485,
+      "loss": 0.1035,
+      "step": 17921
+    },
+    {
+      "epoch": 0.15557156621904325,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001862736409527052,
+      "loss": 0.1055,
+      "step": 17922
+    },
+    {
+      "epoch": 0.15558024669924742,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001862720610617502,
+      "loss": 0.1328,
+      "step": 17923
+    },
+    {
+      "epoch": 0.15558892717945158,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0018627048108738522,
+      "loss": 0.0815,
+      "step": 17924
+    },
+    {
+      "epoch": 0.15559760765965575,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00186268901029612,
+      "loss": 0.0854,
+      "step": 17925
+    },
+    {
+      "epoch": 0.1556062881398599,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0018626732088843224,
+      "loss": 0.1084,
+      "step": 17926
+    },
+    {
+      "epoch": 0.15561496862006408,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001862657406638477,
+      "loss": 0.1377,
+      "step": 17927
+    },
+    {
+      "epoch": 0.1556236491002682,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0018626416035586004,
+      "loss": 0.1143,
+      "step": 17928
+    },
+    {
+      "epoch": 0.15563232958047238,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0018626257996447108,
+      "loss": 0.1523,
+      "step": 17929
+    },
+    {
+      "epoch": 0.15564101006067654,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0018626099948968252,
+      "loss": 0.1533,
+      "step": 17930
+    },
+    {
+      "epoch": 0.1556496905408807,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018625941893149606,
+      "loss": 0.1602,
+      "step": 17931
+    },
+    {
+      "epoch": 0.15565837102108487,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0018625783828991346,
+      "loss": 0.1533,
+      "step": 17932
+    },
+    {
+      "epoch": 0.15566705150128904,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001862562575649364,
+      "loss": 0.1445,
+      "step": 17933
+    },
+    {
+      "epoch": 0.1556757319814932,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0018625467675656667,
+      "loss": 0.0967,
+      "step": 17934
+    },
+    {
+      "epoch": 0.15568441246169737,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0018625309586480596,
+      "loss": 0.1074,
+      "step": 17935
+    },
+    {
+      "epoch": 0.15569309294190153,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0018625151488965601,
+      "loss": 0.1094,
+      "step": 17936
+    },
+    {
+      "epoch": 0.1557017734221057,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018624993383111856,
+      "loss": 0.0957,
+      "step": 17937
+    },
+    {
+      "epoch": 0.15571045390230986,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0018624835268919533,
+      "loss": 0.1172,
+      "step": 17938
+    },
+    {
+      "epoch": 0.15571913438251403,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0018624677146388804,
+      "loss": 0.085,
+      "step": 17939
+    },
+    {
+      "epoch": 0.1557278148627182,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001862451901551984,
+      "loss": 0.1309,
+      "step": 17940
+    },
+    {
+      "epoch": 0.15573649534292236,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001862436087631282,
+      "loss": 0.1187,
+      "step": 17941
+    },
+    {
+      "epoch": 0.15574517582312652,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0018624202728767913,
+      "loss": 0.0957,
+      "step": 17942
+    },
+    {
+      "epoch": 0.1557538563033307,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0018624044572885293,
+      "loss": 0.125,
+      "step": 17943
+    },
+    {
+      "epoch": 0.15576253678353486,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001862388640866513,
+      "loss": 0.1504,
+      "step": 17944
+    },
+    {
+      "epoch": 0.15577121726373902,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.00186237282361076,
+      "loss": 0.1064,
+      "step": 17945
+    },
+    {
+      "epoch": 0.15577989774394319,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0018623570055212877,
+      "loss": 0.1504,
+      "step": 17946
+    },
+    {
+      "epoch": 0.15578857822414735,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0018623411865981133,
+      "loss": 0.0898,
+      "step": 17947
+    },
+    {
+      "epoch": 0.15579725870435152,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001862325366841254,
+      "loss": 0.0947,
+      "step": 17948
+    },
+    {
+      "epoch": 0.15580593918455568,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018623095462507267,
+      "loss": 0.1182,
+      "step": 17949
+    },
+    {
+      "epoch": 0.15581461966475985,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0018622937248265494,
+      "loss": 0.123,
+      "step": 17950
+    },
+    {
+      "epoch": 0.155823300144964,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0018622779025687392,
+      "loss": 0.1133,
+      "step": 17951
+    },
+    {
+      "epoch": 0.15583198062516818,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0018622620794773136,
+      "loss": 0.125,
+      "step": 17952
+    },
+    {
+      "epoch": 0.15584066110537234,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0018622462555522893,
+      "loss": 0.1855,
+      "step": 17953
+    },
+    {
+      "epoch": 0.1558493415855765,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001862230430793684,
+      "loss": 0.1045,
+      "step": 17954
+    },
+    {
+      "epoch": 0.15585802206578067,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001862214605201515,
+      "loss": 0.0952,
+      "step": 17955
+    },
+    {
+      "epoch": 0.15586670254598484,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0018621987787757997,
+      "loss": 0.1562,
+      "step": 17956
+    },
+    {
+      "epoch": 0.155875383026189,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0018621829515165548,
+      "loss": 0.1104,
+      "step": 17957
+    },
+    {
+      "epoch": 0.15588406350639317,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0018621671234237982,
+      "loss": 0.0938,
+      "step": 17958
+    },
+    {
+      "epoch": 0.15589274398659733,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018621512944975475,
+      "loss": 0.1377,
+      "step": 17959
+    },
+    {
+      "epoch": 0.1559014244668015,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0018621354647378192,
+      "loss": 0.1104,
+      "step": 17960
+    },
+    {
+      "epoch": 0.15591010494700566,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001862119634144631,
+      "loss": 0.1895,
+      "step": 17961
+    },
+    {
+      "epoch": 0.15591878542720983,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0018621038027180005,
+      "loss": 0.1396,
+      "step": 17962
+    },
+    {
+      "epoch": 0.155927465907414,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0018620879704579446,
+      "loss": 0.1211,
+      "step": 17963
+    },
+    {
+      "epoch": 0.15593614638761816,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0018620721373644809,
+      "loss": 0.0942,
+      "step": 17964
+    },
+    {
+      "epoch": 0.15594482686782232,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0018620563034376263,
+      "loss": 0.125,
+      "step": 17965
+    },
+    {
+      "epoch": 0.1559535073480265,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0018620404686773987,
+      "loss": 0.1113,
+      "step": 17966
+    },
+    {
+      "epoch": 0.15596218782823065,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0018620246330838149,
+      "loss": 0.106,
+      "step": 17967
+    },
+    {
+      "epoch": 0.15597086830843482,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0018620087966568922,
+      "loss": 0.0942,
+      "step": 17968
+    },
+    {
+      "epoch": 0.15597954878863898,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018619929593966485,
+      "loss": 0.123,
+      "step": 17969
+    },
+    {
+      "epoch": 0.15598822926884315,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0018619771213031005,
+      "loss": 0.1504,
+      "step": 17970
+    },
+    {
+      "epoch": 0.1559969097490473,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0018619612823762657,
+      "loss": 0.1289,
+      "step": 17971
+    },
+    {
+      "epoch": 0.15600559022925148,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0018619454426161617,
+      "loss": 0.1328,
+      "step": 17972
+    },
+    {
+      "epoch": 0.15601427070945564,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0018619296020228055,
+      "loss": 0.1338,
+      "step": 17973
+    },
+    {
+      "epoch": 0.1560229511896598,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0018619137605962147,
+      "loss": 0.1689,
+      "step": 17974
+    },
+    {
+      "epoch": 0.15603163166986397,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0018618979183364062,
+      "loss": 0.0767,
+      "step": 17975
+    },
+    {
+      "epoch": 0.15604031215006814,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001861882075243398,
+      "loss": 0.1196,
+      "step": 17976
+    },
+    {
+      "epoch": 0.1560489926302723,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0018618662313172065,
+      "loss": 0.1201,
+      "step": 17977
+    },
+    {
+      "epoch": 0.15605767311047647,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0018618503865578497,
+      "loss": 0.0957,
+      "step": 17978
+    },
+    {
+      "epoch": 0.15606635359068063,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001861834540965345,
+      "loss": 0.0874,
+      "step": 17979
+    },
+    {
+      "epoch": 0.1560750340708848,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0018618186945397093,
+      "loss": 0.063,
+      "step": 17980
+    },
+    {
+      "epoch": 0.15608371455108896,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.00186180284728096,
+      "loss": 0.1426,
+      "step": 17981
+    },
+    {
+      "epoch": 0.15609239503129313,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001861786999189115,
+      "loss": 0.1758,
+      "step": 17982
+    },
+    {
+      "epoch": 0.1561010755114973,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0018617711502641905,
+      "loss": 0.1553,
+      "step": 17983
+    },
+    {
+      "epoch": 0.15610975599170146,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001861755300506205,
+      "loss": 0.1572,
+      "step": 17984
+    },
+    {
+      "epoch": 0.15611843647190563,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0018617394499151755,
+      "loss": 0.1138,
+      "step": 17985
+    },
+    {
+      "epoch": 0.1561271169521098,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0018617235984911187,
+      "loss": 0.085,
+      "step": 17986
+    },
+    {
+      "epoch": 0.15613579743231396,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0018617077462340526,
+      "loss": 0.1523,
+      "step": 17987
+    },
+    {
+      "epoch": 0.15614447791251812,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0018616918931439943,
+      "loss": 0.1069,
+      "step": 17988
+    },
+    {
+      "epoch": 0.15615315839272229,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0018616760392209616,
+      "loss": 0.1011,
+      "step": 17989
+    },
+    {
+      "epoch": 0.15616183887292645,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0018616601844649713,
+      "loss": 0.125,
+      "step": 17990
+    },
+    {
+      "epoch": 0.15617051935313062,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0018616443288760405,
+      "loss": 0.1143,
+      "step": 17991
+    },
+    {
+      "epoch": 0.15617919983333478,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001861628472454187,
+      "loss": 0.1035,
+      "step": 17992
+    },
+    {
+      "epoch": 0.15618788031353895,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0018616126151994282,
+      "loss": 0.1426,
+      "step": 17993
+    },
+    {
+      "epoch": 0.1561965607937431,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0018615967571117811,
+      "loss": 0.127,
+      "step": 17994
+    },
+    {
+      "epoch": 0.15620524127394728,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018615808981912634,
+      "loss": 0.1172,
+      "step": 17995
+    },
+    {
+      "epoch": 0.15621392175415144,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0018615650384378921,
+      "loss": 0.1357,
+      "step": 17996
+    },
+    {
+      "epoch": 0.1562226022343556,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0018615491778516849,
+      "loss": 0.1406,
+      "step": 17997
+    },
+    {
+      "epoch": 0.15623128271455977,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0018615333164326588,
+      "loss": 0.1299,
+      "step": 17998
+    },
+    {
+      "epoch": 0.15623996319476394,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0018615174541808314,
+      "loss": 0.0977,
+      "step": 17999
+    },
+    {
+      "epoch": 0.1562486436749681,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.00186150159109622,
+      "loss": 0.0928,
+      "step": 18000
+    },
+    {
+      "epoch": 0.15625732415517227,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0018614857271788418,
+      "loss": 0.1426,
+      "step": 18001
+    },
+    {
+      "epoch": 0.15626600463537643,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0018614698624287145,
+      "loss": 0.123,
+      "step": 18002
+    },
+    {
+      "epoch": 0.1562746851155806,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001861453996845855,
+      "loss": 0.1338,
+      "step": 18003
+    },
+    {
+      "epoch": 0.15628336559578476,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001861438130430281,
+      "loss": 0.1187,
+      "step": 18004
+    },
+    {
+      "epoch": 0.15629204607598893,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0018614222631820093,
+      "loss": 0.1279,
+      "step": 18005
+    },
+    {
+      "epoch": 0.1563007265561931,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0018614063951010581,
+      "loss": 0.1211,
+      "step": 18006
+    },
+    {
+      "epoch": 0.15630940703639726,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001861390526187444,
+      "loss": 0.1201,
+      "step": 18007
+    },
+    {
+      "epoch": 0.15631808751660142,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0018613746564411849,
+      "loss": 0.105,
+      "step": 18008
+    },
+    {
+      "epoch": 0.1563267679968056,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0018613587858622976,
+      "loss": 0.1328,
+      "step": 18009
+    },
+    {
+      "epoch": 0.15633544847700975,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0018613429144508,
+      "loss": 0.1108,
+      "step": 18010
+    },
+    {
+      "epoch": 0.15634412895721392,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0018613270422067096,
+      "loss": 0.0938,
+      "step": 18011
+    },
+    {
+      "epoch": 0.15635280943741808,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.0018613111691300427,
+      "loss": 0.1289,
+      "step": 18012
+    },
+    {
+      "epoch": 0.15636148991762225,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0018612952952208176,
+      "loss": 0.0947,
+      "step": 18013
+    },
+    {
+      "epoch": 0.1563701703978264,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0018612794204790513,
+      "loss": 0.1172,
+      "step": 18014
+    },
+    {
+      "epoch": 0.15637885087803058,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0018612635449047614,
+      "loss": 0.1152,
+      "step": 18015
+    },
+    {
+      "epoch": 0.15638753135823474,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018612476684979654,
+      "loss": 0.0938,
+      "step": 18016
+    },
+    {
+      "epoch": 0.1563962118384389,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.00186123179125868,
+      "loss": 0.0918,
+      "step": 18017
+    },
+    {
+      "epoch": 0.15640489231864307,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001861215913186923,
+      "loss": 0.1177,
+      "step": 18018
+    },
+    {
+      "epoch": 0.15641357279884724,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0018612000342827118,
+      "loss": 0.1221,
+      "step": 18019
+    },
+    {
+      "epoch": 0.1564222532790514,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0018611841545460635,
+      "loss": 0.1084,
+      "step": 18020
+    },
+    {
+      "epoch": 0.15643093375925557,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0018611682739769959,
+      "loss": 0.1133,
+      "step": 18021
+    },
+    {
+      "epoch": 0.15643961423945973,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.001861152392575526,
+      "loss": 0.1309,
+      "step": 18022
+    },
+    {
+      "epoch": 0.1564482947196639,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0018611365103416713,
+      "loss": 0.1348,
+      "step": 18023
+    },
+    {
+      "epoch": 0.15645697519986806,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001861120627275449,
+      "loss": 0.0898,
+      "step": 18024
+    },
+    {
+      "epoch": 0.15646565568007223,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0018611047433768771,
+      "loss": 0.1172,
+      "step": 18025
+    },
+    {
+      "epoch": 0.1564743361602764,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001861088858645972,
+      "loss": 0.1094,
+      "step": 18026
+    },
+    {
+      "epoch": 0.15648301664048056,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0018610729730827516,
+      "loss": 0.0859,
+      "step": 18027
+    },
+    {
+      "epoch": 0.15649169712068473,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0018610570866872333,
+      "loss": 0.1016,
+      "step": 18028
+    },
+    {
+      "epoch": 0.1565003776008889,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0018610411994594346,
+      "loss": 0.1016,
+      "step": 18029
+    },
+    {
+      "epoch": 0.15650905808109306,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0018610253113993723,
+      "loss": 0.0654,
+      "step": 18030
+    },
+    {
+      "epoch": 0.15651773856129722,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0018610094225070643,
+      "loss": 0.1621,
+      "step": 18031
+    },
+    {
+      "epoch": 0.15652641904150139,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001860993532782528,
+      "loss": 0.1064,
+      "step": 18032
+    },
+    {
+      "epoch": 0.15653509952170555,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0018609776422257808,
+      "loss": 0.1465,
+      "step": 18033
+    },
+    {
+      "epoch": 0.15654378000190972,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018609617508368395,
+      "loss": 0.1035,
+      "step": 18034
+    },
+    {
+      "epoch": 0.15655246048211388,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0018609458586157218,
+      "loss": 0.1084,
+      "step": 18035
+    },
+    {
+      "epoch": 0.15656114096231805,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0018609299655624454,
+      "loss": 0.1465,
+      "step": 18036
+    },
+    {
+      "epoch": 0.1565698214425222,
+      "grad_norm": 1.703125,
+      "learning_rate": 0.0018609140716770271,
+      "loss": 0.1299,
+      "step": 18037
+    },
+    {
+      "epoch": 0.15657850192272638,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.001860898176959485,
+      "loss": 0.1543,
+      "step": 18038
+    },
+    {
+      "epoch": 0.15658718240293054,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0018608822814098359,
+      "loss": 0.0977,
+      "step": 18039
+    },
+    {
+      "epoch": 0.1565958628831347,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0018608663850280974,
+      "loss": 0.1113,
+      "step": 18040
+    },
+    {
+      "epoch": 0.15660454336333887,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0018608504878142868,
+      "loss": 0.0986,
+      "step": 18041
+    },
+    {
+      "epoch": 0.15661322384354304,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0018608345897684218,
+      "loss": 0.1143,
+      "step": 18042
+    },
+    {
+      "epoch": 0.1566219043237472,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001860818690890519,
+      "loss": 0.0938,
+      "step": 18043
+    },
+    {
+      "epoch": 0.15663058480395137,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0018608027911805967,
+      "loss": 0.083,
+      "step": 18044
+    },
+    {
+      "epoch": 0.15663926528415553,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0018607868906386719,
+      "loss": 0.1001,
+      "step": 18045
+    },
+    {
+      "epoch": 0.1566479457643597,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0018607709892647618,
+      "loss": 0.0977,
+      "step": 18046
+    },
+    {
+      "epoch": 0.15665662624456386,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0018607550870588842,
+      "loss": 0.1289,
+      "step": 18047
+    },
+    {
+      "epoch": 0.15666530672476803,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001860739184021056,
+      "loss": 0.0806,
+      "step": 18048
+    },
+    {
+      "epoch": 0.1566739872049722,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001860723280151295,
+      "loss": 0.1094,
+      "step": 18049
+    },
+    {
+      "epoch": 0.15668266768517636,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0018607073754496185,
+      "loss": 0.1025,
+      "step": 18050
+    },
+    {
+      "epoch": 0.1566913481653805,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0018606914699160438,
+      "loss": 0.1377,
+      "step": 18051
+    },
+    {
+      "epoch": 0.15670002864558466,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0018606755635505884,
+      "loss": 0.1006,
+      "step": 18052
+    },
+    {
+      "epoch": 0.15670870912578883,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0018606596563532694,
+      "loss": 0.1152,
+      "step": 18053
+    },
+    {
+      "epoch": 0.156717389605993,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0018606437483241044,
+      "loss": 0.0947,
+      "step": 18054
+    },
+    {
+      "epoch": 0.15672607008619716,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0018606278394631113,
+      "loss": 0.2012,
+      "step": 18055
+    },
+    {
+      "epoch": 0.15673475056640132,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0018606119297703066,
+      "loss": 0.0933,
+      "step": 18056
+    },
+    {
+      "epoch": 0.15674343104660549,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0018605960192457083,
+      "loss": 0.1309,
+      "step": 18057
+    },
+    {
+      "epoch": 0.15675211152680965,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0018605801078893335,
+      "loss": 0.1914,
+      "step": 18058
+    },
+    {
+      "epoch": 0.15676079200701382,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0018605641957012,
+      "loss": 0.1504,
+      "step": 18059
+    },
+    {
+      "epoch": 0.15676947248721798,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0018605482826813244,
+      "loss": 0.125,
+      "step": 18060
+    },
+    {
+      "epoch": 0.15677815296742215,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001860532368829725,
+      "loss": 0.1064,
+      "step": 18061
+    },
+    {
+      "epoch": 0.1567868334476263,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0018605164541464188,
+      "loss": 0.0811,
+      "step": 18062
+    },
+    {
+      "epoch": 0.15679551392783048,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0018605005386314233,
+      "loss": 0.126,
+      "step": 18063
+    },
+    {
+      "epoch": 0.15680419440803464,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0018604846222847558,
+      "loss": 0.084,
+      "step": 18064
+    },
+    {
+      "epoch": 0.1568128748882388,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0018604687051064336,
+      "loss": 0.1035,
+      "step": 18065
+    },
+    {
+      "epoch": 0.15682155536844297,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0018604527870964742,
+      "loss": 0.0933,
+      "step": 18066
+    },
+    {
+      "epoch": 0.15683023584864714,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0018604368682548953,
+      "loss": 0.0854,
+      "step": 18067
+    },
+    {
+      "epoch": 0.1568389163288513,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001860420948581714,
+      "loss": 0.1152,
+      "step": 18068
+    },
+    {
+      "epoch": 0.15684759680905547,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0018604050280769478,
+      "loss": 0.0952,
+      "step": 18069
+    },
+    {
+      "epoch": 0.15685627728925963,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001860389106740614,
+      "loss": 0.1289,
+      "step": 18070
+    },
+    {
+      "epoch": 0.1568649577694638,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018603731845727302,
+      "loss": 0.1279,
+      "step": 18071
+    },
+    {
+      "epoch": 0.15687363824966796,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0018603572615733136,
+      "loss": 0.1162,
+      "step": 18072
+    },
+    {
+      "epoch": 0.15688231872987213,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0018603413377423818,
+      "loss": 0.1201,
+      "step": 18073
+    },
+    {
+      "epoch": 0.1568909992100763,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001860325413079952,
+      "loss": 0.1211,
+      "step": 18074
+    },
+    {
+      "epoch": 0.15689967969028046,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0018603094875860417,
+      "loss": 0.1533,
+      "step": 18075
+    },
+    {
+      "epoch": 0.15690836017048462,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0018602935612606689,
+      "loss": 0.1123,
+      "step": 18076
+    },
+    {
+      "epoch": 0.1569170406506888,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.00186027763410385,
+      "loss": 0.103,
+      "step": 18077
+    },
+    {
+      "epoch": 0.15692572113089295,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0018602617061156028,
+      "loss": 0.1973,
+      "step": 18078
+    },
+    {
+      "epoch": 0.15693440161109712,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001860245777295945,
+      "loss": 0.1338,
+      "step": 18079
+    },
+    {
+      "epoch": 0.15694308209130128,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.001860229847644894,
+      "loss": 0.1025,
+      "step": 18080
+    },
+    {
+      "epoch": 0.15695176257150545,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0018602139171624667,
+      "loss": 0.1543,
+      "step": 18081
+    },
+    {
+      "epoch": 0.1569604430517096,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0018601979858486812,
+      "loss": 0.1211,
+      "step": 18082
+    },
+    {
+      "epoch": 0.15696912353191378,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0018601820537035546,
+      "loss": 0.1069,
+      "step": 18083
+    },
+    {
+      "epoch": 0.15697780401211794,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0018601661207271044,
+      "loss": 0.1309,
+      "step": 18084
+    },
+    {
+      "epoch": 0.1569864844923221,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0018601501869193475,
+      "loss": 0.1001,
+      "step": 18085
+    },
+    {
+      "epoch": 0.15699516497252627,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001860134252280302,
+      "loss": 0.1299,
+      "step": 18086
+    },
+    {
+      "epoch": 0.15700384545273044,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0018601183168099852,
+      "loss": 0.1211,
+      "step": 18087
+    },
+    {
+      "epoch": 0.1570125259329346,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0018601023805084143,
+      "loss": 0.1211,
+      "step": 18088
+    },
+    {
+      "epoch": 0.15702120641313877,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001860086443375607,
+      "loss": 0.1455,
+      "step": 18089
+    },
+    {
+      "epoch": 0.15702988689334293,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0018600705054115808,
+      "loss": 0.1152,
+      "step": 18090
+    },
+    {
+      "epoch": 0.1570385673735471,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0018600545666163527,
+      "loss": 0.1328,
+      "step": 18091
+    },
+    {
+      "epoch": 0.15704724785375127,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.00186003862698994,
+      "loss": 0.1045,
+      "step": 18092
+    },
+    {
+      "epoch": 0.15705592833395543,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001860022686532361,
+      "loss": 0.0781,
+      "step": 18093
+    },
+    {
+      "epoch": 0.1570646088141596,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0018600067452436324,
+      "loss": 0.1104,
+      "step": 18094
+    },
+    {
+      "epoch": 0.15707328929436376,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001859990803123772,
+      "loss": 0.0664,
+      "step": 18095
+    },
+    {
+      "epoch": 0.15708196977456793,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0018599748601727966,
+      "loss": 0.1143,
+      "step": 18096
+    },
+    {
+      "epoch": 0.1570906502547721,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0018599589163907245,
+      "loss": 0.1162,
+      "step": 18097
+    },
+    {
+      "epoch": 0.15709933073497626,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0018599429717775728,
+      "loss": 0.1309,
+      "step": 18098
+    },
+    {
+      "epoch": 0.15710801121518042,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0018599270263333588,
+      "loss": 0.0889,
+      "step": 18099
+    },
+    {
+      "epoch": 0.15711669169538459,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0018599110800580996,
+      "loss": 0.126,
+      "step": 18100
+    },
+    {
+      "epoch": 0.15712537217558875,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0018598951329518137,
+      "loss": 0.1377,
+      "step": 18101
+    },
+    {
+      "epoch": 0.15713405265579292,
+      "grad_norm": 2.625,
+      "learning_rate": 0.0018598791850145174,
+      "loss": 0.1211,
+      "step": 18102
+    },
+    {
+      "epoch": 0.15714273313599708,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0018598632362462292,
+      "loss": 0.127,
+      "step": 18103
+    },
+    {
+      "epoch": 0.15715141361620125,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0018598472866469655,
+      "loss": 0.0918,
+      "step": 18104
+    },
+    {
+      "epoch": 0.1571600940964054,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0018598313362167443,
+      "loss": 0.1055,
+      "step": 18105
+    },
+    {
+      "epoch": 0.15716877457660958,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0018598153849555832,
+      "loss": 0.082,
+      "step": 18106
+    },
+    {
+      "epoch": 0.15717745505681374,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001859799432863499,
+      "loss": 0.1025,
+      "step": 18107
+    },
+    {
+      "epoch": 0.1571861355370179,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.00185978347994051,
+      "loss": 0.1074,
+      "step": 18108
+    },
+    {
+      "epoch": 0.15719481601722207,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001859767526186633,
+      "loss": 0.1504,
+      "step": 18109
+    },
+    {
+      "epoch": 0.15720349649742624,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0018597515716018856,
+      "loss": 0.1338,
+      "step": 18110
+    },
+    {
+      "epoch": 0.1572121769776304,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0018597356161862853,
+      "loss": 0.1338,
+      "step": 18111
+    },
+    {
+      "epoch": 0.15722085745783457,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018597196599398499,
+      "loss": 0.0898,
+      "step": 18112
+    },
+    {
+      "epoch": 0.15722953793803873,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001859703702862596,
+      "loss": 0.1367,
+      "step": 18113
+    },
+    {
+      "epoch": 0.1572382184182429,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0018596877449545417,
+      "loss": 0.1045,
+      "step": 18114
+    },
+    {
+      "epoch": 0.15724689889844706,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0018596717862157043,
+      "loss": 0.1104,
+      "step": 18115
+    },
+    {
+      "epoch": 0.15725557937865123,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0018596558266461014,
+      "loss": 0.0923,
+      "step": 18116
+    },
+    {
+      "epoch": 0.1572642598588554,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00185963986624575,
+      "loss": 0.1465,
+      "step": 18117
+    },
+    {
+      "epoch": 0.15727294033905956,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0018596239050146682,
+      "loss": 0.1709,
+      "step": 18118
+    },
+    {
+      "epoch": 0.15728162081926372,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.001859607942952873,
+      "loss": 0.1172,
+      "step": 18119
+    },
+    {
+      "epoch": 0.1572903012994679,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001859591980060382,
+      "loss": 0.1182,
+      "step": 18120
+    },
+    {
+      "epoch": 0.15729898177967205,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0018595760163372126,
+      "loss": 0.0869,
+      "step": 18121
+    },
+    {
+      "epoch": 0.15730766225987622,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001859560051783382,
+      "loss": 0.1113,
+      "step": 18122
+    },
+    {
+      "epoch": 0.15731634274008038,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0018595440863989083,
+      "loss": 0.0728,
+      "step": 18123
+    },
+    {
+      "epoch": 0.15732502322028455,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0018595281201838083,
+      "loss": 0.1191,
+      "step": 18124
+    },
+    {
+      "epoch": 0.15733370370048871,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0018595121531380999,
+      "loss": 0.1147,
+      "step": 18125
+    },
+    {
+      "epoch": 0.15734238418069288,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0018594961852618005,
+      "loss": 0.0923,
+      "step": 18126
+    },
+    {
+      "epoch": 0.15735106466089704,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0018594802165549276,
+      "loss": 0.1089,
+      "step": 18127
+    },
+    {
+      "epoch": 0.1573597451411012,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0018594642470174984,
+      "loss": 0.1167,
+      "step": 18128
+    },
+    {
+      "epoch": 0.15736842562130537,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0018594482766495304,
+      "loss": 0.1069,
+      "step": 18129
+    },
+    {
+      "epoch": 0.15737710610150954,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0018594323054510415,
+      "loss": 0.0981,
+      "step": 18130
+    },
+    {
+      "epoch": 0.1573857865817137,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0018594163334220482,
+      "loss": 0.1074,
+      "step": 18131
+    },
+    {
+      "epoch": 0.15739446706191787,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0018594003605625692,
+      "loss": 0.1016,
+      "step": 18132
+    },
+    {
+      "epoch": 0.15740314754212203,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0018593843868726211,
+      "loss": 0.1123,
+      "step": 18133
+    },
+    {
+      "epoch": 0.1574118280223262,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0018593684123522217,
+      "loss": 0.1279,
+      "step": 18134
+    },
+    {
+      "epoch": 0.15742050850253037,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0018593524370013885,
+      "loss": 0.0991,
+      "step": 18135
+    },
+    {
+      "epoch": 0.15742918898273453,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001859336460820139,
+      "loss": 0.1396,
+      "step": 18136
+    },
+    {
+      "epoch": 0.1574378694629387,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.00185932048380849,
+      "loss": 0.1357,
+      "step": 18137
+    },
+    {
+      "epoch": 0.15744654994314286,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0018593045059664603,
+      "loss": 0.1206,
+      "step": 18138
+    },
+    {
+      "epoch": 0.15745523042334703,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001859288527294066,
+      "loss": 0.1123,
+      "step": 18139
+    },
+    {
+      "epoch": 0.1574639109035512,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.0018592725477913255,
+      "loss": 0.0967,
+      "step": 18140
+    },
+    {
+      "epoch": 0.15747259138375536,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0018592565674582558,
+      "loss": 0.104,
+      "step": 18141
+    },
+    {
+      "epoch": 0.15748127186395952,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0018592405862948747,
+      "loss": 0.0684,
+      "step": 18142
+    },
+    {
+      "epoch": 0.1574899523441637,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0018592246043011994,
+      "loss": 0.1216,
+      "step": 18143
+    },
+    {
+      "epoch": 0.15749863282436785,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0018592086214772475,
+      "loss": 0.123,
+      "step": 18144
+    },
+    {
+      "epoch": 0.15750731330457202,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018591926378230359,
+      "loss": 0.083,
+      "step": 18145
+    },
+    {
+      "epoch": 0.15751599378477618,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0018591766533385837,
+      "loss": 0.0933,
+      "step": 18146
+    },
+    {
+      "epoch": 0.15752467426498035,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0018591606680239065,
+      "loss": 0.123,
+      "step": 18147
+    },
+    {
+      "epoch": 0.1575333547451845,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001859144681879023,
+      "loss": 0.0991,
+      "step": 18148
+    },
+    {
+      "epoch": 0.15754203522538868,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0018591286949039502,
+      "loss": 0.1021,
+      "step": 18149
+    },
+    {
+      "epoch": 0.15755071570559284,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0018591127070987055,
+      "loss": 0.1182,
+      "step": 18150
+    },
+    {
+      "epoch": 0.157559396185797,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0018590967184633068,
+      "loss": 0.1777,
+      "step": 18151
+    },
+    {
+      "epoch": 0.15756807666600117,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0018590807289977712,
+      "loss": 0.1133,
+      "step": 18152
+    },
+    {
+      "epoch": 0.15757675714620534,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001859064738702116,
+      "loss": 0.1309,
+      "step": 18153
+    },
+    {
+      "epoch": 0.1575854376264095,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0018590487475763596,
+      "loss": 0.1289,
+      "step": 18154
+    },
+    {
+      "epoch": 0.15759411810661367,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0018590327556205185,
+      "loss": 0.1025,
+      "step": 18155
+    },
+    {
+      "epoch": 0.15760279858681783,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0018590167628346106,
+      "loss": 0.1025,
+      "step": 18156
+    },
+    {
+      "epoch": 0.157611479067022,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0018590007692186534,
+      "loss": 0.125,
+      "step": 18157
+    },
+    {
+      "epoch": 0.15762015954722616,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018589847747726646,
+      "loss": 0.1133,
+      "step": 18158
+    },
+    {
+      "epoch": 0.15762884002743033,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0018589687794966611,
+      "loss": 0.1025,
+      "step": 18159
+    },
+    {
+      "epoch": 0.1576375205076345,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001858952783390661,
+      "loss": 0.1562,
+      "step": 18160
+    },
+    {
+      "epoch": 0.15764620098783866,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0018589367864546816,
+      "loss": 0.1396,
+      "step": 18161
+    },
+    {
+      "epoch": 0.15765488146804282,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0018589207886887402,
+      "loss": 0.0757,
+      "step": 18162
+    },
+    {
+      "epoch": 0.157663561948247,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0018589047900928544,
+      "loss": 0.1279,
+      "step": 18163
+    },
+    {
+      "epoch": 0.15767224242845115,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0018588887906670418,
+      "loss": 0.1074,
+      "step": 18164
+    },
+    {
+      "epoch": 0.15768092290865532,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0018588727904113196,
+      "loss": 0.1133,
+      "step": 18165
+    },
+    {
+      "epoch": 0.15768960338885948,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0018588567893257058,
+      "loss": 0.1143,
+      "step": 18166
+    },
+    {
+      "epoch": 0.15769828386906365,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0018588407874102176,
+      "loss": 0.1328,
+      "step": 18167
+    },
+    {
+      "epoch": 0.15770696434926781,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018588247846648722,
+      "loss": 0.0977,
+      "step": 18168
+    },
+    {
+      "epoch": 0.15771564482947198,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0018588087810896877,
+      "loss": 0.105,
+      "step": 18169
+    },
+    {
+      "epoch": 0.15772432530967614,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0018587927766846814,
+      "loss": 0.1104,
+      "step": 18170
+    },
+    {
+      "epoch": 0.1577330057898803,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0018587767714498705,
+      "loss": 0.1426,
+      "step": 18171
+    },
+    {
+      "epoch": 0.15774168627008447,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0018587607653852726,
+      "loss": 0.1133,
+      "step": 18172
+    },
+    {
+      "epoch": 0.15775036675028864,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0018587447584909056,
+      "loss": 0.1064,
+      "step": 18173
+    },
+    {
+      "epoch": 0.15775904723049278,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0018587287507667867,
+      "loss": 0.1191,
+      "step": 18174
+    },
+    {
+      "epoch": 0.15776772771069694,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0018587127422129335,
+      "loss": 0.0747,
+      "step": 18175
+    },
+    {
+      "epoch": 0.1577764081909011,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0018586967328293633,
+      "loss": 0.0942,
+      "step": 18176
+    },
+    {
+      "epoch": 0.15778508867110527,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018586807226160938,
+      "loss": 0.0952,
+      "step": 18177
+    },
+    {
+      "epoch": 0.15779376915130944,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0018586647115731425,
+      "loss": 0.0957,
+      "step": 18178
+    },
+    {
+      "epoch": 0.1578024496315136,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0018586486997005268,
+      "loss": 0.1973,
+      "step": 18179
+    },
+    {
+      "epoch": 0.15781113011171777,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001858632686998264,
+      "loss": 0.0977,
+      "step": 18180
+    },
+    {
+      "epoch": 0.15781981059192193,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0018586166734663725,
+      "loss": 0.1118,
+      "step": 18181
+    },
+    {
+      "epoch": 0.1578284910721261,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0018586006591048687,
+      "loss": 0.1445,
+      "step": 18182
+    },
+    {
+      "epoch": 0.15783717155233026,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0018585846439137712,
+      "loss": 0.1406,
+      "step": 18183
+    },
+    {
+      "epoch": 0.15784585203253443,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0018585686278930961,
+      "loss": 0.1152,
+      "step": 18184
+    },
+    {
+      "epoch": 0.1578545325127386,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0018585526110428627,
+      "loss": 0.1309,
+      "step": 18185
+    },
+    {
+      "epoch": 0.15786321299294276,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0018585365933630868,
+      "loss": 0.1777,
+      "step": 18186
+    },
+    {
+      "epoch": 0.15787189347314692,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001858520574853787,
+      "loss": 0.0996,
+      "step": 18187
+    },
+    {
+      "epoch": 0.1578805739533511,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0018585045555149803,
+      "loss": 0.0859,
+      "step": 18188
+    },
+    {
+      "epoch": 0.15788925443355525,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0018584885353466847,
+      "loss": 0.0986,
+      "step": 18189
+    },
+    {
+      "epoch": 0.15789793491375942,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0018584725143489173,
+      "loss": 0.1396,
+      "step": 18190
+    },
+    {
+      "epoch": 0.15790661539396358,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0018584564925216958,
+      "loss": 0.1406,
+      "step": 18191
+    },
+    {
+      "epoch": 0.15791529587416775,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018584404698650378,
+      "loss": 0.1436,
+      "step": 18192
+    },
+    {
+      "epoch": 0.15792397635437191,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0018584244463789603,
+      "loss": 0.1348,
+      "step": 18193
+    },
+    {
+      "epoch": 0.15793265683457608,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0018584084220634814,
+      "loss": 0.1504,
+      "step": 18194
+    },
+    {
+      "epoch": 0.15794133731478024,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0018583923969186184,
+      "loss": 0.1133,
+      "step": 18195
+    },
+    {
+      "epoch": 0.1579500177949844,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001858376370944389,
+      "loss": 0.1123,
+      "step": 18196
+    },
+    {
+      "epoch": 0.15795869827518857,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0018583603441408106,
+      "loss": 0.0942,
+      "step": 18197
+    },
+    {
+      "epoch": 0.15796737875539274,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0018583443165079009,
+      "loss": 0.0913,
+      "step": 18198
+    },
+    {
+      "epoch": 0.1579760592355969,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001858328288045677,
+      "loss": 0.1172,
+      "step": 18199
+    },
+    {
+      "epoch": 0.15798473971580107,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0018583122587541563,
+      "loss": 0.1328,
+      "step": 18200
+    },
+    {
+      "epoch": 0.15799342019600524,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0018582962286333572,
+      "loss": 0.1177,
+      "step": 18201
+    },
+    {
+      "epoch": 0.1580021006762094,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0018582801976832965,
+      "loss": 0.1016,
+      "step": 18202
+    },
+    {
+      "epoch": 0.15801078115641357,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0018582641659039925,
+      "loss": 0.1309,
+      "step": 18203
+    },
+    {
+      "epoch": 0.15801946163661773,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0018582481332954618,
+      "loss": 0.1108,
+      "step": 18204
+    },
+    {
+      "epoch": 0.1580281421168219,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0018582320998577222,
+      "loss": 0.0854,
+      "step": 18205
+    },
+    {
+      "epoch": 0.15803682259702606,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0018582160655907917,
+      "loss": 0.1299,
+      "step": 18206
+    },
+    {
+      "epoch": 0.15804550307723023,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001858200030494687,
+      "loss": 0.1289,
+      "step": 18207
+    },
+    {
+      "epoch": 0.1580541835574344,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0018581839945694266,
+      "loss": 0.1074,
+      "step": 18208
+    },
+    {
+      "epoch": 0.15806286403763856,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018581679578150274,
+      "loss": 0.1475,
+      "step": 18209
+    },
+    {
+      "epoch": 0.15807154451784272,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0018581519202315068,
+      "loss": 0.1035,
+      "step": 18210
+    },
+    {
+      "epoch": 0.1580802249980469,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0018581358818188834,
+      "loss": 0.1494,
+      "step": 18211
+    },
+    {
+      "epoch": 0.15808890547825105,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0018581198425771735,
+      "loss": 0.1172,
+      "step": 18212
+    },
+    {
+      "epoch": 0.15809758595845522,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0018581038025063952,
+      "loss": 0.1387,
+      "step": 18213
+    },
+    {
+      "epoch": 0.15810626643865938,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018580877616065658,
+      "loss": 0.0845,
+      "step": 18214
+    },
+    {
+      "epoch": 0.15811494691886355,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0018580717198777032,
+      "loss": 0.1523,
+      "step": 18215
+    },
+    {
+      "epoch": 0.1581236273990677,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001858055677319825,
+      "loss": 0.0723,
+      "step": 18216
+    },
+    {
+      "epoch": 0.15813230787927188,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0018580396339329482,
+      "loss": 0.1128,
+      "step": 18217
+    },
+    {
+      "epoch": 0.15814098835947604,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0018580235897170908,
+      "loss": 0.085,
+      "step": 18218
+    },
+    {
+      "epoch": 0.1581496688396802,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.00185800754467227,
+      "loss": 0.1504,
+      "step": 18219
+    },
+    {
+      "epoch": 0.15815834931988437,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0018579914987985036,
+      "loss": 0.1235,
+      "step": 18220
+    },
+    {
+      "epoch": 0.15816702980008854,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0018579754520958091,
+      "loss": 0.1045,
+      "step": 18221
+    },
+    {
+      "epoch": 0.1581757102802927,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018579594045642041,
+      "loss": 0.1289,
+      "step": 18222
+    },
+    {
+      "epoch": 0.15818439076049687,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001857943356203706,
+      "loss": 0.1309,
+      "step": 18223
+    },
+    {
+      "epoch": 0.15819307124070103,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0018579273070143325,
+      "loss": 0.0845,
+      "step": 18224
+    },
+    {
+      "epoch": 0.1582017517209052,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0018579112569961011,
+      "loss": 0.1084,
+      "step": 18225
+    },
+    {
+      "epoch": 0.15821043220110936,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0018578952061490293,
+      "loss": 0.1211,
+      "step": 18226
+    },
+    {
+      "epoch": 0.15821911268131353,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0018578791544731346,
+      "loss": 0.0957,
+      "step": 18227
+    },
+    {
+      "epoch": 0.1582277931615177,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001857863101968435,
+      "loss": 0.1094,
+      "step": 18228
+    },
+    {
+      "epoch": 0.15823647364172186,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0018578470486349473,
+      "loss": 0.0796,
+      "step": 18229
+    },
+    {
+      "epoch": 0.15824515412192602,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0018578309944726895,
+      "loss": 0.104,
+      "step": 18230
+    },
+    {
+      "epoch": 0.1582538346021302,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001857814939481679,
+      "loss": 0.084,
+      "step": 18231
+    },
+    {
+      "epoch": 0.15826251508233435,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0018577988836619337,
+      "loss": 0.1328,
+      "step": 18232
+    },
+    {
+      "epoch": 0.15827119556253852,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0018577828270134708,
+      "loss": 0.0801,
+      "step": 18233
+    },
+    {
+      "epoch": 0.15827987604274268,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001857766769536308,
+      "loss": 0.1172,
+      "step": 18234
+    },
+    {
+      "epoch": 0.15828855652294685,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001857750711230463,
+      "loss": 0.1621,
+      "step": 18235
+    },
+    {
+      "epoch": 0.15829723700315101,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0018577346520959532,
+      "loss": 0.1094,
+      "step": 18236
+    },
+    {
+      "epoch": 0.15830591748335518,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0018577185921327963,
+      "loss": 0.105,
+      "step": 18237
+    },
+    {
+      "epoch": 0.15831459796355934,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0018577025313410095,
+      "loss": 0.1172,
+      "step": 18238
+    },
+    {
+      "epoch": 0.1583232784437635,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018576864697206106,
+      "loss": 0.1182,
+      "step": 18239
+    },
+    {
+      "epoch": 0.15833195892396768,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0018576704072716173,
+      "loss": 0.1426,
+      "step": 18240
+    },
+    {
+      "epoch": 0.15834063940417184,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0018576543439940468,
+      "loss": 0.127,
+      "step": 18241
+    },
+    {
+      "epoch": 0.158349319884376,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.001857638279887917,
+      "loss": 0.0933,
+      "step": 18242
+    },
+    {
+      "epoch": 0.15835800036458017,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0018576222149532453,
+      "loss": 0.1289,
+      "step": 18243
+    },
+    {
+      "epoch": 0.15836668084478434,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0018576061491900496,
+      "loss": 0.1025,
+      "step": 18244
+    },
+    {
+      "epoch": 0.1583753613249885,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0018575900825983469,
+      "loss": 0.1211,
+      "step": 18245
+    },
+    {
+      "epoch": 0.15838404180519267,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.001857574015178155,
+      "loss": 0.0874,
+      "step": 18246
+    },
+    {
+      "epoch": 0.15839272228539683,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0018575579469294915,
+      "loss": 0.1152,
+      "step": 18247
+    },
+    {
+      "epoch": 0.158401402765601,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0018575418778523744,
+      "loss": 0.1221,
+      "step": 18248
+    },
+    {
+      "epoch": 0.15841008324580516,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0018575258079468206,
+      "loss": 0.1309,
+      "step": 18249
+    },
+    {
+      "epoch": 0.15841876372600933,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0018575097372128483,
+      "loss": 0.1582,
+      "step": 18250
+    },
+    {
+      "epoch": 0.1584274442062135,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0018574936656504744,
+      "loss": 0.1592,
+      "step": 18251
+    },
+    {
+      "epoch": 0.15843612468641766,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001857477593259717,
+      "loss": 0.1221,
+      "step": 18252
+    },
+    {
+      "epoch": 0.15844480516662182,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0018574615200405934,
+      "loss": 0.083,
+      "step": 18253
+    },
+    {
+      "epoch": 0.158453485646826,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001857445445993121,
+      "loss": 0.1328,
+      "step": 18254
+    },
+    {
+      "epoch": 0.15846216612703015,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001857429371117318,
+      "loss": 0.1084,
+      "step": 18255
+    },
+    {
+      "epoch": 0.15847084660723432,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0018574132954132015,
+      "loss": 0.1211,
+      "step": 18256
+    },
+    {
+      "epoch": 0.15847952708743848,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0018573972188807891,
+      "loss": 0.1016,
+      "step": 18257
+    },
+    {
+      "epoch": 0.15848820756764265,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0018573811415200986,
+      "loss": 0.1074,
+      "step": 18258
+    },
+    {
+      "epoch": 0.1584968880478468,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0018573650633311475,
+      "loss": 0.1177,
+      "step": 18259
+    },
+    {
+      "epoch": 0.15850556852805098,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0018573489843139532,
+      "loss": 0.0889,
+      "step": 18260
+    },
+    {
+      "epoch": 0.15851424900825514,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0018573329044685334,
+      "loss": 0.127,
+      "step": 18261
+    },
+    {
+      "epoch": 0.1585229294884593,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0018573168237949061,
+      "loss": 0.1025,
+      "step": 18262
+    },
+    {
+      "epoch": 0.15853160996866347,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018573007422930884,
+      "loss": 0.1001,
+      "step": 18263
+    },
+    {
+      "epoch": 0.15854029044886764,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0018572846599630977,
+      "loss": 0.0796,
+      "step": 18264
+    },
+    {
+      "epoch": 0.1585489709290718,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.001857268576804952,
+      "loss": 0.1152,
+      "step": 18265
+    },
+    {
+      "epoch": 0.15855765140927597,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0018572524928186687,
+      "loss": 0.1074,
+      "step": 18266
+    },
+    {
+      "epoch": 0.15856633188948013,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0018572364080042655,
+      "loss": 0.1118,
+      "step": 18267
+    },
+    {
+      "epoch": 0.1585750123696843,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.00185722032236176,
+      "loss": 0.1123,
+      "step": 18268
+    },
+    {
+      "epoch": 0.15858369284988846,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0018572042358911695,
+      "loss": 0.1113,
+      "step": 18269
+    },
+    {
+      "epoch": 0.15859237333009263,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0018571881485925123,
+      "loss": 0.0669,
+      "step": 18270
+    },
+    {
+      "epoch": 0.1586010538102968,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0018571720604658051,
+      "loss": 0.1084,
+      "step": 18271
+    },
+    {
+      "epoch": 0.15860973429050096,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001857155971511066,
+      "loss": 0.084,
+      "step": 18272
+    },
+    {
+      "epoch": 0.15861841477070512,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0018571398817283126,
+      "loss": 0.1582,
+      "step": 18273
+    },
+    {
+      "epoch": 0.1586270952509093,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0018571237911175623,
+      "loss": 0.1084,
+      "step": 18274
+    },
+    {
+      "epoch": 0.15863577573111345,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0018571076996788327,
+      "loss": 0.1123,
+      "step": 18275
+    },
+    {
+      "epoch": 0.15864445621131762,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0018570916074121416,
+      "loss": 0.1211,
+      "step": 18276
+    },
+    {
+      "epoch": 0.15865313669152178,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0018570755143175067,
+      "loss": 0.1133,
+      "step": 18277
+    },
+    {
+      "epoch": 0.15866181717172595,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0018570594203949453,
+      "loss": 0.1621,
+      "step": 18278
+    },
+    {
+      "epoch": 0.15867049765193011,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001857043325644475,
+      "loss": 0.0918,
+      "step": 18279
+    },
+    {
+      "epoch": 0.15867917813213428,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0018570272300661133,
+      "loss": 0.1855,
+      "step": 18280
+    },
+    {
+      "epoch": 0.15868785861233844,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0018570111336598783,
+      "loss": 0.1289,
+      "step": 18281
+    },
+    {
+      "epoch": 0.1586965390925426,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0018569950364257873,
+      "loss": 0.1201,
+      "step": 18282
+    },
+    {
+      "epoch": 0.15870521957274678,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0018569789383638578,
+      "loss": 0.1377,
+      "step": 18283
+    },
+    {
+      "epoch": 0.15871390005295094,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0018569628394741074,
+      "loss": 0.0879,
+      "step": 18284
+    },
+    {
+      "epoch": 0.1587225805331551,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0018569467397565541,
+      "loss": 0.1025,
+      "step": 18285
+    },
+    {
+      "epoch": 0.15873126101335927,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.001856930639211215,
+      "loss": 0.1523,
+      "step": 18286
+    },
+    {
+      "epoch": 0.15873994149356344,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0018569145378381081,
+      "loss": 0.0977,
+      "step": 18287
+    },
+    {
+      "epoch": 0.1587486219737676,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0018568984356372505,
+      "loss": 0.1211,
+      "step": 18288
+    },
+    {
+      "epoch": 0.15875730245397177,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0018568823326086603,
+      "loss": 0.1104,
+      "step": 18289
+    },
+    {
+      "epoch": 0.15876598293417593,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001856866228752355,
+      "loss": 0.1514,
+      "step": 18290
+    },
+    {
+      "epoch": 0.1587746634143801,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0018568501240683521,
+      "loss": 0.0869,
+      "step": 18291
+    },
+    {
+      "epoch": 0.15878334389458426,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0018568340185566696,
+      "loss": 0.1016,
+      "step": 18292
+    },
+    {
+      "epoch": 0.15879202437478843,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0018568179122173242,
+      "loss": 0.0957,
+      "step": 18293
+    },
+    {
+      "epoch": 0.1588007048549926,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0018568018050503345,
+      "loss": 0.127,
+      "step": 18294
+    },
+    {
+      "epoch": 0.15880938533519676,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0018567856970557177,
+      "loss": 0.1367,
+      "step": 18295
+    },
+    {
+      "epoch": 0.15881806581540092,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0018567695882334913,
+      "loss": 0.1069,
+      "step": 18296
+    },
+    {
+      "epoch": 0.15882674629560506,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001856753478583673,
+      "loss": 0.1162,
+      "step": 18297
+    },
+    {
+      "epoch": 0.15883542677580922,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001856737368106281,
+      "loss": 0.0967,
+      "step": 18298
+    },
+    {
+      "epoch": 0.1588441072560134,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001856721256801332,
+      "loss": 0.1309,
+      "step": 18299
+    },
+    {
+      "epoch": 0.15885278773621755,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001856705144668844,
+      "loss": 0.1245,
+      "step": 18300
+    },
+    {
+      "epoch": 0.15886146821642172,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018566890317088343,
+      "loss": 0.1328,
+      "step": 18301
+    },
+    {
+      "epoch": 0.15887014869662588,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0018566729179213214,
+      "loss": 0.0928,
+      "step": 18302
+    },
+    {
+      "epoch": 0.15887882917683005,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001856656803306322,
+      "loss": 0.1021,
+      "step": 18303
+    },
+    {
+      "epoch": 0.15888750965703421,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001856640687863854,
+      "loss": 0.0957,
+      "step": 18304
+    },
+    {
+      "epoch": 0.15889619013723838,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0018566245715939355,
+      "loss": 0.0825,
+      "step": 18305
+    },
+    {
+      "epoch": 0.15890487061744255,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.0018566084544965836,
+      "loss": 0.0918,
+      "step": 18306
+    },
+    {
+      "epoch": 0.1589135510976467,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001856592336571816,
+      "loss": 0.1074,
+      "step": 18307
+    },
+    {
+      "epoch": 0.15892223157785088,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0018565762178196503,
+      "loss": 0.1167,
+      "step": 18308
+    },
+    {
+      "epoch": 0.15893091205805504,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0018565600982401046,
+      "loss": 0.1475,
+      "step": 18309
+    },
+    {
+      "epoch": 0.1589395925382592,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0018565439778331957,
+      "loss": 0.1191,
+      "step": 18310
+    },
+    {
+      "epoch": 0.15894827301846337,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001856527856598942,
+      "loss": 0.1367,
+      "step": 18311
+    },
+    {
+      "epoch": 0.15895695349866754,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0018565117345373605,
+      "loss": 0.1182,
+      "step": 18312
+    },
+    {
+      "epoch": 0.1589656339788717,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0018564956116484692,
+      "loss": 0.1021,
+      "step": 18313
+    },
+    {
+      "epoch": 0.15897431445907587,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0018564794879322858,
+      "loss": 0.1357,
+      "step": 18314
+    },
+    {
+      "epoch": 0.15898299493928003,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0018564633633888275,
+      "loss": 0.0986,
+      "step": 18315
+    },
+    {
+      "epoch": 0.1589916754194842,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0018564472380181126,
+      "loss": 0.1118,
+      "step": 18316
+    },
+    {
+      "epoch": 0.15900035589968836,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0018564311118201581,
+      "loss": 0.0957,
+      "step": 18317
+    },
+    {
+      "epoch": 0.15900903637989253,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0018564149847949823,
+      "loss": 0.1133,
+      "step": 18318
+    },
+    {
+      "epoch": 0.1590177168600967,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0018563988569426022,
+      "loss": 0.0952,
+      "step": 18319
+    },
+    {
+      "epoch": 0.15902639734030086,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0018563827282630356,
+      "loss": 0.1201,
+      "step": 18320
+    },
+    {
+      "epoch": 0.15903507782050502,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0018563665987563,
+      "loss": 0.1108,
+      "step": 18321
+    },
+    {
+      "epoch": 0.1590437583007092,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0018563504684224138,
+      "loss": 0.1094,
+      "step": 18322
+    },
+    {
+      "epoch": 0.15905243878091335,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0018563343372613936,
+      "loss": 0.1191,
+      "step": 18323
+    },
+    {
+      "epoch": 0.15906111926111752,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0018563182052732578,
+      "loss": 0.1289,
+      "step": 18324
+    },
+    {
+      "epoch": 0.15906979974132168,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001856302072458024,
+      "loss": 0.123,
+      "step": 18325
+    },
+    {
+      "epoch": 0.15907848022152585,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001856285938815709,
+      "loss": 0.1406,
+      "step": 18326
+    },
+    {
+      "epoch": 0.15908716070173,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0018562698043463313,
+      "loss": 0.1089,
+      "step": 18327
+    },
+    {
+      "epoch": 0.15909584118193418,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0018562536690499086,
+      "loss": 0.1357,
+      "step": 18328
+    },
+    {
+      "epoch": 0.15910452166213834,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0018562375329264581,
+      "loss": 0.085,
+      "step": 18329
+    },
+    {
+      "epoch": 0.1591132021423425,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0018562213959759973,
+      "loss": 0.1123,
+      "step": 18330
+    },
+    {
+      "epoch": 0.15912188262254667,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0018562052581985447,
+      "loss": 0.103,
+      "step": 18331
+    },
+    {
+      "epoch": 0.15913056310275084,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001856189119594117,
+      "loss": 0.1475,
+      "step": 18332
+    },
+    {
+      "epoch": 0.159139243582955,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018561729801627321,
+      "loss": 0.0977,
+      "step": 18333
+    },
+    {
+      "epoch": 0.15914792406315917,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0018561568399044083,
+      "loss": 0.1387,
+      "step": 18334
+    },
+    {
+      "epoch": 0.15915660454336333,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0018561406988191623,
+      "loss": 0.1035,
+      "step": 18335
+    },
+    {
+      "epoch": 0.1591652850235675,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0018561245569070123,
+      "loss": 0.3164,
+      "step": 18336
+    },
+    {
+      "epoch": 0.15917396550377166,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0018561084141679757,
+      "loss": 0.0913,
+      "step": 18337
+    },
+    {
+      "epoch": 0.15918264598397583,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0018560922706020706,
+      "loss": 0.1504,
+      "step": 18338
+    },
+    {
+      "epoch": 0.15919132646418,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0018560761262093142,
+      "loss": 0.1045,
+      "step": 18339
+    },
+    {
+      "epoch": 0.15920000694438416,
+      "grad_norm": 2.3125,
+      "learning_rate": 0.0018560599809897245,
+      "loss": 0.1543,
+      "step": 18340
+    },
+    {
+      "epoch": 0.15920868742458832,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0018560438349433186,
+      "loss": 0.1445,
+      "step": 18341
+    },
+    {
+      "epoch": 0.1592173679047925,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001856027688070115,
+      "loss": 0.1504,
+      "step": 18342
+    },
+    {
+      "epoch": 0.15922604838499665,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0018560115403701304,
+      "loss": 0.0977,
+      "step": 18343
+    },
+    {
+      "epoch": 0.15923472886520082,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001855995391843383,
+      "loss": 0.1514,
+      "step": 18344
+    },
+    {
+      "epoch": 0.15924340934540498,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0018559792424898908,
+      "loss": 0.1113,
+      "step": 18345
+    },
+    {
+      "epoch": 0.15925208982560915,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001855963092309671,
+      "loss": 0.1089,
+      "step": 18346
+    },
+    {
+      "epoch": 0.15926077030581332,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.001855946941302741,
+      "loss": 0.0957,
+      "step": 18347
+    },
+    {
+      "epoch": 0.15926945078601748,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0018559307894691185,
+      "loss": 0.1445,
+      "step": 18348
+    },
+    {
+      "epoch": 0.15927813126622165,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0018559146368088222,
+      "loss": 0.124,
+      "step": 18349
+    },
+    {
+      "epoch": 0.1592868117464258,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0018558984833218686,
+      "loss": 0.1094,
+      "step": 18350
+    },
+    {
+      "epoch": 0.15929549222662998,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0018558823290082758,
+      "loss": 0.1172,
+      "step": 18351
+    },
+    {
+      "epoch": 0.15930417270683414,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0018558661738680615,
+      "loss": 0.0894,
+      "step": 18352
+    },
+    {
+      "epoch": 0.1593128531870383,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0018558500179012435,
+      "loss": 0.1172,
+      "step": 18353
+    },
+    {
+      "epoch": 0.15932153366724247,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0018558338611078392,
+      "loss": 0.1514,
+      "step": 18354
+    },
+    {
+      "epoch": 0.15933021414744664,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018558177034878659,
+      "loss": 0.1543,
+      "step": 18355
+    },
+    {
+      "epoch": 0.1593388946276508,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0018558015450413422,
+      "loss": 0.1357,
+      "step": 18356
+    },
+    {
+      "epoch": 0.15934757510785497,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001855785385768285,
+      "loss": 0.1357,
+      "step": 18357
+    },
+    {
+      "epoch": 0.15935625558805913,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0018557692256687124,
+      "loss": 0.123,
+      "step": 18358
+    },
+    {
+      "epoch": 0.1593649360682633,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001855753064742642,
+      "loss": 0.1094,
+      "step": 18359
+    },
+    {
+      "epoch": 0.15937361654846746,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0018557369029900913,
+      "loss": 0.127,
+      "step": 18360
+    },
+    {
+      "epoch": 0.15938229702867163,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0018557207404110779,
+      "loss": 0.1162,
+      "step": 18361
+    },
+    {
+      "epoch": 0.1593909775088758,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0018557045770056198,
+      "loss": 0.1123,
+      "step": 18362
+    },
+    {
+      "epoch": 0.15939965798907996,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0018556884127737345,
+      "loss": 0.0967,
+      "step": 18363
+    },
+    {
+      "epoch": 0.15940833846928412,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.00185567224771544,
+      "loss": 0.1582,
+      "step": 18364
+    },
+    {
+      "epoch": 0.1594170189494883,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018556560818307536,
+      "loss": 0.1147,
+      "step": 18365
+    },
+    {
+      "epoch": 0.15942569942969245,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0018556399151196928,
+      "loss": 0.1143,
+      "step": 18366
+    },
+    {
+      "epoch": 0.15943437990989662,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0018556237475822754,
+      "loss": 0.1074,
+      "step": 18367
+    },
+    {
+      "epoch": 0.15944306039010078,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0018556075792185197,
+      "loss": 0.1113,
+      "step": 18368
+    },
+    {
+      "epoch": 0.15945174087030495,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0018555914100284428,
+      "loss": 0.0996,
+      "step": 18369
+    },
+    {
+      "epoch": 0.1594604213505091,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0018555752400120622,
+      "loss": 0.1011,
+      "step": 18370
+    },
+    {
+      "epoch": 0.15946910183071328,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001855559069169396,
+      "loss": 0.1094,
+      "step": 18371
+    },
+    {
+      "epoch": 0.15947778231091744,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0018555428975004617,
+      "loss": 0.1055,
+      "step": 18372
+    },
+    {
+      "epoch": 0.1594864627911216,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0018555267250052775,
+      "loss": 0.126,
+      "step": 18373
+    },
+    {
+      "epoch": 0.15949514327132577,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0018555105516838602,
+      "loss": 0.1631,
+      "step": 18374
+    },
+    {
+      "epoch": 0.15950382375152994,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0018554943775362278,
+      "loss": 0.125,
+      "step": 18375
+    },
+    {
+      "epoch": 0.1595125042317341,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0018554782025623983,
+      "loss": 0.124,
+      "step": 18376
+    },
+    {
+      "epoch": 0.15952118471193827,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0018554620267623892,
+      "loss": 0.1001,
+      "step": 18377
+    },
+    {
+      "epoch": 0.15952986519214243,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0018554458501362182,
+      "loss": 0.168,
+      "step": 18378
+    },
+    {
+      "epoch": 0.1595385456723466,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0018554296726839027,
+      "loss": 0.1455,
+      "step": 18379
+    },
+    {
+      "epoch": 0.15954722615255076,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.001855413494405461,
+      "loss": 0.0977,
+      "step": 18380
+    },
+    {
+      "epoch": 0.15955590663275493,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0018553973153009103,
+      "loss": 0.0903,
+      "step": 18381
+    },
+    {
+      "epoch": 0.1595645871129591,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0018553811353702685,
+      "loss": 0.1289,
+      "step": 18382
+    },
+    {
+      "epoch": 0.15957326759316326,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001855364954613553,
+      "loss": 0.1025,
+      "step": 18383
+    },
+    {
+      "epoch": 0.15958194807336742,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0018553487730307823,
+      "loss": 0.1504,
+      "step": 18384
+    },
+    {
+      "epoch": 0.1595906285535716,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001855332590621973,
+      "loss": 0.1016,
+      "step": 18385
+    },
+    {
+      "epoch": 0.15959930903377575,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0018553164073871434,
+      "loss": 0.1348,
+      "step": 18386
+    },
+    {
+      "epoch": 0.15960798951397992,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0018553002233263115,
+      "loss": 0.1914,
+      "step": 18387
+    },
+    {
+      "epoch": 0.15961666999418408,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0018552840384394944,
+      "loss": 0.1177,
+      "step": 18388
+    },
+    {
+      "epoch": 0.15962535047438825,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0018552678527267098,
+      "loss": 0.104,
+      "step": 18389
+    },
+    {
+      "epoch": 0.15963403095459242,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0018552516661879757,
+      "loss": 0.1387,
+      "step": 18390
+    },
+    {
+      "epoch": 0.15964271143479658,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.00185523547882331,
+      "loss": 0.0942,
+      "step": 18391
+    },
+    {
+      "epoch": 0.15965139191500075,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0018552192906327299,
+      "loss": 0.0972,
+      "step": 18392
+    },
+    {
+      "epoch": 0.1596600723952049,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0018552031016162533,
+      "loss": 0.0776,
+      "step": 18393
+    },
+    {
+      "epoch": 0.15966875287540908,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0018551869117738981,
+      "loss": 0.127,
+      "step": 18394
+    },
+    {
+      "epoch": 0.15967743335561324,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0018551707211056818,
+      "loss": 0.1104,
+      "step": 18395
+    },
+    {
+      "epoch": 0.1596861138358174,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001855154529611622,
+      "loss": 0.125,
+      "step": 18396
+    },
+    {
+      "epoch": 0.15969479431602157,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0018551383372917367,
+      "loss": 0.1143,
+      "step": 18397
+    },
+    {
+      "epoch": 0.15970347479622574,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0018551221441460435,
+      "loss": 0.1123,
+      "step": 18398
+    },
+    {
+      "epoch": 0.1597121552764299,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.00185510595017456,
+      "loss": 0.0894,
+      "step": 18399
+    },
+    {
+      "epoch": 0.15972083575663407,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0018550897553773038,
+      "loss": 0.0918,
+      "step": 18400
+    },
+    {
+      "epoch": 0.15972951623683823,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001855073559754293,
+      "loss": 0.0947,
+      "step": 18401
+    },
+    {
+      "epoch": 0.1597381967170424,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0018550573633055452,
+      "loss": 0.0864,
+      "step": 18402
+    },
+    {
+      "epoch": 0.15974687719724656,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0018550411660310778,
+      "loss": 0.1172,
+      "step": 18403
+    },
+    {
+      "epoch": 0.15975555767745073,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.001855024967930909,
+      "loss": 0.1416,
+      "step": 18404
+    },
+    {
+      "epoch": 0.1597642381576549,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0018550087690050558,
+      "loss": 0.0986,
+      "step": 18405
+    },
+    {
+      "epoch": 0.15977291863785906,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0018549925692535366,
+      "loss": 0.1387,
+      "step": 18406
+    },
+    {
+      "epoch": 0.15978159911806322,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018549763686763689,
+      "loss": 0.0967,
+      "step": 18407
+    },
+    {
+      "epoch": 0.1597902795982674,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0018549601672735703,
+      "loss": 0.0801,
+      "step": 18408
+    },
+    {
+      "epoch": 0.15979896007847155,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0018549439650451588,
+      "loss": 0.1035,
+      "step": 18409
+    },
+    {
+      "epoch": 0.15980764055867572,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0018549277619911515,
+      "loss": 0.0796,
+      "step": 18410
+    },
+    {
+      "epoch": 0.15981632103887988,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.001854911558111567,
+      "loss": 0.124,
+      "step": 18411
+    },
+    {
+      "epoch": 0.15982500151908405,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0018548953534064223,
+      "loss": 0.1035,
+      "step": 18412
+    },
+    {
+      "epoch": 0.1598336819992882,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0018548791478757353,
+      "loss": 0.1025,
+      "step": 18413
+    },
+    {
+      "epoch": 0.15984236247949238,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0018548629415195241,
+      "loss": 0.1289,
+      "step": 18414
+    },
+    {
+      "epoch": 0.15985104295969654,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0018548467343378061,
+      "loss": 0.1187,
+      "step": 18415
+    },
+    {
+      "epoch": 0.1598597234399007,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001854830526330599,
+      "loss": 0.1406,
+      "step": 18416
+    },
+    {
+      "epoch": 0.15986840392010487,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0018548143174979203,
+      "loss": 0.1045,
+      "step": 18417
+    },
+    {
+      "epoch": 0.15987708440030904,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0018547981078397885,
+      "loss": 0.1426,
+      "step": 18418
+    },
+    {
+      "epoch": 0.1598857648805132,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018547818973562206,
+      "loss": 0.1113,
+      "step": 18419
+    },
+    {
+      "epoch": 0.15989444536071734,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0018547656860472344,
+      "loss": 0.1289,
+      "step": 18420
+    },
+    {
+      "epoch": 0.1599031258409215,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001854749473912848,
+      "loss": 0.1416,
+      "step": 18421
+    },
+    {
+      "epoch": 0.15991180632112567,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0018547332609530787,
+      "loss": 0.0752,
+      "step": 18422
+    },
+    {
+      "epoch": 0.15992048680132984,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0018547170471679443,
+      "loss": 0.123,
+      "step": 18423
+    },
+    {
+      "epoch": 0.159929167281534,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0018547008325574632,
+      "loss": 0.1104,
+      "step": 18424
+    },
+    {
+      "epoch": 0.15993784776173817,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0018546846171216523,
+      "loss": 0.1113,
+      "step": 18425
+    },
+    {
+      "epoch": 0.15994652824194233,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0018546684008605295,
+      "loss": 0.1104,
+      "step": 18426
+    },
+    {
+      "epoch": 0.1599552087221465,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0018546521837741132,
+      "loss": 0.0884,
+      "step": 18427
+    },
+    {
+      "epoch": 0.15996388920235066,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.00185463596586242,
+      "loss": 0.1631,
+      "step": 18428
+    },
+    {
+      "epoch": 0.15997256968255483,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0018546197471254688,
+      "loss": 0.127,
+      "step": 18429
+    },
+    {
+      "epoch": 0.159981250162759,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0018546035275632763,
+      "loss": 0.1045,
+      "step": 18430
+    },
+    {
+      "epoch": 0.15998993064296316,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001854587307175861,
+      "loss": 0.0991,
+      "step": 18431
+    },
+    {
+      "epoch": 0.15999861112316732,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0018545710859632404,
+      "loss": 0.1289,
+      "step": 18432
+    },
+    {
+      "epoch": 0.1600072916033715,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0018545548639254319,
+      "loss": 0.1484,
+      "step": 18433
+    },
+    {
+      "epoch": 0.16001597208357565,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0018545386410624537,
+      "loss": 0.1128,
+      "step": 18434
+    },
+    {
+      "epoch": 0.16002465256377982,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0018545224173743234,
+      "loss": 0.1162,
+      "step": 18435
+    },
+    {
+      "epoch": 0.16003333304398398,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0018545061928610588,
+      "loss": 0.1494,
+      "step": 18436
+    },
+    {
+      "epoch": 0.16004201352418815,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0018544899675226774,
+      "loss": 0.1045,
+      "step": 18437
+    },
+    {
+      "epoch": 0.1600506940043923,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018544737413591973,
+      "loss": 0.127,
+      "step": 18438
+    },
+    {
+      "epoch": 0.16005937448459648,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0018544575143706358,
+      "loss": 0.1064,
+      "step": 18439
+    },
+    {
+      "epoch": 0.16006805496480064,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001854441286557011,
+      "loss": 0.1533,
+      "step": 18440
+    },
+    {
+      "epoch": 0.1600767354450048,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0018544250579183405,
+      "loss": 0.0864,
+      "step": 18441
+    },
+    {
+      "epoch": 0.16008541592520897,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0018544088284546422,
+      "loss": 0.0947,
+      "step": 18442
+    },
+    {
+      "epoch": 0.16009409640541314,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0018543925981659337,
+      "loss": 0.0928,
+      "step": 18443
+    },
+    {
+      "epoch": 0.1601027768856173,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001854376367052233,
+      "loss": 0.1152,
+      "step": 18444
+    },
+    {
+      "epoch": 0.16011145736582147,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0018543601351135575,
+      "loss": 0.1113,
+      "step": 18445
+    },
+    {
+      "epoch": 0.16012013784602563,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001854343902349925,
+      "loss": 0.1055,
+      "step": 18446
+    },
+    {
+      "epoch": 0.1601288183262298,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0018543276687613534,
+      "loss": 0.1025,
+      "step": 18447
+    },
+    {
+      "epoch": 0.16013749880643396,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0018543114343478602,
+      "loss": 0.1123,
+      "step": 18448
+    },
+    {
+      "epoch": 0.16014617928663813,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0018542951991094637,
+      "loss": 0.1133,
+      "step": 18449
+    },
+    {
+      "epoch": 0.1601548597668423,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001854278963046181,
+      "loss": 0.0986,
+      "step": 18450
+    },
+    {
+      "epoch": 0.16016354024704646,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0018542627261580302,
+      "loss": 0.103,
+      "step": 18451
+    },
+    {
+      "epoch": 0.16017222072725062,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018542464884450291,
+      "loss": 0.1226,
+      "step": 18452
+    },
+    {
+      "epoch": 0.1601809012074548,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0018542302499071957,
+      "loss": 0.1025,
+      "step": 18453
+    },
+    {
+      "epoch": 0.16018958168765896,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001854214010544547,
+      "loss": 0.1094,
+      "step": 18454
+    },
+    {
+      "epoch": 0.16019826216786312,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0018541977703571014,
+      "loss": 0.0947,
+      "step": 18455
+    },
+    {
+      "epoch": 0.16020694264806729,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0018541815293448762,
+      "loss": 0.0991,
+      "step": 18456
+    },
+    {
+      "epoch": 0.16021562312827145,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0018541652875078893,
+      "loss": 0.1113,
+      "step": 18457
+    },
+    {
+      "epoch": 0.16022430360847562,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001854149044846159,
+      "loss": 0.0898,
+      "step": 18458
+    },
+    {
+      "epoch": 0.16023298408867978,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0018541328013597026,
+      "loss": 0.1206,
+      "step": 18459
+    },
+    {
+      "epoch": 0.16024166456888395,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0018541165570485376,
+      "loss": 0.1289,
+      "step": 18460
+    },
+    {
+      "epoch": 0.1602503450490881,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0018541003119126823,
+      "loss": 0.1289,
+      "step": 18461
+    },
+    {
+      "epoch": 0.16025902552929228,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0018540840659521543,
+      "loss": 0.1475,
+      "step": 18462
+    },
+    {
+      "epoch": 0.16026770600949644,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001854067819166971,
+      "loss": 0.1152,
+      "step": 18463
+    },
+    {
+      "epoch": 0.1602763864897006,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0018540515715571506,
+      "loss": 0.1045,
+      "step": 18464
+    },
+    {
+      "epoch": 0.16028506696990477,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0018540353231227106,
+      "loss": 0.1211,
+      "step": 18465
+    },
+    {
+      "epoch": 0.16029374745010894,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0018540190738636692,
+      "loss": 0.0874,
+      "step": 18466
+    },
+    {
+      "epoch": 0.1603024279303131,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0018540028237800437,
+      "loss": 0.125,
+      "step": 18467
+    },
+    {
+      "epoch": 0.16031110841051727,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0018539865728718521,
+      "loss": 0.1299,
+      "step": 18468
+    },
+    {
+      "epoch": 0.16031978889072143,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001853970321139112,
+      "loss": 0.0957,
+      "step": 18469
+    },
+    {
+      "epoch": 0.1603284693709256,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0018539540685818415,
+      "loss": 0.1162,
+      "step": 18470
+    },
+    {
+      "epoch": 0.16033714985112976,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018539378152000578,
+      "loss": 0.0986,
+      "step": 18471
+    },
+    {
+      "epoch": 0.16034583033133393,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0018539215609937793,
+      "loss": 0.166,
+      "step": 18472
+    },
+    {
+      "epoch": 0.1603545108115381,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0018539053059630235,
+      "loss": 0.103,
+      "step": 18473
+    },
+    {
+      "epoch": 0.16036319129174226,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0018538890501078082,
+      "loss": 0.168,
+      "step": 18474
+    },
+    {
+      "epoch": 0.16037187177194642,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0018538727934281512,
+      "loss": 0.1045,
+      "step": 18475
+    },
+    {
+      "epoch": 0.1603805522521506,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.00185385653592407,
+      "loss": 0.0996,
+      "step": 18476
+    },
+    {
+      "epoch": 0.16038923273235475,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0018538402775955826,
+      "loss": 0.1699,
+      "step": 18477
+    },
+    {
+      "epoch": 0.16039791321255892,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001853824018442707,
+      "loss": 0.0879,
+      "step": 18478
+    },
+    {
+      "epoch": 0.16040659369276308,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0018538077584654606,
+      "loss": 0.1533,
+      "step": 18479
+    },
+    {
+      "epoch": 0.16041527417296725,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0018537914976638618,
+      "loss": 0.1348,
+      "step": 18480
+    },
+    {
+      "epoch": 0.1604239546531714,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0018537752360379277,
+      "loss": 0.1338,
+      "step": 18481
+    },
+    {
+      "epoch": 0.16043263513337558,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001853758973587676,
+      "loss": 0.1191,
+      "step": 18482
+    },
+    {
+      "epoch": 0.16044131561357974,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0018537427103131252,
+      "loss": 0.123,
+      "step": 18483
+    },
+    {
+      "epoch": 0.1604499960937839,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0018537264462142925,
+      "loss": 0.1099,
+      "step": 18484
+    },
+    {
+      "epoch": 0.16045867657398807,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001853710181291196,
+      "loss": 0.1475,
+      "step": 18485
+    },
+    {
+      "epoch": 0.16046735705419224,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0018536939155438534,
+      "loss": 0.1113,
+      "step": 18486
+    },
+    {
+      "epoch": 0.1604760375343964,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0018536776489722824,
+      "loss": 0.1006,
+      "step": 18487
+    },
+    {
+      "epoch": 0.16048471801460057,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0018536613815765008,
+      "loss": 0.1196,
+      "step": 18488
+    },
+    {
+      "epoch": 0.16049339849480473,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0018536451133565265,
+      "loss": 0.1367,
+      "step": 18489
+    },
+    {
+      "epoch": 0.1605020789750089,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0018536288443123771,
+      "loss": 0.1245,
+      "step": 18490
+    },
+    {
+      "epoch": 0.16051075945521306,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0018536125744440706,
+      "loss": 0.1172,
+      "step": 18491
+    },
+    {
+      "epoch": 0.16051943993541723,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0018535963037516249,
+      "loss": 0.0723,
+      "step": 18492
+    },
+    {
+      "epoch": 0.1605281204156214,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0018535800322350576,
+      "loss": 0.0967,
+      "step": 18493
+    },
+    {
+      "epoch": 0.16053680089582556,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0018535637598943863,
+      "loss": 0.0649,
+      "step": 18494
+    },
+    {
+      "epoch": 0.16054548137602973,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001853547486729629,
+      "loss": 0.1367,
+      "step": 18495
+    },
+    {
+      "epoch": 0.1605541618562339,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0018535312127408035,
+      "loss": 0.0967,
+      "step": 18496
+    },
+    {
+      "epoch": 0.16056284233643806,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0018535149379279273,
+      "loss": 0.1846,
+      "step": 18497
+    },
+    {
+      "epoch": 0.16057152281664222,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001853498662291019,
+      "loss": 0.1211,
+      "step": 18498
+    },
+    {
+      "epoch": 0.16058020329684639,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0018534823858300958,
+      "loss": 0.1338,
+      "step": 18499
+    },
+    {
+      "epoch": 0.16058888377705055,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0018534661085451755,
+      "loss": 0.1221,
+      "step": 18500
+    },
+    {
+      "epoch": 0.16059756425725472,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0018534498304362756,
+      "loss": 0.2109,
+      "step": 18501
+    },
+    {
+      "epoch": 0.16060624473745888,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0018534335515034148,
+      "loss": 0.1182,
+      "step": 18502
+    },
+    {
+      "epoch": 0.16061492521766305,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0018534172717466102,
+      "loss": 0.1113,
+      "step": 18503
+    },
+    {
+      "epoch": 0.1606236056978672,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.00185340099116588,
+      "loss": 0.1011,
+      "step": 18504
+    },
+    {
+      "epoch": 0.16063228617807138,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0018533847097612413,
+      "loss": 0.1172,
+      "step": 18505
+    },
+    {
+      "epoch": 0.16064096665827554,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0018533684275327128,
+      "loss": 0.1221,
+      "step": 18506
+    },
+    {
+      "epoch": 0.1606496471384797,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0018533521444803118,
+      "loss": 0.1104,
+      "step": 18507
+    },
+    {
+      "epoch": 0.16065832761868387,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0018533358606040561,
+      "loss": 0.1328,
+      "step": 18508
+    },
+    {
+      "epoch": 0.16066700809888804,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001853319575903964,
+      "loss": 0.0942,
+      "step": 18509
+    },
+    {
+      "epoch": 0.1606756885790922,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0018533032903800526,
+      "loss": 0.1143,
+      "step": 18510
+    },
+    {
+      "epoch": 0.16068436905929637,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0018532870040323401,
+      "loss": 0.0928,
+      "step": 18511
+    },
+    {
+      "epoch": 0.16069304953950053,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0018532707168608443,
+      "loss": 0.1348,
+      "step": 18512
+    },
+    {
+      "epoch": 0.1607017300197047,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001853254428865583,
+      "loss": 0.1182,
+      "step": 18513
+    },
+    {
+      "epoch": 0.16071041049990886,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0018532381400465736,
+      "loss": 0.0928,
+      "step": 18514
+    },
+    {
+      "epoch": 0.16071909098011303,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0018532218504038348,
+      "loss": 0.126,
+      "step": 18515
+    },
+    {
+      "epoch": 0.1607277714603172,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0018532055599373837,
+      "loss": 0.0947,
+      "step": 18516
+    },
+    {
+      "epoch": 0.16073645194052136,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0018531892686472382,
+      "loss": 0.1167,
+      "step": 18517
+    },
+    {
+      "epoch": 0.16074513242072552,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0018531729765334163,
+      "loss": 0.0898,
+      "step": 18518
+    },
+    {
+      "epoch": 0.1607538129009297,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0018531566835959358,
+      "loss": 0.1025,
+      "step": 18519
+    },
+    {
+      "epoch": 0.16076249338113385,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0018531403898348147,
+      "loss": 0.1006,
+      "step": 18520
+    },
+    {
+      "epoch": 0.16077117386133802,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0018531240952500702,
+      "loss": 0.1035,
+      "step": 18521
+    },
+    {
+      "epoch": 0.16077985434154218,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0018531077998417207,
+      "loss": 0.1055,
+      "step": 18522
+    },
+    {
+      "epoch": 0.16078853482174635,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001853091503609784,
+      "loss": 0.1309,
+      "step": 18523
+    },
+    {
+      "epoch": 0.1607972153019505,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0018530752065542774,
+      "loss": 0.1162,
+      "step": 18524
+    },
+    {
+      "epoch": 0.16080589578215468,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0018530589086752191,
+      "loss": 0.0981,
+      "step": 18525
+    },
+    {
+      "epoch": 0.16081457626235884,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001853042609972627,
+      "loss": 0.1235,
+      "step": 18526
+    },
+    {
+      "epoch": 0.160823256742563,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0018530263104465188,
+      "loss": 0.125,
+      "step": 18527
+    },
+    {
+      "epoch": 0.16083193722276717,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0018530100100969124,
+      "loss": 0.1123,
+      "step": 18528
+    },
+    {
+      "epoch": 0.16084061770297134,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018529937089238253,
+      "loss": 0.106,
+      "step": 18529
+    },
+    {
+      "epoch": 0.1608492981831755,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001852977406927276,
+      "loss": 0.0781,
+      "step": 18530
+    },
+    {
+      "epoch": 0.16085797866337967,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0018529611041072818,
+      "loss": 0.1138,
+      "step": 18531
+    },
+    {
+      "epoch": 0.16086665914358383,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0018529448004638606,
+      "loss": 0.084,
+      "step": 18532
+    },
+    {
+      "epoch": 0.160875339623788,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0018529284959970301,
+      "loss": 0.1079,
+      "step": 18533
+    },
+    {
+      "epoch": 0.16088402010399216,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0018529121907068086,
+      "loss": 0.0586,
+      "step": 18534
+    },
+    {
+      "epoch": 0.16089270058419633,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0018528958845932134,
+      "loss": 0.1104,
+      "step": 18535
+    },
+    {
+      "epoch": 0.1609013810644005,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0018528795776562627,
+      "loss": 0.124,
+      "step": 18536
+    },
+    {
+      "epoch": 0.16091006154460466,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0018528632698959742,
+      "loss": 0.1221,
+      "step": 18537
+    },
+    {
+      "epoch": 0.16091874202480883,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0018528469613123657,
+      "loss": 0.0815,
+      "step": 18538
+    },
+    {
+      "epoch": 0.160927422505013,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0018528306519054552,
+      "loss": 0.0991,
+      "step": 18539
+    },
+    {
+      "epoch": 0.16093610298521716,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0018528143416752603,
+      "loss": 0.1006,
+      "step": 18540
+    },
+    {
+      "epoch": 0.16094478346542132,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0018527980306217988,
+      "loss": 0.0845,
+      "step": 18541
+    },
+    {
+      "epoch": 0.16095346394562549,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0018527817187450888,
+      "loss": 0.0962,
+      "step": 18542
+    },
+    {
+      "epoch": 0.16096214442582965,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0018527654060451482,
+      "loss": 0.1504,
+      "step": 18543
+    },
+    {
+      "epoch": 0.1609708249060338,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0018527490925219945,
+      "loss": 0.1836,
+      "step": 18544
+    },
+    {
+      "epoch": 0.16097950538623795,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0018527327781756456,
+      "loss": 0.1006,
+      "step": 18545
+    },
+    {
+      "epoch": 0.16098818586644212,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0018527164630061197,
+      "loss": 0.1514,
+      "step": 18546
+    },
+    {
+      "epoch": 0.16099686634664628,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001852700147013434,
+      "loss": 0.082,
+      "step": 18547
+    },
+    {
+      "epoch": 0.16100554682685045,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0018526838301976072,
+      "loss": 0.0986,
+      "step": 18548
+    },
+    {
+      "epoch": 0.1610142273070546,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0018526675125586562,
+      "loss": 0.1123,
+      "step": 18549
+    },
+    {
+      "epoch": 0.16102290778725878,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0018526511940966,
+      "loss": 0.0913,
+      "step": 18550
+    },
+    {
+      "epoch": 0.16103158826746294,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0018526348748114552,
+      "loss": 0.1045,
+      "step": 18551
+    },
+    {
+      "epoch": 0.1610402687476671,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0018526185547032403,
+      "loss": 0.0918,
+      "step": 18552
+    },
+    {
+      "epoch": 0.16104894922787127,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001852602233771973,
+      "loss": 0.0723,
+      "step": 18553
+    },
+    {
+      "epoch": 0.16105762970807544,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0018525859120176714,
+      "loss": 0.0977,
+      "step": 18554
+    },
+    {
+      "epoch": 0.1610663101882796,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0018525695894403532,
+      "loss": 0.1367,
+      "step": 18555
+    },
+    {
+      "epoch": 0.16107499066848377,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.001852553266040036,
+      "loss": 0.1621,
+      "step": 18556
+    },
+    {
+      "epoch": 0.16108367114868793,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0018525369418167378,
+      "loss": 0.0977,
+      "step": 18557
+    },
+    {
+      "epoch": 0.1610923516288921,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0018525206167704766,
+      "loss": 0.1367,
+      "step": 18558
+    },
+    {
+      "epoch": 0.16110103210909626,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018525042909012701,
+      "loss": 0.1289,
+      "step": 18559
+    },
+    {
+      "epoch": 0.16110971258930043,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0018524879642091365,
+      "loss": 0.2305,
+      "step": 18560
+    },
+    {
+      "epoch": 0.1611183930695046,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0018524716366940933,
+      "loss": 0.1094,
+      "step": 18561
+    },
+    {
+      "epoch": 0.16112707354970876,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0018524553083561581,
+      "loss": 0.1035,
+      "step": 18562
+    },
+    {
+      "epoch": 0.16113575402991293,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018524389791953492,
+      "loss": 0.1123,
+      "step": 18563
+    },
+    {
+      "epoch": 0.1611444345101171,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0018524226492116845,
+      "loss": 0.0947,
+      "step": 18564
+    },
+    {
+      "epoch": 0.16115311499032126,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0018524063184051818,
+      "loss": 0.0977,
+      "step": 18565
+    },
+    {
+      "epoch": 0.16116179547052542,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0018523899867758584,
+      "loss": 0.1143,
+      "step": 18566
+    },
+    {
+      "epoch": 0.16117047595072959,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0018523736543237328,
+      "loss": 0.1118,
+      "step": 18567
+    },
+    {
+      "epoch": 0.16117915643093375,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001852357321048823,
+      "loss": 0.0898,
+      "step": 18568
+    },
+    {
+      "epoch": 0.16118783691113792,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0018523409869511463,
+      "loss": 0.0991,
+      "step": 18569
+    },
+    {
+      "epoch": 0.16119651739134208,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001852324652030721,
+      "loss": 0.0957,
+      "step": 18570
+    },
+    {
+      "epoch": 0.16120519787154625,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0018523083162875643,
+      "loss": 0.1367,
+      "step": 18571
+    },
+    {
+      "epoch": 0.1612138783517504,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018522919797216949,
+      "loss": 0.1201,
+      "step": 18572
+    },
+    {
+      "epoch": 0.16122255883195458,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0018522756423331298,
+      "loss": 0.0947,
+      "step": 18573
+    },
+    {
+      "epoch": 0.16123123931215874,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0018522593041218878,
+      "loss": 0.0967,
+      "step": 18574
+    },
+    {
+      "epoch": 0.1612399197923629,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0018522429650879863,
+      "loss": 0.1245,
+      "step": 18575
+    },
+    {
+      "epoch": 0.16124860027256707,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0018522266252314434,
+      "loss": 0.0977,
+      "step": 18576
+    },
+    {
+      "epoch": 0.16125728075277124,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0018522102845522765,
+      "loss": 0.0947,
+      "step": 18577
+    },
+    {
+      "epoch": 0.1612659612329754,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0018521939430505036,
+      "loss": 0.082,
+      "step": 18578
+    },
+    {
+      "epoch": 0.16127464171317957,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0018521776007261429,
+      "loss": 0.0771,
+      "step": 18579
+    },
+    {
+      "epoch": 0.16128332219338373,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0018521612575792123,
+      "loss": 0.1533,
+      "step": 18580
+    },
+    {
+      "epoch": 0.1612920026735879,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001852144913609729,
+      "loss": 0.1172,
+      "step": 18581
+    },
+    {
+      "epoch": 0.16130068315379206,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0018521285688177114,
+      "loss": 0.0957,
+      "step": 18582
+    },
+    {
+      "epoch": 0.16130936363399623,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0018521122232031775,
+      "loss": 0.1006,
+      "step": 18583
+    },
+    {
+      "epoch": 0.1613180441142004,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001852095876766145,
+      "loss": 0.1318,
+      "step": 18584
+    },
+    {
+      "epoch": 0.16132672459440456,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0018520795295066314,
+      "loss": 0.0972,
+      "step": 18585
+    },
+    {
+      "epoch": 0.16133540507460872,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0018520631814246554,
+      "loss": 0.1289,
+      "step": 18586
+    },
+    {
+      "epoch": 0.1613440855548129,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001852046832520234,
+      "loss": 0.1006,
+      "step": 18587
+    },
+    {
+      "epoch": 0.16135276603501705,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0018520304827933855,
+      "loss": 0.1123,
+      "step": 18588
+    },
+    {
+      "epoch": 0.16136144651522122,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0018520141322441281,
+      "loss": 0.1504,
+      "step": 18589
+    },
+    {
+      "epoch": 0.16137012699542538,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001851997780872479,
+      "loss": 0.1035,
+      "step": 18590
+    },
+    {
+      "epoch": 0.16137880747562955,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0018519814286784566,
+      "loss": 0.1191,
+      "step": 18591
+    },
+    {
+      "epoch": 0.1613874879558337,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.001851965075662079,
+      "loss": 0.1152,
+      "step": 18592
+    },
+    {
+      "epoch": 0.16139616843603788,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001851948721823363,
+      "loss": 0.1089,
+      "step": 18593
+    },
+    {
+      "epoch": 0.16140484891624204,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018519323671623276,
+      "loss": 0.1191,
+      "step": 18594
+    },
+    {
+      "epoch": 0.1614135293964462,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018519160116789901,
+      "loss": 0.1221,
+      "step": 18595
+    },
+    {
+      "epoch": 0.16142220987665037,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0018518996553733686,
+      "loss": 0.0918,
+      "step": 18596
+    },
+    {
+      "epoch": 0.16143089035685454,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0018518832982454807,
+      "loss": 0.1914,
+      "step": 18597
+    },
+    {
+      "epoch": 0.1614395708370587,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0018518669402953448,
+      "loss": 0.1064,
+      "step": 18598
+    },
+    {
+      "epoch": 0.16144825131726287,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0018518505815229786,
+      "loss": 0.1152,
+      "step": 18599
+    },
+    {
+      "epoch": 0.16145693179746703,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0018518342219283994,
+      "loss": 0.1006,
+      "step": 18600
+    },
+    {
+      "epoch": 0.1614656122776712,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.001851817861511626,
+      "loss": 0.0981,
+      "step": 18601
+    },
+    {
+      "epoch": 0.16147429275787537,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001851801500272676,
+      "loss": 0.126,
+      "step": 18602
+    },
+    {
+      "epoch": 0.16148297323807953,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0018517851382115669,
+      "loss": 0.1357,
+      "step": 18603
+    },
+    {
+      "epoch": 0.1614916537182837,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001851768775328317,
+      "loss": 0.1523,
+      "step": 18604
+    },
+    {
+      "epoch": 0.16150033419848786,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001851752411622944,
+      "loss": 0.0933,
+      "step": 18605
+    },
+    {
+      "epoch": 0.16150901467869203,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0018517360470954659,
+      "loss": 0.0967,
+      "step": 18606
+    },
+    {
+      "epoch": 0.1615176951588962,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0018517196817459003,
+      "loss": 0.1016,
+      "step": 18607
+    },
+    {
+      "epoch": 0.16152637563910036,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0018517033155742658,
+      "loss": 0.1182,
+      "step": 18608
+    },
+    {
+      "epoch": 0.16153505611930452,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0018516869485805797,
+      "loss": 0.0923,
+      "step": 18609
+    },
+    {
+      "epoch": 0.16154373659950869,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0018516705807648597,
+      "loss": 0.1426,
+      "step": 18610
+    },
+    {
+      "epoch": 0.16155241707971285,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0018516542121271246,
+      "loss": 0.1797,
+      "step": 18611
+    },
+    {
+      "epoch": 0.16156109755991702,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0018516378426673915,
+      "loss": 0.1074,
+      "step": 18612
+    },
+    {
+      "epoch": 0.16156977804012118,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0018516214723856784,
+      "loss": 0.0986,
+      "step": 18613
+    },
+    {
+      "epoch": 0.16157845852032535,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0018516051012820035,
+      "loss": 0.0952,
+      "step": 18614
+    },
+    {
+      "epoch": 0.1615871390005295,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0018515887293563845,
+      "loss": 0.0796,
+      "step": 18615
+    },
+    {
+      "epoch": 0.16159581948073368,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0018515723566088394,
+      "loss": 0.0981,
+      "step": 18616
+    },
+    {
+      "epoch": 0.16160449996093784,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0018515559830393857,
+      "loss": 0.1016,
+      "step": 18617
+    },
+    {
+      "epoch": 0.161613180441142,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0018515396086480423,
+      "loss": 0.1128,
+      "step": 18618
+    },
+    {
+      "epoch": 0.16162186092134617,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0018515232334348263,
+      "loss": 0.1045,
+      "step": 18619
+    },
+    {
+      "epoch": 0.16163054140155034,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0018515068573997555,
+      "loss": 0.1543,
+      "step": 18620
+    },
+    {
+      "epoch": 0.1616392218817545,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018514904805428483,
+      "loss": 0.1318,
+      "step": 18621
+    },
+    {
+      "epoch": 0.16164790236195867,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0018514741028641223,
+      "loss": 0.0815,
+      "step": 18622
+    },
+    {
+      "epoch": 0.16165658284216283,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0018514577243635957,
+      "loss": 0.0957,
+      "step": 18623
+    },
+    {
+      "epoch": 0.161665263322367,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001851441345041286,
+      "loss": 0.1138,
+      "step": 18624
+    },
+    {
+      "epoch": 0.16167394380257116,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0018514249648972116,
+      "loss": 0.0933,
+      "step": 18625
+    },
+    {
+      "epoch": 0.16168262428277533,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.00185140858393139,
+      "loss": 0.1206,
+      "step": 18626
+    },
+    {
+      "epoch": 0.1616913047629795,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0018513922021438389,
+      "loss": 0.0933,
+      "step": 18627
+    },
+    {
+      "epoch": 0.16169998524318366,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0018513758195345769,
+      "loss": 0.1182,
+      "step": 18628
+    },
+    {
+      "epoch": 0.16170866572338782,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0018513594361036219,
+      "loss": 0.1094,
+      "step": 18629
+    },
+    {
+      "epoch": 0.161717346203592,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0018513430518509913,
+      "loss": 0.1245,
+      "step": 18630
+    },
+    {
+      "epoch": 0.16172602668379615,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001851326666776703,
+      "loss": 0.1162,
+      "step": 18631
+    },
+    {
+      "epoch": 0.16173470716400032,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0018513102808807753,
+      "loss": 0.1387,
+      "step": 18632
+    },
+    {
+      "epoch": 0.16174338764420448,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001851293894163226,
+      "loss": 0.1021,
+      "step": 18633
+    },
+    {
+      "epoch": 0.16175206812440865,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0018512775066240727,
+      "loss": 0.1182,
+      "step": 18634
+    },
+    {
+      "epoch": 0.16176074860461281,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.001851261118263334,
+      "loss": 0.0903,
+      "step": 18635
+    },
+    {
+      "epoch": 0.16176942908481698,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0018512447290810275,
+      "loss": 0.1187,
+      "step": 18636
+    },
+    {
+      "epoch": 0.16177810956502114,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0018512283390771705,
+      "loss": 0.127,
+      "step": 18637
+    },
+    {
+      "epoch": 0.1617867900452253,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001851211948251782,
+      "loss": 0.0903,
+      "step": 18638
+    },
+    {
+      "epoch": 0.16179547052542947,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0018511955566048793,
+      "loss": 0.1152,
+      "step": 18639
+    },
+    {
+      "epoch": 0.16180415100563364,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0018511791641364803,
+      "loss": 0.1221,
+      "step": 18640
+    },
+    {
+      "epoch": 0.1618128314858378,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0018511627708466033,
+      "loss": 0.084,
+      "step": 18641
+    },
+    {
+      "epoch": 0.16182151196604197,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0018511463767352655,
+      "loss": 0.0938,
+      "step": 18642
+    },
+    {
+      "epoch": 0.16183019244624613,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0018511299818024856,
+      "loss": 0.0776,
+      "step": 18643
+    },
+    {
+      "epoch": 0.1618388729264503,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001851113586048281,
+      "loss": 0.103,
+      "step": 18644
+    },
+    {
+      "epoch": 0.16184755340665447,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0018510971894726703,
+      "loss": 0.124,
+      "step": 18645
+    },
+    {
+      "epoch": 0.16185623388685863,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0018510807920756706,
+      "loss": 0.1592,
+      "step": 18646
+    },
+    {
+      "epoch": 0.1618649143670628,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0018510643938573006,
+      "loss": 0.1167,
+      "step": 18647
+    },
+    {
+      "epoch": 0.16187359484726696,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.001851047994817578,
+      "loss": 0.1289,
+      "step": 18648
+    },
+    {
+      "epoch": 0.16188227532747113,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0018510315949565198,
+      "loss": 0.1289,
+      "step": 18649
+    },
+    {
+      "epoch": 0.1618909558076753,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001851015194274145,
+      "loss": 0.1533,
+      "step": 18650
+    },
+    {
+      "epoch": 0.16189963628787946,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0018509987927704716,
+      "loss": 0.1206,
+      "step": 18651
+    },
+    {
+      "epoch": 0.16190831676808362,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0018509823904455174,
+      "loss": 0.0796,
+      "step": 18652
+    },
+    {
+      "epoch": 0.1619169972482878,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0018509659872992996,
+      "loss": 0.1152,
+      "step": 18653
+    },
+    {
+      "epoch": 0.16192567772849195,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001850949583331837,
+      "loss": 0.124,
+      "step": 18654
+    },
+    {
+      "epoch": 0.16193435820869612,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001850933178543147,
+      "loss": 0.0854,
+      "step": 18655
+    },
+    {
+      "epoch": 0.16194303868890028,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001850916772933248,
+      "loss": 0.1562,
+      "step": 18656
+    },
+    {
+      "epoch": 0.16195171916910445,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018509003665021576,
+      "loss": 0.1172,
+      "step": 18657
+    },
+    {
+      "epoch": 0.1619603996493086,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001850883959249894,
+      "loss": 0.1201,
+      "step": 18658
+    },
+    {
+      "epoch": 0.16196908012951278,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018508675511764749,
+      "loss": 0.1064,
+      "step": 18659
+    },
+    {
+      "epoch": 0.16197776060971694,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0018508511422819182,
+      "loss": 0.127,
+      "step": 18660
+    },
+    {
+      "epoch": 0.1619864410899211,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0018508347325662418,
+      "loss": 0.0757,
+      "step": 18661
+    },
+    {
+      "epoch": 0.16199512157012527,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0018508183220294638,
+      "loss": 0.0796,
+      "step": 18662
+    },
+    {
+      "epoch": 0.16200380205032944,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0018508019106716026,
+      "loss": 0.123,
+      "step": 18663
+    },
+    {
+      "epoch": 0.1620124825305336,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0018507854984926753,
+      "loss": 0.1104,
+      "step": 18664
+    },
+    {
+      "epoch": 0.16202116301073777,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0018507690854927005,
+      "loss": 0.0825,
+      "step": 18665
+    },
+    {
+      "epoch": 0.16202984349094193,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.001850752671671696,
+      "loss": 0.0967,
+      "step": 18666
+    },
+    {
+      "epoch": 0.16203852397114607,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0018507362570296792,
+      "loss": 0.0757,
+      "step": 18667
+    },
+    {
+      "epoch": 0.16204720445135024,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.001850719841566669,
+      "loss": 0.1064,
+      "step": 18668
+    },
+    {
+      "epoch": 0.1620558849315544,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0018507034252826828,
+      "loss": 0.1123,
+      "step": 18669
+    },
+    {
+      "epoch": 0.16206456541175857,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0018506870081777382,
+      "loss": 0.0869,
+      "step": 18670
+    },
+    {
+      "epoch": 0.16207324589196273,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0018506705902518541,
+      "loss": 0.1562,
+      "step": 18671
+    },
+    {
+      "epoch": 0.1620819263721669,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0018506541715050478,
+      "loss": 0.1367,
+      "step": 18672
+    },
+    {
+      "epoch": 0.16209060685237106,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0018506377519373373,
+      "loss": 0.125,
+      "step": 18673
+    },
+    {
+      "epoch": 0.16209928733257523,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0018506213315487405,
+      "loss": 0.1016,
+      "step": 18674
+    },
+    {
+      "epoch": 0.1621079678127794,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0018506049103392758,
+      "loss": 0.105,
+      "step": 18675
+    },
+    {
+      "epoch": 0.16211664829298356,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0018505884883089606,
+      "loss": 0.127,
+      "step": 18676
+    },
+    {
+      "epoch": 0.16212532877318772,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0018505720654578128,
+      "loss": 0.0747,
+      "step": 18677
+    },
+    {
+      "epoch": 0.1621340092533919,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001850555641785851,
+      "loss": 0.0942,
+      "step": 18678
+    },
+    {
+      "epoch": 0.16214268973359605,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018505392172930928,
+      "loss": 0.125,
+      "step": 18679
+    },
+    {
+      "epoch": 0.16215137021380022,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0018505227919795564,
+      "loss": 0.104,
+      "step": 18680
+    },
+    {
+      "epoch": 0.16216005069400438,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0018505063658452595,
+      "loss": 0.1221,
+      "step": 18681
+    },
+    {
+      "epoch": 0.16216873117420855,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0018504899388902196,
+      "loss": 0.1094,
+      "step": 18682
+    },
+    {
+      "epoch": 0.1621774116544127,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0018504735111144557,
+      "loss": 0.0918,
+      "step": 18683
+    },
+    {
+      "epoch": 0.16218609213461688,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001850457082517985,
+      "loss": 0.0928,
+      "step": 18684
+    },
+    {
+      "epoch": 0.16219477261482104,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001850440653100826,
+      "loss": 0.1318,
+      "step": 18685
+    },
+    {
+      "epoch": 0.1622034530950252,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001850424222862996,
+      "loss": 0.1338,
+      "step": 18686
+    },
+    {
+      "epoch": 0.16221213357522937,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.001850407791804514,
+      "loss": 0.1523,
+      "step": 18687
+    },
+    {
+      "epoch": 0.16222081405543354,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0018503913599253966,
+      "loss": 0.1099,
+      "step": 18688
+    },
+    {
+      "epoch": 0.1622294945356377,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001850374927225663,
+      "loss": 0.0815,
+      "step": 18689
+    },
+    {
+      "epoch": 0.16223817501584187,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0018503584937053304,
+      "loss": 0.1621,
+      "step": 18690
+    },
+    {
+      "epoch": 0.16224685549604603,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0018503420593644169,
+      "loss": 0.085,
+      "step": 18691
+    },
+    {
+      "epoch": 0.1622555359762502,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0018503256242029407,
+      "loss": 0.0728,
+      "step": 18692
+    },
+    {
+      "epoch": 0.16226421645645436,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0018503091882209196,
+      "loss": 0.1123,
+      "step": 18693
+    },
+    {
+      "epoch": 0.16227289693665853,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0018502927514183719,
+      "loss": 0.1445,
+      "step": 18694
+    },
+    {
+      "epoch": 0.1622815774168627,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0018502763137953151,
+      "loss": 0.1045,
+      "step": 18695
+    },
+    {
+      "epoch": 0.16229025789706686,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0018502598753517673,
+      "loss": 0.1641,
+      "step": 18696
+    },
+    {
+      "epoch": 0.16229893837727102,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001850243436087747,
+      "loss": 0.0986,
+      "step": 18697
+    },
+    {
+      "epoch": 0.1623076188574752,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018502269960032713,
+      "loss": 0.1055,
+      "step": 18698
+    },
+    {
+      "epoch": 0.16231629933767935,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018502105550983587,
+      "loss": 0.0923,
+      "step": 18699
+    },
+    {
+      "epoch": 0.16232497981788352,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0018501941133730274,
+      "loss": 0.0874,
+      "step": 18700
+    },
+    {
+      "epoch": 0.16233366029808768,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001850177670827295,
+      "loss": 0.0835,
+      "step": 18701
+    },
+    {
+      "epoch": 0.16234234077829185,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0018501612274611791,
+      "loss": 0.0869,
+      "step": 18702
+    },
+    {
+      "epoch": 0.16235102125849601,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0018501447832746988,
+      "loss": 0.123,
+      "step": 18703
+    },
+    {
+      "epoch": 0.16235970173870018,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001850128338267871,
+      "loss": 0.0869,
+      "step": 18704
+    },
+    {
+      "epoch": 0.16236838221890434,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0018501118924407144,
+      "loss": 0.085,
+      "step": 18705
+    },
+    {
+      "epoch": 0.1623770626991085,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0018500954457932467,
+      "loss": 0.0859,
+      "step": 18706
+    },
+    {
+      "epoch": 0.16238574317931267,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0018500789983254857,
+      "loss": 0.1133,
+      "step": 18707
+    },
+    {
+      "epoch": 0.16239442365951684,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0018500625500374498,
+      "loss": 0.1426,
+      "step": 18708
+    },
+    {
+      "epoch": 0.162403104139721,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018500461009291562,
+      "loss": 0.0938,
+      "step": 18709
+    },
+    {
+      "epoch": 0.16241178461992517,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0018500296510006243,
+      "loss": 0.1104,
+      "step": 18710
+    },
+    {
+      "epoch": 0.16242046510012934,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0018500132002518706,
+      "loss": 0.0952,
+      "step": 18711
+    },
+    {
+      "epoch": 0.1624291455803335,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018499967486829139,
+      "loss": 0.1289,
+      "step": 18712
+    },
+    {
+      "epoch": 0.16243782606053767,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0018499802962937723,
+      "loss": 0.0884,
+      "step": 18713
+    },
+    {
+      "epoch": 0.16244650654074183,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0018499638430844632,
+      "loss": 0.085,
+      "step": 18714
+    },
+    {
+      "epoch": 0.162455187020946,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.001849947389055005,
+      "loss": 0.084,
+      "step": 18715
+    },
+    {
+      "epoch": 0.16246386750115016,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0018499309342054156,
+      "loss": 0.1309,
+      "step": 18716
+    },
+    {
+      "epoch": 0.16247254798135433,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0018499144785357133,
+      "loss": 0.0908,
+      "step": 18717
+    },
+    {
+      "epoch": 0.1624812284615585,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0018498980220459152,
+      "loss": 0.0972,
+      "step": 18718
+    },
+    {
+      "epoch": 0.16248990894176266,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00184988156473604,
+      "loss": 0.1494,
+      "step": 18719
+    },
+    {
+      "epoch": 0.16249858942196682,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0018498651066061061,
+      "loss": 0.1147,
+      "step": 18720
+    },
+    {
+      "epoch": 0.162507269902171,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0018498486476561304,
+      "loss": 0.1465,
+      "step": 18721
+    },
+    {
+      "epoch": 0.16251595038237515,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0018498321878861318,
+      "loss": 0.1123,
+      "step": 18722
+    },
+    {
+      "epoch": 0.16252463086257932,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0018498157272961277,
+      "loss": 0.0908,
+      "step": 18723
+    },
+    {
+      "epoch": 0.16253331134278348,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0018497992658861367,
+      "loss": 0.1387,
+      "step": 18724
+    },
+    {
+      "epoch": 0.16254199182298765,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0018497828036561767,
+      "loss": 0.1055,
+      "step": 18725
+    },
+    {
+      "epoch": 0.1625506723031918,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001849766340606265,
+      "loss": 0.0781,
+      "step": 18726
+    },
+    {
+      "epoch": 0.16255935278339598,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0018497498767364206,
+      "loss": 0.0962,
+      "step": 18727
+    },
+    {
+      "epoch": 0.16256803326360014,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0018497334120466604,
+      "loss": 0.0742,
+      "step": 18728
+    },
+    {
+      "epoch": 0.1625767137438043,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0018497169465370035,
+      "loss": 0.1367,
+      "step": 18729
+    },
+    {
+      "epoch": 0.16258539422400847,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0018497004802074672,
+      "loss": 0.1191,
+      "step": 18730
+    },
+    {
+      "epoch": 0.16259407470421264,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0018496840130580695,
+      "loss": 0.0903,
+      "step": 18731
+    },
+    {
+      "epoch": 0.1626027551844168,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.001849667545088829,
+      "loss": 0.0947,
+      "step": 18732
+    },
+    {
+      "epoch": 0.16261143566462097,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.001849651076299763,
+      "loss": 0.1055,
+      "step": 18733
+    },
+    {
+      "epoch": 0.16262011614482513,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.00184963460669089,
+      "loss": 0.0869,
+      "step": 18734
+    },
+    {
+      "epoch": 0.1626287966250293,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001849618136262228,
+      "loss": 0.0977,
+      "step": 18735
+    },
+    {
+      "epoch": 0.16263747710523346,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0018496016650137949,
+      "loss": 0.1357,
+      "step": 18736
+    },
+    {
+      "epoch": 0.16264615758543763,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0018495851929456084,
+      "loss": 0.1128,
+      "step": 18737
+    },
+    {
+      "epoch": 0.1626548380656418,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0018495687200576869,
+      "loss": 0.123,
+      "step": 18738
+    },
+    {
+      "epoch": 0.16266351854584596,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0018495522463500484,
+      "loss": 0.1416,
+      "step": 18739
+    },
+    {
+      "epoch": 0.16267219902605012,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0018495357718227108,
+      "loss": 0.0879,
+      "step": 18740
+    },
+    {
+      "epoch": 0.1626808795062543,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001849519296475692,
+      "loss": 0.1064,
+      "step": 18741
+    },
+    {
+      "epoch": 0.16268955998645845,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0018495028203090104,
+      "loss": 0.1025,
+      "step": 18742
+    },
+    {
+      "epoch": 0.16269824046666262,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0018494863433226837,
+      "loss": 0.1426,
+      "step": 18743
+    },
+    {
+      "epoch": 0.16270692094686678,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.00184946986551673,
+      "loss": 0.1162,
+      "step": 18744
+    },
+    {
+      "epoch": 0.16271560142707095,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0018494533868911674,
+      "loss": 0.0928,
+      "step": 18745
+    },
+    {
+      "epoch": 0.16272428190727511,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0018494369074460136,
+      "loss": 0.1221,
+      "step": 18746
+    },
+    {
+      "epoch": 0.16273296238747928,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001849420427181287,
+      "loss": 0.125,
+      "step": 18747
+    },
+    {
+      "epoch": 0.16274164286768344,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0018494039460970053,
+      "loss": 0.1113,
+      "step": 18748
+    },
+    {
+      "epoch": 0.1627503233478876,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001849387464193187,
+      "loss": 0.1235,
+      "step": 18749
+    },
+    {
+      "epoch": 0.16275900382809178,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0018493709814698493,
+      "loss": 0.0928,
+      "step": 18750
+    },
+    {
+      "epoch": 0.16276768430829594,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0018493544979270114,
+      "loss": 0.1143,
+      "step": 18751
+    },
+    {
+      "epoch": 0.1627763647885001,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0018493380135646905,
+      "loss": 0.1816,
+      "step": 18752
+    },
+    {
+      "epoch": 0.16278504526870427,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0018493215283829046,
+      "loss": 0.0801,
+      "step": 18753
+    },
+    {
+      "epoch": 0.16279372574890844,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.001849305042381672,
+      "loss": 0.124,
+      "step": 18754
+    },
+    {
+      "epoch": 0.1628024062291126,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0018492885555610105,
+      "loss": 0.1064,
+      "step": 18755
+    },
+    {
+      "epoch": 0.16281108670931677,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0018492720679209387,
+      "loss": 0.1426,
+      "step": 18756
+    },
+    {
+      "epoch": 0.16281976718952093,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001849255579461474,
+      "loss": 0.1445,
+      "step": 18757
+    },
+    {
+      "epoch": 0.1628284476697251,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0018492390901826347,
+      "loss": 0.1475,
+      "step": 18758
+    },
+    {
+      "epoch": 0.16283712814992926,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0018492226000844387,
+      "loss": 0.1201,
+      "step": 18759
+    },
+    {
+      "epoch": 0.16284580863013343,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0018492061091669044,
+      "loss": 0.0898,
+      "step": 18760
+    },
+    {
+      "epoch": 0.1628544891103376,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0018491896174300491,
+      "loss": 0.085,
+      "step": 18761
+    },
+    {
+      "epoch": 0.16286316959054176,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0018491731248738916,
+      "loss": 0.1191,
+      "step": 18762
+    },
+    {
+      "epoch": 0.16287185007074592,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.00184915663149845,
+      "loss": 0.1318,
+      "step": 18763
+    },
+    {
+      "epoch": 0.1628805305509501,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0018491401373037413,
+      "loss": 0.1147,
+      "step": 18764
+    },
+    {
+      "epoch": 0.16288921103115425,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0018491236422897843,
+      "loss": 0.1318,
+      "step": 18765
+    },
+    {
+      "epoch": 0.16289789151135842,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001849107146456597,
+      "loss": 0.1484,
+      "step": 18766
+    },
+    {
+      "epoch": 0.16290657199156258,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018490906498041977,
+      "loss": 0.1113,
+      "step": 18767
+    },
+    {
+      "epoch": 0.16291525247176675,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0018490741523326038,
+      "loss": 0.124,
+      "step": 18768
+    },
+    {
+      "epoch": 0.1629239329519709,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0018490576540418336,
+      "loss": 0.1084,
+      "step": 18769
+    },
+    {
+      "epoch": 0.16293261343217508,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0018490411549319055,
+      "loss": 0.1299,
+      "step": 18770
+    },
+    {
+      "epoch": 0.16294129391237924,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0018490246550028368,
+      "loss": 0.1299,
+      "step": 18771
+    },
+    {
+      "epoch": 0.1629499743925834,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0018490081542546465,
+      "loss": 0.1206,
+      "step": 18772
+    },
+    {
+      "epoch": 0.16295865487278757,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0018489916526873518,
+      "loss": 0.1235,
+      "step": 18773
+    },
+    {
+      "epoch": 0.16296733535299174,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0018489751503009712,
+      "loss": 0.1235,
+      "step": 18774
+    },
+    {
+      "epoch": 0.1629760158331959,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.001848958647095523,
+      "loss": 0.1035,
+      "step": 18775
+    },
+    {
+      "epoch": 0.16298469631340007,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018489421430710243,
+      "loss": 0.1113,
+      "step": 18776
+    },
+    {
+      "epoch": 0.16299337679360423,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001848925638227494,
+      "loss": 0.1338,
+      "step": 18777
+    },
+    {
+      "epoch": 0.1630020572738084,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0018489091325649496,
+      "loss": 0.1348,
+      "step": 18778
+    },
+    {
+      "epoch": 0.16301073775401256,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0018488926260834095,
+      "loss": 0.1592,
+      "step": 18779
+    },
+    {
+      "epoch": 0.16301941823421673,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0018488761187828919,
+      "loss": 0.1133,
+      "step": 18780
+    },
+    {
+      "epoch": 0.1630280987144209,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0018488596106634148,
+      "loss": 0.0771,
+      "step": 18781
+    },
+    {
+      "epoch": 0.16303677919462506,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0018488431017249958,
+      "loss": 0.1123,
+      "step": 18782
+    },
+    {
+      "epoch": 0.16304545967482922,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001848826591967653,
+      "loss": 0.0981,
+      "step": 18783
+    },
+    {
+      "epoch": 0.1630541401550334,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0018488100813914053,
+      "loss": 0.0854,
+      "step": 18784
+    },
+    {
+      "epoch": 0.16306282063523755,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0018487935699962696,
+      "loss": 0.1201,
+      "step": 18785
+    },
+    {
+      "epoch": 0.16307150111544172,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0018487770577822647,
+      "loss": 0.1562,
+      "step": 18786
+    },
+    {
+      "epoch": 0.16308018159564588,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0018487605447494084,
+      "loss": 0.1572,
+      "step": 18787
+    },
+    {
+      "epoch": 0.16308886207585005,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0018487440308977189,
+      "loss": 0.1738,
+      "step": 18788
+    },
+    {
+      "epoch": 0.16309754255605421,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0018487275162272138,
+      "loss": 0.1245,
+      "step": 18789
+    },
+    {
+      "epoch": 0.16310622303625835,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0018487110007379122,
+      "loss": 0.124,
+      "step": 18790
+    },
+    {
+      "epoch": 0.16311490351646252,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001848694484429831,
+      "loss": 0.0986,
+      "step": 18791
+    },
+    {
+      "epoch": 0.16312358399666668,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0018486779673029892,
+      "loss": 0.1162,
+      "step": 18792
+    },
+    {
+      "epoch": 0.16313226447687085,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001848661449357404,
+      "loss": 0.1094,
+      "step": 18793
+    },
+    {
+      "epoch": 0.163140944957075,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0018486449305930943,
+      "loss": 0.1152,
+      "step": 18794
+    },
+    {
+      "epoch": 0.16314962543727918,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0018486284110100773,
+      "loss": 0.1357,
+      "step": 18795
+    },
+    {
+      "epoch": 0.16315830591748334,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0018486118906083718,
+      "loss": 0.1182,
+      "step": 18796
+    },
+    {
+      "epoch": 0.1631669863976875,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0018485953693879956,
+      "loss": 0.0688,
+      "step": 18797
+    },
+    {
+      "epoch": 0.16317566687789167,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0018485788473489666,
+      "loss": 0.0918,
+      "step": 18798
+    },
+    {
+      "epoch": 0.16318434735809584,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001848562324491303,
+      "loss": 0.0972,
+      "step": 18799
+    },
+    {
+      "epoch": 0.1631930278383,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0018485458008150229,
+      "loss": 0.1006,
+      "step": 18800
+    },
+    {
+      "epoch": 0.16320170831850417,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0018485292763201445,
+      "loss": 0.1289,
+      "step": 18801
+    },
+    {
+      "epoch": 0.16321038879870833,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0018485127510066857,
+      "loss": 0.1094,
+      "step": 18802
+    },
+    {
+      "epoch": 0.1632190692789125,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0018484962248746645,
+      "loss": 0.1377,
+      "step": 18803
+    },
+    {
+      "epoch": 0.16322774975911666,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0018484796979240992,
+      "loss": 0.0703,
+      "step": 18804
+    },
+    {
+      "epoch": 0.16323643023932083,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0018484631701550078,
+      "loss": 0.1357,
+      "step": 18805
+    },
+    {
+      "epoch": 0.163245110719525,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001848446641567408,
+      "loss": 0.0923,
+      "step": 18806
+    },
+    {
+      "epoch": 0.16325379119972916,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0018484301121613185,
+      "loss": 0.168,
+      "step": 18807
+    },
+    {
+      "epoch": 0.16326247167993332,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001848413581936757,
+      "loss": 0.1074,
+      "step": 18808
+    },
+    {
+      "epoch": 0.1632711521601375,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0018483970508937418,
+      "loss": 0.1367,
+      "step": 18809
+    },
+    {
+      "epoch": 0.16327983264034165,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0018483805190322904,
+      "loss": 0.1152,
+      "step": 18810
+    },
+    {
+      "epoch": 0.16328851312054582,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.0018483639863524216,
+      "loss": 0.0659,
+      "step": 18811
+    },
+    {
+      "epoch": 0.16329719360074998,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001848347452854153,
+      "loss": 0.1191,
+      "step": 18812
+    },
+    {
+      "epoch": 0.16330587408095415,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0018483309185375032,
+      "loss": 0.1553,
+      "step": 18813
+    },
+    {
+      "epoch": 0.16331455456115831,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0018483143834024897,
+      "loss": 0.0933,
+      "step": 18814
+    },
+    {
+      "epoch": 0.16332323504136248,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0018482978474491309,
+      "loss": 0.0967,
+      "step": 18815
+    },
+    {
+      "epoch": 0.16333191552156665,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0018482813106774447,
+      "loss": 0.1064,
+      "step": 18816
+    },
+    {
+      "epoch": 0.1633405960017708,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0018482647730874496,
+      "loss": 0.1113,
+      "step": 18817
+    },
+    {
+      "epoch": 0.16334927648197498,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.001848248234679163,
+      "loss": 0.1162,
+      "step": 18818
+    },
+    {
+      "epoch": 0.16335795696217914,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0018482316954526038,
+      "loss": 0.0898,
+      "step": 18819
+    },
+    {
+      "epoch": 0.1633666374423833,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0018482151554077896,
+      "loss": 0.1445,
+      "step": 18820
+    },
+    {
+      "epoch": 0.16337531792258747,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0018481986145447383,
+      "loss": 0.0825,
+      "step": 18821
+    },
+    {
+      "epoch": 0.16338399840279164,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0018481820728634684,
+      "loss": 0.1064,
+      "step": 18822
+    },
+    {
+      "epoch": 0.1633926788829958,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0018481655303639976,
+      "loss": 0.1104,
+      "step": 18823
+    },
+    {
+      "epoch": 0.16340135936319997,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0018481489870463444,
+      "loss": 0.126,
+      "step": 18824
+    },
+    {
+      "epoch": 0.16341003984340413,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0018481324429105268,
+      "loss": 0.0947,
+      "step": 18825
+    },
+    {
+      "epoch": 0.1634187203236083,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0018481158979565624,
+      "loss": 0.1602,
+      "step": 18826
+    },
+    {
+      "epoch": 0.16342740080381246,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.00184809935218447,
+      "loss": 0.0879,
+      "step": 18827
+    },
+    {
+      "epoch": 0.16343608128401663,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0018480828055942677,
+      "loss": 0.0913,
+      "step": 18828
+    },
+    {
+      "epoch": 0.1634447617642208,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0018480662581859728,
+      "loss": 0.1279,
+      "step": 18829
+    },
+    {
+      "epoch": 0.16345344224442496,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0018480497099596042,
+      "loss": 0.0923,
+      "step": 18830
+    },
+    {
+      "epoch": 0.16346212272462912,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0018480331609151792,
+      "loss": 0.1426,
+      "step": 18831
+    },
+    {
+      "epoch": 0.1634708032048333,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0018480166110527167,
+      "loss": 0.0913,
+      "step": 18832
+    },
+    {
+      "epoch": 0.16347948368503745,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0018480000603722346,
+      "loss": 0.0791,
+      "step": 18833
+    },
+    {
+      "epoch": 0.16348816416524162,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001847983508873751,
+      "loss": 0.125,
+      "step": 18834
+    },
+    {
+      "epoch": 0.16349684464544578,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0018479669565572834,
+      "loss": 0.1196,
+      "step": 18835
+    },
+    {
+      "epoch": 0.16350552512564995,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0018479504034228508,
+      "loss": 0.0898,
+      "step": 18836
+    },
+    {
+      "epoch": 0.1635142056058541,
+      "grad_norm": 0.061767578125,
+      "learning_rate": 0.0018479338494704703,
+      "loss": 0.0781,
+      "step": 18837
+    },
+    {
+      "epoch": 0.16352288608605828,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0018479172947001611,
+      "loss": 0.0845,
+      "step": 18838
+    },
+    {
+      "epoch": 0.16353156656626244,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0018479007391119405,
+      "loss": 0.0898,
+      "step": 18839
+    },
+    {
+      "epoch": 0.1635402470464666,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001847884182705827,
+      "loss": 0.1182,
+      "step": 18840
+    },
+    {
+      "epoch": 0.16354892752667077,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0018478676254818384,
+      "loss": 0.1162,
+      "step": 18841
+    },
+    {
+      "epoch": 0.16355760800687494,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0018478510674399934,
+      "loss": 0.1357,
+      "step": 18842
+    },
+    {
+      "epoch": 0.1635662884870791,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0018478345085803094,
+      "loss": 0.123,
+      "step": 18843
+    },
+    {
+      "epoch": 0.16357496896728327,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0018478179489028052,
+      "loss": 0.1016,
+      "step": 18844
+    },
+    {
+      "epoch": 0.16358364944748743,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001847801388407498,
+      "loss": 0.1162,
+      "step": 18845
+    },
+    {
+      "epoch": 0.1635923299276916,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001847784827094407,
+      "loss": 0.105,
+      "step": 18846
+    },
+    {
+      "epoch": 0.16360101040789576,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0018477682649635492,
+      "loss": 0.1113,
+      "step": 18847
+    },
+    {
+      "epoch": 0.16360969088809993,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0018477517020149437,
+      "loss": 0.0996,
+      "step": 18848
+    },
+    {
+      "epoch": 0.1636183713683041,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0018477351382486077,
+      "loss": 0.1045,
+      "step": 18849
+    },
+    {
+      "epoch": 0.16362705184850826,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0018477185736645602,
+      "loss": 0.1201,
+      "step": 18850
+    },
+    {
+      "epoch": 0.16363573232871242,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0018477020082628187,
+      "loss": 0.1104,
+      "step": 18851
+    },
+    {
+      "epoch": 0.1636444128089166,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0018476854420434016,
+      "loss": 0.127,
+      "step": 18852
+    },
+    {
+      "epoch": 0.16365309328912075,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0018476688750063272,
+      "loss": 0.1074,
+      "step": 18853
+    },
+    {
+      "epoch": 0.16366177376932492,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001847652307151613,
+      "loss": 0.1406,
+      "step": 18854
+    },
+    {
+      "epoch": 0.16367045424952908,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0018476357384792775,
+      "loss": 0.1069,
+      "step": 18855
+    },
+    {
+      "epoch": 0.16367913472973325,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0018476191689893386,
+      "loss": 0.1064,
+      "step": 18856
+    },
+    {
+      "epoch": 0.16368781520993742,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0018476025986818151,
+      "loss": 0.1025,
+      "step": 18857
+    },
+    {
+      "epoch": 0.16369649569014158,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0018475860275567243,
+      "loss": 0.1172,
+      "step": 18858
+    },
+    {
+      "epoch": 0.16370517617034575,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0018475694556140847,
+      "loss": 0.106,
+      "step": 18859
+    },
+    {
+      "epoch": 0.1637138566505499,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0018475528828539142,
+      "loss": 0.1006,
+      "step": 18860
+    },
+    {
+      "epoch": 0.16372253713075408,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0018475363092762315,
+      "loss": 0.1328,
+      "step": 18861
+    },
+    {
+      "epoch": 0.16373121761095824,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0018475197348810542,
+      "loss": 0.0742,
+      "step": 18862
+    },
+    {
+      "epoch": 0.1637398980911624,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0018475031596684002,
+      "loss": 0.1318,
+      "step": 18863
+    },
+    {
+      "epoch": 0.16374857857136657,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0018474865836382883,
+      "loss": 0.0996,
+      "step": 18864
+    },
+    {
+      "epoch": 0.16375725905157074,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0018474700067907363,
+      "loss": 0.0942,
+      "step": 18865
+    },
+    {
+      "epoch": 0.1637659395317749,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0018474534291257623,
+      "loss": 0.1621,
+      "step": 18866
+    },
+    {
+      "epoch": 0.16377462001197907,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0018474368506433843,
+      "loss": 0.0713,
+      "step": 18867
+    },
+    {
+      "epoch": 0.16378330049218323,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0018474202713436208,
+      "loss": 0.0996,
+      "step": 18868
+    },
+    {
+      "epoch": 0.1637919809723874,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0018474036912264895,
+      "loss": 0.1699,
+      "step": 18869
+    },
+    {
+      "epoch": 0.16380066145259156,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0018473871102920093,
+      "loss": 0.1113,
+      "step": 18870
+    },
+    {
+      "epoch": 0.16380934193279573,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0018473705285401974,
+      "loss": 0.0898,
+      "step": 18871
+    },
+    {
+      "epoch": 0.1638180224129999,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0018473539459710722,
+      "loss": 0.1318,
+      "step": 18872
+    },
+    {
+      "epoch": 0.16382670289320406,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018473373625846519,
+      "loss": 0.1387,
+      "step": 18873
+    },
+    {
+      "epoch": 0.16383538337340822,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001847320778380955,
+      "loss": 0.0986,
+      "step": 18874
+    },
+    {
+      "epoch": 0.1638440638536124,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001847304193359999,
+      "loss": 0.105,
+      "step": 18875
+    },
+    {
+      "epoch": 0.16385274433381655,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0018472876075218026,
+      "loss": 0.1318,
+      "step": 18876
+    },
+    {
+      "epoch": 0.16386142481402072,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0018472710208663836,
+      "loss": 0.0986,
+      "step": 18877
+    },
+    {
+      "epoch": 0.16387010529422488,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0018472544333937603,
+      "loss": 0.103,
+      "step": 18878
+    },
+    {
+      "epoch": 0.16387878577442905,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0018472378451039508,
+      "loss": 0.0884,
+      "step": 18879
+    },
+    {
+      "epoch": 0.1638874662546332,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0018472212559969732,
+      "loss": 0.1094,
+      "step": 18880
+    },
+    {
+      "epoch": 0.16389614673483738,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0018472046660728456,
+      "loss": 0.1143,
+      "step": 18881
+    },
+    {
+      "epoch": 0.16390482721504154,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018471880753315865,
+      "loss": 0.1118,
+      "step": 18882
+    },
+    {
+      "epoch": 0.1639135076952457,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0018471714837732132,
+      "loss": 0.1719,
+      "step": 18883
+    },
+    {
+      "epoch": 0.16392218817544987,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0018471548913977448,
+      "loss": 0.1357,
+      "step": 18884
+    },
+    {
+      "epoch": 0.16393086865565404,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001847138298205199,
+      "loss": 0.1064,
+      "step": 18885
+    },
+    {
+      "epoch": 0.1639395491358582,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001847121704195594,
+      "loss": 0.1113,
+      "step": 18886
+    },
+    {
+      "epoch": 0.16394822961606237,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0018471051093689476,
+      "loss": 0.1128,
+      "step": 18887
+    },
+    {
+      "epoch": 0.16395691009626653,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0018470885137252787,
+      "loss": 0.1104,
+      "step": 18888
+    },
+    {
+      "epoch": 0.1639655905764707,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0018470719172646046,
+      "loss": 0.0791,
+      "step": 18889
+    },
+    {
+      "epoch": 0.16397427105667486,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018470553199869443,
+      "loss": 0.1016,
+      "step": 18890
+    },
+    {
+      "epoch": 0.16398295153687903,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0018470387218923153,
+      "loss": 0.0996,
+      "step": 18891
+    },
+    {
+      "epoch": 0.1639916320170832,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0018470221229807359,
+      "loss": 0.0986,
+      "step": 18892
+    },
+    {
+      "epoch": 0.16400031249728736,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0018470055232522244,
+      "loss": 0.0957,
+      "step": 18893
+    },
+    {
+      "epoch": 0.16400899297749152,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0018469889227067991,
+      "loss": 0.1094,
+      "step": 18894
+    },
+    {
+      "epoch": 0.1640176734576957,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0018469723213444778,
+      "loss": 0.1289,
+      "step": 18895
+    },
+    {
+      "epoch": 0.16402635393789985,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001846955719165279,
+      "loss": 0.1758,
+      "step": 18896
+    },
+    {
+      "epoch": 0.16403503441810402,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.00184693911616922,
+      "loss": 0.1113,
+      "step": 18897
+    },
+    {
+      "epoch": 0.16404371489830818,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0018469225123563203,
+      "loss": 0.0869,
+      "step": 18898
+    },
+    {
+      "epoch": 0.16405239537851235,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001846905907726597,
+      "loss": 0.0879,
+      "step": 18899
+    },
+    {
+      "epoch": 0.16406107585871652,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0018468893022800686,
+      "loss": 0.0981,
+      "step": 18900
+    },
+    {
+      "epoch": 0.16406975633892068,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0018468726960167534,
+      "loss": 0.1279,
+      "step": 18901
+    },
+    {
+      "epoch": 0.16407843681912485,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0018468560889366695,
+      "loss": 0.1504,
+      "step": 18902
+    },
+    {
+      "epoch": 0.164087117299329,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0018468394810398348,
+      "loss": 0.0898,
+      "step": 18903
+    },
+    {
+      "epoch": 0.16409579777953318,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0018468228723262676,
+      "loss": 0.0835,
+      "step": 18904
+    },
+    {
+      "epoch": 0.16410447825973734,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0018468062627959865,
+      "loss": 0.0664,
+      "step": 18905
+    },
+    {
+      "epoch": 0.1641131587399415,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001846789652449009,
+      "loss": 0.1094,
+      "step": 18906
+    },
+    {
+      "epoch": 0.16412183922014567,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001846773041285354,
+      "loss": 0.1064,
+      "step": 18907
+    },
+    {
+      "epoch": 0.16413051970034984,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0018467564293050385,
+      "loss": 0.1289,
+      "step": 18908
+    },
+    {
+      "epoch": 0.164139200180554,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001846739816508082,
+      "loss": 0.1216,
+      "step": 18909
+    },
+    {
+      "epoch": 0.16414788066075817,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0018467232028945015,
+      "loss": 0.125,
+      "step": 18910
+    },
+    {
+      "epoch": 0.16415656114096233,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0018467065884643162,
+      "loss": 0.1123,
+      "step": 18911
+    },
+    {
+      "epoch": 0.1641652416211665,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0018466899732175438,
+      "loss": 0.0996,
+      "step": 18912
+    },
+    {
+      "epoch": 0.16417392210137063,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0018466733571542023,
+      "loss": 0.1006,
+      "step": 18913
+    },
+    {
+      "epoch": 0.1641826025815748,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0018466567402743103,
+      "loss": 0.1309,
+      "step": 18914
+    },
+    {
+      "epoch": 0.16419128306177896,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0018466401225778852,
+      "loss": 0.1377,
+      "step": 18915
+    },
+    {
+      "epoch": 0.16419996354198313,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0018466235040649457,
+      "loss": 0.1055,
+      "step": 18916
+    },
+    {
+      "epoch": 0.1642086440221873,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0018466068847355103,
+      "loss": 0.1016,
+      "step": 18917
+    },
+    {
+      "epoch": 0.16421732450239146,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001846590264589597,
+      "loss": 0.1172,
+      "step": 18918
+    },
+    {
+      "epoch": 0.16422600498259562,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0018465736436272231,
+      "loss": 0.1162,
+      "step": 18919
+    },
+    {
+      "epoch": 0.1642346854627998,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0018465570218484083,
+      "loss": 0.1138,
+      "step": 18920
+    },
+    {
+      "epoch": 0.16424336594300395,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0018465403992531694,
+      "loss": 0.1416,
+      "step": 18921
+    },
+    {
+      "epoch": 0.16425204642320812,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0018465237758415252,
+      "loss": 0.1211,
+      "step": 18922
+    },
+    {
+      "epoch": 0.16426072690341229,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001846507151613494,
+      "loss": 0.0835,
+      "step": 18923
+    },
+    {
+      "epoch": 0.16426940738361645,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001846490526569094,
+      "loss": 0.0869,
+      "step": 18924
+    },
+    {
+      "epoch": 0.16427808786382062,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0018464739007083427,
+      "loss": 0.1084,
+      "step": 18925
+    },
+    {
+      "epoch": 0.16428676834402478,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.001846457274031259,
+      "loss": 0.1348,
+      "step": 18926
+    },
+    {
+      "epoch": 0.16429544882422895,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0018464406465378605,
+      "loss": 0.1152,
+      "step": 18927
+    },
+    {
+      "epoch": 0.1643041293044331,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001846424018228166,
+      "loss": 0.1182,
+      "step": 18928
+    },
+    {
+      "epoch": 0.16431280978463728,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018464073891021936,
+      "loss": 0.0796,
+      "step": 18929
+    },
+    {
+      "epoch": 0.16432149026484144,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0018463907591599609,
+      "loss": 0.0923,
+      "step": 18930
+    },
+    {
+      "epoch": 0.1643301707450456,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0018463741284014867,
+      "loss": 0.1143,
+      "step": 18931
+    },
+    {
+      "epoch": 0.16433885122524977,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0018463574968267892,
+      "loss": 0.1406,
+      "step": 18932
+    },
+    {
+      "epoch": 0.16434753170545394,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001846340864435886,
+      "loss": 0.1191,
+      "step": 18933
+    },
+    {
+      "epoch": 0.1643562121856581,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0018463242312287958,
+      "loss": 0.1875,
+      "step": 18934
+    },
+    {
+      "epoch": 0.16436489266586227,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0018463075972055365,
+      "loss": 0.1582,
+      "step": 18935
+    },
+    {
+      "epoch": 0.16437357314606643,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0018462909623661267,
+      "loss": 0.1167,
+      "step": 18936
+    },
+    {
+      "epoch": 0.1643822536262706,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001846274326710584,
+      "loss": 0.1084,
+      "step": 18937
+    },
+    {
+      "epoch": 0.16439093410647476,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001846257690238927,
+      "loss": 0.0806,
+      "step": 18938
+    },
+    {
+      "epoch": 0.16439961458667893,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001846241052951174,
+      "loss": 0.0781,
+      "step": 18939
+    },
+    {
+      "epoch": 0.1644082950668831,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0018462244148473425,
+      "loss": 0.1191,
+      "step": 18940
+    },
+    {
+      "epoch": 0.16441697554708726,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0018462077759274517,
+      "loss": 0.1309,
+      "step": 18941
+    },
+    {
+      "epoch": 0.16442565602729142,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0018461911361915193,
+      "loss": 0.1211,
+      "step": 18942
+    },
+    {
+      "epoch": 0.1644343365074956,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0018461744956395632,
+      "loss": 0.125,
+      "step": 18943
+    },
+    {
+      "epoch": 0.16444301698769975,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001846157854271602,
+      "loss": 0.084,
+      "step": 18944
+    },
+    {
+      "epoch": 0.16445169746790392,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001846141212087654,
+      "loss": 0.1406,
+      "step": 18945
+    },
+    {
+      "epoch": 0.16446037794810808,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0018461245690877366,
+      "loss": 0.1064,
+      "step": 18946
+    },
+    {
+      "epoch": 0.16446905842831225,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.001846107925271869,
+      "loss": 0.1504,
+      "step": 18947
+    },
+    {
+      "epoch": 0.1644777389085164,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001846091280640069,
+      "loss": 0.0986,
+      "step": 18948
+    },
+    {
+      "epoch": 0.16448641938872058,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0018460746351923547,
+      "loss": 0.1128,
+      "step": 18949
+    },
+    {
+      "epoch": 0.16449509986892474,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018460579889287444,
+      "loss": 0.1494,
+      "step": 18950
+    },
+    {
+      "epoch": 0.1645037803491289,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0018460413418492562,
+      "loss": 0.1211,
+      "step": 18951
+    },
+    {
+      "epoch": 0.16451246082933307,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0018460246939539086,
+      "loss": 0.0752,
+      "step": 18952
+    },
+    {
+      "epoch": 0.16452114130953724,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0018460080452427192,
+      "loss": 0.1748,
+      "step": 18953
+    },
+    {
+      "epoch": 0.1645298217897414,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0018459913957157071,
+      "loss": 0.1001,
+      "step": 18954
+    },
+    {
+      "epoch": 0.16453850226994557,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0018459747453728897,
+      "loss": 0.0933,
+      "step": 18955
+    },
+    {
+      "epoch": 0.16454718275014973,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0018459580942142857,
+      "loss": 0.1328,
+      "step": 18956
+    },
+    {
+      "epoch": 0.1645558632303539,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001845941442239913,
+      "loss": 0.0918,
+      "step": 18957
+    },
+    {
+      "epoch": 0.16456454371055806,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0018459247894497901,
+      "loss": 0.0874,
+      "step": 18958
+    },
+    {
+      "epoch": 0.16457322419076223,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0018459081358439351,
+      "loss": 0.0928,
+      "step": 18959
+    },
+    {
+      "epoch": 0.1645819046709664,
+      "grad_norm": 1.703125,
+      "learning_rate": 0.001845891481422366,
+      "loss": 0.293,
+      "step": 18960
+    },
+    {
+      "epoch": 0.16459058515117056,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0018458748261851012,
+      "loss": 0.1211,
+      "step": 18961
+    },
+    {
+      "epoch": 0.16459926563137472,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0018458581701321591,
+      "loss": 0.1221,
+      "step": 18962
+    },
+    {
+      "epoch": 0.1646079461115789,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0018458415132635573,
+      "loss": 0.1191,
+      "step": 18963
+    },
+    {
+      "epoch": 0.16461662659178306,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0018458248555793147,
+      "loss": 0.0918,
+      "step": 18964
+    },
+    {
+      "epoch": 0.16462530707198722,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0018458081970794491,
+      "loss": 0.1699,
+      "step": 18965
+    },
+    {
+      "epoch": 0.16463398755219139,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001845791537763979,
+      "loss": 0.0918,
+      "step": 18966
+    },
+    {
+      "epoch": 0.16464266803239555,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0018457748776329226,
+      "loss": 0.0938,
+      "step": 18967
+    },
+    {
+      "epoch": 0.16465134851259972,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0018457582166862979,
+      "loss": 0.125,
+      "step": 18968
+    },
+    {
+      "epoch": 0.16466002899280388,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018457415549241233,
+      "loss": 0.1455,
+      "step": 18969
+    },
+    {
+      "epoch": 0.16466870947300805,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0018457248923464168,
+      "loss": 0.1211,
+      "step": 18970
+    },
+    {
+      "epoch": 0.1646773899532122,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0018457082289531966,
+      "loss": 0.1035,
+      "step": 18971
+    },
+    {
+      "epoch": 0.16468607043341638,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0018456915647444812,
+      "loss": 0.3848,
+      "step": 18972
+    },
+    {
+      "epoch": 0.16469475091362054,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0018456748997202888,
+      "loss": 0.1104,
+      "step": 18973
+    },
+    {
+      "epoch": 0.1647034313938247,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0018456582338806375,
+      "loss": 0.0928,
+      "step": 18974
+    },
+    {
+      "epoch": 0.16471211187402887,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018456415672255457,
+      "loss": 0.0903,
+      "step": 18975
+    },
+    {
+      "epoch": 0.16472079235423304,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001845624899755031,
+      "loss": 0.1104,
+      "step": 18976
+    },
+    {
+      "epoch": 0.1647294728344372,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0018456082314691126,
+      "loss": 0.1221,
+      "step": 18977
+    },
+    {
+      "epoch": 0.16473815331464137,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001845591562367808,
+      "loss": 0.1201,
+      "step": 18978
+    },
+    {
+      "epoch": 0.16474683379484553,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001845574892451136,
+      "loss": 0.1123,
+      "step": 18979
+    },
+    {
+      "epoch": 0.1647555142750497,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001845558221719114,
+      "loss": 0.1396,
+      "step": 18980
+    },
+    {
+      "epoch": 0.16476419475525386,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0018455415501717613,
+      "loss": 0.0796,
+      "step": 18981
+    },
+    {
+      "epoch": 0.16477287523545803,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0018455248778090952,
+      "loss": 0.1299,
+      "step": 18982
+    },
+    {
+      "epoch": 0.1647815557156622,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0018455082046311343,
+      "loss": 0.1172,
+      "step": 18983
+    },
+    {
+      "epoch": 0.16479023619586636,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001845491530637897,
+      "loss": 0.1562,
+      "step": 18984
+    },
+    {
+      "epoch": 0.16479891667607052,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0018454748558294012,
+      "loss": 0.1348,
+      "step": 18985
+    },
+    {
+      "epoch": 0.1648075971562747,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0018454581802056654,
+      "loss": 0.1533,
+      "step": 18986
+    },
+    {
+      "epoch": 0.16481627763647885,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0018454415037667076,
+      "loss": 0.0771,
+      "step": 18987
+    },
+    {
+      "epoch": 0.16482495811668302,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0018454248265125461,
+      "loss": 0.0908,
+      "step": 18988
+    },
+    {
+      "epoch": 0.16483363859688718,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0018454081484431996,
+      "loss": 0.123,
+      "step": 18989
+    },
+    {
+      "epoch": 0.16484231907709135,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0018453914695586856,
+      "loss": 0.1338,
+      "step": 18990
+    },
+    {
+      "epoch": 0.1648509995572955,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.0018453747898590227,
+      "loss": 0.0908,
+      "step": 18991
+    },
+    {
+      "epoch": 0.16485968003749968,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0018453581093442293,
+      "loss": 0.1406,
+      "step": 18992
+    },
+    {
+      "epoch": 0.16486836051770384,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0018453414280143238,
+      "loss": 0.1099,
+      "step": 18993
+    },
+    {
+      "epoch": 0.164877040997908,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0018453247458693235,
+      "loss": 0.124,
+      "step": 18994
+    },
+    {
+      "epoch": 0.16488572147811217,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0018453080629092472,
+      "loss": 0.1221,
+      "step": 18995
+    },
+    {
+      "epoch": 0.16489440195831634,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0018452913791341136,
+      "loss": 0.1309,
+      "step": 18996
+    },
+    {
+      "epoch": 0.1649030824385205,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0018452746945439402,
+      "loss": 0.1123,
+      "step": 18997
+    },
+    {
+      "epoch": 0.16491176291872467,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.001845258009138746,
+      "loss": 0.1357,
+      "step": 18998
+    },
+    {
+      "epoch": 0.16492044339892883,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0018452413229185487,
+      "loss": 0.1133,
+      "step": 18999
+    },
+    {
+      "epoch": 0.164929123879133,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0018452246358833667,
+      "loss": 0.1582,
+      "step": 19000
+    },
+    {
+      "epoch": 0.16493780435933716,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.001845207948033218,
+      "loss": 0.3594,
+      "step": 19001
+    },
+    {
+      "epoch": 0.16494648483954133,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0018451912593681213,
+      "loss": 0.1367,
+      "step": 19002
+    },
+    {
+      "epoch": 0.1649551653197455,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0018451745698880946,
+      "loss": 0.0771,
+      "step": 19003
+    },
+    {
+      "epoch": 0.16496384579994966,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0018451578795931564,
+      "loss": 0.1191,
+      "step": 19004
+    },
+    {
+      "epoch": 0.16497252628015383,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0018451411884833242,
+      "loss": 0.1152,
+      "step": 19005
+    },
+    {
+      "epoch": 0.164981206760358,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0018451244965586172,
+      "loss": 0.1074,
+      "step": 19006
+    },
+    {
+      "epoch": 0.16498988724056216,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001845107803819053,
+      "loss": 0.126,
+      "step": 19007
+    },
+    {
+      "epoch": 0.16499856772076632,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0018450911102646504,
+      "loss": 0.1396,
+      "step": 19008
+    },
+    {
+      "epoch": 0.16500724820097049,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0018450744158954272,
+      "loss": 0.1006,
+      "step": 19009
+    },
+    {
+      "epoch": 0.16501592868117465,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0018450577207114018,
+      "loss": 0.1069,
+      "step": 19010
+    },
+    {
+      "epoch": 0.16502460916137882,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0018450410247125924,
+      "loss": 0.1426,
+      "step": 19011
+    },
+    {
+      "epoch": 0.16503328964158298,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018450243278990176,
+      "loss": 0.1533,
+      "step": 19012
+    },
+    {
+      "epoch": 0.16504197012178715,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001845007630270695,
+      "loss": 0.1074,
+      "step": 19013
+    },
+    {
+      "epoch": 0.1650506506019913,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0018449909318276436,
+      "loss": 0.1416,
+      "step": 19014
+    },
+    {
+      "epoch": 0.16505933108219548,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001844974232569881,
+      "loss": 0.1133,
+      "step": 19015
+    },
+    {
+      "epoch": 0.16506801156239964,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001844957532497426,
+      "loss": 0.1426,
+      "step": 19016
+    },
+    {
+      "epoch": 0.1650766920426038,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001844940831610297,
+      "loss": 0.1533,
+      "step": 19017
+    },
+    {
+      "epoch": 0.16508537252280797,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0018449241299085114,
+      "loss": 0.0771,
+      "step": 19018
+    },
+    {
+      "epoch": 0.16509405300301214,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0018449074273920877,
+      "loss": 0.1094,
+      "step": 19019
+    },
+    {
+      "epoch": 0.1651027334832163,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0018448907240610451,
+      "loss": 0.1123,
+      "step": 19020
+    },
+    {
+      "epoch": 0.16511141396342047,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0018448740199154005,
+      "loss": 0.126,
+      "step": 19021
+    },
+    {
+      "epoch": 0.16512009444362463,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0018448573149551734,
+      "loss": 0.1016,
+      "step": 19022
+    },
+    {
+      "epoch": 0.1651287749238288,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0018448406091803814,
+      "loss": 0.1279,
+      "step": 19023
+    },
+    {
+      "epoch": 0.16513745540403296,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001844823902591043,
+      "loss": 0.1465,
+      "step": 19024
+    },
+    {
+      "epoch": 0.16514613588423713,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001844807195187176,
+      "loss": 0.1182,
+      "step": 19025
+    },
+    {
+      "epoch": 0.1651548163644413,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0018447904869687993,
+      "loss": 0.0952,
+      "step": 19026
+    },
+    {
+      "epoch": 0.16516349684464546,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001844773777935931,
+      "loss": 0.1016,
+      "step": 19027
+    },
+    {
+      "epoch": 0.16517217732484962,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001844757068088589,
+      "loss": 0.1201,
+      "step": 19028
+    },
+    {
+      "epoch": 0.1651808578050538,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0018447403574267923,
+      "loss": 0.1064,
+      "step": 19029
+    },
+    {
+      "epoch": 0.16518953828525795,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0018447236459505584,
+      "loss": 0.1289,
+      "step": 19030
+    },
+    {
+      "epoch": 0.16519821876546212,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001844706933659906,
+      "loss": 0.085,
+      "step": 19031
+    },
+    {
+      "epoch": 0.16520689924566628,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001844690220554853,
+      "loss": 0.1006,
+      "step": 19032
+    },
+    {
+      "epoch": 0.16521557972587045,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0018446735066354185,
+      "loss": 0.0889,
+      "step": 19033
+    },
+    {
+      "epoch": 0.1652242602060746,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0018446567919016197,
+      "loss": 0.0977,
+      "step": 19034
+    },
+    {
+      "epoch": 0.16523294068627878,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0018446400763534759,
+      "loss": 0.1074,
+      "step": 19035
+    },
+    {
+      "epoch": 0.16524162116648292,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0018446233599910048,
+      "loss": 0.1133,
+      "step": 19036
+    },
+    {
+      "epoch": 0.16525030164668708,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0018446066428142247,
+      "loss": 0.126,
+      "step": 19037
+    },
+    {
+      "epoch": 0.16525898212689125,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0018445899248231536,
+      "loss": 0.1113,
+      "step": 19038
+    },
+    {
+      "epoch": 0.1652676626070954,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0018445732060178106,
+      "loss": 0.1523,
+      "step": 19039
+    },
+    {
+      "epoch": 0.16527634308729958,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0018445564863982134,
+      "loss": 0.0879,
+      "step": 19040
+    },
+    {
+      "epoch": 0.16528502356750374,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0018445397659643805,
+      "loss": 0.1279,
+      "step": 19041
+    },
+    {
+      "epoch": 0.1652937040477079,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.00184452304471633,
+      "loss": 0.123,
+      "step": 19042
+    },
+    {
+      "epoch": 0.16530238452791207,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0018445063226540804,
+      "loss": 0.0981,
+      "step": 19043
+    },
+    {
+      "epoch": 0.16531106500811624,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0018444895997776496,
+      "loss": 0.1279,
+      "step": 19044
+    },
+    {
+      "epoch": 0.1653197454883204,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0018444728760870564,
+      "loss": 0.127,
+      "step": 19045
+    },
+    {
+      "epoch": 0.16532842596852457,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001844456151582319,
+      "loss": 0.1182,
+      "step": 19046
+    },
+    {
+      "epoch": 0.16533710644872873,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0018444394262634549,
+      "loss": 0.0811,
+      "step": 19047
+    },
+    {
+      "epoch": 0.1653457869289329,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0018444227001304836,
+      "loss": 0.0957,
+      "step": 19048
+    },
+    {
+      "epoch": 0.16535446740913706,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018444059731834229,
+      "loss": 0.1006,
+      "step": 19049
+    },
+    {
+      "epoch": 0.16536314788934123,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0018443892454222907,
+      "loss": 0.0947,
+      "step": 19050
+    },
+    {
+      "epoch": 0.1653718283695454,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0018443725168471054,
+      "loss": 0.1553,
+      "step": 19051
+    },
+    {
+      "epoch": 0.16538050884974956,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001844355787457886,
+      "loss": 0.1133,
+      "step": 19052
+    },
+    {
+      "epoch": 0.16538918932995372,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0018443390572546502,
+      "loss": 0.1191,
+      "step": 19053
+    },
+    {
+      "epoch": 0.1653978698101579,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0018443223262374164,
+      "loss": 0.1128,
+      "step": 19054
+    },
+    {
+      "epoch": 0.16540655029036205,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0018443055944062025,
+      "loss": 0.1201,
+      "step": 19055
+    },
+    {
+      "epoch": 0.16541523077056622,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0018442888617610277,
+      "loss": 0.0986,
+      "step": 19056
+    },
+    {
+      "epoch": 0.16542391125077038,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0018442721283019093,
+      "loss": 0.1299,
+      "step": 19057
+    },
+    {
+      "epoch": 0.16543259173097455,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018442553940288663,
+      "loss": 0.0996,
+      "step": 19058
+    },
+    {
+      "epoch": 0.1654412722111787,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001844238658941917,
+      "loss": 0.0986,
+      "step": 19059
+    },
+    {
+      "epoch": 0.16544995269138288,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0018442219230410792,
+      "loss": 0.0869,
+      "step": 19060
+    },
+    {
+      "epoch": 0.16545863317158704,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0018442051863263719,
+      "loss": 0.1221,
+      "step": 19061
+    },
+    {
+      "epoch": 0.1654673136517912,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018441884487978124,
+      "loss": 0.1118,
+      "step": 19062
+    },
+    {
+      "epoch": 0.16547599413199537,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0018441717104554199,
+      "loss": 0.0991,
+      "step": 19063
+    },
+    {
+      "epoch": 0.16548467461219954,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0018441549712992123,
+      "loss": 0.127,
+      "step": 19064
+    },
+    {
+      "epoch": 0.1654933550924037,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0018441382313292081,
+      "loss": 0.1108,
+      "step": 19065
+    },
+    {
+      "epoch": 0.16550203557260787,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0018441214905454255,
+      "loss": 0.1074,
+      "step": 19066
+    },
+    {
+      "epoch": 0.16551071605281203,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001844104748947883,
+      "loss": 0.1191,
+      "step": 19067
+    },
+    {
+      "epoch": 0.1655193965330162,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0018440880065365983,
+      "loss": 0.1748,
+      "step": 19068
+    },
+    {
+      "epoch": 0.16552807701322036,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.00184407126331159,
+      "loss": 0.1377,
+      "step": 19069
+    },
+    {
+      "epoch": 0.16553675749342453,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001844054519272877,
+      "loss": 0.1016,
+      "step": 19070
+    },
+    {
+      "epoch": 0.1655454379736287,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0018440377744204772,
+      "loss": 0.0996,
+      "step": 19071
+    },
+    {
+      "epoch": 0.16555411845383286,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0018440210287544085,
+      "loss": 0.1689,
+      "step": 19072
+    },
+    {
+      "epoch": 0.16556279893403703,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0018440042822746902,
+      "loss": 0.124,
+      "step": 19073
+    },
+    {
+      "epoch": 0.1655714794142412,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0018439875349813394,
+      "loss": 0.1328,
+      "step": 19074
+    },
+    {
+      "epoch": 0.16558015989444536,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0018439707868743755,
+      "loss": 0.123,
+      "step": 19075
+    },
+    {
+      "epoch": 0.16558884037464952,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0018439540379538157,
+      "loss": 0.104,
+      "step": 19076
+    },
+    {
+      "epoch": 0.16559752085485369,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0018439372882196794,
+      "loss": 0.1084,
+      "step": 19077
+    },
+    {
+      "epoch": 0.16560620133505785,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0018439205376719846,
+      "loss": 0.1309,
+      "step": 19078
+    },
+    {
+      "epoch": 0.16561488181526202,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0018439037863107488,
+      "loss": 0.1211,
+      "step": 19079
+    },
+    {
+      "epoch": 0.16562356229546618,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0018438870341359916,
+      "loss": 0.1162,
+      "step": 19080
+    },
+    {
+      "epoch": 0.16563224277567035,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0018438702811477304,
+      "loss": 0.1377,
+      "step": 19081
+    },
+    {
+      "epoch": 0.1656409232558745,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001843853527345984,
+      "loss": 0.1216,
+      "step": 19082
+    },
+    {
+      "epoch": 0.16564960373607868,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0018438367727307706,
+      "loss": 0.1318,
+      "step": 19083
+    },
+    {
+      "epoch": 0.16565828421628284,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0018438200173021085,
+      "loss": 0.0986,
+      "step": 19084
+    },
+    {
+      "epoch": 0.165666964696487,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0018438032610600158,
+      "loss": 0.0889,
+      "step": 19085
+    },
+    {
+      "epoch": 0.16567564517669117,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018437865040045111,
+      "loss": 0.1992,
+      "step": 19086
+    },
+    {
+      "epoch": 0.16568432565689534,
+      "grad_norm": 1.8125,
+      "learning_rate": 0.0018437697461356126,
+      "loss": 0.0972,
+      "step": 19087
+    },
+    {
+      "epoch": 0.1656930061370995,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0018437529874533389,
+      "loss": 0.1465,
+      "step": 19088
+    },
+    {
+      "epoch": 0.16570168661730367,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001843736227957708,
+      "loss": 0.1035,
+      "step": 19089
+    },
+    {
+      "epoch": 0.16571036709750783,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0018437194676487383,
+      "loss": 0.0986,
+      "step": 19090
+    },
+    {
+      "epoch": 0.165719047577712,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018437027065264483,
+      "loss": 0.1191,
+      "step": 19091
+    },
+    {
+      "epoch": 0.16572772805791616,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001843685944590856,
+      "loss": 0.1309,
+      "step": 19092
+    },
+    {
+      "epoch": 0.16573640853812033,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0018436691818419799,
+      "loss": 0.103,
+      "step": 19093
+    },
+    {
+      "epoch": 0.1657450890183245,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0018436524182798384,
+      "loss": 0.1064,
+      "step": 19094
+    },
+    {
+      "epoch": 0.16575376949852866,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0018436356539044498,
+      "loss": 0.1143,
+      "step": 19095
+    },
+    {
+      "epoch": 0.16576244997873282,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0018436188887158325,
+      "loss": 0.0952,
+      "step": 19096
+    },
+    {
+      "epoch": 0.165771130458937,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0018436021227140047,
+      "loss": 0.0859,
+      "step": 19097
+    },
+    {
+      "epoch": 0.16577981093914115,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001843585355898985,
+      "loss": 0.1157,
+      "step": 19098
+    },
+    {
+      "epoch": 0.16578849141934532,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0018435685882707913,
+      "loss": 0.1123,
+      "step": 19099
+    },
+    {
+      "epoch": 0.16579717189954948,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0018435518198294423,
+      "loss": 0.1079,
+      "step": 19100
+    },
+    {
+      "epoch": 0.16580585237975365,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001843535050574956,
+      "loss": 0.0879,
+      "step": 19101
+    },
+    {
+      "epoch": 0.16581453285995781,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001843518280507351,
+      "loss": 0.0815,
+      "step": 19102
+    },
+    {
+      "epoch": 0.16582321334016198,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0018435015096266456,
+      "loss": 0.0889,
+      "step": 19103
+    },
+    {
+      "epoch": 0.16583189382036614,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0018434847379328583,
+      "loss": 0.1055,
+      "step": 19104
+    },
+    {
+      "epoch": 0.1658405743005703,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001843467965426007,
+      "loss": 0.1094,
+      "step": 19105
+    },
+    {
+      "epoch": 0.16584925478077447,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0018434511921061104,
+      "loss": 0.0752,
+      "step": 19106
+    },
+    {
+      "epoch": 0.16585793526097864,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0018434344179731868,
+      "loss": 0.1104,
+      "step": 19107
+    },
+    {
+      "epoch": 0.1658666157411828,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0018434176430272545,
+      "loss": 0.0854,
+      "step": 19108
+    },
+    {
+      "epoch": 0.16587529622138697,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0018434008672683316,
+      "loss": 0.1514,
+      "step": 19109
+    },
+    {
+      "epoch": 0.16588397670159113,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0018433840906964371,
+      "loss": 0.1338,
+      "step": 19110
+    },
+    {
+      "epoch": 0.1658926571817953,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0018433673133115885,
+      "loss": 0.1211,
+      "step": 19111
+    },
+    {
+      "epoch": 0.16590133766199947,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0018433505351138052,
+      "loss": 0.1045,
+      "step": 19112
+    },
+    {
+      "epoch": 0.16591001814220363,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0018433337561031043,
+      "loss": 0.1367,
+      "step": 19113
+    },
+    {
+      "epoch": 0.1659186986224078,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0018433169762795047,
+      "loss": 0.1152,
+      "step": 19114
+    },
+    {
+      "epoch": 0.16592737910261196,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0018433001956430254,
+      "loss": 0.1001,
+      "step": 19115
+    },
+    {
+      "epoch": 0.16593605958281613,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0018432834141936837,
+      "loss": 0.1201,
+      "step": 19116
+    },
+    {
+      "epoch": 0.1659447400630203,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0018432666319314988,
+      "loss": 0.1279,
+      "step": 19117
+    },
+    {
+      "epoch": 0.16595342054322446,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001843249848856488,
+      "loss": 0.1035,
+      "step": 19118
+    },
+    {
+      "epoch": 0.16596210102342862,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001843233064968671,
+      "loss": 0.1299,
+      "step": 19119
+    },
+    {
+      "epoch": 0.16597078150363279,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0018432162802680652,
+      "loss": 0.1211,
+      "step": 19120
+    },
+    {
+      "epoch": 0.16597946198383695,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0018431994947546895,
+      "loss": 0.1201,
+      "step": 19121
+    },
+    {
+      "epoch": 0.16598814246404112,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001843182708428562,
+      "loss": 0.1055,
+      "step": 19122
+    },
+    {
+      "epoch": 0.16599682294424528,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0018431659212897007,
+      "loss": 0.1367,
+      "step": 19123
+    },
+    {
+      "epoch": 0.16600550342444945,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0018431491333381245,
+      "loss": 0.1035,
+      "step": 19124
+    },
+    {
+      "epoch": 0.1660141839046536,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0018431323445738516,
+      "loss": 0.1035,
+      "step": 19125
+    },
+    {
+      "epoch": 0.16602286438485778,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0018431155549969002,
+      "loss": 0.0996,
+      "step": 19126
+    },
+    {
+      "epoch": 0.16603154486506194,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001843098764607289,
+      "loss": 0.123,
+      "step": 19127
+    },
+    {
+      "epoch": 0.1660402253452661,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0018430819734050361,
+      "loss": 0.1377,
+      "step": 19128
+    },
+    {
+      "epoch": 0.16604890582547027,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0018430651813901597,
+      "loss": 0.1201,
+      "step": 19129
+    },
+    {
+      "epoch": 0.16605758630567444,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0018430483885626785,
+      "loss": 0.1069,
+      "step": 19130
+    },
+    {
+      "epoch": 0.1660662667858786,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0018430315949226111,
+      "loss": 0.1211,
+      "step": 19131
+    },
+    {
+      "epoch": 0.16607494726608277,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0018430148004699753,
+      "loss": 0.1138,
+      "step": 19132
+    },
+    {
+      "epoch": 0.16608362774628693,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0018429980052047893,
+      "loss": 0.1523,
+      "step": 19133
+    },
+    {
+      "epoch": 0.1660923082264911,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001842981209127072,
+      "loss": 0.1123,
+      "step": 19134
+    },
+    {
+      "epoch": 0.16610098870669526,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0018429644122368418,
+      "loss": 0.0986,
+      "step": 19135
+    },
+    {
+      "epoch": 0.16610966918689943,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0018429476145341171,
+      "loss": 0.1167,
+      "step": 19136
+    },
+    {
+      "epoch": 0.1661183496671036,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0018429308160189159,
+      "loss": 0.1016,
+      "step": 19137
+    },
+    {
+      "epoch": 0.16612703014730776,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018429140166912566,
+      "loss": 0.1445,
+      "step": 19138
+    },
+    {
+      "epoch": 0.16613571062751192,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0018428972165511575,
+      "loss": 0.1895,
+      "step": 19139
+    },
+    {
+      "epoch": 0.1661443911077161,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0018428804155986374,
+      "loss": 0.126,
+      "step": 19140
+    },
+    {
+      "epoch": 0.16615307158792025,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0018428636138337145,
+      "loss": 0.0747,
+      "step": 19141
+    },
+    {
+      "epoch": 0.16616175206812442,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0018428468112564069,
+      "loss": 0.1201,
+      "step": 19142
+    },
+    {
+      "epoch": 0.16617043254832858,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0018428300078667333,
+      "loss": 0.123,
+      "step": 19143
+    },
+    {
+      "epoch": 0.16617911302853275,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001842813203664712,
+      "loss": 0.1133,
+      "step": 19144
+    },
+    {
+      "epoch": 0.16618779350873691,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0018427963986503613,
+      "loss": 0.1123,
+      "step": 19145
+    },
+    {
+      "epoch": 0.16619647398894108,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018427795928236998,
+      "loss": 0.1045,
+      "step": 19146
+    },
+    {
+      "epoch": 0.16620515446914524,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0018427627861847453,
+      "loss": 0.0981,
+      "step": 19147
+    },
+    {
+      "epoch": 0.1662138349493494,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0018427459787335168,
+      "loss": 0.1328,
+      "step": 19148
+    },
+    {
+      "epoch": 0.16622251542955357,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0018427291704700326,
+      "loss": 0.1094,
+      "step": 19149
+    },
+    {
+      "epoch": 0.16623119590975774,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0018427123613943108,
+      "loss": 0.1855,
+      "step": 19150
+    },
+    {
+      "epoch": 0.1662398763899619,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0018426955515063702,
+      "loss": 0.1279,
+      "step": 19151
+    },
+    {
+      "epoch": 0.16624855687016607,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0018426787408062281,
+      "loss": 0.0967,
+      "step": 19152
+    },
+    {
+      "epoch": 0.16625723735037023,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0018426619292939044,
+      "loss": 0.0957,
+      "step": 19153
+    },
+    {
+      "epoch": 0.1662659178305744,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0018426451169694165,
+      "loss": 0.0854,
+      "step": 19154
+    },
+    {
+      "epoch": 0.16627459831077857,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0018426283038327832,
+      "loss": 0.0791,
+      "step": 19155
+    },
+    {
+      "epoch": 0.16628327879098273,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0018426114898840227,
+      "loss": 0.0947,
+      "step": 19156
+    },
+    {
+      "epoch": 0.1662919592711869,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0018425946751231535,
+      "loss": 0.1318,
+      "step": 19157
+    },
+    {
+      "epoch": 0.16630063975139106,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0018425778595501936,
+      "loss": 0.1543,
+      "step": 19158
+    },
+    {
+      "epoch": 0.16630932023159523,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0018425610431651624,
+      "loss": 0.0986,
+      "step": 19159
+    },
+    {
+      "epoch": 0.16631800071179936,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0018425442259680771,
+      "loss": 0.0996,
+      "step": 19160
+    },
+    {
+      "epoch": 0.16632668119200353,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0018425274079589566,
+      "loss": 0.166,
+      "step": 19161
+    },
+    {
+      "epoch": 0.1663353616722077,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0018425105891378192,
+      "loss": 0.1079,
+      "step": 19162
+    },
+    {
+      "epoch": 0.16634404215241186,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0018424937695046834,
+      "loss": 0.1357,
+      "step": 19163
+    },
+    {
+      "epoch": 0.16635272263261602,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001842476949059568,
+      "loss": 0.1484,
+      "step": 19164
+    },
+    {
+      "epoch": 0.1663614031128202,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0018424601278024906,
+      "loss": 0.1387,
+      "step": 19165
+    },
+    {
+      "epoch": 0.16637008359302435,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.00184244330573347,
+      "loss": 0.0972,
+      "step": 19166
+    },
+    {
+      "epoch": 0.16637876407322852,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0018424264828525245,
+      "loss": 0.1436,
+      "step": 19167
+    },
+    {
+      "epoch": 0.16638744455343268,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0018424096591596723,
+      "loss": 0.0977,
+      "step": 19168
+    },
+    {
+      "epoch": 0.16639612503363685,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0018423928346549326,
+      "loss": 0.1191,
+      "step": 19169
+    },
+    {
+      "epoch": 0.16640480551384101,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.001842376009338323,
+      "loss": 0.1582,
+      "step": 19170
+    },
+    {
+      "epoch": 0.16641348599404518,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.001842359183209862,
+      "loss": 0.0977,
+      "step": 19171
+    },
+    {
+      "epoch": 0.16642216647424934,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0018423423562695684,
+      "loss": 0.1309,
+      "step": 19172
+    },
+    {
+      "epoch": 0.1664308469544535,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0018423255285174604,
+      "loss": 0.1348,
+      "step": 19173
+    },
+    {
+      "epoch": 0.16643952743465767,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001842308699953556,
+      "loss": 0.0981,
+      "step": 19174
+    },
+    {
+      "epoch": 0.16644820791486184,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0018422918705778741,
+      "loss": 0.0669,
+      "step": 19175
+    },
+    {
+      "epoch": 0.166456888395066,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0018422750403904334,
+      "loss": 0.1387,
+      "step": 19176
+    },
+    {
+      "epoch": 0.16646556887527017,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0018422582093912513,
+      "loss": 0.3906,
+      "step": 19177
+    },
+    {
+      "epoch": 0.16647424935547434,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001842241377580347,
+      "loss": 0.1172,
+      "step": 19178
+    },
+    {
+      "epoch": 0.1664829298356785,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0018422245449577386,
+      "loss": 0.0933,
+      "step": 19179
+    },
+    {
+      "epoch": 0.16649161031588267,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0018422077115234448,
+      "loss": 0.0918,
+      "step": 19180
+    },
+    {
+      "epoch": 0.16650029079608683,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0018421908772774836,
+      "loss": 0.1494,
+      "step": 19181
+    },
+    {
+      "epoch": 0.166508971276291,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0018421740422198737,
+      "loss": 0.1328,
+      "step": 19182
+    },
+    {
+      "epoch": 0.16651765175649516,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0018421572063506333,
+      "loss": 0.1504,
+      "step": 19183
+    },
+    {
+      "epoch": 0.16652633223669933,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001842140369669781,
+      "loss": 0.1235,
+      "step": 19184
+    },
+    {
+      "epoch": 0.1665350127169035,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0018421235321773352,
+      "loss": 0.1348,
+      "step": 19185
+    },
+    {
+      "epoch": 0.16654369319710766,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001842106693873314,
+      "loss": 0.1445,
+      "step": 19186
+    },
+    {
+      "epoch": 0.16655237367731182,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0018420898547577365,
+      "loss": 0.1309,
+      "step": 19187
+    },
+    {
+      "epoch": 0.166561054157516,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0018420730148306205,
+      "loss": 0.124,
+      "step": 19188
+    },
+    {
+      "epoch": 0.16656973463772015,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0018420561740919847,
+      "loss": 0.1377,
+      "step": 19189
+    },
+    {
+      "epoch": 0.16657841511792432,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001842039332541847,
+      "loss": 0.1084,
+      "step": 19190
+    },
+    {
+      "epoch": 0.16658709559812848,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0018420224901802267,
+      "loss": 0.1445,
+      "step": 19191
+    },
+    {
+      "epoch": 0.16659577607833265,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0018420056470071415,
+      "loss": 0.1055,
+      "step": 19192
+    },
+    {
+      "epoch": 0.1666044565585368,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.00184198880302261,
+      "loss": 0.1279,
+      "step": 19193
+    },
+    {
+      "epoch": 0.16661313703874098,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001841971958226651,
+      "loss": 0.0908,
+      "step": 19194
+    },
+    {
+      "epoch": 0.16662181751894514,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0018419551126192825,
+      "loss": 0.0771,
+      "step": 19195
+    },
+    {
+      "epoch": 0.1666304979991493,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0018419382662005228,
+      "loss": 0.0947,
+      "step": 19196
+    },
+    {
+      "epoch": 0.16663917847935347,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0018419214189703908,
+      "loss": 0.1074,
+      "step": 19197
+    },
+    {
+      "epoch": 0.16664785895955764,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018419045709289045,
+      "loss": 0.1455,
+      "step": 19198
+    },
+    {
+      "epoch": 0.1666565394397618,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0018418877220760827,
+      "loss": 0.0957,
+      "step": 19199
+    },
+    {
+      "epoch": 0.16666521991996597,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018418708724119437,
+      "loss": 0.1016,
+      "step": 19200
+    },
+    {
+      "epoch": 0.16667390040017013,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001841854021936506,
+      "loss": 0.1621,
+      "step": 19201
+    },
+    {
+      "epoch": 0.1666825808803743,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0018418371706497873,
+      "loss": 0.1338,
+      "step": 19202
+    },
+    {
+      "epoch": 0.16669126136057846,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001841820318551807,
+      "loss": 0.1084,
+      "step": 19203
+    },
+    {
+      "epoch": 0.16669994184078263,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0018418034656425828,
+      "loss": 0.0889,
+      "step": 19204
+    },
+    {
+      "epoch": 0.1667086223209868,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.001841786611922134,
+      "loss": 0.123,
+      "step": 19205
+    },
+    {
+      "epoch": 0.16671730280119096,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001841769757390478,
+      "loss": 0.0986,
+      "step": 19206
+    },
+    {
+      "epoch": 0.16672598328139512,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0018417529020476342,
+      "loss": 0.1055,
+      "step": 19207
+    },
+    {
+      "epoch": 0.1667346637615993,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0018417360458936206,
+      "loss": 0.1641,
+      "step": 19208
+    },
+    {
+      "epoch": 0.16674334424180345,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0018417191889284553,
+      "loss": 0.1123,
+      "step": 19209
+    },
+    {
+      "epoch": 0.16675202472200762,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001841702331152157,
+      "loss": 0.1699,
+      "step": 19210
+    },
+    {
+      "epoch": 0.16676070520221178,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018416854725647443,
+      "loss": 0.0845,
+      "step": 19211
+    },
+    {
+      "epoch": 0.16676938568241595,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0018416686131662355,
+      "loss": 0.1494,
+      "step": 19212
+    },
+    {
+      "epoch": 0.16677806616262011,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001841651752956649,
+      "loss": 0.1582,
+      "step": 19213
+    },
+    {
+      "epoch": 0.16678674664282428,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018416348919360033,
+      "loss": 0.1572,
+      "step": 19214
+    },
+    {
+      "epoch": 0.16679542712302844,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001841618030104317,
+      "loss": 0.1484,
+      "step": 19215
+    },
+    {
+      "epoch": 0.1668041076032326,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001841601167461608,
+      "loss": 0.0903,
+      "step": 19216
+    },
+    {
+      "epoch": 0.16681278808343677,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0018415843040078955,
+      "loss": 0.0898,
+      "step": 19217
+    },
+    {
+      "epoch": 0.16682146856364094,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0018415674397431977,
+      "loss": 0.1094,
+      "step": 19218
+    },
+    {
+      "epoch": 0.1668301490438451,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0018415505746675323,
+      "loss": 0.0923,
+      "step": 19219
+    },
+    {
+      "epoch": 0.16683882952404927,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0018415337087809185,
+      "loss": 0.1191,
+      "step": 19220
+    },
+    {
+      "epoch": 0.16684751000425344,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0018415168420833747,
+      "loss": 0.124,
+      "step": 19221
+    },
+    {
+      "epoch": 0.1668561904844576,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0018414999745749192,
+      "loss": 0.1084,
+      "step": 19222
+    },
+    {
+      "epoch": 0.16686487096466177,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0018414831062555706,
+      "loss": 0.1221,
+      "step": 19223
+    },
+    {
+      "epoch": 0.16687355144486593,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001841466237125347,
+      "loss": 0.1045,
+      "step": 19224
+    },
+    {
+      "epoch": 0.1668822319250701,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0018414493671842671,
+      "loss": 0.1045,
+      "step": 19225
+    },
+    {
+      "epoch": 0.16689091240527426,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0018414324964323495,
+      "loss": 0.0854,
+      "step": 19226
+    },
+    {
+      "epoch": 0.16689959288547843,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0018414156248696123,
+      "loss": 0.1094,
+      "step": 19227
+    },
+    {
+      "epoch": 0.1669082733656826,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001841398752496074,
+      "loss": 0.0796,
+      "step": 19228
+    },
+    {
+      "epoch": 0.16691695384588676,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0018413818793117534,
+      "loss": 0.1426,
+      "step": 19229
+    },
+    {
+      "epoch": 0.16692563432609092,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0018413650053166685,
+      "loss": 0.1172,
+      "step": 19230
+    },
+    {
+      "epoch": 0.1669343148062951,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0018413481305108383,
+      "loss": 0.0908,
+      "step": 19231
+    },
+    {
+      "epoch": 0.16694299528649925,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0018413312548942807,
+      "loss": 0.1138,
+      "step": 19232
+    },
+    {
+      "epoch": 0.16695167576670342,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0018413143784670144,
+      "loss": 0.1006,
+      "step": 19233
+    },
+    {
+      "epoch": 0.16696035624690758,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0018412975012290578,
+      "loss": 0.1143,
+      "step": 19234
+    },
+    {
+      "epoch": 0.16696903672711175,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0018412806231804296,
+      "loss": 0.125,
+      "step": 19235
+    },
+    {
+      "epoch": 0.1669777172073159,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018412637443211476,
+      "loss": 0.1064,
+      "step": 19236
+    },
+    {
+      "epoch": 0.16698639768752008,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001841246864651231,
+      "loss": 0.126,
+      "step": 19237
+    },
+    {
+      "epoch": 0.16699507816772424,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001841229984170698,
+      "loss": 0.0713,
+      "step": 19238
+    },
+    {
+      "epoch": 0.1670037586479284,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0018412131028795668,
+      "loss": 0.1074,
+      "step": 19239
+    },
+    {
+      "epoch": 0.16701243912813257,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0018411962207778565,
+      "loss": 0.0908,
+      "step": 19240
+    },
+    {
+      "epoch": 0.16702111960833674,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0018411793378655845,
+      "loss": 0.1064,
+      "step": 19241
+    },
+    {
+      "epoch": 0.1670298000885409,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0018411624541427704,
+      "loss": 0.1172,
+      "step": 19242
+    },
+    {
+      "epoch": 0.16703848056874507,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001841145569609432,
+      "loss": 0.1191,
+      "step": 19243
+    },
+    {
+      "epoch": 0.16704716104894923,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001841128684265588,
+      "loss": 0.1133,
+      "step": 19244
+    },
+    {
+      "epoch": 0.1670558415291534,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001841111798111257,
+      "loss": 0.1104,
+      "step": 19245
+    },
+    {
+      "epoch": 0.16706452200935756,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0018410949111464573,
+      "loss": 0.0957,
+      "step": 19246
+    },
+    {
+      "epoch": 0.16707320248956173,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001841078023371207,
+      "loss": 0.105,
+      "step": 19247
+    },
+    {
+      "epoch": 0.1670818829697659,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0018410611347855251,
+      "loss": 0.1123,
+      "step": 19248
+    },
+    {
+      "epoch": 0.16709056344997006,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.00184104424538943,
+      "loss": 0.0957,
+      "step": 19249
+    },
+    {
+      "epoch": 0.16709924393017422,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0018410273551829396,
+      "loss": 0.1523,
+      "step": 19250
+    },
+    {
+      "epoch": 0.1671079244103784,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0018410104641660731,
+      "loss": 0.1338,
+      "step": 19251
+    },
+    {
+      "epoch": 0.16711660489058255,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0018409935723388488,
+      "loss": 0.1089,
+      "step": 19252
+    },
+    {
+      "epoch": 0.16712528537078672,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001840976679701285,
+      "loss": 0.1177,
+      "step": 19253
+    },
+    {
+      "epoch": 0.16713396585099088,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0018409597862534,
+      "loss": 0.0996,
+      "step": 19254
+    },
+    {
+      "epoch": 0.16714264633119505,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0018409428919952129,
+      "loss": 0.1099,
+      "step": 19255
+    },
+    {
+      "epoch": 0.16715132681139921,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0018409259969267414,
+      "loss": 0.127,
+      "step": 19256
+    },
+    {
+      "epoch": 0.16716000729160338,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0018409091010480045,
+      "loss": 0.1582,
+      "step": 19257
+    },
+    {
+      "epoch": 0.16716868777180754,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0018408922043590206,
+      "loss": 0.1357,
+      "step": 19258
+    },
+    {
+      "epoch": 0.1671773682520117,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0018408753068598082,
+      "loss": 0.1367,
+      "step": 19259
+    },
+    {
+      "epoch": 0.16718604873221588,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0018408584085503855,
+      "loss": 0.1191,
+      "step": 19260
+    },
+    {
+      "epoch": 0.16719472921242004,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0018408415094307714,
+      "loss": 0.123,
+      "step": 19261
+    },
+    {
+      "epoch": 0.1672034096926242,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0018408246095009842,
+      "loss": 0.1187,
+      "step": 19262
+    },
+    {
+      "epoch": 0.16721209017282837,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001840807708761042,
+      "loss": 0.1592,
+      "step": 19263
+    },
+    {
+      "epoch": 0.16722077065303254,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0018407908072109638,
+      "loss": 0.0942,
+      "step": 19264
+    },
+    {
+      "epoch": 0.1672294511332367,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0018407739048507679,
+      "loss": 0.0977,
+      "step": 19265
+    },
+    {
+      "epoch": 0.16723813161344087,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018407570016804728,
+      "loss": 0.1152,
+      "step": 19266
+    },
+    {
+      "epoch": 0.16724681209364503,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0018407400977000967,
+      "loss": 0.1074,
+      "step": 19267
+    },
+    {
+      "epoch": 0.1672554925738492,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0018407231929096588,
+      "loss": 0.1621,
+      "step": 19268
+    },
+    {
+      "epoch": 0.16726417305405336,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0018407062873091768,
+      "loss": 0.1177,
+      "step": 19269
+    },
+    {
+      "epoch": 0.16727285353425753,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0018406893808986698,
+      "loss": 0.1191,
+      "step": 19270
+    },
+    {
+      "epoch": 0.1672815340144617,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001840672473678156,
+      "loss": 0.126,
+      "step": 19271
+    },
+    {
+      "epoch": 0.16729021449466586,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0018406555656476536,
+      "loss": 0.084,
+      "step": 19272
+    },
+    {
+      "epoch": 0.16729889497487002,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0018406386568071817,
+      "loss": 0.0938,
+      "step": 19273
+    },
+    {
+      "epoch": 0.1673075754550742,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0018406217471567585,
+      "loss": 0.1064,
+      "step": 19274
+    },
+    {
+      "epoch": 0.16731625593527835,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0018406048366964023,
+      "loss": 0.1133,
+      "step": 19275
+    },
+    {
+      "epoch": 0.16732493641548252,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0018405879254261319,
+      "loss": 0.0771,
+      "step": 19276
+    },
+    {
+      "epoch": 0.16733361689568668,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0018405710133459657,
+      "loss": 0.1338,
+      "step": 19277
+    },
+    {
+      "epoch": 0.16734229737589085,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0018405541004559223,
+      "loss": 0.1426,
+      "step": 19278
+    },
+    {
+      "epoch": 0.167350977856095,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0018405371867560197,
+      "loss": 0.0942,
+      "step": 19279
+    },
+    {
+      "epoch": 0.16735965833629918,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0018405202722462767,
+      "loss": 0.1631,
+      "step": 19280
+    },
+    {
+      "epoch": 0.16736833881650334,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0018405033569267124,
+      "loss": 0.0986,
+      "step": 19281
+    },
+    {
+      "epoch": 0.1673770192967075,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0018404864407973445,
+      "loss": 0.1025,
+      "step": 19282
+    },
+    {
+      "epoch": 0.16738569977691165,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0018404695238581916,
+      "loss": 0.1777,
+      "step": 19283
+    },
+    {
+      "epoch": 0.1673943802571158,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0018404526061092724,
+      "loss": 0.1533,
+      "step": 19284
+    },
+    {
+      "epoch": 0.16740306073731998,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0018404356875506055,
+      "loss": 0.1709,
+      "step": 19285
+    },
+    {
+      "epoch": 0.16741174121752414,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0018404187681822092,
+      "loss": 0.1099,
+      "step": 19286
+    },
+    {
+      "epoch": 0.1674204216977283,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.001840401848004102,
+      "loss": 0.1309,
+      "step": 19287
+    },
+    {
+      "epoch": 0.16742910217793247,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0018403849270163025,
+      "loss": 0.1196,
+      "step": 19288
+    },
+    {
+      "epoch": 0.16743778265813664,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0018403680052188293,
+      "loss": 0.1328,
+      "step": 19289
+    },
+    {
+      "epoch": 0.1674464631383408,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0018403510826117005,
+      "loss": 0.0879,
+      "step": 19290
+    },
+    {
+      "epoch": 0.16745514361854497,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001840334159194935,
+      "loss": 0.1006,
+      "step": 19291
+    },
+    {
+      "epoch": 0.16746382409874913,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0018403172349685513,
+      "loss": 0.1113,
+      "step": 19292
+    },
+    {
+      "epoch": 0.1674725045789533,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0018403003099325675,
+      "loss": 0.1162,
+      "step": 19293
+    },
+    {
+      "epoch": 0.16748118505915746,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0018402833840870025,
+      "loss": 0.1069,
+      "step": 19294
+    },
+    {
+      "epoch": 0.16748986553936163,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001840266457431875,
+      "loss": 0.1045,
+      "step": 19295
+    },
+    {
+      "epoch": 0.1674985460195658,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0018402495299672031,
+      "loss": 0.0908,
+      "step": 19296
+    },
+    {
+      "epoch": 0.16750722649976996,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0018402326016930051,
+      "loss": 0.0947,
+      "step": 19297
+    },
+    {
+      "epoch": 0.16751590697997412,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018402156726093002,
+      "loss": 0.0825,
+      "step": 19298
+    },
+    {
+      "epoch": 0.1675245874601783,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0018401987427161064,
+      "loss": 0.1504,
+      "step": 19299
+    },
+    {
+      "epoch": 0.16753326794038245,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0018401818120134426,
+      "loss": 0.0942,
+      "step": 19300
+    },
+    {
+      "epoch": 0.16754194842058662,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001840164880501327,
+      "loss": 0.1167,
+      "step": 19301
+    },
+    {
+      "epoch": 0.16755062890079078,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0018401479481797783,
+      "loss": 0.0771,
+      "step": 19302
+    },
+    {
+      "epoch": 0.16755930938099495,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0018401310150488147,
+      "loss": 0.1348,
+      "step": 19303
+    },
+    {
+      "epoch": 0.1675679898611991,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001840114081108455,
+      "loss": 0.1143,
+      "step": 19304
+    },
+    {
+      "epoch": 0.16757667034140328,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0018400971463587176,
+      "loss": 0.1279,
+      "step": 19305
+    },
+    {
+      "epoch": 0.16758535082160744,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0018400802107996213,
+      "loss": 0.1177,
+      "step": 19306
+    },
+    {
+      "epoch": 0.1675940313018116,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0018400632744311844,
+      "loss": 0.125,
+      "step": 19307
+    },
+    {
+      "epoch": 0.16760271178201577,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001840046337253425,
+      "loss": 0.1094,
+      "step": 19308
+    },
+    {
+      "epoch": 0.16761139226221994,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0018400293992663623,
+      "loss": 0.1289,
+      "step": 19309
+    },
+    {
+      "epoch": 0.1676200727424241,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0018400124604700148,
+      "loss": 0.166,
+      "step": 19310
+    },
+    {
+      "epoch": 0.16762875322262827,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0018399955208644005,
+      "loss": 0.1348,
+      "step": 19311
+    },
+    {
+      "epoch": 0.16763743370283243,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0018399785804495386,
+      "loss": 0.1318,
+      "step": 19312
+    },
+    {
+      "epoch": 0.1676461141830366,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001839961639225447,
+      "loss": 0.1191,
+      "step": 19313
+    },
+    {
+      "epoch": 0.16765479466324076,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0018399446971921443,
+      "loss": 0.0967,
+      "step": 19314
+    },
+    {
+      "epoch": 0.16766347514344493,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0018399277543496493,
+      "loss": 0.1328,
+      "step": 19315
+    },
+    {
+      "epoch": 0.1676721556236491,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0018399108106979803,
+      "loss": 0.1147,
+      "step": 19316
+    },
+    {
+      "epoch": 0.16768083610385326,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0018398938662371562,
+      "loss": 0.0957,
+      "step": 19317
+    },
+    {
+      "epoch": 0.16768951658405742,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0018398769209671951,
+      "loss": 0.0962,
+      "step": 19318
+    },
+    {
+      "epoch": 0.1676981970642616,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0018398599748881159,
+      "loss": 0.0986,
+      "step": 19319
+    },
+    {
+      "epoch": 0.16770687754446575,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018398430279999366,
+      "loss": 0.0898,
+      "step": 19320
+    },
+    {
+      "epoch": 0.16771555802466992,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0018398260803026765,
+      "loss": 0.0996,
+      "step": 19321
+    },
+    {
+      "epoch": 0.16772423850487408,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0018398091317963538,
+      "loss": 0.1196,
+      "step": 19322
+    },
+    {
+      "epoch": 0.16773291898507825,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0018397921824809865,
+      "loss": 0.0962,
+      "step": 19323
+    },
+    {
+      "epoch": 0.16774159946528241,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001839775232356594,
+      "loss": 0.1309,
+      "step": 19324
+    },
+    {
+      "epoch": 0.16775027994548658,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001839758281423194,
+      "loss": 0.1089,
+      "step": 19325
+    },
+    {
+      "epoch": 0.16775896042569075,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0018397413296808056,
+      "loss": 0.0947,
+      "step": 19326
+    },
+    {
+      "epoch": 0.1677676409058949,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001839724377129447,
+      "loss": 0.0942,
+      "step": 19327
+    },
+    {
+      "epoch": 0.16777632138609908,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0018397074237691373,
+      "loss": 0.1152,
+      "step": 19328
+    },
+    {
+      "epoch": 0.16778500186630324,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001839690469599895,
+      "loss": 0.0615,
+      "step": 19329
+    },
+    {
+      "epoch": 0.1677936823465074,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0018396735146217376,
+      "loss": 0.1426,
+      "step": 19330
+    },
+    {
+      "epoch": 0.16780236282671157,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0018396565588346848,
+      "loss": 0.0947,
+      "step": 19331
+    },
+    {
+      "epoch": 0.16781104330691574,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018396396022387542,
+      "loss": 0.1104,
+      "step": 19332
+    },
+    {
+      "epoch": 0.1678197237871199,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0018396226448339655,
+      "loss": 0.0835,
+      "step": 19333
+    },
+    {
+      "epoch": 0.16782840426732407,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0018396056866203361,
+      "loss": 0.1162,
+      "step": 19334
+    },
+    {
+      "epoch": 0.16783708474752823,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018395887275978852,
+      "loss": 0.0996,
+      "step": 19335
+    },
+    {
+      "epoch": 0.1678457652277324,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0018395717677666312,
+      "loss": 0.0957,
+      "step": 19336
+    },
+    {
+      "epoch": 0.16785444570793656,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0018395548071265927,
+      "loss": 0.1006,
+      "step": 19337
+    },
+    {
+      "epoch": 0.16786312618814073,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018395378456777882,
+      "loss": 0.1279,
+      "step": 19338
+    },
+    {
+      "epoch": 0.1678718066683449,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001839520883420236,
+      "loss": 0.1172,
+      "step": 19339
+    },
+    {
+      "epoch": 0.16788048714854906,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0018395039203539547,
+      "loss": 0.0791,
+      "step": 19340
+    },
+    {
+      "epoch": 0.16788916762875322,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0018394869564789635,
+      "loss": 0.1621,
+      "step": 19341
+    },
+    {
+      "epoch": 0.1678978481089574,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0018394699917952803,
+      "loss": 0.1699,
+      "step": 19342
+    },
+    {
+      "epoch": 0.16790652858916155,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0018394530263029238,
+      "loss": 0.0898,
+      "step": 19343
+    },
+    {
+      "epoch": 0.16791520906936572,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0018394360600019126,
+      "loss": 0.0879,
+      "step": 19344
+    },
+    {
+      "epoch": 0.16792388954956988,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0018394190928922652,
+      "loss": 0.1182,
+      "step": 19345
+    },
+    {
+      "epoch": 0.16793257002977405,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0018394021249740003,
+      "loss": 0.0972,
+      "step": 19346
+    },
+    {
+      "epoch": 0.1679412505099782,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0018393851562471361,
+      "loss": 0.0996,
+      "step": 19347
+    },
+    {
+      "epoch": 0.16794993099018238,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0018393681867116918,
+      "loss": 0.209,
+      "step": 19348
+    },
+    {
+      "epoch": 0.16795861147038654,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0018393512163676854,
+      "loss": 0.1787,
+      "step": 19349
+    },
+    {
+      "epoch": 0.1679672919505907,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0018393342452151353,
+      "loss": 0.0869,
+      "step": 19350
+    },
+    {
+      "epoch": 0.16797597243079487,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0018393172732540607,
+      "loss": 0.1133,
+      "step": 19351
+    },
+    {
+      "epoch": 0.16798465291099904,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0018393003004844796,
+      "loss": 0.1504,
+      "step": 19352
+    },
+    {
+      "epoch": 0.1679933333912032,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0018392833269064112,
+      "loss": 0.126,
+      "step": 19353
+    },
+    {
+      "epoch": 0.16800201387140737,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018392663525198733,
+      "loss": 0.1494,
+      "step": 19354
+    },
+    {
+      "epoch": 0.16801069435161153,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0018392493773248851,
+      "loss": 0.1084,
+      "step": 19355
+    },
+    {
+      "epoch": 0.1680193748318157,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0018392324013214644,
+      "loss": 0.125,
+      "step": 19356
+    },
+    {
+      "epoch": 0.16802805531201986,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.001839215424509631,
+      "loss": 0.084,
+      "step": 19357
+    },
+    {
+      "epoch": 0.16803673579222403,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001839198446889402,
+      "loss": 0.1436,
+      "step": 19358
+    },
+    {
+      "epoch": 0.1680454162724282,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0018391814684607971,
+      "loss": 0.1157,
+      "step": 19359
+    },
+    {
+      "epoch": 0.16805409675263236,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0018391644892238343,
+      "loss": 0.1387,
+      "step": 19360
+    },
+    {
+      "epoch": 0.16806277723283652,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0018391475091785324,
+      "loss": 0.0967,
+      "step": 19361
+    },
+    {
+      "epoch": 0.1680714577130407,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00183913052832491,
+      "loss": 0.1279,
+      "step": 19362
+    },
+    {
+      "epoch": 0.16808013819324485,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0018391135466629854,
+      "loss": 0.1035,
+      "step": 19363
+    },
+    {
+      "epoch": 0.16808881867344902,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0018390965641927773,
+      "loss": 0.1602,
+      "step": 19364
+    },
+    {
+      "epoch": 0.16809749915365318,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0018390795809143043,
+      "loss": 0.1367,
+      "step": 19365
+    },
+    {
+      "epoch": 0.16810617963385735,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001839062596827585,
+      "loss": 0.1182,
+      "step": 19366
+    },
+    {
+      "epoch": 0.16811486011406152,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0018390456119326381,
+      "loss": 0.1055,
+      "step": 19367
+    },
+    {
+      "epoch": 0.16812354059426568,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0018390286262294818,
+      "loss": 0.1895,
+      "step": 19368
+    },
+    {
+      "epoch": 0.16813222107446985,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001839011639718135,
+      "loss": 0.0879,
+      "step": 19369
+    },
+    {
+      "epoch": 0.168140901554674,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0018389946523986159,
+      "loss": 0.1201,
+      "step": 19370
+    },
+    {
+      "epoch": 0.16814958203487818,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0018389776642709437,
+      "loss": 0.1328,
+      "step": 19371
+    },
+    {
+      "epoch": 0.16815826251508234,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001838960675335137,
+      "loss": 0.1152,
+      "step": 19372
+    },
+    {
+      "epoch": 0.1681669429952865,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0018389436855912131,
+      "loss": 0.1045,
+      "step": 19373
+    },
+    {
+      "epoch": 0.16817562347549067,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001838926695039192,
+      "loss": 0.1572,
+      "step": 19374
+    },
+    {
+      "epoch": 0.16818430395569484,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0018389097036790917,
+      "loss": 0.0928,
+      "step": 19375
+    },
+    {
+      "epoch": 0.168192984435899,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0018388927115109309,
+      "loss": 0.1118,
+      "step": 19376
+    },
+    {
+      "epoch": 0.16820166491610317,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0018388757185347283,
+      "loss": 0.1396,
+      "step": 19377
+    },
+    {
+      "epoch": 0.16821034539630733,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0018388587247505019,
+      "loss": 0.127,
+      "step": 19378
+    },
+    {
+      "epoch": 0.1682190258765115,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001838841730158271,
+      "loss": 0.1387,
+      "step": 19379
+    },
+    {
+      "epoch": 0.16822770635671566,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0018388247347580538,
+      "loss": 0.1191,
+      "step": 19380
+    },
+    {
+      "epoch": 0.16823638683691983,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0018388077385498689,
+      "loss": 0.1289,
+      "step": 19381
+    },
+    {
+      "epoch": 0.168245067317124,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0018387907415337351,
+      "loss": 0.123,
+      "step": 19382
+    },
+    {
+      "epoch": 0.16825374779732816,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0018387737437096706,
+      "loss": 0.0923,
+      "step": 19383
+    },
+    {
+      "epoch": 0.16826242827753232,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0018387567450776944,
+      "loss": 0.0923,
+      "step": 19384
+    },
+    {
+      "epoch": 0.1682711087577365,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001838739745637825,
+      "loss": 0.1543,
+      "step": 19385
+    },
+    {
+      "epoch": 0.16827978923794065,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0018387227453900807,
+      "loss": 0.1641,
+      "step": 19386
+    },
+    {
+      "epoch": 0.16828846971814482,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0018387057443344805,
+      "loss": 0.0908,
+      "step": 19387
+    },
+    {
+      "epoch": 0.16829715019834898,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0018386887424710425,
+      "loss": 0.1328,
+      "step": 19388
+    },
+    {
+      "epoch": 0.16830583067855315,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0018386717397997862,
+      "loss": 0.1123,
+      "step": 19389
+    },
+    {
+      "epoch": 0.1683145111587573,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001838654736320729,
+      "loss": 0.1318,
+      "step": 19390
+    },
+    {
+      "epoch": 0.16832319163896148,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0018386377320338903,
+      "loss": 0.1221,
+      "step": 19391
+    },
+    {
+      "epoch": 0.16833187211916564,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0018386207269392887,
+      "loss": 0.1191,
+      "step": 19392
+    },
+    {
+      "epoch": 0.1683405525993698,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0018386037210369422,
+      "loss": 0.0928,
+      "step": 19393
+    },
+    {
+      "epoch": 0.16834923307957397,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0018385867143268697,
+      "loss": 0.1289,
+      "step": 19394
+    },
+    {
+      "epoch": 0.16835791355977814,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0018385697068090904,
+      "loss": 0.1562,
+      "step": 19395
+    },
+    {
+      "epoch": 0.1683665940399823,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0018385526984836219,
+      "loss": 0.105,
+      "step": 19396
+    },
+    {
+      "epoch": 0.16837527452018647,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0018385356893504832,
+      "loss": 0.0781,
+      "step": 19397
+    },
+    {
+      "epoch": 0.16838395500039063,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0018385186794096934,
+      "loss": 0.1816,
+      "step": 19398
+    },
+    {
+      "epoch": 0.1683926354805948,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0018385016686612704,
+      "loss": 0.1021,
+      "step": 19399
+    },
+    {
+      "epoch": 0.16840131596079896,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018384846571052333,
+      "loss": 0.1309,
+      "step": 19400
+    },
+    {
+      "epoch": 0.16840999644100313,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0018384676447416,
+      "loss": 0.0967,
+      "step": 19401
+    },
+    {
+      "epoch": 0.1684186769212073,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0018384506315703899,
+      "loss": 0.0693,
+      "step": 19402
+    },
+    {
+      "epoch": 0.16842735740141146,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0018384336175916215,
+      "loss": 0.1123,
+      "step": 19403
+    },
+    {
+      "epoch": 0.16843603788161562,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0018384166028053127,
+      "loss": 0.1143,
+      "step": 19404
+    },
+    {
+      "epoch": 0.1684447183618198,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0018383995872114827,
+      "loss": 0.1025,
+      "step": 19405
+    },
+    {
+      "epoch": 0.16845339884202393,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0018383825708101502,
+      "loss": 0.0938,
+      "step": 19406
+    },
+    {
+      "epoch": 0.1684620793222281,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0018383655536013338,
+      "loss": 0.0957,
+      "step": 19407
+    },
+    {
+      "epoch": 0.16847075980243226,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0018383485355850519,
+      "loss": 0.0874,
+      "step": 19408
+    },
+    {
+      "epoch": 0.16847944028263642,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0018383315167613227,
+      "loss": 0.1406,
+      "step": 19409
+    },
+    {
+      "epoch": 0.1684881207628406,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0018383144971301653,
+      "loss": 0.1094,
+      "step": 19410
+    },
+    {
+      "epoch": 0.16849680124304475,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0018382974766915985,
+      "loss": 0.1328,
+      "step": 19411
+    },
+    {
+      "epoch": 0.16850548172324892,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0018382804554456405,
+      "loss": 0.1787,
+      "step": 19412
+    },
+    {
+      "epoch": 0.16851416220345308,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0018382634333923102,
+      "loss": 0.1406,
+      "step": 19413
+    },
+    {
+      "epoch": 0.16852284268365725,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001838246410531626,
+      "loss": 0.1309,
+      "step": 19414
+    },
+    {
+      "epoch": 0.1685315231638614,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001838229386863607,
+      "loss": 0.0908,
+      "step": 19415
+    },
+    {
+      "epoch": 0.16854020364406558,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0018382123623882712,
+      "loss": 0.1182,
+      "step": 19416
+    },
+    {
+      "epoch": 0.16854888412426974,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001838195337105637,
+      "loss": 0.0874,
+      "step": 19417
+    },
+    {
+      "epoch": 0.1685575646044739,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0018381783110157238,
+      "loss": 0.1348,
+      "step": 19418
+    },
+    {
+      "epoch": 0.16856624508467807,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.00183816128411855,
+      "loss": 0.0859,
+      "step": 19419
+    },
+    {
+      "epoch": 0.16857492556488224,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0018381442564141342,
+      "loss": 0.1523,
+      "step": 19420
+    },
+    {
+      "epoch": 0.1685836060450864,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0018381272279024948,
+      "loss": 0.0972,
+      "step": 19421
+    },
+    {
+      "epoch": 0.16859228652529057,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0018381101985836506,
+      "loss": 0.1113,
+      "step": 19422
+    },
+    {
+      "epoch": 0.16860096700549473,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0018380931684576199,
+      "loss": 0.123,
+      "step": 19423
+    },
+    {
+      "epoch": 0.1686096474856989,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0018380761375244219,
+      "loss": 0.0752,
+      "step": 19424
+    },
+    {
+      "epoch": 0.16861832796590306,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001838059105784075,
+      "loss": 0.1123,
+      "step": 19425
+    },
+    {
+      "epoch": 0.16862700844610723,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0018380420732365974,
+      "loss": 0.1182,
+      "step": 19426
+    },
+    {
+      "epoch": 0.1686356889263114,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0018380250398820084,
+      "loss": 0.1001,
+      "step": 19427
+    },
+    {
+      "epoch": 0.16864436940651556,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.001838008005720326,
+      "loss": 0.1157,
+      "step": 19428
+    },
+    {
+      "epoch": 0.16865304988671972,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0018379909707515695,
+      "loss": 0.1279,
+      "step": 19429
+    },
+    {
+      "epoch": 0.1686617303669239,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0018379739349757569,
+      "loss": 0.0967,
+      "step": 19430
+    },
+    {
+      "epoch": 0.16867041084712805,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0018379568983929071,
+      "loss": 0.1221,
+      "step": 19431
+    },
+    {
+      "epoch": 0.16867909132733222,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0018379398610030386,
+      "loss": 0.0801,
+      "step": 19432
+    },
+    {
+      "epoch": 0.16868777180753639,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0018379228228061707,
+      "loss": 0.1099,
+      "step": 19433
+    },
+    {
+      "epoch": 0.16869645228774055,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0018379057838023211,
+      "loss": 0.0977,
+      "step": 19434
+    },
+    {
+      "epoch": 0.16870513276794472,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0018378887439915088,
+      "loss": 0.1104,
+      "step": 19435
+    },
+    {
+      "epoch": 0.16871381324814888,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0018378717033737525,
+      "loss": 0.1279,
+      "step": 19436
+    },
+    {
+      "epoch": 0.16872249372835305,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0018378546619490707,
+      "loss": 0.1426,
+      "step": 19437
+    },
+    {
+      "epoch": 0.1687311742085572,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0018378376197174823,
+      "loss": 0.0962,
+      "step": 19438
+    },
+    {
+      "epoch": 0.16873985468876138,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.0018378205766790058,
+      "loss": 0.127,
+      "step": 19439
+    },
+    {
+      "epoch": 0.16874853516896554,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0018378035328336596,
+      "loss": 0.125,
+      "step": 19440
+    },
+    {
+      "epoch": 0.1687572156491697,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0018377864881814626,
+      "loss": 0.1089,
+      "step": 19441
+    },
+    {
+      "epoch": 0.16876589612937387,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0018377694427224333,
+      "loss": 0.1172,
+      "step": 19442
+    },
+    {
+      "epoch": 0.16877457660957804,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018377523964565904,
+      "loss": 0.1162,
+      "step": 19443
+    },
+    {
+      "epoch": 0.1687832570897822,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001837735349383953,
+      "loss": 0.1211,
+      "step": 19444
+    },
+    {
+      "epoch": 0.16879193756998637,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001837718301504539,
+      "loss": 0.0801,
+      "step": 19445
+    },
+    {
+      "epoch": 0.16880061805019053,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0018377012528183673,
+      "loss": 0.1035,
+      "step": 19446
+    },
+    {
+      "epoch": 0.1688092985303947,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0018376842033254565,
+      "loss": 0.1113,
+      "step": 19447
+    },
+    {
+      "epoch": 0.16881797901059886,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0018376671530258258,
+      "loss": 0.1387,
+      "step": 19448
+    },
+    {
+      "epoch": 0.16882665949080303,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0018376501019194932,
+      "loss": 0.127,
+      "step": 19449
+    },
+    {
+      "epoch": 0.1688353399710072,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0018376330500064772,
+      "loss": 0.085,
+      "step": 19450
+    },
+    {
+      "epoch": 0.16884402045121136,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0018376159972867969,
+      "loss": 0.1621,
+      "step": 19451
+    },
+    {
+      "epoch": 0.16885270093141552,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001837598943760471,
+      "loss": 0.0972,
+      "step": 19452
+    },
+    {
+      "epoch": 0.1688613814116197,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001837581889427518,
+      "loss": 0.1216,
+      "step": 19453
+    },
+    {
+      "epoch": 0.16887006189182385,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0018375648342879563,
+      "loss": 0.1045,
+      "step": 19454
+    },
+    {
+      "epoch": 0.16887874237202802,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001837547778341805,
+      "loss": 0.1226,
+      "step": 19455
+    },
+    {
+      "epoch": 0.16888742285223218,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0018375307215890823,
+      "loss": 0.0864,
+      "step": 19456
+    },
+    {
+      "epoch": 0.16889610333243635,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0018375136640298073,
+      "loss": 0.1211,
+      "step": 19457
+    },
+    {
+      "epoch": 0.1689047838126405,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0018374966056639984,
+      "loss": 0.1074,
+      "step": 19458
+    },
+    {
+      "epoch": 0.16891346429284468,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0018374795464916745,
+      "loss": 0.123,
+      "step": 19459
+    },
+    {
+      "epoch": 0.16892214477304884,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0018374624865128534,
+      "loss": 0.106,
+      "step": 19460
+    },
+    {
+      "epoch": 0.168930825253253,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.001837445425727555,
+      "loss": 0.1504,
+      "step": 19461
+    },
+    {
+      "epoch": 0.16893950573345717,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0018374283641357971,
+      "loss": 0.0913,
+      "step": 19462
+    },
+    {
+      "epoch": 0.16894818621366134,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0018374113017375986,
+      "loss": 0.1426,
+      "step": 19463
+    },
+    {
+      "epoch": 0.1689568666938655,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0018373942385329785,
+      "loss": 0.1133,
+      "step": 19464
+    },
+    {
+      "epoch": 0.16896554717406967,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0018373771745219547,
+      "loss": 0.0957,
+      "step": 19465
+    },
+    {
+      "epoch": 0.16897422765427383,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0018373601097045468,
+      "loss": 0.123,
+      "step": 19466
+    },
+    {
+      "epoch": 0.168982908134478,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0018373430440807726,
+      "loss": 0.0947,
+      "step": 19467
+    },
+    {
+      "epoch": 0.16899158861468216,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001837325977650651,
+      "loss": 0.126,
+      "step": 19468
+    },
+    {
+      "epoch": 0.16900026909488633,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0018373089104142008,
+      "loss": 0.1328,
+      "step": 19469
+    },
+    {
+      "epoch": 0.1690089495750905,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001837291842371441,
+      "loss": 0.1133,
+      "step": 19470
+    },
+    {
+      "epoch": 0.16901763005529466,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0018372747735223899,
+      "loss": 0.1816,
+      "step": 19471
+    },
+    {
+      "epoch": 0.16902631053549882,
+      "grad_norm": 1.7734375,
+      "learning_rate": 0.001837257703867066,
+      "loss": 0.166,
+      "step": 19472
+    },
+    {
+      "epoch": 0.169034991015703,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0018372406334054882,
+      "loss": 0.1543,
+      "step": 19473
+    },
+    {
+      "epoch": 0.16904367149590716,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0018372235621376752,
+      "loss": 0.1318,
+      "step": 19474
+    },
+    {
+      "epoch": 0.16905235197611132,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0018372064900636456,
+      "loss": 0.1089,
+      "step": 19475
+    },
+    {
+      "epoch": 0.16906103245631549,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0018371894171834177,
+      "loss": 0.0732,
+      "step": 19476
+    },
+    {
+      "epoch": 0.16906971293651965,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001837172343497011,
+      "loss": 0.1436,
+      "step": 19477
+    },
+    {
+      "epoch": 0.16907839341672382,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0018371552690044435,
+      "loss": 0.1133,
+      "step": 19478
+    },
+    {
+      "epoch": 0.16908707389692798,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001837138193705734,
+      "loss": 0.1016,
+      "step": 19479
+    },
+    {
+      "epoch": 0.16909575437713215,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0018371211176009012,
+      "loss": 0.0938,
+      "step": 19480
+    },
+    {
+      "epoch": 0.1691044348573363,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001837104040689964,
+      "loss": 0.1016,
+      "step": 19481
+    },
+    {
+      "epoch": 0.16911311533754048,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001837086962972941,
+      "loss": 0.103,
+      "step": 19482
+    },
+    {
+      "epoch": 0.16912179581774464,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0018370698844498506,
+      "loss": 0.1377,
+      "step": 19483
+    },
+    {
+      "epoch": 0.1691304762979488,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0018370528051207117,
+      "loss": 0.125,
+      "step": 19484
+    },
+    {
+      "epoch": 0.16913915677815297,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0018370357249855426,
+      "loss": 0.0981,
+      "step": 19485
+    },
+    {
+      "epoch": 0.16914783725835714,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0018370186440443625,
+      "loss": 0.123,
+      "step": 19486
+    },
+    {
+      "epoch": 0.1691565177385613,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0018370015622971901,
+      "loss": 0.1006,
+      "step": 19487
+    },
+    {
+      "epoch": 0.16916519821876547,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0018369844797440438,
+      "loss": 0.1582,
+      "step": 19488
+    },
+    {
+      "epoch": 0.16917387869896963,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0018369673963849426,
+      "loss": 0.126,
+      "step": 19489
+    },
+    {
+      "epoch": 0.1691825591791738,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0018369503122199042,
+      "loss": 0.1123,
+      "step": 19490
+    },
+    {
+      "epoch": 0.16919123965937796,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0018369332272489486,
+      "loss": 0.1514,
+      "step": 19491
+    },
+    {
+      "epoch": 0.16919992013958213,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0018369161414720937,
+      "loss": 0.1787,
+      "step": 19492
+    },
+    {
+      "epoch": 0.1692086006197863,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018368990548893582,
+      "loss": 0.0913,
+      "step": 19493
+    },
+    {
+      "epoch": 0.16921728109999046,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0018368819675007612,
+      "loss": 0.1055,
+      "step": 19494
+    },
+    {
+      "epoch": 0.16922596158019462,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0018368648793063212,
+      "loss": 0.0894,
+      "step": 19495
+    },
+    {
+      "epoch": 0.1692346420603988,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0018368477903060567,
+      "loss": 0.1045,
+      "step": 19496
+    },
+    {
+      "epoch": 0.16924332254060295,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0018368307004999866,
+      "loss": 0.0869,
+      "step": 19497
+    },
+    {
+      "epoch": 0.16925200302080712,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0018368136098881294,
+      "loss": 0.1504,
+      "step": 19498
+    },
+    {
+      "epoch": 0.16926068350101128,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0018367965184705043,
+      "loss": 0.0747,
+      "step": 19499
+    },
+    {
+      "epoch": 0.16926936398121545,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0018367794262471291,
+      "loss": 0.0918,
+      "step": 19500
+    },
+    {
+      "epoch": 0.1692780444614196,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0018367623332180235,
+      "loss": 0.1689,
+      "step": 19501
+    },
+    {
+      "epoch": 0.16928672494162378,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0018367452393832052,
+      "loss": 0.1094,
+      "step": 19502
+    },
+    {
+      "epoch": 0.16929540542182794,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0018367281447426932,
+      "loss": 0.1406,
+      "step": 19503
+    },
+    {
+      "epoch": 0.1693040859020321,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001836711049296507,
+      "loss": 0.0918,
+      "step": 19504
+    },
+    {
+      "epoch": 0.16931276638223627,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018366939530446644,
+      "loss": 0.1104,
+      "step": 19505
+    },
+    {
+      "epoch": 0.16932144686244044,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018366768559871843,
+      "loss": 0.1118,
+      "step": 19506
+    },
+    {
+      "epoch": 0.1693301273426446,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0018366597581240855,
+      "loss": 0.124,
+      "step": 19507
+    },
+    {
+      "epoch": 0.16933880782284877,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0018366426594553867,
+      "loss": 0.1172,
+      "step": 19508
+    },
+    {
+      "epoch": 0.16934748830305293,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0018366255599811063,
+      "loss": 0.127,
+      "step": 19509
+    },
+    {
+      "epoch": 0.1693561687832571,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0018366084597012638,
+      "loss": 0.1123,
+      "step": 19510
+    },
+    {
+      "epoch": 0.16936484926346126,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0018365913586158768,
+      "loss": 0.1001,
+      "step": 19511
+    },
+    {
+      "epoch": 0.16937352974366543,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018365742567249647,
+      "loss": 0.0771,
+      "step": 19512
+    },
+    {
+      "epoch": 0.1693822102238696,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0018365571540285464,
+      "loss": 0.1221,
+      "step": 19513
+    },
+    {
+      "epoch": 0.16939089070407376,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0018365400505266402,
+      "loss": 0.1445,
+      "step": 19514
+    },
+    {
+      "epoch": 0.16939957118427793,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0018365229462192643,
+      "loss": 0.1084,
+      "step": 19515
+    },
+    {
+      "epoch": 0.1694082516644821,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0018365058411064386,
+      "loss": 0.0967,
+      "step": 19516
+    },
+    {
+      "epoch": 0.16941693214468626,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0018364887351881808,
+      "loss": 0.0771,
+      "step": 19517
+    },
+    {
+      "epoch": 0.16942561262489042,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.00183647162846451,
+      "loss": 0.1436,
+      "step": 19518
+    },
+    {
+      "epoch": 0.16943429310509459,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001836454520935445,
+      "loss": 0.0737,
+      "step": 19519
+    },
+    {
+      "epoch": 0.16944297358529875,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0018364374126010046,
+      "loss": 0.126,
+      "step": 19520
+    },
+    {
+      "epoch": 0.16945165406550292,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001836420303461207,
+      "loss": 0.1074,
+      "step": 19521
+    },
+    {
+      "epoch": 0.16946033454570708,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018364031935160713,
+      "loss": 0.2031,
+      "step": 19522
+    },
+    {
+      "epoch": 0.16946901502591125,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018363860827656162,
+      "loss": 0.0938,
+      "step": 19523
+    },
+    {
+      "epoch": 0.1694776955061154,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0018363689712098603,
+      "loss": 0.1055,
+      "step": 19524
+    },
+    {
+      "epoch": 0.16948637598631958,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0018363518588488223,
+      "loss": 0.125,
+      "step": 19525
+    },
+    {
+      "epoch": 0.16949505646652374,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001836334745682521,
+      "loss": 0.0942,
+      "step": 19526
+    },
+    {
+      "epoch": 0.1695037369467279,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001836317631710975,
+      "loss": 0.1025,
+      "step": 19527
+    },
+    {
+      "epoch": 0.16951241742693207,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0018363005169342033,
+      "loss": 0.0698,
+      "step": 19528
+    },
+    {
+      "epoch": 0.1695210979071362,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0018362834013522244,
+      "loss": 0.1396,
+      "step": 19529
+    },
+    {
+      "epoch": 0.16952977838734037,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001836266284965057,
+      "loss": 0.0811,
+      "step": 19530
+    },
+    {
+      "epoch": 0.16953845886754454,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.00183624916777272,
+      "loss": 0.1426,
+      "step": 19531
+    },
+    {
+      "epoch": 0.1695471393477487,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0018362320497752318,
+      "loss": 0.1465,
+      "step": 19532
+    },
+    {
+      "epoch": 0.16955581982795287,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0018362149309726114,
+      "loss": 0.1211,
+      "step": 19533
+    },
+    {
+      "epoch": 0.16956450030815703,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018361978113648771,
+      "loss": 0.1196,
+      "step": 19534
+    },
+    {
+      "epoch": 0.1695731807883612,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0018361806909520484,
+      "loss": 0.1045,
+      "step": 19535
+    },
+    {
+      "epoch": 0.16958186126856536,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001836163569734143,
+      "loss": 0.1309,
+      "step": 19536
+    },
+    {
+      "epoch": 0.16959054174876953,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0018361464477111807,
+      "loss": 0.1182,
+      "step": 19537
+    },
+    {
+      "epoch": 0.1695992222289737,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0018361293248831793,
+      "loss": 0.0811,
+      "step": 19538
+    },
+    {
+      "epoch": 0.16960790270917786,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.001836112201250158,
+      "loss": 0.1377,
+      "step": 19539
+    },
+    {
+      "epoch": 0.16961658318938203,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0018360950768121357,
+      "loss": 0.125,
+      "step": 19540
+    },
+    {
+      "epoch": 0.1696252636695862,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0018360779515691308,
+      "loss": 0.0845,
+      "step": 19541
+    },
+    {
+      "epoch": 0.16963394414979036,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001836060825521162,
+      "loss": 0.1416,
+      "step": 19542
+    },
+    {
+      "epoch": 0.16964262462999452,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0018360436986682483,
+      "loss": 0.1094,
+      "step": 19543
+    },
+    {
+      "epoch": 0.16965130511019869,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018360265710104082,
+      "loss": 0.1079,
+      "step": 19544
+    },
+    {
+      "epoch": 0.16965998559040285,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0018360094425476606,
+      "loss": 0.1426,
+      "step": 19545
+    },
+    {
+      "epoch": 0.16966866607060702,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0018359923132800237,
+      "loss": 0.1079,
+      "step": 19546
+    },
+    {
+      "epoch": 0.16967734655081118,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018359751832075172,
+      "loss": 0.1367,
+      "step": 19547
+    },
+    {
+      "epoch": 0.16968602703101535,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001835958052330159,
+      "loss": 0.0996,
+      "step": 19548
+    },
+    {
+      "epoch": 0.1696947075112195,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018359409206479683,
+      "loss": 0.083,
+      "step": 19549
+    },
+    {
+      "epoch": 0.16970338799142368,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0018359237881609635,
+      "loss": 0.0967,
+      "step": 19550
+    },
+    {
+      "epoch": 0.16971206847162784,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0018359066548691635,
+      "loss": 0.1084,
+      "step": 19551
+    },
+    {
+      "epoch": 0.169720748951832,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018358895207725872,
+      "loss": 0.0806,
+      "step": 19552
+    },
+    {
+      "epoch": 0.16972942943203617,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018358723858712532,
+      "loss": 0.1084,
+      "step": 19553
+    },
+    {
+      "epoch": 0.16973810991224034,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.00183585525016518,
+      "loss": 0.0996,
+      "step": 19554
+    },
+    {
+      "epoch": 0.1697467903924445,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0018358381136543867,
+      "loss": 0.1221,
+      "step": 19555
+    },
+    {
+      "epoch": 0.16975547087264867,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0018358209763388918,
+      "loss": 0.2852,
+      "step": 19556
+    },
+    {
+      "epoch": 0.16976415135285283,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0018358038382187143,
+      "loss": 0.2285,
+      "step": 19557
+    },
+    {
+      "epoch": 0.169772831833057,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0018357866992938726,
+      "loss": 0.125,
+      "step": 19558
+    },
+    {
+      "epoch": 0.16978151231326116,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0018357695595643858,
+      "loss": 0.0742,
+      "step": 19559
+    },
+    {
+      "epoch": 0.16979019279346533,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001835752419030272,
+      "loss": 0.0854,
+      "step": 19560
+    },
+    {
+      "epoch": 0.1697988732736695,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001835735277691551,
+      "loss": 0.1455,
+      "step": 19561
+    },
+    {
+      "epoch": 0.16980755375387366,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0018357181355482409,
+      "loss": 0.1138,
+      "step": 19562
+    },
+    {
+      "epoch": 0.16981623423407782,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.00183570099260036,
+      "loss": 0.1094,
+      "step": 19563
+    },
+    {
+      "epoch": 0.169824914714282,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001835683848847928,
+      "loss": 0.1338,
+      "step": 19564
+    },
+    {
+      "epoch": 0.16983359519448615,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001835666704290963,
+      "loss": 0.0854,
+      "step": 19565
+    },
+    {
+      "epoch": 0.16984227567469032,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001835649558929484,
+      "loss": 0.105,
+      "step": 19566
+    },
+    {
+      "epoch": 0.16985095615489448,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0018356324127635098,
+      "loss": 0.1064,
+      "step": 19567
+    },
+    {
+      "epoch": 0.16985963663509865,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0018356152657930589,
+      "loss": 0.0825,
+      "step": 19568
+    },
+    {
+      "epoch": 0.1698683171153028,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0018355981180181503,
+      "loss": 0.0986,
+      "step": 19569
+    },
+    {
+      "epoch": 0.16987699759550698,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0018355809694388023,
+      "loss": 0.1045,
+      "step": 19570
+    },
+    {
+      "epoch": 0.16988567807571114,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0018355638200550346,
+      "loss": 0.1406,
+      "step": 19571
+    },
+    {
+      "epoch": 0.1698943585559153,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001835546669866865,
+      "loss": 0.127,
+      "step": 19572
+    },
+    {
+      "epoch": 0.16990303903611947,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001835529518874313,
+      "loss": 0.1328,
+      "step": 19573
+    },
+    {
+      "epoch": 0.16991171951632364,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0018355123670773966,
+      "loss": 0.1924,
+      "step": 19574
+    },
+    {
+      "epoch": 0.1699203999965278,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0018354952144761347,
+      "loss": 0.106,
+      "step": 19575
+    },
+    {
+      "epoch": 0.16992908047673197,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0018354780610705466,
+      "loss": 0.1069,
+      "step": 19576
+    },
+    {
+      "epoch": 0.16993776095693613,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0018354609068606507,
+      "loss": 0.1108,
+      "step": 19577
+    },
+    {
+      "epoch": 0.1699464414371403,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0018354437518464657,
+      "loss": 0.1074,
+      "step": 19578
+    },
+    {
+      "epoch": 0.16995512191734446,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0018354265960280108,
+      "loss": 0.1045,
+      "step": 19579
+    },
+    {
+      "epoch": 0.16996380239754863,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0018354094394053041,
+      "loss": 0.1074,
+      "step": 19580
+    },
+    {
+      "epoch": 0.1699724828777528,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0018353922819783646,
+      "loss": 0.0659,
+      "step": 19581
+    },
+    {
+      "epoch": 0.16998116335795696,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0018353751237472117,
+      "loss": 0.1108,
+      "step": 19582
+    },
+    {
+      "epoch": 0.16998984383816113,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.001835357964711863,
+      "loss": 0.1494,
+      "step": 19583
+    },
+    {
+      "epoch": 0.1699985243183653,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001835340804872338,
+      "loss": 0.1108,
+      "step": 19584
+    },
+    {
+      "epoch": 0.17000720479856946,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0018353236442286557,
+      "loss": 0.1406,
+      "step": 19585
+    },
+    {
+      "epoch": 0.17001588527877362,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001835306482780834,
+      "loss": 0.1328,
+      "step": 19586
+    },
+    {
+      "epoch": 0.17002456575897779,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0018352893205288929,
+      "loss": 0.0894,
+      "step": 19587
+    },
+    {
+      "epoch": 0.17003324623918195,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0018352721574728496,
+      "loss": 0.1157,
+      "step": 19588
+    },
+    {
+      "epoch": 0.17004192671938612,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0018352549936127241,
+      "loss": 0.1279,
+      "step": 19589
+    },
+    {
+      "epoch": 0.17005060719959028,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001835237828948535,
+      "loss": 0.1309,
+      "step": 19590
+    },
+    {
+      "epoch": 0.17005928767979445,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0018352206634803007,
+      "loss": 0.1089,
+      "step": 19591
+    },
+    {
+      "epoch": 0.1700679681599986,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0018352034972080398,
+      "loss": 0.1118,
+      "step": 19592
+    },
+    {
+      "epoch": 0.17007664864020278,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0018351863301317717,
+      "loss": 0.0786,
+      "step": 19593
+    },
+    {
+      "epoch": 0.17008532912040694,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018351691622515152,
+      "loss": 0.0845,
+      "step": 19594
+    },
+    {
+      "epoch": 0.1700940096006111,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0018351519935672883,
+      "loss": 0.1367,
+      "step": 19595
+    },
+    {
+      "epoch": 0.17010269008081527,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.00183513482407911,
+      "loss": 0.1128,
+      "step": 19596
+    },
+    {
+      "epoch": 0.17011137056101944,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001835117653787,
+      "loss": 0.1143,
+      "step": 19597
+    },
+    {
+      "epoch": 0.1701200510412236,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001835100482690976,
+      "loss": 0.0762,
+      "step": 19598
+    },
+    {
+      "epoch": 0.17012873152142777,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.001835083310791057,
+      "loss": 0.2793,
+      "step": 19599
+    },
+    {
+      "epoch": 0.17013741200163193,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018350661380872621,
+      "loss": 0.1074,
+      "step": 19600
+    },
+    {
+      "epoch": 0.1701460924818361,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.00183504896457961,
+      "loss": 0.1426,
+      "step": 19601
+    },
+    {
+      "epoch": 0.17015477296204026,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0018350317902681194,
+      "loss": 0.124,
+      "step": 19602
+    },
+    {
+      "epoch": 0.17016345344224443,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0018350146151528092,
+      "loss": 0.1309,
+      "step": 19603
+    },
+    {
+      "epoch": 0.1701721339224486,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0018349974392336977,
+      "loss": 0.0967,
+      "step": 19604
+    },
+    {
+      "epoch": 0.17018081440265276,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001834980262510804,
+      "loss": 0.1064,
+      "step": 19605
+    },
+    {
+      "epoch": 0.17018949488285692,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0018349630849841471,
+      "loss": 0.1074,
+      "step": 19606
+    },
+    {
+      "epoch": 0.1701981753630611,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001834945906653746,
+      "loss": 0.123,
+      "step": 19607
+    },
+    {
+      "epoch": 0.17020685584326525,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0018349287275196188,
+      "loss": 0.1621,
+      "step": 19608
+    },
+    {
+      "epoch": 0.17021553632346942,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0018349115475817844,
+      "loss": 0.1309,
+      "step": 19609
+    },
+    {
+      "epoch": 0.17022421680367358,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0018348943668402618,
+      "loss": 0.1099,
+      "step": 19610
+    },
+    {
+      "epoch": 0.17023289728387775,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0018348771852950698,
+      "loss": 0.0991,
+      "step": 19611
+    },
+    {
+      "epoch": 0.17024157776408191,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0018348600029462275,
+      "loss": 0.6641,
+      "step": 19612
+    },
+    {
+      "epoch": 0.17025025824428608,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0018348428197937528,
+      "loss": 0.1211,
+      "step": 19613
+    },
+    {
+      "epoch": 0.17025893872449024,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018348256358376653,
+      "loss": 0.1128,
+      "step": 19614
+    },
+    {
+      "epoch": 0.1702676192046944,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018348084510779834,
+      "loss": 0.083,
+      "step": 19615
+    },
+    {
+      "epoch": 0.17027629968489857,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0018347912655147262,
+      "loss": 0.1406,
+      "step": 19616
+    },
+    {
+      "epoch": 0.17028498016510274,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018347740791479124,
+      "loss": 0.1152,
+      "step": 19617
+    },
+    {
+      "epoch": 0.1702936606453069,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.0018347568919775603,
+      "loss": 0.0918,
+      "step": 19618
+    },
+    {
+      "epoch": 0.17030234112551107,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0018347397040036895,
+      "loss": 0.1396,
+      "step": 19619
+    },
+    {
+      "epoch": 0.17031102160571523,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0018347225152263184,
+      "loss": 0.1104,
+      "step": 19620
+    },
+    {
+      "epoch": 0.1703197020859194,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0018347053256454657,
+      "loss": 0.1914,
+      "step": 19621
+    },
+    {
+      "epoch": 0.17032838256612357,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.00183468813526115,
+      "loss": 0.082,
+      "step": 19622
+    },
+    {
+      "epoch": 0.17033706304632773,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0018346709440733907,
+      "loss": 0.0781,
+      "step": 19623
+    },
+    {
+      "epoch": 0.1703457435265319,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0018346537520822065,
+      "loss": 0.1113,
+      "step": 19624
+    },
+    {
+      "epoch": 0.17035442400673606,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0018346365592876157,
+      "loss": 0.0947,
+      "step": 19625
+    },
+    {
+      "epoch": 0.17036310448694023,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0018346193656896372,
+      "loss": 0.0898,
+      "step": 19626
+    },
+    {
+      "epoch": 0.1703717849671444,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0018346021712882903,
+      "loss": 0.1279,
+      "step": 19627
+    },
+    {
+      "epoch": 0.17038046544734856,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0018345849760835935,
+      "loss": 0.125,
+      "step": 19628
+    },
+    {
+      "epoch": 0.17038914592755272,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0018345677800755656,
+      "loss": 0.0806,
+      "step": 19629
+    },
+    {
+      "epoch": 0.17039782640775689,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0018345505832642252,
+      "loss": 0.1035,
+      "step": 19630
+    },
+    {
+      "epoch": 0.17040650688796105,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0018345333856495914,
+      "loss": 0.0845,
+      "step": 19631
+    },
+    {
+      "epoch": 0.17041518736816522,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001834516187231683,
+      "loss": 0.1191,
+      "step": 19632
+    },
+    {
+      "epoch": 0.17042386784836938,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018344989880105188,
+      "loss": 0.0859,
+      "step": 19633
+    },
+    {
+      "epoch": 0.17043254832857355,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0018344817879861175,
+      "loss": 0.125,
+      "step": 19634
+    },
+    {
+      "epoch": 0.1704412288087777,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0018344645871584979,
+      "loss": 0.1064,
+      "step": 19635
+    },
+    {
+      "epoch": 0.17044990928898188,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001834447385527679,
+      "loss": 0.1045,
+      "step": 19636
+    },
+    {
+      "epoch": 0.17045858976918604,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0018344301830936793,
+      "loss": 0.103,
+      "step": 19637
+    },
+    {
+      "epoch": 0.1704672702493902,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0018344129798565179,
+      "loss": 0.1406,
+      "step": 19638
+    },
+    {
+      "epoch": 0.17047595072959437,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0018343957758162135,
+      "loss": 0.082,
+      "step": 19639
+    },
+    {
+      "epoch": 0.17048463120979854,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0018343785709727847,
+      "loss": 0.1289,
+      "step": 19640
+    },
+    {
+      "epoch": 0.1704933116900027,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0018343613653262509,
+      "loss": 0.127,
+      "step": 19641
+    },
+    {
+      "epoch": 0.17050199217020687,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.00183434415887663,
+      "loss": 0.1387,
+      "step": 19642
+    },
+    {
+      "epoch": 0.17051067265041103,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0018343269516239418,
+      "loss": 0.1328,
+      "step": 19643
+    },
+    {
+      "epoch": 0.1705193531306152,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0018343097435682047,
+      "loss": 0.1162,
+      "step": 19644
+    },
+    {
+      "epoch": 0.17052803361081936,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0018342925347094373,
+      "loss": 0.1201,
+      "step": 19645
+    },
+    {
+      "epoch": 0.17053671409102353,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0018342753250476587,
+      "loss": 0.1162,
+      "step": 19646
+    },
+    {
+      "epoch": 0.1705453945712277,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0018342581145828877,
+      "loss": 0.123,
+      "step": 19647
+    },
+    {
+      "epoch": 0.17055407505143186,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0018342409033151427,
+      "loss": 0.0908,
+      "step": 19648
+    },
+    {
+      "epoch": 0.17056275553163602,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0018342236912444435,
+      "loss": 0.1387,
+      "step": 19649
+    },
+    {
+      "epoch": 0.1705714360118402,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001834206478370808,
+      "loss": 0.207,
+      "step": 19650
+    },
+    {
+      "epoch": 0.17058011649204435,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001834189264694255,
+      "loss": 0.083,
+      "step": 19651
+    },
+    {
+      "epoch": 0.1705887969722485,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0018341720502148041,
+      "loss": 0.1396,
+      "step": 19652
+    },
+    {
+      "epoch": 0.17059747745245266,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0018341548349324736,
+      "loss": 0.082,
+      "step": 19653
+    },
+    {
+      "epoch": 0.17060615793265682,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0018341376188472825,
+      "loss": 0.1074,
+      "step": 19654
+    },
+    {
+      "epoch": 0.170614838412861,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0018341204019592492,
+      "loss": 0.1396,
+      "step": 19655
+    },
+    {
+      "epoch": 0.17062351889306515,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0018341031842683928,
+      "loss": 0.0996,
+      "step": 19656
+    },
+    {
+      "epoch": 0.17063219937326932,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0018340859657747324,
+      "loss": 0.1064,
+      "step": 19657
+    },
+    {
+      "epoch": 0.17064087985347348,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0018340687464782866,
+      "loss": 0.1377,
+      "step": 19658
+    },
+    {
+      "epoch": 0.17064956033367765,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0018340515263790742,
+      "loss": 0.0938,
+      "step": 19659
+    },
+    {
+      "epoch": 0.1706582408138818,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0018340343054771143,
+      "loss": 0.1172,
+      "step": 19660
+    },
+    {
+      "epoch": 0.17066692129408598,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0018340170837724253,
+      "loss": 0.127,
+      "step": 19661
+    },
+    {
+      "epoch": 0.17067560177429014,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018339998612650263,
+      "loss": 0.1074,
+      "step": 19662
+    },
+    {
+      "epoch": 0.1706842822544943,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0018339826379549363,
+      "loss": 0.1484,
+      "step": 19663
+    },
+    {
+      "epoch": 0.17069296273469847,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0018339654138421735,
+      "loss": 0.1084,
+      "step": 19664
+    },
+    {
+      "epoch": 0.17070164321490264,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0018339481889267574,
+      "loss": 0.1299,
+      "step": 19665
+    },
+    {
+      "epoch": 0.1707103236951068,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0018339309632087065,
+      "loss": 0.103,
+      "step": 19666
+    },
+    {
+      "epoch": 0.17071900417531097,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.00183391373668804,
+      "loss": 0.0713,
+      "step": 19667
+    },
+    {
+      "epoch": 0.17072768465551513,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001833896509364776,
+      "loss": 0.0884,
+      "step": 19668
+    },
+    {
+      "epoch": 0.1707363651357193,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0018338792812389341,
+      "loss": 0.1143,
+      "step": 19669
+    },
+    {
+      "epoch": 0.17074504561592346,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0018338620523105328,
+      "loss": 0.1797,
+      "step": 19670
+    },
+    {
+      "epoch": 0.17075372609612763,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0018338448225795909,
+      "loss": 0.1514,
+      "step": 19671
+    },
+    {
+      "epoch": 0.1707624065763318,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0018338275920461273,
+      "loss": 0.1357,
+      "step": 19672
+    },
+    {
+      "epoch": 0.17077108705653596,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0018338103607101611,
+      "loss": 0.085,
+      "step": 19673
+    },
+    {
+      "epoch": 0.17077976753674012,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0018337931285717108,
+      "loss": 0.1152,
+      "step": 19674
+    },
+    {
+      "epoch": 0.1707884480169443,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0018337758956307954,
+      "loss": 0.1826,
+      "step": 19675
+    },
+    {
+      "epoch": 0.17079712849714845,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0018337586618874335,
+      "loss": 0.1445,
+      "step": 19676
+    },
+    {
+      "epoch": 0.17080580897735262,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0018337414273416443,
+      "loss": 0.127,
+      "step": 19677
+    },
+    {
+      "epoch": 0.17081448945755678,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0018337241919934466,
+      "loss": 0.1133,
+      "step": 19678
+    },
+    {
+      "epoch": 0.17082316993776095,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001833706955842859,
+      "loss": 0.1289,
+      "step": 19679
+    },
+    {
+      "epoch": 0.17083185041796511,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0018336897188899004,
+      "loss": 0.0913,
+      "step": 19680
+    },
+    {
+      "epoch": 0.17084053089816928,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0018336724811345902,
+      "loss": 0.0889,
+      "step": 19681
+    },
+    {
+      "epoch": 0.17084921137837344,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0018336552425769464,
+      "loss": 0.127,
+      "step": 19682
+    },
+    {
+      "epoch": 0.1708578918585776,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0018336380032169884,
+      "loss": 0.0835,
+      "step": 19683
+    },
+    {
+      "epoch": 0.17086657233878177,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0018336207630547346,
+      "loss": 0.1504,
+      "step": 19684
+    },
+    {
+      "epoch": 0.17087525281898594,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0018336035220902045,
+      "loss": 0.1191,
+      "step": 19685
+    },
+    {
+      "epoch": 0.1708839332991901,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0018335862803234165,
+      "loss": 0.0723,
+      "step": 19686
+    },
+    {
+      "epoch": 0.17089261377939427,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0018335690377543896,
+      "loss": 0.1045,
+      "step": 19687
+    },
+    {
+      "epoch": 0.17090129425959844,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0018335517943831423,
+      "loss": 0.1045,
+      "step": 19688
+    },
+    {
+      "epoch": 0.1709099747398026,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0018335345502096946,
+      "loss": 0.1094,
+      "step": 19689
+    },
+    {
+      "epoch": 0.17091865522000677,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0018335173052340637,
+      "loss": 0.0977,
+      "step": 19690
+    },
+    {
+      "epoch": 0.17092733570021093,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0018335000594562694,
+      "loss": 0.1143,
+      "step": 19691
+    },
+    {
+      "epoch": 0.1709360161804151,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001833482812876331,
+      "loss": 0.0928,
+      "step": 19692
+    },
+    {
+      "epoch": 0.17094469666061926,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0018334655654942665,
+      "loss": 0.0991,
+      "step": 19693
+    },
+    {
+      "epoch": 0.17095337714082343,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001833448317310095,
+      "loss": 0.1094,
+      "step": 19694
+    },
+    {
+      "epoch": 0.1709620576210276,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0018334310683238353,
+      "loss": 0.1079,
+      "step": 19695
+    },
+    {
+      "epoch": 0.17097073810123176,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0018334138185355066,
+      "loss": 0.1035,
+      "step": 19696
+    },
+    {
+      "epoch": 0.17097941858143592,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0018333965679451277,
+      "loss": 0.1152,
+      "step": 19697
+    },
+    {
+      "epoch": 0.1709880990616401,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0018333793165527172,
+      "loss": 0.1367,
+      "step": 19698
+    },
+    {
+      "epoch": 0.17099677954184425,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0018333620643582941,
+      "loss": 0.0952,
+      "step": 19699
+    },
+    {
+      "epoch": 0.17100546002204842,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0018333448113618772,
+      "loss": 0.0947,
+      "step": 19700
+    },
+    {
+      "epoch": 0.17101414050225258,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0018333275575634854,
+      "loss": 0.1162,
+      "step": 19701
+    },
+    {
+      "epoch": 0.17102282098245675,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0018333103029631378,
+      "loss": 0.1455,
+      "step": 19702
+    },
+    {
+      "epoch": 0.1710315014626609,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0018332930475608532,
+      "loss": 0.1289,
+      "step": 19703
+    },
+    {
+      "epoch": 0.17104018194286508,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00183327579135665,
+      "loss": 0.1582,
+      "step": 19704
+    },
+    {
+      "epoch": 0.17104886242306924,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0018332585343505475,
+      "loss": 0.1357,
+      "step": 19705
+    },
+    {
+      "epoch": 0.1710575429032734,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0018332412765425642,
+      "loss": 0.0771,
+      "step": 19706
+    },
+    {
+      "epoch": 0.17106622338347757,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0018332240179327198,
+      "loss": 0.1064,
+      "step": 19707
+    },
+    {
+      "epoch": 0.17107490386368174,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0018332067585210324,
+      "loss": 0.1152,
+      "step": 19708
+    },
+    {
+      "epoch": 0.1710835843438859,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001833189498307521,
+      "loss": 0.0977,
+      "step": 19709
+    },
+    {
+      "epoch": 0.17109226482409007,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0018331722372922046,
+      "loss": 0.1504,
+      "step": 19710
+    },
+    {
+      "epoch": 0.17110094530429423,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0018331549754751022,
+      "loss": 0.1113,
+      "step": 19711
+    },
+    {
+      "epoch": 0.1711096257844984,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018331377128562325,
+      "loss": 0.1348,
+      "step": 19712
+    },
+    {
+      "epoch": 0.17111830626470256,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0018331204494356143,
+      "loss": 0.1387,
+      "step": 19713
+    },
+    {
+      "epoch": 0.17112698674490673,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0018331031852132666,
+      "loss": 0.0708,
+      "step": 19714
+    },
+    {
+      "epoch": 0.1711356672251109,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0018330859201892086,
+      "loss": 0.0977,
+      "step": 19715
+    },
+    {
+      "epoch": 0.17114434770531506,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0018330686543634586,
+      "loss": 0.0879,
+      "step": 19716
+    },
+    {
+      "epoch": 0.17115302818551922,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0018330513877360356,
+      "loss": 0.0781,
+      "step": 19717
+    },
+    {
+      "epoch": 0.1711617086657234,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0018330341203069587,
+      "loss": 0.1069,
+      "step": 19718
+    },
+    {
+      "epoch": 0.17117038914592755,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0018330168520762466,
+      "loss": 0.1484,
+      "step": 19719
+    },
+    {
+      "epoch": 0.17117906962613172,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0018329995830439186,
+      "loss": 0.0996,
+      "step": 19720
+    },
+    {
+      "epoch": 0.17118775010633588,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0018329823132099932,
+      "loss": 0.1045,
+      "step": 19721
+    },
+    {
+      "epoch": 0.17119643058654005,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0018329650425744893,
+      "loss": 0.1475,
+      "step": 19722
+    },
+    {
+      "epoch": 0.17120511106674421,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0018329477711374255,
+      "loss": 0.1699,
+      "step": 19723
+    },
+    {
+      "epoch": 0.17121379154694838,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0018329304988988215,
+      "loss": 0.1279,
+      "step": 19724
+    },
+    {
+      "epoch": 0.17122247202715254,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0018329132258586955,
+      "loss": 0.1289,
+      "step": 19725
+    },
+    {
+      "epoch": 0.1712311525073567,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0018328959520170667,
+      "loss": 0.124,
+      "step": 19726
+    },
+    {
+      "epoch": 0.17123983298756087,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0018328786773739538,
+      "loss": 0.1611,
+      "step": 19727
+    },
+    {
+      "epoch": 0.17124851346776504,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001832861401929376,
+      "loss": 0.1387,
+      "step": 19728
+    },
+    {
+      "epoch": 0.1712571939479692,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0018328441256833519,
+      "loss": 0.0972,
+      "step": 19729
+    },
+    {
+      "epoch": 0.17126587442817337,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0018328268486359003,
+      "loss": 0.1104,
+      "step": 19730
+    },
+    {
+      "epoch": 0.17127455490837754,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0018328095707870402,
+      "loss": 0.127,
+      "step": 19731
+    },
+    {
+      "epoch": 0.1712832353885817,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.001832792292136791,
+      "loss": 0.1572,
+      "step": 19732
+    },
+    {
+      "epoch": 0.17129191586878587,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0018327750126851709,
+      "loss": 0.1025,
+      "step": 19733
+    },
+    {
+      "epoch": 0.17130059634899003,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0018327577324321988,
+      "loss": 0.1211,
+      "step": 19734
+    },
+    {
+      "epoch": 0.1713092768291942,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001832740451377894,
+      "loss": 0.0933,
+      "step": 19735
+    },
+    {
+      "epoch": 0.17131795730939836,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0018327231695222754,
+      "loss": 0.1055,
+      "step": 19736
+    },
+    {
+      "epoch": 0.17132663778960253,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001832705886865362,
+      "loss": 0.0938,
+      "step": 19737
+    },
+    {
+      "epoch": 0.1713353182698067,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.001832688603407172,
+      "loss": 0.0762,
+      "step": 19738
+    },
+    {
+      "epoch": 0.17134399875001086,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0018326713191477249,
+      "loss": 0.1006,
+      "step": 19739
+    },
+    {
+      "epoch": 0.17135267923021502,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0018326540340870394,
+      "loss": 0.123,
+      "step": 19740
+    },
+    {
+      "epoch": 0.1713613597104192,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0018326367482251345,
+      "loss": 0.1299,
+      "step": 19741
+    },
+    {
+      "epoch": 0.17137004019062335,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0018326194615620292,
+      "loss": 0.1582,
+      "step": 19742
+    },
+    {
+      "epoch": 0.17137872067082752,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0018326021740977419,
+      "loss": 0.1157,
+      "step": 19743
+    },
+    {
+      "epoch": 0.17138740115103168,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0018325848858322922,
+      "loss": 0.1201,
+      "step": 19744
+    },
+    {
+      "epoch": 0.17139608163123585,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0018325675967656987,
+      "loss": 0.0967,
+      "step": 19745
+    },
+    {
+      "epoch": 0.17140476211144,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0018325503068979802,
+      "loss": 0.1025,
+      "step": 19746
+    },
+    {
+      "epoch": 0.17141344259164418,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0018325330162291555,
+      "loss": 0.1299,
+      "step": 19747
+    },
+    {
+      "epoch": 0.17142212307184834,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0018325157247592436,
+      "loss": 0.1099,
+      "step": 19748
+    },
+    {
+      "epoch": 0.1714308035520525,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0018324984324882643,
+      "loss": 0.1475,
+      "step": 19749
+    },
+    {
+      "epoch": 0.17143948403225667,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001832481139416235,
+      "loss": 0.1035,
+      "step": 19750
+    },
+    {
+      "epoch": 0.17144816451246084,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0018324638455431755,
+      "loss": 0.1094,
+      "step": 19751
+    },
+    {
+      "epoch": 0.171456844992665,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0018324465508691046,
+      "loss": 0.1494,
+      "step": 19752
+    },
+    {
+      "epoch": 0.17146552547286917,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0018324292553940412,
+      "loss": 0.085,
+      "step": 19753
+    },
+    {
+      "epoch": 0.17147420595307333,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001832411959118004,
+      "loss": 0.123,
+      "step": 19754
+    },
+    {
+      "epoch": 0.1714828864332775,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001832394662041012,
+      "loss": 0.1221,
+      "step": 19755
+    },
+    {
+      "epoch": 0.17149156691348166,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0018323773641630847,
+      "loss": 0.124,
+      "step": 19756
+    },
+    {
+      "epoch": 0.17150024739368583,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.00183236006548424,
+      "loss": 0.0708,
+      "step": 19757
+    },
+    {
+      "epoch": 0.17150892787389,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0018323427660044973,
+      "loss": 0.0952,
+      "step": 19758
+    },
+    {
+      "epoch": 0.17151760835409416,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0018323254657238762,
+      "loss": 0.0981,
+      "step": 19759
+    },
+    {
+      "epoch": 0.17152628883429832,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0018323081646423945,
+      "loss": 0.0977,
+      "step": 19760
+    },
+    {
+      "epoch": 0.1715349693145025,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0018322908627600714,
+      "loss": 0.1299,
+      "step": 19761
+    },
+    {
+      "epoch": 0.17154364979470665,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0018322735600769264,
+      "loss": 0.1641,
+      "step": 19762
+    },
+    {
+      "epoch": 0.17155233027491082,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001832256256592978,
+      "loss": 0.085,
+      "step": 19763
+    },
+    {
+      "epoch": 0.17156101075511498,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0018322389523082452,
+      "loss": 0.1182,
+      "step": 19764
+    },
+    {
+      "epoch": 0.17156969123531915,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0018322216472227467,
+      "loss": 0.1309,
+      "step": 19765
+    },
+    {
+      "epoch": 0.17157837171552331,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0018322043413365014,
+      "loss": 0.125,
+      "step": 19766
+    },
+    {
+      "epoch": 0.17158705219572748,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018321870346495288,
+      "loss": 0.123,
+      "step": 19767
+    },
+    {
+      "epoch": 0.17159573267593164,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0018321697271618475,
+      "loss": 0.1055,
+      "step": 19768
+    },
+    {
+      "epoch": 0.1716044131561358,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018321524188734761,
+      "loss": 0.1138,
+      "step": 19769
+    },
+    {
+      "epoch": 0.17161309363633998,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0018321351097844339,
+      "loss": 0.1602,
+      "step": 19770
+    },
+    {
+      "epoch": 0.17162177411654414,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0018321177998947397,
+      "loss": 0.1045,
+      "step": 19771
+    },
+    {
+      "epoch": 0.1716304545967483,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0018321004892044126,
+      "loss": 0.1416,
+      "step": 19772
+    },
+    {
+      "epoch": 0.17163913507695247,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0018320831777134715,
+      "loss": 0.1162,
+      "step": 19773
+    },
+    {
+      "epoch": 0.17164781555715664,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0018320658654219352,
+      "loss": 0.0972,
+      "step": 19774
+    },
+    {
+      "epoch": 0.1716564960373608,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0018320485523298225,
+      "loss": 0.1631,
+      "step": 19775
+    },
+    {
+      "epoch": 0.17166517651756494,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0018320312384371524,
+      "loss": 0.0977,
+      "step": 19776
+    },
+    {
+      "epoch": 0.1716738569977691,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0018320139237439444,
+      "loss": 0.1328,
+      "step": 19777
+    },
+    {
+      "epoch": 0.17168253747797327,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0018319966082502167,
+      "loss": 0.0811,
+      "step": 19778
+    },
+    {
+      "epoch": 0.17169121795817743,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0018319792919559883,
+      "loss": 0.1113,
+      "step": 19779
+    },
+    {
+      "epoch": 0.1716998984383816,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0018319619748612787,
+      "loss": 0.1201,
+      "step": 19780
+    },
+    {
+      "epoch": 0.17170857891858576,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0018319446569661061,
+      "loss": 0.0889,
+      "step": 19781
+    },
+    {
+      "epoch": 0.17171725939878993,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.00183192733827049,
+      "loss": 0.0898,
+      "step": 19782
+    },
+    {
+      "epoch": 0.1717259398789941,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0018319100187744496,
+      "loss": 0.1191,
+      "step": 19783
+    },
+    {
+      "epoch": 0.17173462035919826,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001831892698478003,
+      "loss": 0.1001,
+      "step": 19784
+    },
+    {
+      "epoch": 0.17174330083940242,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018318753773811695,
+      "loss": 0.1143,
+      "step": 19785
+    },
+    {
+      "epoch": 0.1717519813196066,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0018318580554839681,
+      "loss": 0.1143,
+      "step": 19786
+    },
+    {
+      "epoch": 0.17176066179981075,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0018318407327864176,
+      "loss": 0.0996,
+      "step": 19787
+    },
+    {
+      "epoch": 0.17176934228001492,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0018318234092885373,
+      "loss": 0.1104,
+      "step": 19788
+    },
+    {
+      "epoch": 0.17177802276021908,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001831806084990346,
+      "loss": 0.1309,
+      "step": 19789
+    },
+    {
+      "epoch": 0.17178670324042325,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001831788759891862,
+      "loss": 0.0908,
+      "step": 19790
+    },
+    {
+      "epoch": 0.17179538372062741,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0018317714339931056,
+      "loss": 0.1172,
+      "step": 19791
+    },
+    {
+      "epoch": 0.17180406420083158,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.0018317541072940947,
+      "loss": 0.1826,
+      "step": 19792
+    },
+    {
+      "epoch": 0.17181274468103575,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0018317367797948483,
+      "loss": 0.0996,
+      "step": 19793
+    },
+    {
+      "epoch": 0.1718214251612399,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0018317194514953854,
+      "loss": 0.1758,
+      "step": 19794
+    },
+    {
+      "epoch": 0.17183010564144408,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0018317021223957252,
+      "loss": 0.0981,
+      "step": 19795
+    },
+    {
+      "epoch": 0.17183878612164824,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0018316847924958867,
+      "loss": 0.1104,
+      "step": 19796
+    },
+    {
+      "epoch": 0.1718474666018524,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018316674617958888,
+      "loss": 0.1318,
+      "step": 19797
+    },
+    {
+      "epoch": 0.17185614708205657,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0018316501302957503,
+      "loss": 0.0933,
+      "step": 19798
+    },
+    {
+      "epoch": 0.17186482756226074,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0018316327979954904,
+      "loss": 0.0874,
+      "step": 19799
+    },
+    {
+      "epoch": 0.1718735080424649,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0018316154648951274,
+      "loss": 0.0796,
+      "step": 19800
+    },
+    {
+      "epoch": 0.17188218852266907,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001831598130994681,
+      "loss": 0.1416,
+      "step": 19801
+    },
+    {
+      "epoch": 0.17189086900287323,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0018315807962941695,
+      "loss": 0.0986,
+      "step": 19802
+    },
+    {
+      "epoch": 0.1718995494830774,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0018315634607936127,
+      "loss": 0.0781,
+      "step": 19803
+    },
+    {
+      "epoch": 0.17190822996328156,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0018315461244930293,
+      "loss": 0.0928,
+      "step": 19804
+    },
+    {
+      "epoch": 0.17191691044348573,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0018315287873924373,
+      "loss": 0.1211,
+      "step": 19805
+    },
+    {
+      "epoch": 0.1719255909236899,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0018315114494918568,
+      "loss": 0.1074,
+      "step": 19806
+    },
+    {
+      "epoch": 0.17193427140389406,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0018314941107913064,
+      "loss": 0.1094,
+      "step": 19807
+    },
+    {
+      "epoch": 0.17194295188409822,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0018314767712908052,
+      "loss": 0.0859,
+      "step": 19808
+    },
+    {
+      "epoch": 0.1719516323643024,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0018314594309903716,
+      "loss": 0.1133,
+      "step": 19809
+    },
+    {
+      "epoch": 0.17196031284450655,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001831442089890025,
+      "loss": 0.1211,
+      "step": 19810
+    },
+    {
+      "epoch": 0.17196899332471072,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0018314247479897846,
+      "loss": 0.126,
+      "step": 19811
+    },
+    {
+      "epoch": 0.17197767380491488,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001831407405289669,
+      "loss": 0.0811,
+      "step": 19812
+    },
+    {
+      "epoch": 0.17198635428511905,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001831390061789697,
+      "loss": 0.1089,
+      "step": 19813
+    },
+    {
+      "epoch": 0.1719950347653232,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018313727174898884,
+      "loss": 0.0889,
+      "step": 19814
+    },
+    {
+      "epoch": 0.17200371524552738,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001831355372390261,
+      "loss": 0.0923,
+      "step": 19815
+    },
+    {
+      "epoch": 0.17201239572573154,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0018313380264908345,
+      "loss": 0.1465,
+      "step": 19816
+    },
+    {
+      "epoch": 0.1720210762059357,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001831320679791628,
+      "loss": 0.1035,
+      "step": 19817
+    },
+    {
+      "epoch": 0.17202975668613987,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0018313033322926598,
+      "loss": 0.0889,
+      "step": 19818
+    },
+    {
+      "epoch": 0.17203843716634404,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0018312859839939492,
+      "loss": 0.1079,
+      "step": 19819
+    },
+    {
+      "epoch": 0.1720471176465482,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0018312686348955153,
+      "loss": 0.1021,
+      "step": 19820
+    },
+    {
+      "epoch": 0.17205579812675237,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0018312512849973776,
+      "loss": 0.1436,
+      "step": 19821
+    },
+    {
+      "epoch": 0.17206447860695653,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001831233934299554,
+      "loss": 0.0737,
+      "step": 19822
+    },
+    {
+      "epoch": 0.1720731590871607,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0018312165828020639,
+      "loss": 0.1221,
+      "step": 19823
+    },
+    {
+      "epoch": 0.17208183956736486,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0018311992305049265,
+      "loss": 0.1016,
+      "step": 19824
+    },
+    {
+      "epoch": 0.17209052004756903,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0018311818774081605,
+      "loss": 0.1475,
+      "step": 19825
+    },
+    {
+      "epoch": 0.1720992005277732,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001831164523511785,
+      "loss": 0.1147,
+      "step": 19826
+    },
+    {
+      "epoch": 0.17210788100797736,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001831147168815819,
+      "loss": 0.1426,
+      "step": 19827
+    },
+    {
+      "epoch": 0.17211656148818152,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0018311298133202817,
+      "loss": 0.1133,
+      "step": 19828
+    },
+    {
+      "epoch": 0.1721252419683857,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018311124570251913,
+      "loss": 0.1357,
+      "step": 19829
+    },
+    {
+      "epoch": 0.17213392244858985,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0018310950999305675,
+      "loss": 0.1011,
+      "step": 19830
+    },
+    {
+      "epoch": 0.17214260292879402,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0018310777420364292,
+      "loss": 0.0835,
+      "step": 19831
+    },
+    {
+      "epoch": 0.17215128340899818,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0018310603833427955,
+      "loss": 0.1445,
+      "step": 19832
+    },
+    {
+      "epoch": 0.17215996388920235,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001831043023849685,
+      "loss": 0.1006,
+      "step": 19833
+    },
+    {
+      "epoch": 0.17216864436940651,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018310256635571166,
+      "loss": 0.1387,
+      "step": 19834
+    },
+    {
+      "epoch": 0.17217732484961068,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0018310083024651095,
+      "loss": 0.1162,
+      "step": 19835
+    },
+    {
+      "epoch": 0.17218600532981485,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001830990940573683,
+      "loss": 0.1562,
+      "step": 19836
+    },
+    {
+      "epoch": 0.172194685810019,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0018309735778828553,
+      "loss": 0.124,
+      "step": 19837
+    },
+    {
+      "epoch": 0.17220336629022318,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0018309562143926463,
+      "loss": 0.1187,
+      "step": 19838
+    },
+    {
+      "epoch": 0.17221204677042734,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0018309388501030744,
+      "loss": 0.168,
+      "step": 19839
+    },
+    {
+      "epoch": 0.1722207272506315,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001830921485014159,
+      "loss": 0.1172,
+      "step": 19840
+    },
+    {
+      "epoch": 0.17222940773083567,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0018309041191259185,
+      "loss": 0.0903,
+      "step": 19841
+    },
+    {
+      "epoch": 0.17223808821103984,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0018308867524383726,
+      "loss": 0.1143,
+      "step": 19842
+    },
+    {
+      "epoch": 0.172246768691244,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0018308693849515396,
+      "loss": 0.1172,
+      "step": 19843
+    },
+    {
+      "epoch": 0.17225544917144817,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0018308520166654388,
+      "loss": 0.1768,
+      "step": 19844
+    },
+    {
+      "epoch": 0.17226412965165233,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0018308346475800895,
+      "loss": 0.0967,
+      "step": 19845
+    },
+    {
+      "epoch": 0.1722728101318565,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.00183081727769551,
+      "loss": 0.1113,
+      "step": 19846
+    },
+    {
+      "epoch": 0.17228149061206066,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0018307999070117202,
+      "loss": 0.1177,
+      "step": 19847
+    },
+    {
+      "epoch": 0.17229017109226483,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0018307825355287383,
+      "loss": 0.1035,
+      "step": 19848
+    },
+    {
+      "epoch": 0.172298851572469,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018307651632465838,
+      "loss": 0.1064,
+      "step": 19849
+    },
+    {
+      "epoch": 0.17230753205267316,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0018307477901652754,
+      "loss": 0.1143,
+      "step": 19850
+    },
+    {
+      "epoch": 0.17231621253287732,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001830730416284832,
+      "loss": 0.1094,
+      "step": 19851
+    },
+    {
+      "epoch": 0.1723248930130815,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.001830713041605273,
+      "loss": 0.0928,
+      "step": 19852
+    },
+    {
+      "epoch": 0.17233357349328565,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0018306956661266172,
+      "loss": 0.0786,
+      "step": 19853
+    },
+    {
+      "epoch": 0.17234225397348982,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0018306782898488837,
+      "loss": 0.0957,
+      "step": 19854
+    },
+    {
+      "epoch": 0.17235093445369398,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0018306609127720912,
+      "loss": 0.1045,
+      "step": 19855
+    },
+    {
+      "epoch": 0.17235961493389815,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0018306435348962589,
+      "loss": 0.1128,
+      "step": 19856
+    },
+    {
+      "epoch": 0.1723682954141023,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001830626156221406,
+      "loss": 0.103,
+      "step": 19857
+    },
+    {
+      "epoch": 0.17237697589430648,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0018306087767475509,
+      "loss": 0.1367,
+      "step": 19858
+    },
+    {
+      "epoch": 0.17238565637451064,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0018305913964747136,
+      "loss": 0.0884,
+      "step": 19859
+    },
+    {
+      "epoch": 0.1723943368547148,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0018305740154029122,
+      "loss": 0.0859,
+      "step": 19860
+    },
+    {
+      "epoch": 0.17240301733491897,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0018305566335321659,
+      "loss": 0.126,
+      "step": 19861
+    },
+    {
+      "epoch": 0.17241169781512314,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0018305392508624941,
+      "loss": 0.1143,
+      "step": 19862
+    },
+    {
+      "epoch": 0.1724203782953273,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0018305218673939155,
+      "loss": 0.1602,
+      "step": 19863
+    },
+    {
+      "epoch": 0.17242905877553147,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0018305044831264492,
+      "loss": 0.1035,
+      "step": 19864
+    },
+    {
+      "epoch": 0.17243773925573563,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0018304870980601138,
+      "loss": 0.1211,
+      "step": 19865
+    },
+    {
+      "epoch": 0.1724464197359398,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0018304697121949293,
+      "loss": 0.1045,
+      "step": 19866
+    },
+    {
+      "epoch": 0.17245510021614396,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0018304523255309136,
+      "loss": 0.1001,
+      "step": 19867
+    },
+    {
+      "epoch": 0.17246378069634813,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0018304349380680862,
+      "loss": 0.0835,
+      "step": 19868
+    },
+    {
+      "epoch": 0.1724724611765523,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0018304175498064666,
+      "loss": 0.127,
+      "step": 19869
+    },
+    {
+      "epoch": 0.17248114165675646,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001830400160746073,
+      "loss": 0.1143,
+      "step": 19870
+    },
+    {
+      "epoch": 0.17248982213696062,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0018303827708869248,
+      "loss": 0.1387,
+      "step": 19871
+    },
+    {
+      "epoch": 0.1724985026171648,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001830365380229041,
+      "loss": 0.0952,
+      "step": 19872
+    },
+    {
+      "epoch": 0.17250718309736895,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0018303479887724406,
+      "loss": 0.1196,
+      "step": 19873
+    },
+    {
+      "epoch": 0.17251586357757312,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0018303305965171423,
+      "loss": 0.1035,
+      "step": 19874
+    },
+    {
+      "epoch": 0.17252454405777728,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0018303132034631659,
+      "loss": 0.1006,
+      "step": 19875
+    },
+    {
+      "epoch": 0.17253322453798145,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018302958096105299,
+      "loss": 0.0811,
+      "step": 19876
+    },
+    {
+      "epoch": 0.17254190501818562,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0018302784149592532,
+      "loss": 0.0869,
+      "step": 19877
+    },
+    {
+      "epoch": 0.17255058549838978,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001830261019509355,
+      "loss": 0.1289,
+      "step": 19878
+    },
+    {
+      "epoch": 0.17255926597859395,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0018302436232608544,
+      "loss": 0.1465,
+      "step": 19879
+    },
+    {
+      "epoch": 0.1725679464587981,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018302262262137703,
+      "loss": 0.1216,
+      "step": 19880
+    },
+    {
+      "epoch": 0.17257662693900228,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001830208828368122,
+      "loss": 0.1426,
+      "step": 19881
+    },
+    {
+      "epoch": 0.17258530741920644,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001830191429723928,
+      "loss": 0.124,
+      "step": 19882
+    },
+    {
+      "epoch": 0.1725939878994106,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0018301740302812078,
+      "loss": 0.1196,
+      "step": 19883
+    },
+    {
+      "epoch": 0.17260266837961477,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0018301566300399804,
+      "loss": 0.1035,
+      "step": 19884
+    },
+    {
+      "epoch": 0.17261134885981894,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001830139229000264,
+      "loss": 0.3398,
+      "step": 19885
+    },
+    {
+      "epoch": 0.1726200293400231,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0018301218271620787,
+      "loss": 0.127,
+      "step": 19886
+    },
+    {
+      "epoch": 0.17262870982022727,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0018301044245254432,
+      "loss": 0.1465,
+      "step": 19887
+    },
+    {
+      "epoch": 0.17263739030043143,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0018300870210903766,
+      "loss": 0.1377,
+      "step": 19888
+    },
+    {
+      "epoch": 0.1726460707806356,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0018300696168568977,
+      "loss": 0.1348,
+      "step": 19889
+    },
+    {
+      "epoch": 0.17265475126083976,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0018300522118250258,
+      "loss": 0.2197,
+      "step": 19890
+    },
+    {
+      "epoch": 0.17266343174104393,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0018300348059947793,
+      "loss": 0.0825,
+      "step": 19891
+    },
+    {
+      "epoch": 0.1726721122212481,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0018300173993661781,
+      "loss": 0.1738,
+      "step": 19892
+    },
+    {
+      "epoch": 0.17268079270145226,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0018299999919392405,
+      "loss": 0.1138,
+      "step": 19893
+    },
+    {
+      "epoch": 0.17268947318165642,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0018299825837139864,
+      "loss": 0.1133,
+      "step": 19894
+    },
+    {
+      "epoch": 0.1726981536618606,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.001829965174690434,
+      "loss": 0.1118,
+      "step": 19895
+    },
+    {
+      "epoch": 0.17270683414206475,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0018299477648686024,
+      "loss": 0.1602,
+      "step": 19896
+    },
+    {
+      "epoch": 0.17271551462226892,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0018299303542485112,
+      "loss": 0.123,
+      "step": 19897
+    },
+    {
+      "epoch": 0.17272419510247308,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0018299129428301793,
+      "loss": 0.1191,
+      "step": 19898
+    },
+    {
+      "epoch": 0.17273287558267722,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0018298955306136253,
+      "loss": 0.1318,
+      "step": 19899
+    },
+    {
+      "epoch": 0.17274155606288139,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018298781175988686,
+      "loss": 0.1113,
+      "step": 19900
+    },
+    {
+      "epoch": 0.17275023654308555,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001829860703785928,
+      "loss": 0.0791,
+      "step": 19901
+    },
+    {
+      "epoch": 0.17275891702328972,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0018298432891748228,
+      "loss": 0.1025,
+      "step": 19902
+    },
+    {
+      "epoch": 0.17276759750349388,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0018298258737655723,
+      "loss": 0.0996,
+      "step": 19903
+    },
+    {
+      "epoch": 0.17277627798369805,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018298084575581947,
+      "loss": 0.1367,
+      "step": 19904
+    },
+    {
+      "epoch": 0.1727849584639022,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0018297910405527095,
+      "loss": 0.0903,
+      "step": 19905
+    },
+    {
+      "epoch": 0.17279363894410638,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0018297736227491361,
+      "loss": 0.127,
+      "step": 19906
+    },
+    {
+      "epoch": 0.17280231942431054,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001829756204147493,
+      "loss": 0.1279,
+      "step": 19907
+    },
+    {
+      "epoch": 0.1728109999045147,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0018297387847477998,
+      "loss": 0.168,
+      "step": 19908
+    },
+    {
+      "epoch": 0.17281968038471887,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0018297213645500749,
+      "loss": 0.1113,
+      "step": 19909
+    },
+    {
+      "epoch": 0.17282836086492304,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001829703943554338,
+      "loss": 0.103,
+      "step": 19910
+    },
+    {
+      "epoch": 0.1728370413451272,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018296865217606077,
+      "loss": 0.1074,
+      "step": 19911
+    },
+    {
+      "epoch": 0.17284572182533137,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0018296690991689028,
+      "loss": 0.1309,
+      "step": 19912
+    },
+    {
+      "epoch": 0.17285440230553553,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0018296516757792434,
+      "loss": 0.1094,
+      "step": 19913
+    },
+    {
+      "epoch": 0.1728630827857397,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0018296342515916474,
+      "loss": 0.1035,
+      "step": 19914
+    },
+    {
+      "epoch": 0.17287176326594386,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0018296168266061343,
+      "loss": 0.1191,
+      "step": 19915
+    },
+    {
+      "epoch": 0.17288044374614803,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0018295994008227234,
+      "loss": 0.0884,
+      "step": 19916
+    },
+    {
+      "epoch": 0.1728891242263522,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0018295819742414339,
+      "loss": 0.1162,
+      "step": 19917
+    },
+    {
+      "epoch": 0.17289780470655636,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001829564546862284,
+      "loss": 0.1406,
+      "step": 19918
+    },
+    {
+      "epoch": 0.17290648518676052,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0018295471186852935,
+      "loss": 0.0928,
+      "step": 19919
+    },
+    {
+      "epoch": 0.1729151656669647,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001829529689710481,
+      "loss": 0.1099,
+      "step": 19920
+    },
+    {
+      "epoch": 0.17292384614716885,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0018295122599378659,
+      "loss": 0.1113,
+      "step": 19921
+    },
+    {
+      "epoch": 0.17293252662737302,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0018294948293674674,
+      "loss": 0.1465,
+      "step": 19922
+    },
+    {
+      "epoch": 0.17294120710757718,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001829477397999304,
+      "loss": 0.1641,
+      "step": 19923
+    },
+    {
+      "epoch": 0.17294988758778135,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0018294599658333953,
+      "loss": 0.0986,
+      "step": 19924
+    },
+    {
+      "epoch": 0.1729585680679855,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0018294425328697602,
+      "loss": 0.1079,
+      "step": 19925
+    },
+    {
+      "epoch": 0.17296724854818968,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0018294250991084176,
+      "loss": 0.1221,
+      "step": 19926
+    },
+    {
+      "epoch": 0.17297592902839384,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0018294076645493863,
+      "loss": 0.1465,
+      "step": 19927
+    },
+    {
+      "epoch": 0.172984609508598,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0018293902291926863,
+      "loss": 0.126,
+      "step": 19928
+    },
+    {
+      "epoch": 0.17299328998880217,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0018293727930383359,
+      "loss": 0.1079,
+      "step": 19929
+    },
+    {
+      "epoch": 0.17300197046900634,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0018293553560863544,
+      "loss": 0.1045,
+      "step": 19930
+    },
+    {
+      "epoch": 0.1730106509492105,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0018293379183367605,
+      "loss": 0.1001,
+      "step": 19931
+    },
+    {
+      "epoch": 0.17301933142941467,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001829320479789574,
+      "loss": 0.1084,
+      "step": 19932
+    },
+    {
+      "epoch": 0.17302801190961883,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0018293030404448137,
+      "loss": 0.1299,
+      "step": 19933
+    },
+    {
+      "epoch": 0.173036692389823,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001829285600302498,
+      "loss": 0.1201,
+      "step": 19934
+    },
+    {
+      "epoch": 0.17304537287002716,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001829268159362647,
+      "loss": 0.1289,
+      "step": 19935
+    },
+    {
+      "epoch": 0.17305405335023133,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.001829250717625279,
+      "loss": 0.1475,
+      "step": 19936
+    },
+    {
+      "epoch": 0.1730627338304355,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0018292332750904138,
+      "loss": 0.1108,
+      "step": 19937
+    },
+    {
+      "epoch": 0.17307141431063966,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0018292158317580697,
+      "loss": 0.127,
+      "step": 19938
+    },
+    {
+      "epoch": 0.17308009479084382,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001829198387628266,
+      "loss": 0.1055,
+      "step": 19939
+    },
+    {
+      "epoch": 0.173088775271048,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0018291809427010222,
+      "loss": 0.1143,
+      "step": 19940
+    },
+    {
+      "epoch": 0.17309745575125215,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001829163496976357,
+      "loss": 0.0869,
+      "step": 19941
+    },
+    {
+      "epoch": 0.17310613623145632,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0018291460504542894,
+      "loss": 0.0757,
+      "step": 19942
+    },
+    {
+      "epoch": 0.17311481671166049,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0018291286031348391,
+      "loss": 0.126,
+      "step": 19943
+    },
+    {
+      "epoch": 0.17312349719186465,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001829111155018024,
+      "loss": 0.1011,
+      "step": 19944
+    },
+    {
+      "epoch": 0.17313217767206882,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0018290937061038643,
+      "loss": 0.0952,
+      "step": 19945
+    },
+    {
+      "epoch": 0.17314085815227298,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0018290762563923787,
+      "loss": 0.1045,
+      "step": 19946
+    },
+    {
+      "epoch": 0.17314953863247715,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0018290588058835864,
+      "loss": 0.1094,
+      "step": 19947
+    },
+    {
+      "epoch": 0.1731582191126813,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0018290413545775062,
+      "loss": 0.1045,
+      "step": 19948
+    },
+    {
+      "epoch": 0.17316689959288548,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0018290239024741572,
+      "loss": 0.1035,
+      "step": 19949
+    },
+    {
+      "epoch": 0.17317558007308964,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0018290064495735585,
+      "loss": 0.1289,
+      "step": 19950
+    },
+    {
+      "epoch": 0.1731842605532938,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0018289889958757295,
+      "loss": 0.1465,
+      "step": 19951
+    },
+    {
+      "epoch": 0.17319294103349797,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0018289715413806892,
+      "loss": 0.106,
+      "step": 19952
+    },
+    {
+      "epoch": 0.17320162151370214,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0018289540860884565,
+      "loss": 0.1211,
+      "step": 19953
+    },
+    {
+      "epoch": 0.1732103019939063,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0018289366299990505,
+      "loss": 0.1475,
+      "step": 19954
+    },
+    {
+      "epoch": 0.17321898247411047,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0018289191731124903,
+      "loss": 0.1738,
+      "step": 19955
+    },
+    {
+      "epoch": 0.17322766295431463,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0018289017154287952,
+      "loss": 0.1221,
+      "step": 19956
+    },
+    {
+      "epoch": 0.1732363434345188,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001828884256947984,
+      "loss": 0.1045,
+      "step": 19957
+    },
+    {
+      "epoch": 0.17324502391472296,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001828866797670076,
+      "loss": 0.1592,
+      "step": 19958
+    },
+    {
+      "epoch": 0.17325370439492713,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0018288493375950901,
+      "loss": 0.1475,
+      "step": 19959
+    },
+    {
+      "epoch": 0.1732623848751313,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0018288318767230457,
+      "loss": 0.1187,
+      "step": 19960
+    },
+    {
+      "epoch": 0.17327106535533546,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0018288144150539617,
+      "loss": 0.0874,
+      "step": 19961
+    },
+    {
+      "epoch": 0.17327974583553962,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0018287969525878568,
+      "loss": 0.1099,
+      "step": 19962
+    },
+    {
+      "epoch": 0.1732884263157438,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001828779489324751,
+      "loss": 0.0977,
+      "step": 19963
+    },
+    {
+      "epoch": 0.17329710679594795,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0018287620252646628,
+      "loss": 0.1211,
+      "step": 19964
+    },
+    {
+      "epoch": 0.17330578727615212,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0018287445604076114,
+      "loss": 0.0894,
+      "step": 19965
+    },
+    {
+      "epoch": 0.17331446775635628,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0018287270947536158,
+      "loss": 0.1055,
+      "step": 19966
+    },
+    {
+      "epoch": 0.17332314823656045,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0018287096283026952,
+      "loss": 0.0938,
+      "step": 19967
+    },
+    {
+      "epoch": 0.1733318287167646,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0018286921610548687,
+      "loss": 0.1543,
+      "step": 19968
+    },
+    {
+      "epoch": 0.17334050919696878,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0018286746930101557,
+      "loss": 0.1025,
+      "step": 19969
+    },
+    {
+      "epoch": 0.17334918967717294,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0018286572241685746,
+      "loss": 0.1162,
+      "step": 19970
+    },
+    {
+      "epoch": 0.1733578701573771,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001828639754530145,
+      "loss": 0.1055,
+      "step": 19971
+    },
+    {
+      "epoch": 0.17336655063758127,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001828622284094886,
+      "loss": 0.125,
+      "step": 19972
+    },
+    {
+      "epoch": 0.17337523111778544,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0018286048128628165,
+      "loss": 0.1104,
+      "step": 19973
+    },
+    {
+      "epoch": 0.1733839115979896,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0018285873408339557,
+      "loss": 0.1162,
+      "step": 19974
+    },
+    {
+      "epoch": 0.17339259207819377,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.001828569868008323,
+      "loss": 0.1104,
+      "step": 19975
+    },
+    {
+      "epoch": 0.17340127255839793,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001828552394385937,
+      "loss": 0.0908,
+      "step": 19976
+    },
+    {
+      "epoch": 0.1734099530386021,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.001828534919966817,
+      "loss": 0.2461,
+      "step": 19977
+    },
+    {
+      "epoch": 0.17341863351880626,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0018285174447509823,
+      "loss": 0.1172,
+      "step": 19978
+    },
+    {
+      "epoch": 0.17342731399901043,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0018284999687384518,
+      "loss": 0.0898,
+      "step": 19979
+    },
+    {
+      "epoch": 0.1734359944792146,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.001828482491929245,
+      "loss": 0.1162,
+      "step": 19980
+    },
+    {
+      "epoch": 0.17344467495941876,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0018284650143233804,
+      "loss": 0.1289,
+      "step": 19981
+    },
+    {
+      "epoch": 0.17345335543962292,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0018284475359208775,
+      "loss": 0.127,
+      "step": 19982
+    },
+    {
+      "epoch": 0.1734620359198271,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0018284300567217551,
+      "loss": 0.1318,
+      "step": 19983
+    },
+    {
+      "epoch": 0.17347071640003126,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0018284125767260323,
+      "loss": 0.1025,
+      "step": 19984
+    },
+    {
+      "epoch": 0.17347939688023542,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001828395095933729,
+      "loss": 0.126,
+      "step": 19985
+    },
+    {
+      "epoch": 0.17348807736043959,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0018283776143448638,
+      "loss": 0.0771,
+      "step": 19986
+    },
+    {
+      "epoch": 0.17349675784064375,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0018283601319594553,
+      "loss": 0.1152,
+      "step": 19987
+    },
+    {
+      "epoch": 0.17350543832084792,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0018283426487775234,
+      "loss": 0.166,
+      "step": 19988
+    },
+    {
+      "epoch": 0.17351411880105208,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0018283251647990874,
+      "loss": 0.1133,
+      "step": 19989
+    },
+    {
+      "epoch": 0.17352279928125625,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0018283076800241652,
+      "loss": 0.1133,
+      "step": 19990
+    },
+    {
+      "epoch": 0.1735314797614604,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001828290194452777,
+      "loss": 0.1406,
+      "step": 19991
+    },
+    {
+      "epoch": 0.17354016024166458,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0018282727080849413,
+      "loss": 0.1172,
+      "step": 19992
+    },
+    {
+      "epoch": 0.17354884072186874,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0018282552209206778,
+      "loss": 0.1309,
+      "step": 19993
+    },
+    {
+      "epoch": 0.1735575212020729,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.001828237732960005,
+      "loss": 0.1621,
+      "step": 19994
+    },
+    {
+      "epoch": 0.17356620168227707,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0018282202442029428,
+      "loss": 0.103,
+      "step": 19995
+    },
+    {
+      "epoch": 0.17357488216248124,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0018282027546495097,
+      "loss": 0.126,
+      "step": 19996
+    },
+    {
+      "epoch": 0.1735835626426854,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0018281852642997249,
+      "loss": 0.0889,
+      "step": 19997
+    },
+    {
+      "epoch": 0.17359224312288957,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0018281677731536077,
+      "loss": 0.1035,
+      "step": 19998
+    },
+    {
+      "epoch": 0.17360092360309373,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0018281502812111771,
+      "loss": 0.0776,
+      "step": 19999
+    },
+    {
+      "epoch": 0.1736096040832979,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0018281327884724526,
+      "loss": 0.0977,
+      "step": 20000
+    },
+    {
+      "epoch": 0.17361828456350206,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0018281152949374527,
+      "loss": 0.1182,
+      "step": 20001
+    },
+    {
+      "epoch": 0.17362696504370623,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001828097800606197,
+      "loss": 0.1104,
+      "step": 20002
+    },
+    {
+      "epoch": 0.1736356455239104,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018280803054787043,
+      "loss": 0.1475,
+      "step": 20003
+    },
+    {
+      "epoch": 0.17364432600411456,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0018280628095549943,
+      "loss": 0.083,
+      "step": 20004
+    },
+    {
+      "epoch": 0.17365300648431872,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0018280453128350855,
+      "loss": 0.125,
+      "step": 20005
+    },
+    {
+      "epoch": 0.1736616869645229,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018280278153189973,
+      "loss": 0.1084,
+      "step": 20006
+    },
+    {
+      "epoch": 0.17367036744472705,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0018280103170067487,
+      "loss": 0.1055,
+      "step": 20007
+    },
+    {
+      "epoch": 0.17367904792493122,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001827992817898359,
+      "loss": 0.123,
+      "step": 20008
+    },
+    {
+      "epoch": 0.17368772840513538,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0018279753179938473,
+      "loss": 0.1089,
+      "step": 20009
+    },
+    {
+      "epoch": 0.17369640888533955,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001827957817293233,
+      "loss": 0.1221,
+      "step": 20010
+    },
+    {
+      "epoch": 0.1737050893655437,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0018279403157965347,
+      "loss": 0.1328,
+      "step": 20011
+    },
+    {
+      "epoch": 0.17371376984574788,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0018279228135037718,
+      "loss": 0.0859,
+      "step": 20012
+    },
+    {
+      "epoch": 0.17372245032595204,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0018279053104149634,
+      "loss": 0.1118,
+      "step": 20013
+    },
+    {
+      "epoch": 0.1737311308061562,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0018278878065301288,
+      "loss": 0.0938,
+      "step": 20014
+    },
+    {
+      "epoch": 0.17373981128636037,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001827870301849287,
+      "loss": 0.1367,
+      "step": 20015
+    },
+    {
+      "epoch": 0.17374849176656454,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0018278527963724574,
+      "loss": 0.1504,
+      "step": 20016
+    },
+    {
+      "epoch": 0.1737571722467687,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0018278352900996586,
+      "loss": 0.1113,
+      "step": 20017
+    },
+    {
+      "epoch": 0.17376585272697287,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0018278177830309103,
+      "loss": 0.1484,
+      "step": 20018
+    },
+    {
+      "epoch": 0.17377453320717703,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0018278002751662312,
+      "loss": 0.1357,
+      "step": 20019
+    },
+    {
+      "epoch": 0.1737832136873812,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0018277827665056409,
+      "loss": 0.0962,
+      "step": 20020
+    },
+    {
+      "epoch": 0.17379189416758536,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0018277652570491579,
+      "loss": 0.1309,
+      "step": 20021
+    },
+    {
+      "epoch": 0.1738005746477895,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0018277477467968021,
+      "loss": 0.1172,
+      "step": 20022
+    },
+    {
+      "epoch": 0.17380925512799367,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0018277302357485924,
+      "loss": 0.1187,
+      "step": 20023
+    },
+    {
+      "epoch": 0.17381793560819783,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0018277127239045474,
+      "loss": 0.1338,
+      "step": 20024
+    },
+    {
+      "epoch": 0.173826616088402,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001827695211264687,
+      "loss": 0.1084,
+      "step": 20025
+    },
+    {
+      "epoch": 0.17383529656860616,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.00182767769782903,
+      "loss": 0.0957,
+      "step": 20026
+    },
+    {
+      "epoch": 0.17384397704881033,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0018276601835975957,
+      "loss": 0.1133,
+      "step": 20027
+    },
+    {
+      "epoch": 0.1738526575290145,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001827642668570403,
+      "loss": 0.0991,
+      "step": 20028
+    },
+    {
+      "epoch": 0.17386133800921866,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0018276251527474714,
+      "loss": 0.0981,
+      "step": 20029
+    },
+    {
+      "epoch": 0.17387001848942282,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.00182760763612882,
+      "loss": 0.1138,
+      "step": 20030
+    },
+    {
+      "epoch": 0.173878698969627,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001827590118714467,
+      "loss": 0.1152,
+      "step": 20031
+    },
+    {
+      "epoch": 0.17388737944983115,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.001827572600504433,
+      "loss": 0.082,
+      "step": 20032
+    },
+    {
+      "epoch": 0.17389605993003532,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0018275550814987366,
+      "loss": 0.1147,
+      "step": 20033
+    },
+    {
+      "epoch": 0.17390474041023948,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0018275375616973967,
+      "loss": 0.1309,
+      "step": 20034
+    },
+    {
+      "epoch": 0.17391342089044365,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0018275200411004327,
+      "loss": 0.1045,
+      "step": 20035
+    },
+    {
+      "epoch": 0.1739221013706478,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.001827502519707864,
+      "loss": 0.1025,
+      "step": 20036
+    },
+    {
+      "epoch": 0.17393078185085198,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001827484997519709,
+      "loss": 0.126,
+      "step": 20037
+    },
+    {
+      "epoch": 0.17393946233105614,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0018274674745359877,
+      "loss": 0.0908,
+      "step": 20038
+    },
+    {
+      "epoch": 0.1739481428112603,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0018274499507567185,
+      "loss": 0.0879,
+      "step": 20039
+    },
+    {
+      "epoch": 0.17395682329146447,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.001827432426181921,
+      "loss": 0.1572,
+      "step": 20040
+    },
+    {
+      "epoch": 0.17396550377166864,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0018274149008116148,
+      "loss": 0.0967,
+      "step": 20041
+    },
+    {
+      "epoch": 0.1739741842518728,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0018273973746458184,
+      "loss": 0.1367,
+      "step": 20042
+    },
+    {
+      "epoch": 0.17398286473207697,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0018273798476845509,
+      "loss": 0.0908,
+      "step": 20043
+    },
+    {
+      "epoch": 0.17399154521228113,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001827362319927832,
+      "loss": 0.1157,
+      "step": 20044
+    },
+    {
+      "epoch": 0.1740002256924853,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0018273447913756808,
+      "loss": 0.0947,
+      "step": 20045
+    },
+    {
+      "epoch": 0.17400890617268946,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0018273272620281157,
+      "loss": 0.1543,
+      "step": 20046
+    },
+    {
+      "epoch": 0.17401758665289363,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0018273097318851567,
+      "loss": 0.1123,
+      "step": 20047
+    },
+    {
+      "epoch": 0.1740262671330978,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0018272922009468229,
+      "loss": 0.1147,
+      "step": 20048
+    },
+    {
+      "epoch": 0.17403494761330196,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0018272746692131329,
+      "loss": 0.0996,
+      "step": 20049
+    },
+    {
+      "epoch": 0.17404362809350613,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0018272571366841064,
+      "loss": 0.0938,
+      "step": 20050
+    },
+    {
+      "epoch": 0.1740523085737103,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018272396033597626,
+      "loss": 0.0957,
+      "step": 20051
+    },
+    {
+      "epoch": 0.17406098905391446,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0018272220692401202,
+      "loss": 0.1553,
+      "step": 20052
+    },
+    {
+      "epoch": 0.17406966953411862,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001827204534325199,
+      "loss": 0.1201,
+      "step": 20053
+    },
+    {
+      "epoch": 0.17407835001432279,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0018271869986150177,
+      "loss": 0.126,
+      "step": 20054
+    },
+    {
+      "epoch": 0.17408703049452695,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0018271694621095956,
+      "loss": 0.0977,
+      "step": 20055
+    },
+    {
+      "epoch": 0.17409571097473112,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0018271519248089518,
+      "loss": 0.1108,
+      "step": 20056
+    },
+    {
+      "epoch": 0.17410439145493528,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0018271343867131058,
+      "loss": 0.1377,
+      "step": 20057
+    },
+    {
+      "epoch": 0.17411307193513945,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0018271168478220764,
+      "loss": 0.0879,
+      "step": 20058
+    },
+    {
+      "epoch": 0.1741217524153436,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001827099308135883,
+      "loss": 0.1318,
+      "step": 20059
+    },
+    {
+      "epoch": 0.17413043289554778,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0018270817676545447,
+      "loss": 0.1055,
+      "step": 20060
+    },
+    {
+      "epoch": 0.17413911337575194,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0018270642263780805,
+      "loss": 0.1348,
+      "step": 20061
+    },
+    {
+      "epoch": 0.1741477938559561,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.00182704668430651,
+      "loss": 0.1162,
+      "step": 20062
+    },
+    {
+      "epoch": 0.17415647433616027,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001827029141439852,
+      "loss": 0.0898,
+      "step": 20063
+    },
+    {
+      "epoch": 0.17416515481636444,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0018270115977781266,
+      "loss": 0.1143,
+      "step": 20064
+    },
+    {
+      "epoch": 0.1741738352965686,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0018269940533213514,
+      "loss": 0.1445,
+      "step": 20065
+    },
+    {
+      "epoch": 0.17418251577677277,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0018269765080695466,
+      "loss": 0.168,
+      "step": 20066
+    },
+    {
+      "epoch": 0.17419119625697693,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0018269589620227313,
+      "loss": 0.1045,
+      "step": 20067
+    },
+    {
+      "epoch": 0.1741998767371811,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0018269414151809247,
+      "loss": 0.1279,
+      "step": 20068
+    },
+    {
+      "epoch": 0.17420855721738526,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0018269238675441457,
+      "loss": 0.1641,
+      "step": 20069
+    },
+    {
+      "epoch": 0.17421723769758943,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0018269063191124135,
+      "loss": 0.0962,
+      "step": 20070
+    },
+    {
+      "epoch": 0.1742259181777936,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018268887698857479,
+      "loss": 0.1055,
+      "step": 20071
+    },
+    {
+      "epoch": 0.17423459865799776,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0018268712198641678,
+      "loss": 0.124,
+      "step": 20072
+    },
+    {
+      "epoch": 0.17424327913820192,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0018268536690476918,
+      "loss": 0.1387,
+      "step": 20073
+    },
+    {
+      "epoch": 0.1742519596184061,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0018268361174363396,
+      "loss": 0.125,
+      "step": 20074
+    },
+    {
+      "epoch": 0.17426064009861025,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0018268185650301306,
+      "loss": 0.1123,
+      "step": 20075
+    },
+    {
+      "epoch": 0.17426932057881442,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0018268010118290836,
+      "loss": 0.1172,
+      "step": 20076
+    },
+    {
+      "epoch": 0.17427800105901858,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001826783457833218,
+      "loss": 0.0708,
+      "step": 20077
+    },
+    {
+      "epoch": 0.17428668153922275,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0018267659030425528,
+      "loss": 0.1309,
+      "step": 20078
+    },
+    {
+      "epoch": 0.1742953620194269,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0018267483474571071,
+      "loss": 0.1465,
+      "step": 20079
+    },
+    {
+      "epoch": 0.17430404249963108,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001826730791076901,
+      "loss": 0.1475,
+      "step": 20080
+    },
+    {
+      "epoch": 0.17431272297983524,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0018267132339019525,
+      "loss": 0.1079,
+      "step": 20081
+    },
+    {
+      "epoch": 0.1743214034600394,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0018266956759322814,
+      "loss": 0.1035,
+      "step": 20082
+    },
+    {
+      "epoch": 0.17433008394024357,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001826678117167907,
+      "loss": 0.0986,
+      "step": 20083
+    },
+    {
+      "epoch": 0.17433876442044774,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0018266605576088481,
+      "loss": 0.105,
+      "step": 20084
+    },
+    {
+      "epoch": 0.1743474449006519,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0018266429972551245,
+      "loss": 0.0933,
+      "step": 20085
+    },
+    {
+      "epoch": 0.17435612538085607,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0018266254361067546,
+      "loss": 0.1611,
+      "step": 20086
+    },
+    {
+      "epoch": 0.17436480586106023,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0018266078741637581,
+      "loss": 0.106,
+      "step": 20087
+    },
+    {
+      "epoch": 0.1743734863412644,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0018265903114261543,
+      "loss": 0.1191,
+      "step": 20088
+    },
+    {
+      "epoch": 0.17438216682146856,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0018265727478939623,
+      "loss": 0.1318,
+      "step": 20089
+    },
+    {
+      "epoch": 0.17439084730167273,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0018265551835672013,
+      "loss": 0.1582,
+      "step": 20090
+    },
+    {
+      "epoch": 0.1743995277818769,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0018265376184458902,
+      "loss": 0.105,
+      "step": 20091
+    },
+    {
+      "epoch": 0.17440820826208106,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0018265200525300486,
+      "loss": 0.1279,
+      "step": 20092
+    },
+    {
+      "epoch": 0.17441688874228523,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0018265024858196955,
+      "loss": 0.1196,
+      "step": 20093
+    },
+    {
+      "epoch": 0.1744255692224894,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018264849183148506,
+      "loss": 0.126,
+      "step": 20094
+    },
+    {
+      "epoch": 0.17443424970269356,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0018264673500155322,
+      "loss": 0.1348,
+      "step": 20095
+    },
+    {
+      "epoch": 0.17444293018289772,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0018264497809217602,
+      "loss": 0.1201,
+      "step": 20096
+    },
+    {
+      "epoch": 0.17445161066310189,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001826432211033554,
+      "loss": 0.1465,
+      "step": 20097
+    },
+    {
+      "epoch": 0.17446029114330605,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.001826414640350932,
+      "loss": 0.1045,
+      "step": 20098
+    },
+    {
+      "epoch": 0.17446897162351022,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001826397068873914,
+      "loss": 0.1201,
+      "step": 20099
+    },
+    {
+      "epoch": 0.17447765210371438,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0018263794966025188,
+      "loss": 0.0986,
+      "step": 20100
+    },
+    {
+      "epoch": 0.17448633258391855,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0018263619235367662,
+      "loss": 0.0918,
+      "step": 20101
+    },
+    {
+      "epoch": 0.1744950130641227,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0018263443496766751,
+      "loss": 0.1025,
+      "step": 20102
+    },
+    {
+      "epoch": 0.17450369354432688,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0018263267750222648,
+      "loss": 0.0967,
+      "step": 20103
+    },
+    {
+      "epoch": 0.17451237402453104,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0018263091995735544,
+      "loss": 0.1875,
+      "step": 20104
+    },
+    {
+      "epoch": 0.1745210545047352,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.001826291623330563,
+      "loss": 0.1016,
+      "step": 20105
+    },
+    {
+      "epoch": 0.17452973498493937,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0018262740462933102,
+      "loss": 0.1221,
+      "step": 20106
+    },
+    {
+      "epoch": 0.17453841546514354,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001826256468461815,
+      "loss": 0.0981,
+      "step": 20107
+    },
+    {
+      "epoch": 0.1745470959453477,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0018262388898360965,
+      "loss": 0.123,
+      "step": 20108
+    },
+    {
+      "epoch": 0.17455577642555187,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0018262213104161742,
+      "loss": 0.1123,
+      "step": 20109
+    },
+    {
+      "epoch": 0.17456445690575603,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018262037302020672,
+      "loss": 0.0947,
+      "step": 20110
+    },
+    {
+      "epoch": 0.1745731373859602,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0018261861491937945,
+      "loss": 0.0874,
+      "step": 20111
+    },
+    {
+      "epoch": 0.17458181786616436,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0018261685673913756,
+      "loss": 0.1318,
+      "step": 20112
+    },
+    {
+      "epoch": 0.17459049834636853,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0018261509847948298,
+      "loss": 0.125,
+      "step": 20113
+    },
+    {
+      "epoch": 0.1745991788265727,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0018261334014041761,
+      "loss": 0.1006,
+      "step": 20114
+    },
+    {
+      "epoch": 0.17460785930677686,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0018261158172194342,
+      "loss": 0.1074,
+      "step": 20115
+    },
+    {
+      "epoch": 0.17461653978698102,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0018260982322406223,
+      "loss": 0.1118,
+      "step": 20116
+    },
+    {
+      "epoch": 0.1746252202671852,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0018260806464677607,
+      "loss": 0.0864,
+      "step": 20117
+    },
+    {
+      "epoch": 0.17463390074738935,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0018260630599008679,
+      "loss": 0.0801,
+      "step": 20118
+    },
+    {
+      "epoch": 0.17464258122759352,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0018260454725399639,
+      "loss": 0.0967,
+      "step": 20119
+    },
+    {
+      "epoch": 0.17465126170779768,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0018260278843850672,
+      "loss": 0.1035,
+      "step": 20120
+    },
+    {
+      "epoch": 0.17465994218800185,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0018260102954361975,
+      "loss": 0.1289,
+      "step": 20121
+    },
+    {
+      "epoch": 0.17466862266820601,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0018259927056933733,
+      "loss": 0.1055,
+      "step": 20122
+    },
+    {
+      "epoch": 0.17467730314841018,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0018259751151566148,
+      "loss": 0.0708,
+      "step": 20123
+    },
+    {
+      "epoch": 0.17468598362861434,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.001825957523825941,
+      "loss": 0.1338,
+      "step": 20124
+    },
+    {
+      "epoch": 0.1746946641088185,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.001825939931701371,
+      "loss": 0.1309,
+      "step": 20125
+    },
+    {
+      "epoch": 0.17470334458902267,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0018259223387829236,
+      "loss": 0.1133,
+      "step": 20126
+    },
+    {
+      "epoch": 0.17471202506922684,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0018259047450706186,
+      "loss": 0.1201,
+      "step": 20127
+    },
+    {
+      "epoch": 0.174720705549431,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001825887150564475,
+      "loss": 0.1123,
+      "step": 20128
+    },
+    {
+      "epoch": 0.17472938602963517,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0018258695552645125,
+      "loss": 0.1064,
+      "step": 20129
+    },
+    {
+      "epoch": 0.17473806650983933,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0018258519591707494,
+      "loss": 0.1055,
+      "step": 20130
+    },
+    {
+      "epoch": 0.1747467469900435,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018258343622832057,
+      "loss": 0.1289,
+      "step": 20131
+    },
+    {
+      "epoch": 0.17475542747024767,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0018258167646019005,
+      "loss": 0.1357,
+      "step": 20132
+    },
+    {
+      "epoch": 0.17476410795045183,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0018257991661268529,
+      "loss": 0.1279,
+      "step": 20133
+    },
+    {
+      "epoch": 0.174772788430656,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0018257815668580825,
+      "loss": 0.1221,
+      "step": 20134
+    },
+    {
+      "epoch": 0.17478146891086016,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001825763966795608,
+      "loss": 0.165,
+      "step": 20135
+    },
+    {
+      "epoch": 0.17479014939106433,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0018257463659394492,
+      "loss": 0.1011,
+      "step": 20136
+    },
+    {
+      "epoch": 0.1747988298712685,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0018257287642896248,
+      "loss": 0.1016,
+      "step": 20137
+    },
+    {
+      "epoch": 0.17480751035147266,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0018257111618461546,
+      "loss": 0.1387,
+      "step": 20138
+    },
+    {
+      "epoch": 0.17481619083167682,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0018256935586090574,
+      "loss": 0.1069,
+      "step": 20139
+    },
+    {
+      "epoch": 0.17482487131188099,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018256759545783525,
+      "loss": 0.1143,
+      "step": 20140
+    },
+    {
+      "epoch": 0.17483355179208515,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0018256583497540595,
+      "loss": 0.1206,
+      "step": 20141
+    },
+    {
+      "epoch": 0.17484223227228932,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0018256407441361972,
+      "loss": 0.1162,
+      "step": 20142
+    },
+    {
+      "epoch": 0.17485091275249348,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0018256231377247855,
+      "loss": 0.0947,
+      "step": 20143
+    },
+    {
+      "epoch": 0.17485959323269765,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001825605530519843,
+      "loss": 0.1021,
+      "step": 20144
+    },
+    {
+      "epoch": 0.17486827371290178,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0018255879225213892,
+      "loss": 0.1094,
+      "step": 20145
+    },
+    {
+      "epoch": 0.17487695419310595,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0018255703137294433,
+      "loss": 0.1465,
+      "step": 20146
+    },
+    {
+      "epoch": 0.17488563467331011,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0018255527041440246,
+      "loss": 0.0947,
+      "step": 20147
+    },
+    {
+      "epoch": 0.17489431515351428,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018255350937651524,
+      "loss": 0.1523,
+      "step": 20148
+    },
+    {
+      "epoch": 0.17490299563371844,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001825517482592846,
+      "loss": 0.1016,
+      "step": 20149
+    },
+    {
+      "epoch": 0.1749116761139226,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018254998706271244,
+      "loss": 0.1035,
+      "step": 20150
+    },
+    {
+      "epoch": 0.17492035659412677,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0018254822578680073,
+      "loss": 0.0825,
+      "step": 20151
+    },
+    {
+      "epoch": 0.17492903707433094,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0018254646443155134,
+      "loss": 0.1416,
+      "step": 20152
+    },
+    {
+      "epoch": 0.1749377175545351,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0018254470299696624,
+      "loss": 0.1138,
+      "step": 20153
+    },
+    {
+      "epoch": 0.17494639803473927,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0018254294148304735,
+      "loss": 0.1309,
+      "step": 20154
+    },
+    {
+      "epoch": 0.17495507851494344,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0018254117988979659,
+      "loss": 0.1113,
+      "step": 20155
+    },
+    {
+      "epoch": 0.1749637589951476,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0018253941821721586,
+      "loss": 0.1025,
+      "step": 20156
+    },
+    {
+      "epoch": 0.17497243947535177,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0018253765646530713,
+      "loss": 0.1133,
+      "step": 20157
+    },
+    {
+      "epoch": 0.17498111995555593,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0018253589463407232,
+      "loss": 0.1006,
+      "step": 20158
+    },
+    {
+      "epoch": 0.1749898004357601,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0018253413272351336,
+      "loss": 0.0967,
+      "step": 20159
+    },
+    {
+      "epoch": 0.17499848091596426,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001825323707336321,
+      "loss": 0.0898,
+      "step": 20160
+    },
+    {
+      "epoch": 0.17500716139616843,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0018253060866443062,
+      "loss": 0.0811,
+      "step": 20161
+    },
+    {
+      "epoch": 0.1750158418763726,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0018252884651591068,
+      "loss": 0.1133,
+      "step": 20162
+    },
+    {
+      "epoch": 0.17502452235657676,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001825270842880743,
+      "loss": 0.1445,
+      "step": 20163
+    },
+    {
+      "epoch": 0.17503320283678092,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001825253219809234,
+      "loss": 0.167,
+      "step": 20164
+    },
+    {
+      "epoch": 0.1750418833169851,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.001825235595944599,
+      "loss": 0.0986,
+      "step": 20165
+    },
+    {
+      "epoch": 0.17505056379718925,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001825217971286857,
+      "loss": 0.0889,
+      "step": 20166
+    },
+    {
+      "epoch": 0.17505924427739342,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018252003458360277,
+      "loss": 0.1045,
+      "step": 20167
+    },
+    {
+      "epoch": 0.17506792475759758,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0018251827195921305,
+      "loss": 0.1758,
+      "step": 20168
+    },
+    {
+      "epoch": 0.17507660523780175,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0018251650925551841,
+      "loss": 0.0942,
+      "step": 20169
+    },
+    {
+      "epoch": 0.1750852857180059,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001825147464725208,
+      "loss": 0.1089,
+      "step": 20170
+    },
+    {
+      "epoch": 0.17509396619821008,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0018251298361022213,
+      "loss": 0.1338,
+      "step": 20171
+    },
+    {
+      "epoch": 0.17510264667841424,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001825112206686244,
+      "loss": 0.1445,
+      "step": 20172
+    },
+    {
+      "epoch": 0.1751113271586184,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001825094576477295,
+      "loss": 0.1367,
+      "step": 20173
+    },
+    {
+      "epoch": 0.17512000763882257,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0018250769454753928,
+      "loss": 0.0864,
+      "step": 20174
+    },
+    {
+      "epoch": 0.17512868811902674,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0018250593136805576,
+      "loss": 0.0972,
+      "step": 20175
+    },
+    {
+      "epoch": 0.1751373685992309,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0018250416810928086,
+      "loss": 0.0713,
+      "step": 20176
+    },
+    {
+      "epoch": 0.17514604907943507,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0018250240477121647,
+      "loss": 0.085,
+      "step": 20177
+    },
+    {
+      "epoch": 0.17515472955963923,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0018250064135386457,
+      "loss": 0.0981,
+      "step": 20178
+    },
+    {
+      "epoch": 0.1751634100398434,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0018249887785722703,
+      "loss": 0.0864,
+      "step": 20179
+    },
+    {
+      "epoch": 0.17517209052004756,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0018249711428130581,
+      "loss": 0.0977,
+      "step": 20180
+    },
+    {
+      "epoch": 0.17518077100025173,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0018249535062610284,
+      "loss": 0.0806,
+      "step": 20181
+    },
+    {
+      "epoch": 0.1751894514804559,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0018249358689162002,
+      "loss": 0.1016,
+      "step": 20182
+    },
+    {
+      "epoch": 0.17519813196066006,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0018249182307785937,
+      "loss": 0.0811,
+      "step": 20183
+    },
+    {
+      "epoch": 0.17520681244086422,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0018249005918482269,
+      "loss": 0.1108,
+      "step": 20184
+    },
+    {
+      "epoch": 0.1752154929210684,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0018248829521251197,
+      "loss": 0.0933,
+      "step": 20185
+    },
+    {
+      "epoch": 0.17522417340127255,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0018248653116092917,
+      "loss": 0.1133,
+      "step": 20186
+    },
+    {
+      "epoch": 0.17523285388147672,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0018248476703007616,
+      "loss": 0.1074,
+      "step": 20187
+    },
+    {
+      "epoch": 0.17524153436168088,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0018248300281995494,
+      "loss": 0.0713,
+      "step": 20188
+    },
+    {
+      "epoch": 0.17525021484188505,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018248123853056735,
+      "loss": 0.0967,
+      "step": 20189
+    },
+    {
+      "epoch": 0.17525889532208921,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0018247947416191541,
+      "loss": 0.1104,
+      "step": 20190
+    },
+    {
+      "epoch": 0.17526757580229338,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0018247770971400097,
+      "loss": 0.1445,
+      "step": 20191
+    },
+    {
+      "epoch": 0.17527625628249754,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0018247594518682603,
+      "loss": 0.1729,
+      "step": 20192
+    },
+    {
+      "epoch": 0.1752849367627017,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0018247418058039244,
+      "loss": 0.1177,
+      "step": 20193
+    },
+    {
+      "epoch": 0.17529361724290587,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001824724158947022,
+      "loss": 0.0918,
+      "step": 20194
+    },
+    {
+      "epoch": 0.17530229772311004,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0018247065112975724,
+      "loss": 0.0874,
+      "step": 20195
+    },
+    {
+      "epoch": 0.1753109782033142,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0018246888628555944,
+      "loss": 0.0615,
+      "step": 20196
+    },
+    {
+      "epoch": 0.17531965868351837,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0018246712136211075,
+      "loss": 0.1582,
+      "step": 20197
+    },
+    {
+      "epoch": 0.17532833916372254,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001824653563594131,
+      "loss": 0.1455,
+      "step": 20198
+    },
+    {
+      "epoch": 0.1753370196439267,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0018246359127746846,
+      "loss": 0.0977,
+      "step": 20199
+    },
+    {
+      "epoch": 0.17534570012413087,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0018246182611627868,
+      "loss": 0.1079,
+      "step": 20200
+    },
+    {
+      "epoch": 0.17535438060433503,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0018246006087584577,
+      "loss": 0.1328,
+      "step": 20201
+    },
+    {
+      "epoch": 0.1753630610845392,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001824582955561716,
+      "loss": 0.1143,
+      "step": 20202
+    },
+    {
+      "epoch": 0.17537174156474336,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0018245653015725814,
+      "loss": 0.1069,
+      "step": 20203
+    },
+    {
+      "epoch": 0.17538042204494753,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0018245476467910729,
+      "loss": 0.1143,
+      "step": 20204
+    },
+    {
+      "epoch": 0.1753891025251517,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0018245299912172102,
+      "loss": 0.1152,
+      "step": 20205
+    },
+    {
+      "epoch": 0.17539778300535586,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0018245123348510125,
+      "loss": 0.103,
+      "step": 20206
+    },
+    {
+      "epoch": 0.17540646348556002,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0018244946776924988,
+      "loss": 0.1084,
+      "step": 20207
+    },
+    {
+      "epoch": 0.1754151439657642,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0018244770197416884,
+      "loss": 0.0972,
+      "step": 20208
+    },
+    {
+      "epoch": 0.17542382444596835,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001824459360998601,
+      "loss": 0.1152,
+      "step": 20209
+    },
+    {
+      "epoch": 0.17543250492617252,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0018244417014632562,
+      "loss": 0.0928,
+      "step": 20210
+    },
+    {
+      "epoch": 0.17544118540637668,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001824424041135672,
+      "loss": 0.1084,
+      "step": 20211
+    },
+    {
+      "epoch": 0.17544986588658085,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0018244063800158692,
+      "loss": 0.0913,
+      "step": 20212
+    },
+    {
+      "epoch": 0.175458546366785,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0018243887181038662,
+      "loss": 0.1182,
+      "step": 20213
+    },
+    {
+      "epoch": 0.17546722684698918,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0018243710553996826,
+      "loss": 0.1064,
+      "step": 20214
+    },
+    {
+      "epoch": 0.17547590732719334,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0018243533919033378,
+      "loss": 0.1436,
+      "step": 20215
+    },
+    {
+      "epoch": 0.1754845878073975,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018243357276148508,
+      "loss": 0.0757,
+      "step": 20216
+    },
+    {
+      "epoch": 0.17549326828760167,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0018243180625342414,
+      "loss": 0.1113,
+      "step": 20217
+    },
+    {
+      "epoch": 0.17550194876780584,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0018243003966615285,
+      "loss": 0.123,
+      "step": 20218
+    },
+    {
+      "epoch": 0.17551062924801,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0018242827299967315,
+      "loss": 0.124,
+      "step": 20219
+    },
+    {
+      "epoch": 0.17551930972821417,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00182426506253987,
+      "loss": 0.1172,
+      "step": 20220
+    },
+    {
+      "epoch": 0.17552799020841833,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0018242473942909627,
+      "loss": 0.1533,
+      "step": 20221
+    },
+    {
+      "epoch": 0.1755366706886225,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0018242297252500296,
+      "loss": 0.103,
+      "step": 20222
+    },
+    {
+      "epoch": 0.17554535116882666,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0018242120554170897,
+      "loss": 0.1006,
+      "step": 20223
+    },
+    {
+      "epoch": 0.17555403164903083,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0018241943847921627,
+      "loss": 0.1387,
+      "step": 20224
+    },
+    {
+      "epoch": 0.175562712129235,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0018241767133752673,
+      "loss": 0.1387,
+      "step": 20225
+    },
+    {
+      "epoch": 0.17557139260943916,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.001824159041166423,
+      "loss": 0.1436,
+      "step": 20226
+    },
+    {
+      "epoch": 0.17558007308964332,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0018241413681656493,
+      "loss": 0.1553,
+      "step": 20227
+    },
+    {
+      "epoch": 0.1755887535698475,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0018241236943729656,
+      "loss": 0.1309,
+      "step": 20228
+    },
+    {
+      "epoch": 0.17559743405005165,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0018241060197883914,
+      "loss": 0.1328,
+      "step": 20229
+    },
+    {
+      "epoch": 0.17560611453025582,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018240883444119451,
+      "loss": 0.1406,
+      "step": 20230
+    },
+    {
+      "epoch": 0.17561479501045998,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0018240706682436474,
+      "loss": 0.1484,
+      "step": 20231
+    },
+    {
+      "epoch": 0.17562347549066415,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0018240529912835163,
+      "loss": 0.123,
+      "step": 20232
+    },
+    {
+      "epoch": 0.17563215597086831,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001824035313531572,
+      "loss": 0.0903,
+      "step": 20233
+    },
+    {
+      "epoch": 0.17564083645107248,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0018240176349878334,
+      "loss": 0.125,
+      "step": 20234
+    },
+    {
+      "epoch": 0.17564951693127664,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.00182399995565232,
+      "loss": 0.1099,
+      "step": 20235
+    },
+    {
+      "epoch": 0.1756581974114808,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0018239822755250514,
+      "loss": 0.1338,
+      "step": 20236
+    },
+    {
+      "epoch": 0.17566687789168497,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0018239645946060464,
+      "loss": 0.1089,
+      "step": 20237
+    },
+    {
+      "epoch": 0.17567555837188914,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0018239469128953248,
+      "loss": 0.1836,
+      "step": 20238
+    },
+    {
+      "epoch": 0.1756842388520933,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0018239292303929057,
+      "loss": 0.1338,
+      "step": 20239
+    },
+    {
+      "epoch": 0.17569291933229747,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0018239115470988082,
+      "loss": 0.0752,
+      "step": 20240
+    },
+    {
+      "epoch": 0.17570159981250164,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.001823893863013052,
+      "loss": 0.1201,
+      "step": 20241
+    },
+    {
+      "epoch": 0.1757102802927058,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0018238761781356565,
+      "loss": 0.0898,
+      "step": 20242
+    },
+    {
+      "epoch": 0.17571896077290997,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0018238584924666406,
+      "loss": 0.1348,
+      "step": 20243
+    },
+    {
+      "epoch": 0.17572764125311413,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0018238408060060243,
+      "loss": 0.1196,
+      "step": 20244
+    },
+    {
+      "epoch": 0.1757363217333183,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0018238231187538265,
+      "loss": 0.1221,
+      "step": 20245
+    },
+    {
+      "epoch": 0.17574500221352246,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0018238054307100665,
+      "loss": 0.1318,
+      "step": 20246
+    },
+    {
+      "epoch": 0.17575368269372663,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001823787741874764,
+      "loss": 0.1387,
+      "step": 20247
+    },
+    {
+      "epoch": 0.1757623631739308,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0018237700522479377,
+      "loss": 0.1367,
+      "step": 20248
+    },
+    {
+      "epoch": 0.17577104365413496,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0018237523618296074,
+      "loss": 0.1162,
+      "step": 20249
+    },
+    {
+      "epoch": 0.17577972413433912,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0018237346706197926,
+      "loss": 0.0679,
+      "step": 20250
+    },
+    {
+      "epoch": 0.1757884046145433,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.0018237169786185124,
+      "loss": 0.1465,
+      "step": 20251
+    },
+    {
+      "epoch": 0.17579708509474745,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001823699285825786,
+      "loss": 0.0825,
+      "step": 20252
+    },
+    {
+      "epoch": 0.17580576557495162,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0018236815922416335,
+      "loss": 0.126,
+      "step": 20253
+    },
+    {
+      "epoch": 0.17581444605515578,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018236638978660728,
+      "loss": 0.1172,
+      "step": 20254
+    },
+    {
+      "epoch": 0.17582312653535995,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001823646202699125,
+      "loss": 0.126,
+      "step": 20255
+    },
+    {
+      "epoch": 0.1758318070155641,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001823628506740808,
+      "loss": 0.0874,
+      "step": 20256
+    },
+    {
+      "epoch": 0.17584048749576828,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001823610809991142,
+      "loss": 0.1475,
+      "step": 20257
+    },
+    {
+      "epoch": 0.17584916797597244,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0018235931124501459,
+      "loss": 0.1094,
+      "step": 20258
+    },
+    {
+      "epoch": 0.1758578484561766,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0018235754141178394,
+      "loss": 0.082,
+      "step": 20259
+    },
+    {
+      "epoch": 0.17586652893638077,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0018235577149942415,
+      "loss": 0.1367,
+      "step": 20260
+    },
+    {
+      "epoch": 0.17587520941658494,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001823540015079372,
+      "loss": 0.1191,
+      "step": 20261
+    },
+    {
+      "epoch": 0.1758838898967891,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00182352231437325,
+      "loss": 0.126,
+      "step": 20262
+    },
+    {
+      "epoch": 0.17589257037699327,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0018235046128758947,
+      "loss": 0.1143,
+      "step": 20263
+    },
+    {
+      "epoch": 0.17590125085719743,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001823486910587326,
+      "loss": 0.126,
+      "step": 20264
+    },
+    {
+      "epoch": 0.1759099313374016,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0018234692075075622,
+      "loss": 0.7734,
+      "step": 20265
+    },
+    {
+      "epoch": 0.17591861181760576,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0018234515036366237,
+      "loss": 0.082,
+      "step": 20266
+    },
+    {
+      "epoch": 0.17592729229780993,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0018234337989745297,
+      "loss": 0.1191,
+      "step": 20267
+    },
+    {
+      "epoch": 0.17593597277801407,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0018234160935212994,
+      "loss": 0.0938,
+      "step": 20268
+    },
+    {
+      "epoch": 0.17594465325821823,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001823398387276952,
+      "loss": 0.0903,
+      "step": 20269
+    },
+    {
+      "epoch": 0.1759533337384224,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0018233806802415067,
+      "loss": 0.0947,
+      "step": 20270
+    },
+    {
+      "epoch": 0.17596201421862656,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0018233629724149832,
+      "loss": 0.1865,
+      "step": 20271
+    },
+    {
+      "epoch": 0.17597069469883073,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0018233452637974013,
+      "loss": 0.1089,
+      "step": 20272
+    },
+    {
+      "epoch": 0.1759793751790349,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0018233275543887795,
+      "loss": 0.1191,
+      "step": 20273
+    },
+    {
+      "epoch": 0.17598805565923906,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0018233098441891378,
+      "loss": 0.1074,
+      "step": 20274
+    },
+    {
+      "epoch": 0.17599673613944322,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001823292133198495,
+      "loss": 0.1299,
+      "step": 20275
+    },
+    {
+      "epoch": 0.1760054166196474,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001823274421416871,
+      "loss": 0.1157,
+      "step": 20276
+    },
+    {
+      "epoch": 0.17601409709985155,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0018232567088442848,
+      "loss": 0.1191,
+      "step": 20277
+    },
+    {
+      "epoch": 0.17602277758005572,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0018232389954807562,
+      "loss": 0.1113,
+      "step": 20278
+    },
+    {
+      "epoch": 0.17603145806025988,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0018232212813263044,
+      "loss": 0.1094,
+      "step": 20279
+    },
+    {
+      "epoch": 0.17604013854046405,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0018232035663809482,
+      "loss": 0.1016,
+      "step": 20280
+    },
+    {
+      "epoch": 0.1760488190206682,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0018231858506447078,
+      "loss": 0.1089,
+      "step": 20281
+    },
+    {
+      "epoch": 0.17605749950087238,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001823168134117602,
+      "loss": 0.1152,
+      "step": 20282
+    },
+    {
+      "epoch": 0.17606617998107654,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0018231504167996505,
+      "loss": 0.1182,
+      "step": 20283
+    },
+    {
+      "epoch": 0.1760748604612807,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0018231326986908724,
+      "loss": 0.1079,
+      "step": 20284
+    },
+    {
+      "epoch": 0.17608354094148487,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0018231149797912876,
+      "loss": 0.1406,
+      "step": 20285
+    },
+    {
+      "epoch": 0.17609222142168904,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001823097260100915,
+      "loss": 0.0796,
+      "step": 20286
+    },
+    {
+      "epoch": 0.1761009019018932,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001823079539619774,
+      "loss": 0.1396,
+      "step": 20287
+    },
+    {
+      "epoch": 0.17610958238209737,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0018230618183478842,
+      "loss": 0.1318,
+      "step": 20288
+    },
+    {
+      "epoch": 0.17611826286230153,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0018230440962852645,
+      "loss": 0.166,
+      "step": 20289
+    },
+    {
+      "epoch": 0.1761269433425057,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001823026373431935,
+      "loss": 0.1279,
+      "step": 20290
+    },
+    {
+      "epoch": 0.17613562382270986,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0018230086497879145,
+      "loss": 0.1226,
+      "step": 20291
+    },
+    {
+      "epoch": 0.17614430430291403,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001822990925353223,
+      "loss": 0.1533,
+      "step": 20292
+    },
+    {
+      "epoch": 0.1761529847831182,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0018229732001278792,
+      "loss": 0.0967,
+      "step": 20293
+    },
+    {
+      "epoch": 0.17616166526332236,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0018229554741119028,
+      "loss": 0.1172,
+      "step": 20294
+    },
+    {
+      "epoch": 0.17617034574352652,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0018229377473053133,
+      "loss": 0.0874,
+      "step": 20295
+    },
+    {
+      "epoch": 0.1761790262237307,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0018229200197081295,
+      "loss": 0.1514,
+      "step": 20296
+    },
+    {
+      "epoch": 0.17618770670393485,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0018229022913203714,
+      "loss": 0.1182,
+      "step": 20297
+    },
+    {
+      "epoch": 0.17619638718413902,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0018228845621420585,
+      "loss": 0.104,
+      "step": 20298
+    },
+    {
+      "epoch": 0.17620506766434318,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0018228668321732097,
+      "loss": 0.085,
+      "step": 20299
+    },
+    {
+      "epoch": 0.17621374814454735,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0018228491014138446,
+      "loss": 0.1465,
+      "step": 20300
+    },
+    {
+      "epoch": 0.17622242862475151,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0018228313698639828,
+      "loss": 0.1592,
+      "step": 20301
+    },
+    {
+      "epoch": 0.17623110910495568,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0018228136375236432,
+      "loss": 0.1338,
+      "step": 20302
+    },
+    {
+      "epoch": 0.17623978958515985,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0018227959043928457,
+      "loss": 0.0991,
+      "step": 20303
+    },
+    {
+      "epoch": 0.176248470065364,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0018227781704716094,
+      "loss": 0.1338,
+      "step": 20304
+    },
+    {
+      "epoch": 0.17625715054556818,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0018227604357599534,
+      "loss": 0.0913,
+      "step": 20305
+    },
+    {
+      "epoch": 0.17626583102577234,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0018227427002578977,
+      "loss": 0.1318,
+      "step": 20306
+    },
+    {
+      "epoch": 0.1762745115059765,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0018227249639654614,
+      "loss": 0.0908,
+      "step": 20307
+    },
+    {
+      "epoch": 0.17628319198618067,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0018227072268826641,
+      "loss": 0.0967,
+      "step": 20308
+    },
+    {
+      "epoch": 0.17629187246638484,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0018226894890095249,
+      "loss": 0.1133,
+      "step": 20309
+    },
+    {
+      "epoch": 0.176300552946589,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0018226717503460634,
+      "loss": 0.0938,
+      "step": 20310
+    },
+    {
+      "epoch": 0.17630923342679317,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001822654010892299,
+      "loss": 0.0928,
+      "step": 20311
+    },
+    {
+      "epoch": 0.17631791390699733,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0018226362706482509,
+      "loss": 0.1123,
+      "step": 20312
+    },
+    {
+      "epoch": 0.1763265943872015,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0018226185296139387,
+      "loss": 0.1465,
+      "step": 20313
+    },
+    {
+      "epoch": 0.17633527486740566,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0018226007877893816,
+      "loss": 0.1001,
+      "step": 20314
+    },
+    {
+      "epoch": 0.17634395534760983,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0018225830451745993,
+      "loss": 0.127,
+      "step": 20315
+    },
+    {
+      "epoch": 0.176352635827814,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001822565301769611,
+      "loss": 0.1113,
+      "step": 20316
+    },
+    {
+      "epoch": 0.17636131630801816,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001822547557574436,
+      "loss": 0.0747,
+      "step": 20317
+    },
+    {
+      "epoch": 0.17636999678822232,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0018225298125890941,
+      "loss": 0.124,
+      "step": 20318
+    },
+    {
+      "epoch": 0.1763786772684265,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0018225120668136042,
+      "loss": 0.1182,
+      "step": 20319
+    },
+    {
+      "epoch": 0.17638735774863065,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001822494320247986,
+      "loss": 0.1299,
+      "step": 20320
+    },
+    {
+      "epoch": 0.17639603822883482,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001822476572892259,
+      "loss": 0.104,
+      "step": 20321
+    },
+    {
+      "epoch": 0.17640471870903898,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0018224588247464425,
+      "loss": 0.1406,
+      "step": 20322
+    },
+    {
+      "epoch": 0.17641339918924315,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018224410758105557,
+      "loss": 0.0845,
+      "step": 20323
+    },
+    {
+      "epoch": 0.1764220796694473,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0018224233260846184,
+      "loss": 0.0879,
+      "step": 20324
+    },
+    {
+      "epoch": 0.17643076014965148,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0018224055755686494,
+      "loss": 0.0918,
+      "step": 20325
+    },
+    {
+      "epoch": 0.17643944062985564,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0018223878242626687,
+      "loss": 0.1245,
+      "step": 20326
+    },
+    {
+      "epoch": 0.1764481211100598,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001822370072166696,
+      "loss": 0.1494,
+      "step": 20327
+    },
+    {
+      "epoch": 0.17645680159026397,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0018223523192807496,
+      "loss": 0.1396,
+      "step": 20328
+    },
+    {
+      "epoch": 0.17646548207046814,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0018223345656048502,
+      "loss": 0.1035,
+      "step": 20329
+    },
+    {
+      "epoch": 0.1764741625506723,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001822316811139016,
+      "loss": 0.1338,
+      "step": 20330
+    },
+    {
+      "epoch": 0.17648284303087647,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0018222990558832673,
+      "loss": 0.1172,
+      "step": 20331
+    },
+    {
+      "epoch": 0.17649152351108063,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018222812998376233,
+      "loss": 0.0962,
+      "step": 20332
+    },
+    {
+      "epoch": 0.1765002039912848,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0018222635430021029,
+      "loss": 0.0996,
+      "step": 20333
+    },
+    {
+      "epoch": 0.17650888447148896,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0018222457853767262,
+      "loss": 0.1133,
+      "step": 20334
+    },
+    {
+      "epoch": 0.17651756495169313,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0018222280269615124,
+      "loss": 0.0991,
+      "step": 20335
+    },
+    {
+      "epoch": 0.1765262454318973,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0018222102677564805,
+      "loss": 0.0845,
+      "step": 20336
+    },
+    {
+      "epoch": 0.17653492591210146,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0018221925077616511,
+      "loss": 0.1172,
+      "step": 20337
+    },
+    {
+      "epoch": 0.17654360639230562,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001822174746977042,
+      "loss": 0.1104,
+      "step": 20338
+    },
+    {
+      "epoch": 0.1765522868725098,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0018221569854026739,
+      "loss": 0.0977,
+      "step": 20339
+    },
+    {
+      "epoch": 0.17656096735271395,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0018221392230385657,
+      "loss": 0.0752,
+      "step": 20340
+    },
+    {
+      "epoch": 0.17656964783291812,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001822121459884737,
+      "loss": 0.1826,
+      "step": 20341
+    },
+    {
+      "epoch": 0.17657832831312228,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0018221036959412068,
+      "loss": 0.082,
+      "step": 20342
+    },
+    {
+      "epoch": 0.17658700879332645,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0018220859312079952,
+      "loss": 0.1162,
+      "step": 20343
+    },
+    {
+      "epoch": 0.17659568927353061,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0018220681656851208,
+      "loss": 0.0767,
+      "step": 20344
+    },
+    {
+      "epoch": 0.17660436975373478,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001822050399372604,
+      "loss": 0.1001,
+      "step": 20345
+    },
+    {
+      "epoch": 0.17661305023393895,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0018220326322704635,
+      "loss": 0.1113,
+      "step": 20346
+    },
+    {
+      "epoch": 0.1766217307141431,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0018220148643787188,
+      "loss": 0.1011,
+      "step": 20347
+    },
+    {
+      "epoch": 0.17663041119434728,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0018219970956973898,
+      "loss": 0.1279,
+      "step": 20348
+    },
+    {
+      "epoch": 0.17663909167455144,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0018219793262264955,
+      "loss": 0.1143,
+      "step": 20349
+    },
+    {
+      "epoch": 0.1766477721547556,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0018219615559660557,
+      "loss": 0.1064,
+      "step": 20350
+    },
+    {
+      "epoch": 0.17665645263495977,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001821943784916089,
+      "loss": 0.0947,
+      "step": 20351
+    },
+    {
+      "epoch": 0.17666513311516394,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.001821926013076616,
+      "loss": 0.0977,
+      "step": 20352
+    },
+    {
+      "epoch": 0.1766738135953681,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0018219082404476552,
+      "loss": 0.1211,
+      "step": 20353
+    },
+    {
+      "epoch": 0.17668249407557227,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0018218904670292265,
+      "loss": 0.0977,
+      "step": 20354
+    },
+    {
+      "epoch": 0.17669117455577643,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001821872692821349,
+      "loss": 0.1377,
+      "step": 20355
+    },
+    {
+      "epoch": 0.1766998550359806,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0018218549178240428,
+      "loss": 0.1055,
+      "step": 20356
+    },
+    {
+      "epoch": 0.17670853551618476,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001821837142037327,
+      "loss": 0.1611,
+      "step": 20357
+    },
+    {
+      "epoch": 0.17671721599638893,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0018218193654612207,
+      "loss": 0.1133,
+      "step": 20358
+    },
+    {
+      "epoch": 0.1767258964765931,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0018218015880957434,
+      "loss": 0.1021,
+      "step": 20359
+    },
+    {
+      "epoch": 0.17673457695679726,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001821783809940915,
+      "loss": 0.1299,
+      "step": 20360
+    },
+    {
+      "epoch": 0.17674325743700142,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0018217660309967545,
+      "loss": 0.1484,
+      "step": 20361
+    },
+    {
+      "epoch": 0.1767519379172056,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0018217482512632814,
+      "loss": 0.1309,
+      "step": 20362
+    },
+    {
+      "epoch": 0.17676061839740975,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0018217304707405157,
+      "loss": 0.1719,
+      "step": 20363
+    },
+    {
+      "epoch": 0.17676929887761392,
+      "grad_norm": 3.65625,
+      "learning_rate": 0.0018217126894284762,
+      "loss": 0.375,
+      "step": 20364
+    },
+    {
+      "epoch": 0.17677797935781808,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0018216949073271824,
+      "loss": 0.1201,
+      "step": 20365
+    },
+    {
+      "epoch": 0.17678665983802225,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001821677124436654,
+      "loss": 0.0967,
+      "step": 20366
+    },
+    {
+      "epoch": 0.1767953403182264,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.00182165934075691,
+      "loss": 0.0957,
+      "step": 20367
+    },
+    {
+      "epoch": 0.17680402079843058,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0018216415562879707,
+      "loss": 0.291,
+      "step": 20368
+    },
+    {
+      "epoch": 0.17681270127863474,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0018216237710298548,
+      "loss": 0.1465,
+      "step": 20369
+    },
+    {
+      "epoch": 0.1768213817588389,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0018216059849825823,
+      "loss": 0.1348,
+      "step": 20370
+    },
+    {
+      "epoch": 0.17683006223904307,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0018215881981461718,
+      "loss": 0.126,
+      "step": 20371
+    },
+    {
+      "epoch": 0.17683874271924724,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018215704105206437,
+      "loss": 0.1553,
+      "step": 20372
+    },
+    {
+      "epoch": 0.1768474231994514,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001821552622106017,
+      "loss": 0.1328,
+      "step": 20373
+    },
+    {
+      "epoch": 0.17685610367965557,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0018215348329023112,
+      "loss": 0.1016,
+      "step": 20374
+    },
+    {
+      "epoch": 0.17686478415985973,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0018215170429095454,
+      "loss": 0.105,
+      "step": 20375
+    },
+    {
+      "epoch": 0.1768734646400639,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0018214992521277396,
+      "loss": 0.1289,
+      "step": 20376
+    },
+    {
+      "epoch": 0.17688214512026806,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0018214814605569131,
+      "loss": 0.1299,
+      "step": 20377
+    },
+    {
+      "epoch": 0.17689082560047223,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0018214636681970855,
+      "loss": 0.1025,
+      "step": 20378
+    },
+    {
+      "epoch": 0.1768995060806764,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.001821445875048276,
+      "loss": 0.106,
+      "step": 20379
+    },
+    {
+      "epoch": 0.17690818656088056,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0018214280811105039,
+      "loss": 0.0986,
+      "step": 20380
+    },
+    {
+      "epoch": 0.17691686704108472,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001821410286383789,
+      "loss": 0.0986,
+      "step": 20381
+    },
+    {
+      "epoch": 0.1769255475212889,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0018213924908681508,
+      "loss": 0.1113,
+      "step": 20382
+    },
+    {
+      "epoch": 0.17693422800149305,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0018213746945636082,
+      "loss": 0.1289,
+      "step": 20383
+    },
+    {
+      "epoch": 0.17694290848169722,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0018213568974701814,
+      "loss": 0.0957,
+      "step": 20384
+    },
+    {
+      "epoch": 0.17695158896190138,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0018213390995878895,
+      "loss": 0.1104,
+      "step": 20385
+    },
+    {
+      "epoch": 0.17696026944210555,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001821321300916752,
+      "loss": 0.0776,
+      "step": 20386
+    },
+    {
+      "epoch": 0.17696894992230972,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0018213035014567882,
+      "loss": 0.0952,
+      "step": 20387
+    },
+    {
+      "epoch": 0.17697763040251388,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001821285701208018,
+      "loss": 0.106,
+      "step": 20388
+    },
+    {
+      "epoch": 0.17698631088271805,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0018212679001704604,
+      "loss": 0.1035,
+      "step": 20389
+    },
+    {
+      "epoch": 0.1769949913629222,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0018212500983441353,
+      "loss": 0.1162,
+      "step": 20390
+    },
+    {
+      "epoch": 0.17700367184312638,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0018212322957290618,
+      "loss": 0.1084,
+      "step": 20391
+    },
+    {
+      "epoch": 0.1770123523233305,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0018212144923252592,
+      "loss": 0.1133,
+      "step": 20392
+    },
+    {
+      "epoch": 0.17702103280353468,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0018211966881327478,
+      "loss": 0.0859,
+      "step": 20393
+    },
+    {
+      "epoch": 0.17702971328373884,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0018211788831515462,
+      "loss": 0.127,
+      "step": 20394
+    },
+    {
+      "epoch": 0.177038393763943,
+      "grad_norm": 2.890625,
+      "learning_rate": 0.0018211610773816742,
+      "loss": 0.2129,
+      "step": 20395
+    },
+    {
+      "epoch": 0.17704707424414717,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0018211432708231511,
+      "loss": 0.1094,
+      "step": 20396
+    },
+    {
+      "epoch": 0.17705575472435134,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001821125463475997,
+      "loss": 0.1035,
+      "step": 20397
+    },
+    {
+      "epoch": 0.1770644352045555,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0018211076553402308,
+      "loss": 0.1387,
+      "step": 20398
+    },
+    {
+      "epoch": 0.17707311568475967,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001821089846415872,
+      "loss": 0.1182,
+      "step": 20399
+    },
+    {
+      "epoch": 0.17708179616496383,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0018210720367029401,
+      "loss": 0.083,
+      "step": 20400
+    },
+    {
+      "epoch": 0.177090476645168,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018210542262014546,
+      "loss": 0.1328,
+      "step": 20401
+    },
+    {
+      "epoch": 0.17709915712537216,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0018210364149114356,
+      "loss": 0.083,
+      "step": 20402
+    },
+    {
+      "epoch": 0.17710783760557633,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0018210186028329014,
+      "loss": 0.1328,
+      "step": 20403
+    },
+    {
+      "epoch": 0.1771165180857805,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0018210007899658724,
+      "loss": 0.0708,
+      "step": 20404
+    },
+    {
+      "epoch": 0.17712519856598466,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0018209829763103675,
+      "loss": 0.1152,
+      "step": 20405
+    },
+    {
+      "epoch": 0.17713387904618882,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0018209651618664067,
+      "loss": 0.1436,
+      "step": 20406
+    },
+    {
+      "epoch": 0.177142559526393,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001820947346634009,
+      "loss": 0.1553,
+      "step": 20407
+    },
+    {
+      "epoch": 0.17715124000659715,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0018209295306131947,
+      "loss": 0.1328,
+      "step": 20408
+    },
+    {
+      "epoch": 0.17715992048680132,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0018209117138039822,
+      "loss": 0.1201,
+      "step": 20409
+    },
+    {
+      "epoch": 0.17716860096700549,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0018208938962063913,
+      "loss": 0.0879,
+      "step": 20410
+    },
+    {
+      "epoch": 0.17717728144720965,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001820876077820442,
+      "loss": 0.1099,
+      "step": 20411
+    },
+    {
+      "epoch": 0.17718596192741382,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0018208582586461534,
+      "loss": 0.0986,
+      "step": 20412
+    },
+    {
+      "epoch": 0.17719464240761798,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0018208404386835448,
+      "loss": 0.1221,
+      "step": 20413
+    },
+    {
+      "epoch": 0.17720332288782215,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0018208226179326364,
+      "loss": 0.1094,
+      "step": 20414
+    },
+    {
+      "epoch": 0.1772120033680263,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0018208047963934467,
+      "loss": 0.0933,
+      "step": 20415
+    },
+    {
+      "epoch": 0.17722068384823048,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.001820786974065996,
+      "loss": 0.1021,
+      "step": 20416
+    },
+    {
+      "epoch": 0.17722936432843464,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0018207691509503035,
+      "loss": 0.0679,
+      "step": 20417
+    },
+    {
+      "epoch": 0.1772380448086388,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0018207513270463885,
+      "loss": 0.0806,
+      "step": 20418
+    },
+    {
+      "epoch": 0.17724672528884297,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0018207335023542708,
+      "loss": 0.0879,
+      "step": 20419
+    },
+    {
+      "epoch": 0.17725540576904714,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.00182071567687397,
+      "loss": 0.1035,
+      "step": 20420
+    },
+    {
+      "epoch": 0.1772640862492513,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001820697850605505,
+      "loss": 0.0898,
+      "step": 20421
+    },
+    {
+      "epoch": 0.17727276672945547,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001820680023548896,
+      "loss": 0.1074,
+      "step": 20422
+    },
+    {
+      "epoch": 0.17728144720965963,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0018206621957041619,
+      "loss": 0.1357,
+      "step": 20423
+    },
+    {
+      "epoch": 0.1772901276898638,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0018206443670713223,
+      "loss": 0.1504,
+      "step": 20424
+    },
+    {
+      "epoch": 0.17729880817006796,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001820626537650397,
+      "loss": 0.1201,
+      "step": 20425
+    },
+    {
+      "epoch": 0.17730748865027213,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018206087074414054,
+      "loss": 0.1201,
+      "step": 20426
+    },
+    {
+      "epoch": 0.1773161691304763,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0018205908764443671,
+      "loss": 0.1523,
+      "step": 20427
+    },
+    {
+      "epoch": 0.17732484961068046,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0018205730446593012,
+      "loss": 0.1016,
+      "step": 20428
+    },
+    {
+      "epoch": 0.17733353009088462,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018205552120862274,
+      "loss": 0.1123,
+      "step": 20429
+    },
+    {
+      "epoch": 0.1773422105710888,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0018205373787251653,
+      "loss": 0.1025,
+      "step": 20430
+    },
+    {
+      "epoch": 0.17735089105129295,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0018205195445761347,
+      "loss": 0.1357,
+      "step": 20431
+    },
+    {
+      "epoch": 0.17735957153149712,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0018205017096391545,
+      "loss": 0.082,
+      "step": 20432
+    },
+    {
+      "epoch": 0.17736825201170128,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0018204838739142445,
+      "loss": 0.0957,
+      "step": 20433
+    },
+    {
+      "epoch": 0.17737693249190545,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0018204660374014242,
+      "loss": 0.0898,
+      "step": 20434
+    },
+    {
+      "epoch": 0.1773856129721096,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0018204482001007128,
+      "loss": 0.1191,
+      "step": 20435
+    },
+    {
+      "epoch": 0.17739429345231378,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0018204303620121302,
+      "loss": 0.1025,
+      "step": 20436
+    },
+    {
+      "epoch": 0.17740297393251794,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0018204125231356958,
+      "loss": 0.127,
+      "step": 20437
+    },
+    {
+      "epoch": 0.1774116544127221,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0018203946834714291,
+      "loss": 0.124,
+      "step": 20438
+    },
+    {
+      "epoch": 0.17742033489292627,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0018203768430193495,
+      "loss": 0.1143,
+      "step": 20439
+    },
+    {
+      "epoch": 0.17742901537313044,
+      "grad_norm": 2.28125,
+      "learning_rate": 0.0018203590017794769,
+      "loss": 0.1562,
+      "step": 20440
+    },
+    {
+      "epoch": 0.1774376958533346,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0018203411597518303,
+      "loss": 0.1289,
+      "step": 20441
+    },
+    {
+      "epoch": 0.17744637633353877,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0018203233169364296,
+      "loss": 0.1089,
+      "step": 20442
+    },
+    {
+      "epoch": 0.17745505681374293,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0018203054733332935,
+      "loss": 0.125,
+      "step": 20443
+    },
+    {
+      "epoch": 0.1774637372939471,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001820287628942443,
+      "loss": 0.1113,
+      "step": 20444
+    },
+    {
+      "epoch": 0.17747241777415126,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0018202697837638965,
+      "loss": 0.0918,
+      "step": 20445
+    },
+    {
+      "epoch": 0.17748109825435543,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0018202519377976734,
+      "loss": 0.1079,
+      "step": 20446
+    },
+    {
+      "epoch": 0.1774897787345596,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001820234091043794,
+      "loss": 0.0923,
+      "step": 20447
+    },
+    {
+      "epoch": 0.17749845921476376,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0018202162435022773,
+      "loss": 0.0996,
+      "step": 20448
+    },
+    {
+      "epoch": 0.17750713969496792,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.001820198395173143,
+      "loss": 0.126,
+      "step": 20449
+    },
+    {
+      "epoch": 0.1775158201751721,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0018201805460564106,
+      "loss": 0.083,
+      "step": 20450
+    },
+    {
+      "epoch": 0.17752450065537626,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0018201626961520997,
+      "loss": 0.1416,
+      "step": 20451
+    },
+    {
+      "epoch": 0.17753318113558042,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0018201448454602295,
+      "loss": 0.1143,
+      "step": 20452
+    },
+    {
+      "epoch": 0.17754186161578459,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0018201269939808196,
+      "loss": 0.1182,
+      "step": 20453
+    },
+    {
+      "epoch": 0.17755054209598875,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0018201091417138896,
+      "loss": 0.125,
+      "step": 20454
+    },
+    {
+      "epoch": 0.17755922257619292,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0018200912886594592,
+      "loss": 0.1455,
+      "step": 20455
+    },
+    {
+      "epoch": 0.17756790305639708,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0018200734348175478,
+      "loss": 0.1074,
+      "step": 20456
+    },
+    {
+      "epoch": 0.17757658353660125,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0018200555801881752,
+      "loss": 0.1201,
+      "step": 20457
+    },
+    {
+      "epoch": 0.1775852640168054,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0018200377247713602,
+      "loss": 0.1074,
+      "step": 20458
+    },
+    {
+      "epoch": 0.17759394449700958,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001820019868567123,
+      "loss": 0.1328,
+      "step": 20459
+    },
+    {
+      "epoch": 0.17760262497721374,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001820002011575483,
+      "loss": 0.1523,
+      "step": 20460
+    },
+    {
+      "epoch": 0.1776113054574179,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0018199841537964595,
+      "loss": 0.1533,
+      "step": 20461
+    },
+    {
+      "epoch": 0.17761998593762207,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0018199662952300716,
+      "loss": 0.1162,
+      "step": 20462
+    },
+    {
+      "epoch": 0.17762866641782624,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.00181994843587634,
+      "loss": 0.1011,
+      "step": 20463
+    },
+    {
+      "epoch": 0.1776373468980304,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0018199305757352834,
+      "loss": 0.1143,
+      "step": 20464
+    },
+    {
+      "epoch": 0.17764602737823457,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0018199127148069217,
+      "loss": 0.0947,
+      "step": 20465
+    },
+    {
+      "epoch": 0.17765470785843873,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0018198948530912741,
+      "loss": 0.1221,
+      "step": 20466
+    },
+    {
+      "epoch": 0.1776633883386429,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0018198769905883603,
+      "loss": 0.1152,
+      "step": 20467
+    },
+    {
+      "epoch": 0.17767206881884706,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0018198591272982,
+      "loss": 0.0967,
+      "step": 20468
+    },
+    {
+      "epoch": 0.17768074929905123,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0018198412632208125,
+      "loss": 0.0986,
+      "step": 20469
+    },
+    {
+      "epoch": 0.1776894297792554,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0018198233983562175,
+      "loss": 0.1631,
+      "step": 20470
+    },
+    {
+      "epoch": 0.17769811025945956,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0018198055327044342,
+      "loss": 0.1182,
+      "step": 20471
+    },
+    {
+      "epoch": 0.17770679073966372,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0018197876662654825,
+      "loss": 0.1406,
+      "step": 20472
+    },
+    {
+      "epoch": 0.1777154712198679,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0018197697990393817,
+      "loss": 0.1611,
+      "step": 20473
+    },
+    {
+      "epoch": 0.17772415170007205,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0018197519310261519,
+      "loss": 0.0928,
+      "step": 20474
+    },
+    {
+      "epoch": 0.17773283218027622,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001819734062225812,
+      "loss": 0.0947,
+      "step": 20475
+    },
+    {
+      "epoch": 0.17774151266048038,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0018197161926383817,
+      "loss": 0.0903,
+      "step": 20476
+    },
+    {
+      "epoch": 0.17775019314068455,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018196983222638803,
+      "loss": 0.1318,
+      "step": 20477
+    },
+    {
+      "epoch": 0.1777588736208887,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0018196804511023282,
+      "loss": 0.0967,
+      "step": 20478
+    },
+    {
+      "epoch": 0.17776755410109288,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0018196625791537437,
+      "loss": 0.1055,
+      "step": 20479
+    },
+    {
+      "epoch": 0.17777623458129704,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0018196447064181475,
+      "loss": 0.1055,
+      "step": 20480
+    },
+    {
+      "epoch": 0.1777849150615012,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0018196268328955583,
+      "loss": 0.0874,
+      "step": 20481
+    },
+    {
+      "epoch": 0.17779359554170537,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0018196089585859963,
+      "loss": 0.1387,
+      "step": 20482
+    },
+    {
+      "epoch": 0.17780227602190954,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0018195910834894807,
+      "loss": 0.1016,
+      "step": 20483
+    },
+    {
+      "epoch": 0.1778109565021137,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0018195732076060312,
+      "loss": 0.105,
+      "step": 20484
+    },
+    {
+      "epoch": 0.17781963698231787,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001819555330935667,
+      "loss": 0.0908,
+      "step": 20485
+    },
+    {
+      "epoch": 0.17782831746252203,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0018195374534784082,
+      "loss": 0.1147,
+      "step": 20486
+    },
+    {
+      "epoch": 0.1778369979427262,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0018195195752342738,
+      "loss": 0.1055,
+      "step": 20487
+    },
+    {
+      "epoch": 0.17784567842293036,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0018195016962032835,
+      "loss": 0.0723,
+      "step": 20488
+    },
+    {
+      "epoch": 0.17785435890313453,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0018194838163854573,
+      "loss": 0.1816,
+      "step": 20489
+    },
+    {
+      "epoch": 0.1778630393833387,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0018194659357808143,
+      "loss": 0.106,
+      "step": 20490
+    },
+    {
+      "epoch": 0.17787171986354286,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001819448054389374,
+      "loss": 0.1094,
+      "step": 20491
+    },
+    {
+      "epoch": 0.17788040034374702,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0018194301722111563,
+      "loss": 0.0889,
+      "step": 20492
+    },
+    {
+      "epoch": 0.1778890808239512,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0018194122892461807,
+      "loss": 0.1074,
+      "step": 20493
+    },
+    {
+      "epoch": 0.17789776130415536,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0018193944054944661,
+      "loss": 0.0962,
+      "step": 20494
+    },
+    {
+      "epoch": 0.17790644178435952,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0018193765209560332,
+      "loss": 0.124,
+      "step": 20495
+    },
+    {
+      "epoch": 0.17791512226456369,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0018193586356309009,
+      "loss": 0.1216,
+      "step": 20496
+    },
+    {
+      "epoch": 0.17792380274476785,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0018193407495190883,
+      "loss": 0.1084,
+      "step": 20497
+    },
+    {
+      "epoch": 0.17793248322497202,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0018193228626206157,
+      "loss": 0.0996,
+      "step": 20498
+    },
+    {
+      "epoch": 0.17794116370517618,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0018193049749355027,
+      "loss": 0.0874,
+      "step": 20499
+    },
+    {
+      "epoch": 0.17794984418538035,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0018192870864637682,
+      "loss": 0.1172,
+      "step": 20500
+    },
+    {
+      "epoch": 0.1779585246655845,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0018192691972054322,
+      "loss": 0.0889,
+      "step": 20501
+    },
+    {
+      "epoch": 0.17796720514578868,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0018192513071605143,
+      "loss": 0.1396,
+      "step": 20502
+    },
+    {
+      "epoch": 0.17797588562599284,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0018192334163290341,
+      "loss": 0.1338,
+      "step": 20503
+    },
+    {
+      "epoch": 0.177984566106197,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0018192155247110109,
+      "loss": 0.082,
+      "step": 20504
+    },
+    {
+      "epoch": 0.17799324658640117,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0018191976323064646,
+      "loss": 0.1309,
+      "step": 20505
+    },
+    {
+      "epoch": 0.17800192706660534,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0018191797391154142,
+      "loss": 0.0801,
+      "step": 20506
+    },
+    {
+      "epoch": 0.1780106075468095,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00181916184513788,
+      "loss": 0.1377,
+      "step": 20507
+    },
+    {
+      "epoch": 0.17801928802701367,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0018191439503738812,
+      "loss": 0.1572,
+      "step": 20508
+    },
+    {
+      "epoch": 0.17802796850721783,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0018191260548234371,
+      "loss": 0.1084,
+      "step": 20509
+    },
+    {
+      "epoch": 0.178036648987422,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0018191081584865679,
+      "loss": 0.1021,
+      "step": 20510
+    },
+    {
+      "epoch": 0.17804532946762616,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0018190902613632925,
+      "loss": 0.0977,
+      "step": 20511
+    },
+    {
+      "epoch": 0.17805400994783033,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001819072363453631,
+      "loss": 0.0977,
+      "step": 20512
+    },
+    {
+      "epoch": 0.1780626904280345,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0018190544647576025,
+      "loss": 0.0957,
+      "step": 20513
+    },
+    {
+      "epoch": 0.17807137090823866,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0018190365652752272,
+      "loss": 0.1035,
+      "step": 20514
+    },
+    {
+      "epoch": 0.1780800513884428,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0018190186650065242,
+      "loss": 0.1104,
+      "step": 20515
+    },
+    {
+      "epoch": 0.17808873186864696,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001819000763951513,
+      "loss": 0.1484,
+      "step": 20516
+    },
+    {
+      "epoch": 0.17809741234885113,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0018189828621102135,
+      "loss": 0.1406,
+      "step": 20517
+    },
+    {
+      "epoch": 0.1781060928290553,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0018189649594826452,
+      "loss": 0.0986,
+      "step": 20518
+    },
+    {
+      "epoch": 0.17811477330925946,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0018189470560688278,
+      "loss": 0.105,
+      "step": 20519
+    },
+    {
+      "epoch": 0.17812345378946362,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0018189291518687805,
+      "loss": 0.1162,
+      "step": 20520
+    },
+    {
+      "epoch": 0.17813213426966779,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0018189112468825228,
+      "loss": 0.0918,
+      "step": 20521
+    },
+    {
+      "epoch": 0.17814081474987195,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0018188933411100747,
+      "loss": 0.1289,
+      "step": 20522
+    },
+    {
+      "epoch": 0.17814949523007612,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001818875434551456,
+      "loss": 0.1187,
+      "step": 20523
+    },
+    {
+      "epoch": 0.17815817571028028,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0018188575272066853,
+      "loss": 0.123,
+      "step": 20524
+    },
+    {
+      "epoch": 0.17816685619048445,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0018188396190757835,
+      "loss": 0.1113,
+      "step": 20525
+    },
+    {
+      "epoch": 0.1781755366706886,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0018188217101587692,
+      "loss": 0.1357,
+      "step": 20526
+    },
+    {
+      "epoch": 0.17818421715089278,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0018188038004556622,
+      "loss": 0.1289,
+      "step": 20527
+    },
+    {
+      "epoch": 0.17819289763109694,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0018187858899664823,
+      "loss": 0.125,
+      "step": 20528
+    },
+    {
+      "epoch": 0.1782015781113011,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001818767978691249,
+      "loss": 0.1523,
+      "step": 20529
+    },
+    {
+      "epoch": 0.17821025859150527,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018187500666299818,
+      "loss": 0.1191,
+      "step": 20530
+    },
+    {
+      "epoch": 0.17821893907170944,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0018187321537827,
+      "loss": 0.0908,
+      "step": 20531
+    },
+    {
+      "epoch": 0.1782276195519136,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001818714240149424,
+      "loss": 0.0928,
+      "step": 20532
+    },
+    {
+      "epoch": 0.17823630003211777,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0018186963257301725,
+      "loss": 0.1172,
+      "step": 20533
+    },
+    {
+      "epoch": 0.17824498051232193,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0018186784105249656,
+      "loss": 0.1445,
+      "step": 20534
+    },
+    {
+      "epoch": 0.1782536609925261,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001818660494533823,
+      "loss": 0.1045,
+      "step": 20535
+    },
+    {
+      "epoch": 0.17826234147273026,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001818642577756764,
+      "loss": 0.0957,
+      "step": 20536
+    },
+    {
+      "epoch": 0.17827102195293443,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001818624660193808,
+      "loss": 0.1138,
+      "step": 20537
+    },
+    {
+      "epoch": 0.1782797024331386,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001818606741844975,
+      "loss": 0.0889,
+      "step": 20538
+    },
+    {
+      "epoch": 0.17828838291334276,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0018185888227102846,
+      "loss": 0.0996,
+      "step": 20539
+    },
+    {
+      "epoch": 0.17829706339354692,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0018185709027897564,
+      "loss": 0.1367,
+      "step": 20540
+    },
+    {
+      "epoch": 0.1783057438737511,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0018185529820834096,
+      "loss": 0.0752,
+      "step": 20541
+    },
+    {
+      "epoch": 0.17831442435395525,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001818535060591264,
+      "loss": 0.1152,
+      "step": 20542
+    },
+    {
+      "epoch": 0.17832310483415942,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018185171383133395,
+      "loss": 0.1455,
+      "step": 20543
+    },
+    {
+      "epoch": 0.17833178531436358,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001818499215249655,
+      "loss": 0.0977,
+      "step": 20544
+    },
+    {
+      "epoch": 0.17834046579456775,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001818481291400231,
+      "loss": 0.1553,
+      "step": 20545
+    },
+    {
+      "epoch": 0.1783491462747719,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0018184633667650866,
+      "loss": 0.1436,
+      "step": 20546
+    },
+    {
+      "epoch": 0.17835782675497608,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0018184454413442414,
+      "loss": 0.0898,
+      "step": 20547
+    },
+    {
+      "epoch": 0.17836650723518024,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0018184275151377149,
+      "loss": 0.1377,
+      "step": 20548
+    },
+    {
+      "epoch": 0.1783751877153844,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001818409588145527,
+      "loss": 0.0972,
+      "step": 20549
+    },
+    {
+      "epoch": 0.17838386819558857,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0018183916603676975,
+      "loss": 0.1211,
+      "step": 20550
+    },
+    {
+      "epoch": 0.17839254867579274,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0018183737318042453,
+      "loss": 0.1177,
+      "step": 20551
+    },
+    {
+      "epoch": 0.1784012291559969,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0018183558024551902,
+      "loss": 0.106,
+      "step": 20552
+    },
+    {
+      "epoch": 0.17840990963620107,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0018183378723205523,
+      "loss": 0.0996,
+      "step": 20553
+    },
+    {
+      "epoch": 0.17841859011640523,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0018183199414003508,
+      "loss": 0.085,
+      "step": 20554
+    },
+    {
+      "epoch": 0.1784272705966094,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0018183020096946055,
+      "loss": 0.0684,
+      "step": 20555
+    },
+    {
+      "epoch": 0.17843595107681356,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.001818284077203336,
+      "loss": 0.1875,
+      "step": 20556
+    },
+    {
+      "epoch": 0.17844463155701773,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0018182661439265614,
+      "loss": 0.1289,
+      "step": 20557
+    },
+    {
+      "epoch": 0.1784533120372219,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001818248209864302,
+      "loss": 0.0981,
+      "step": 20558
+    },
+    {
+      "epoch": 0.17846199251742606,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0018182302750165771,
+      "loss": 0.1099,
+      "step": 20559
+    },
+    {
+      "epoch": 0.17847067299763023,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0018182123393834064,
+      "loss": 0.1025,
+      "step": 20560
+    },
+    {
+      "epoch": 0.1784793534778344,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0018181944029648097,
+      "loss": 0.1118,
+      "step": 20561
+    },
+    {
+      "epoch": 0.17848803395803856,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0018181764657608061,
+      "loss": 0.1035,
+      "step": 20562
+    },
+    {
+      "epoch": 0.17849671443824272,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0018181585277714156,
+      "loss": 0.0986,
+      "step": 20563
+    },
+    {
+      "epoch": 0.17850539491844689,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0018181405889966579,
+      "loss": 0.1113,
+      "step": 20564
+    },
+    {
+      "epoch": 0.17851407539865105,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0018181226494365522,
+      "loss": 0.0996,
+      "step": 20565
+    },
+    {
+      "epoch": 0.17852275587885522,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0018181047090911181,
+      "loss": 0.0996,
+      "step": 20566
+    },
+    {
+      "epoch": 0.17853143635905938,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0018180867679603759,
+      "loss": 0.0972,
+      "step": 20567
+    },
+    {
+      "epoch": 0.17854011683926355,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0018180688260443448,
+      "loss": 0.1309,
+      "step": 20568
+    },
+    {
+      "epoch": 0.1785487973194677,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0018180508833430442,
+      "loss": 0.1631,
+      "step": 20569
+    },
+    {
+      "epoch": 0.17855747779967188,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001818032939856494,
+      "loss": 0.1309,
+      "step": 20570
+    },
+    {
+      "epoch": 0.17856615827987604,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0018180149955847138,
+      "loss": 0.0894,
+      "step": 20571
+    },
+    {
+      "epoch": 0.1785748387600802,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0018179970505277233,
+      "loss": 0.0933,
+      "step": 20572
+    },
+    {
+      "epoch": 0.17858351924028437,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001817979104685542,
+      "loss": 0.0942,
+      "step": 20573
+    },
+    {
+      "epoch": 0.17859219972048854,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0018179611580581895,
+      "loss": 0.1035,
+      "step": 20574
+    },
+    {
+      "epoch": 0.1786008802006927,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0018179432106456852,
+      "loss": 0.0879,
+      "step": 20575
+    },
+    {
+      "epoch": 0.17860956068089687,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0018179252624480494,
+      "loss": 0.1025,
+      "step": 20576
+    },
+    {
+      "epoch": 0.17861824116110103,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001817907313465301,
+      "loss": 0.166,
+      "step": 20577
+    },
+    {
+      "epoch": 0.1786269216413052,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0018178893636974602,
+      "loss": 0.1211,
+      "step": 20578
+    },
+    {
+      "epoch": 0.17863560212150936,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0018178714131445461,
+      "loss": 0.0894,
+      "step": 20579
+    },
+    {
+      "epoch": 0.17864428260171353,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001817853461806579,
+      "loss": 0.0928,
+      "step": 20580
+    },
+    {
+      "epoch": 0.1786529630819177,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001817835509683578,
+      "loss": 0.1367,
+      "step": 20581
+    },
+    {
+      "epoch": 0.17866164356212186,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0018178175567755625,
+      "loss": 0.1123,
+      "step": 20582
+    },
+    {
+      "epoch": 0.17867032404232602,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001817799603082553,
+      "loss": 0.105,
+      "step": 20583
+    },
+    {
+      "epoch": 0.1786790045225302,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0018177816486045684,
+      "loss": 0.1367,
+      "step": 20584
+    },
+    {
+      "epoch": 0.17868768500273435,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0018177636933416287,
+      "loss": 0.1113,
+      "step": 20585
+    },
+    {
+      "epoch": 0.17869636548293852,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0018177457372937531,
+      "loss": 0.1108,
+      "step": 20586
+    },
+    {
+      "epoch": 0.17870504596314268,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0018177277804609618,
+      "loss": 0.1191,
+      "step": 20587
+    },
+    {
+      "epoch": 0.17871372644334685,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0018177098228432742,
+      "loss": 0.1001,
+      "step": 20588
+    },
+    {
+      "epoch": 0.178722406923551,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0018176918644407097,
+      "loss": 0.1289,
+      "step": 20589
+    },
+    {
+      "epoch": 0.17873108740375518,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0018176739052532885,
+      "loss": 0.1104,
+      "step": 20590
+    },
+    {
+      "epoch": 0.17873976788395934,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0018176559452810295,
+      "loss": 0.1089,
+      "step": 20591
+    },
+    {
+      "epoch": 0.1787484483641635,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0018176379845239532,
+      "loss": 0.1006,
+      "step": 20592
+    },
+    {
+      "epoch": 0.17875712884436767,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0018176200229820785,
+      "loss": 0.1357,
+      "step": 20593
+    },
+    {
+      "epoch": 0.17876580932457184,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0018176020606554253,
+      "loss": 0.083,
+      "step": 20594
+    },
+    {
+      "epoch": 0.178774489804776,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0018175840975440131,
+      "loss": 0.1118,
+      "step": 20595
+    },
+    {
+      "epoch": 0.17878317028498017,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0018175661336478624,
+      "loss": 0.1016,
+      "step": 20596
+    },
+    {
+      "epoch": 0.17879185076518433,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0018175481689669914,
+      "loss": 0.1216,
+      "step": 20597
+    },
+    {
+      "epoch": 0.1788005312453885,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001817530203501421,
+      "loss": 0.1021,
+      "step": 20598
+    },
+    {
+      "epoch": 0.17880921172559266,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0018175122372511704,
+      "loss": 0.1235,
+      "step": 20599
+    },
+    {
+      "epoch": 0.17881789220579683,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0018174942702162589,
+      "loss": 0.1016,
+      "step": 20600
+    },
+    {
+      "epoch": 0.178826572686001,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0018174763023967066,
+      "loss": 0.1162,
+      "step": 20601
+    },
+    {
+      "epoch": 0.17883525316620516,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0018174583337925326,
+      "loss": 0.0923,
+      "step": 20602
+    },
+    {
+      "epoch": 0.17884393364640933,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0018174403644037572,
+      "loss": 0.1631,
+      "step": 20603
+    },
+    {
+      "epoch": 0.1788526141266135,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0018174223942303998,
+      "loss": 0.1045,
+      "step": 20604
+    },
+    {
+      "epoch": 0.17886129460681766,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0018174044232724803,
+      "loss": 0.1318,
+      "step": 20605
+    },
+    {
+      "epoch": 0.17886997508702182,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0018173864515300178,
+      "loss": 0.0835,
+      "step": 20606
+    },
+    {
+      "epoch": 0.17887865556722599,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018173684790030323,
+      "loss": 0.1064,
+      "step": 20607
+    },
+    {
+      "epoch": 0.17888733604743015,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0018173505056915437,
+      "loss": 0.1123,
+      "step": 20608
+    },
+    {
+      "epoch": 0.17889601652763432,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0018173325315955708,
+      "loss": 0.1084,
+      "step": 20609
+    },
+    {
+      "epoch": 0.17890469700783848,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018173145567151343,
+      "loss": 0.1484,
+      "step": 20610
+    },
+    {
+      "epoch": 0.17891337748804265,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0018172965810502534,
+      "loss": 0.1123,
+      "step": 20611
+    },
+    {
+      "epoch": 0.1789220579682468,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0018172786046009473,
+      "loss": 0.1245,
+      "step": 20612
+    },
+    {
+      "epoch": 0.17893073844845098,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0018172606273672365,
+      "loss": 0.1348,
+      "step": 20613
+    },
+    {
+      "epoch": 0.17893941892865514,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018172426493491402,
+      "loss": 0.123,
+      "step": 20614
+    },
+    {
+      "epoch": 0.1789480994088593,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001817224670546678,
+      "loss": 0.0845,
+      "step": 20615
+    },
+    {
+      "epoch": 0.17895677988906347,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0018172066909598696,
+      "loss": 0.1206,
+      "step": 20616
+    },
+    {
+      "epoch": 0.17896546036926764,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0018171887105887347,
+      "loss": 0.1543,
+      "step": 20617
+    },
+    {
+      "epoch": 0.1789741408494718,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0018171707294332934,
+      "loss": 0.2754,
+      "step": 20618
+    },
+    {
+      "epoch": 0.17898282132967597,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0018171527474935647,
+      "loss": 0.1338,
+      "step": 20619
+    },
+    {
+      "epoch": 0.17899150180988013,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0018171347647695684,
+      "loss": 0.3359,
+      "step": 20620
+    },
+    {
+      "epoch": 0.1790001822900843,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0018171167812613244,
+      "loss": 0.1201,
+      "step": 20621
+    },
+    {
+      "epoch": 0.17900886277028846,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0018170987969688523,
+      "loss": 0.1055,
+      "step": 20622
+    },
+    {
+      "epoch": 0.17901754325049263,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0018170808118921718,
+      "loss": 0.124,
+      "step": 20623
+    },
+    {
+      "epoch": 0.1790262237306968,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0018170628260313025,
+      "loss": 0.1504,
+      "step": 20624
+    },
+    {
+      "epoch": 0.17903490421090096,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001817044839386264,
+      "loss": 0.1299,
+      "step": 20625
+    },
+    {
+      "epoch": 0.17904358469110512,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.001817026851957076,
+      "loss": 0.1045,
+      "step": 20626
+    },
+    {
+      "epoch": 0.1790522651713093,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001817008863743758,
+      "loss": 0.1035,
+      "step": 20627
+    },
+    {
+      "epoch": 0.17906094565151345,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0018169908747463304,
+      "loss": 0.1504,
+      "step": 20628
+    },
+    {
+      "epoch": 0.17906962613171762,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001816972884964812,
+      "loss": 0.0859,
+      "step": 20629
+    },
+    {
+      "epoch": 0.17907830661192178,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0018169548943992228,
+      "loss": 0.0781,
+      "step": 20630
+    },
+    {
+      "epoch": 0.17908698709212595,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0018169369030495825,
+      "loss": 0.1128,
+      "step": 20631
+    },
+    {
+      "epoch": 0.17909566757233011,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0018169189109159111,
+      "loss": 0.1211,
+      "step": 20632
+    },
+    {
+      "epoch": 0.17910434805253428,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001816900917998228,
+      "loss": 0.1748,
+      "step": 20633
+    },
+    {
+      "epoch": 0.17911302853273844,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0018168829242965524,
+      "loss": 0.0884,
+      "step": 20634
+    },
+    {
+      "epoch": 0.1791217090129426,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0018168649298109043,
+      "loss": 0.1001,
+      "step": 20635
+    },
+    {
+      "epoch": 0.17913038949314677,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001816846934541304,
+      "loss": 0.1738,
+      "step": 20636
+    },
+    {
+      "epoch": 0.17913906997335094,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.00181682893848777,
+      "loss": 0.1299,
+      "step": 20637
+    },
+    {
+      "epoch": 0.17914775045355508,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0018168109416503233,
+      "loss": 0.1104,
+      "step": 20638
+    },
+    {
+      "epoch": 0.17915643093375924,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0018167929440289826,
+      "loss": 0.1641,
+      "step": 20639
+    },
+    {
+      "epoch": 0.1791651114139634,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001816774945623768,
+      "loss": 0.1064,
+      "step": 20640
+    },
+    {
+      "epoch": 0.17917379189416757,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0018167569464346992,
+      "loss": 0.1162,
+      "step": 20641
+    },
+    {
+      "epoch": 0.17918247237437174,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0018167389464617955,
+      "loss": 0.1211,
+      "step": 20642
+    },
+    {
+      "epoch": 0.1791911528545759,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0018167209457050771,
+      "loss": 0.0986,
+      "step": 20643
+    },
+    {
+      "epoch": 0.17919983333478007,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0018167029441645633,
+      "loss": 0.123,
+      "step": 20644
+    },
+    {
+      "epoch": 0.17920851381498423,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001816684941840274,
+      "loss": 0.0947,
+      "step": 20645
+    },
+    {
+      "epoch": 0.1792171942951884,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0018166669387322289,
+      "loss": 0.1104,
+      "step": 20646
+    },
+    {
+      "epoch": 0.17922587477539256,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0018166489348404474,
+      "loss": 0.0879,
+      "step": 20647
+    },
+    {
+      "epoch": 0.17923455525559673,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0018166309301649493,
+      "loss": 0.0762,
+      "step": 20648
+    },
+    {
+      "epoch": 0.1792432357358009,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0018166129247057545,
+      "loss": 0.1016,
+      "step": 20649
+    },
+    {
+      "epoch": 0.17925191621600506,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001816594918462883,
+      "loss": 0.1025,
+      "step": 20650
+    },
+    {
+      "epoch": 0.17926059669620922,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0018165769114363536,
+      "loss": 0.1104,
+      "step": 20651
+    },
+    {
+      "epoch": 0.1792692771764134,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0018165589036261865,
+      "loss": 0.1387,
+      "step": 20652
+    },
+    {
+      "epoch": 0.17927795765661755,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0018165408950324014,
+      "loss": 0.1001,
+      "step": 20653
+    },
+    {
+      "epoch": 0.17928663813682172,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001816522885655018,
+      "loss": 0.0825,
+      "step": 20654
+    },
+    {
+      "epoch": 0.17929531861702588,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0018165048754940558,
+      "loss": 0.1191,
+      "step": 20655
+    },
+    {
+      "epoch": 0.17930399909723005,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0018164868645495346,
+      "loss": 0.0903,
+      "step": 20656
+    },
+    {
+      "epoch": 0.17931267957743421,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0018164688528214744,
+      "loss": 0.1055,
+      "step": 20657
+    },
+    {
+      "epoch": 0.17932136005763838,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0018164508403098942,
+      "loss": 0.1211,
+      "step": 20658
+    },
+    {
+      "epoch": 0.17933004053784254,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0018164328270148145,
+      "loss": 0.1074,
+      "step": 20659
+    },
+    {
+      "epoch": 0.1793387210180467,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0018164148129362547,
+      "loss": 0.0869,
+      "step": 20660
+    },
+    {
+      "epoch": 0.17934740149825087,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0018163967980742342,
+      "loss": 0.1338,
+      "step": 20661
+    },
+    {
+      "epoch": 0.17935608197845504,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001816378782428773,
+      "loss": 0.1289,
+      "step": 20662
+    },
+    {
+      "epoch": 0.1793647624586592,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0018163607659998906,
+      "loss": 0.0845,
+      "step": 20663
+    },
+    {
+      "epoch": 0.17937344293886337,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0018163427487876069,
+      "loss": 0.1465,
+      "step": 20664
+    },
+    {
+      "epoch": 0.17938212341906754,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0018163247307919415,
+      "loss": 0.1406,
+      "step": 20665
+    },
+    {
+      "epoch": 0.1793908038992717,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0018163067120129143,
+      "loss": 0.1157,
+      "step": 20666
+    },
+    {
+      "epoch": 0.17939948437947587,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001816288692450545,
+      "loss": 0.0859,
+      "step": 20667
+    },
+    {
+      "epoch": 0.17940816485968003,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0018162706721048527,
+      "loss": 0.1006,
+      "step": 20668
+    },
+    {
+      "epoch": 0.1794168453398842,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0018162526509758576,
+      "loss": 0.0918,
+      "step": 20669
+    },
+    {
+      "epoch": 0.17942552582008836,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0018162346290635796,
+      "loss": 0.1191,
+      "step": 20670
+    },
+    {
+      "epoch": 0.17943420630029253,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001816216606368038,
+      "loss": 0.0811,
+      "step": 20671
+    },
+    {
+      "epoch": 0.1794428867804967,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0018161985828892531,
+      "loss": 0.1289,
+      "step": 20672
+    },
+    {
+      "epoch": 0.17945156726070086,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0018161805586272438,
+      "loss": 0.123,
+      "step": 20673
+    },
+    {
+      "epoch": 0.17946024774090502,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.00181616253358203,
+      "loss": 0.1064,
+      "step": 20674
+    },
+    {
+      "epoch": 0.1794689282211092,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001816144507753632,
+      "loss": 0.1436,
+      "step": 20675
+    },
+    {
+      "epoch": 0.17947760870131335,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018161264811420688,
+      "loss": 0.1328,
+      "step": 20676
+    },
+    {
+      "epoch": 0.17948628918151752,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001816108453747361,
+      "loss": 0.0879,
+      "step": 20677
+    },
+    {
+      "epoch": 0.17949496966172168,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001816090425569527,
+      "loss": 0.1133,
+      "step": 20678
+    },
+    {
+      "epoch": 0.17950365014192585,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0018160723966085877,
+      "loss": 0.0767,
+      "step": 20679
+    },
+    {
+      "epoch": 0.17951233062213,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0018160543668645623,
+      "loss": 0.1211,
+      "step": 20680
+    },
+    {
+      "epoch": 0.17952101110233418,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0018160363363374707,
+      "loss": 0.0928,
+      "step": 20681
+    },
+    {
+      "epoch": 0.17952969158253834,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0018160183050273324,
+      "loss": 0.0928,
+      "step": 20682
+    },
+    {
+      "epoch": 0.1795383720627425,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0018160002729341674,
+      "loss": 0.1406,
+      "step": 20683
+    },
+    {
+      "epoch": 0.17954705254294667,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0018159822400579952,
+      "loss": 0.0898,
+      "step": 20684
+    },
+    {
+      "epoch": 0.17955573302315084,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0018159642063988355,
+      "loss": 0.0928,
+      "step": 20685
+    },
+    {
+      "epoch": 0.179564413503355,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0018159461719567081,
+      "loss": 0.127,
+      "step": 20686
+    },
+    {
+      "epoch": 0.17957309398355917,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0018159281367316326,
+      "loss": 0.0986,
+      "step": 20687
+    },
+    {
+      "epoch": 0.17958177446376333,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0018159101007236293,
+      "loss": 0.1074,
+      "step": 20688
+    },
+    {
+      "epoch": 0.1795904549439675,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0018158920639327171,
+      "loss": 0.166,
+      "step": 20689
+    },
+    {
+      "epoch": 0.17959913542417166,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0018158740263589162,
+      "loss": 0.1035,
+      "step": 20690
+    },
+    {
+      "epoch": 0.17960781590437583,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0018158559880022463,
+      "loss": 0.0938,
+      "step": 20691
+    },
+    {
+      "epoch": 0.17961649638458,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001815837948862727,
+      "loss": 0.1562,
+      "step": 20692
+    },
+    {
+      "epoch": 0.17962517686478416,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0018158199089403778,
+      "loss": 0.1201,
+      "step": 20693
+    },
+    {
+      "epoch": 0.17963385734498832,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001815801868235219,
+      "loss": 0.123,
+      "step": 20694
+    },
+    {
+      "epoch": 0.1796425378251925,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00181578382674727,
+      "loss": 0.1377,
+      "step": 20695
+    },
+    {
+      "epoch": 0.17965121830539665,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0018157657844765508,
+      "loss": 0.1152,
+      "step": 20696
+    },
+    {
+      "epoch": 0.17965989878560082,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0018157477414230802,
+      "loss": 0.1562,
+      "step": 20697
+    },
+    {
+      "epoch": 0.17966857926580498,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0018157296975868792,
+      "loss": 0.1328,
+      "step": 20698
+    },
+    {
+      "epoch": 0.17967725974600915,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018157116529679666,
+      "loss": 0.1074,
+      "step": 20699
+    },
+    {
+      "epoch": 0.17968594022621331,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0018156936075663628,
+      "loss": 0.1152,
+      "step": 20700
+    },
+    {
+      "epoch": 0.17969462070641748,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0018156755613820872,
+      "loss": 0.1699,
+      "step": 20701
+    },
+    {
+      "epoch": 0.17970330118662164,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0018156575144151594,
+      "loss": 0.1104,
+      "step": 20702
+    },
+    {
+      "epoch": 0.1797119816668258,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0018156394666655996,
+      "loss": 0.1064,
+      "step": 20703
+    },
+    {
+      "epoch": 0.17972066214702997,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0018156214181334267,
+      "loss": 0.0952,
+      "step": 20704
+    },
+    {
+      "epoch": 0.17972934262723414,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0018156033688186614,
+      "loss": 0.1465,
+      "step": 20705
+    },
+    {
+      "epoch": 0.1797380231074383,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001815585318721323,
+      "loss": 0.104,
+      "step": 20706
+    },
+    {
+      "epoch": 0.17974670358764247,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001815567267841431,
+      "loss": 0.1309,
+      "step": 20707
+    },
+    {
+      "epoch": 0.17975538406784664,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0018155492161790055,
+      "loss": 0.1484,
+      "step": 20708
+    },
+    {
+      "epoch": 0.1797640645480508,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001815531163734066,
+      "loss": 0.1328,
+      "step": 20709
+    },
+    {
+      "epoch": 0.17977274502825497,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0018155131105066324,
+      "loss": 0.0986,
+      "step": 20710
+    },
+    {
+      "epoch": 0.17978142550845913,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0018154950564967247,
+      "loss": 0.1289,
+      "step": 20711
+    },
+    {
+      "epoch": 0.1797901059886633,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001815477001704362,
+      "loss": 0.1147,
+      "step": 20712
+    },
+    {
+      "epoch": 0.17979878646886746,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0018154589461295647,
+      "loss": 0.1172,
+      "step": 20713
+    },
+    {
+      "epoch": 0.17980746694907163,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0018154408897723521,
+      "loss": 0.123,
+      "step": 20714
+    },
+    {
+      "epoch": 0.1798161474292758,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0018154228326327441,
+      "loss": 0.1001,
+      "step": 20715
+    },
+    {
+      "epoch": 0.17982482790947996,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0018154047747107604,
+      "loss": 0.0811,
+      "step": 20716
+    },
+    {
+      "epoch": 0.17983350838968412,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001815386716006421,
+      "loss": 0.1143,
+      "step": 20717
+    },
+    {
+      "epoch": 0.1798421888698883,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018153686565197454,
+      "loss": 0.127,
+      "step": 20718
+    },
+    {
+      "epoch": 0.17985086935009245,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001815350596250753,
+      "loss": 0.1035,
+      "step": 20719
+    },
+    {
+      "epoch": 0.17985954983029662,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0018153325351994644,
+      "loss": 0.0977,
+      "step": 20720
+    },
+    {
+      "epoch": 0.17986823031050078,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0018153144733658983,
+      "loss": 0.123,
+      "step": 20721
+    },
+    {
+      "epoch": 0.17987691079070495,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0018152964107500758,
+      "loss": 0.0845,
+      "step": 20722
+    },
+    {
+      "epoch": 0.1798855912709091,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0018152783473520155,
+      "loss": 0.1523,
+      "step": 20723
+    },
+    {
+      "epoch": 0.17989427175111328,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0018152602831717375,
+      "loss": 0.1006,
+      "step": 20724
+    },
+    {
+      "epoch": 0.17990295223131744,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0018152422182092618,
+      "loss": 0.1108,
+      "step": 20725
+    },
+    {
+      "epoch": 0.1799116327115216,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0018152241524646076,
+      "loss": 0.1191,
+      "step": 20726
+    },
+    {
+      "epoch": 0.17992031319172577,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0018152060859377953,
+      "loss": 0.1094,
+      "step": 20727
+    },
+    {
+      "epoch": 0.17992899367192994,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0018151880186288443,
+      "loss": 0.0996,
+      "step": 20728
+    },
+    {
+      "epoch": 0.1799376741521341,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0018151699505377743,
+      "loss": 0.1016,
+      "step": 20729
+    },
+    {
+      "epoch": 0.17994635463233827,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0018151518816646053,
+      "loss": 0.1719,
+      "step": 20730
+    },
+    {
+      "epoch": 0.17995503511254243,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001815133812009357,
+      "loss": 0.0874,
+      "step": 20731
+    },
+    {
+      "epoch": 0.1799637155927466,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0018151157415720492,
+      "loss": 0.1572,
+      "step": 20732
+    },
+    {
+      "epoch": 0.17997239607295076,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0018150976703527016,
+      "loss": 0.1016,
+      "step": 20733
+    },
+    {
+      "epoch": 0.17998107655315493,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018150795983513338,
+      "loss": 0.1143,
+      "step": 20734
+    },
+    {
+      "epoch": 0.1799897570333591,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0018150615255679654,
+      "loss": 0.1523,
+      "step": 20735
+    },
+    {
+      "epoch": 0.17999843751356326,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018150434520026167,
+      "loss": 0.1001,
+      "step": 20736
+    },
+    {
+      "epoch": 0.18000711799376742,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0018150253776553073,
+      "loss": 0.1309,
+      "step": 20737
+    },
+    {
+      "epoch": 0.1800157984739716,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001815007302526057,
+      "loss": 0.0908,
+      "step": 20738
+    },
+    {
+      "epoch": 0.18002447895417575,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001814989226614885,
+      "loss": 0.1572,
+      "step": 20739
+    },
+    {
+      "epoch": 0.18003315943437992,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0018149711499218118,
+      "loss": 0.1235,
+      "step": 20740
+    },
+    {
+      "epoch": 0.18004183991458408,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0018149530724468571,
+      "loss": 0.1543,
+      "step": 20741
+    },
+    {
+      "epoch": 0.18005052039478825,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0018149349941900402,
+      "loss": 0.1084,
+      "step": 20742
+    },
+    {
+      "epoch": 0.18005920087499241,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001814916915151381,
+      "loss": 0.1348,
+      "step": 20743
+    },
+    {
+      "epoch": 0.18006788135519658,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018148988353309003,
+      "loss": 0.0596,
+      "step": 20744
+    },
+    {
+      "epoch": 0.18007656183540074,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0018148807547286158,
+      "loss": 0.1143,
+      "step": 20745
+    },
+    {
+      "epoch": 0.1800852423156049,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001814862673344549,
+      "loss": 0.1235,
+      "step": 20746
+    },
+    {
+      "epoch": 0.18009392279580907,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001814844591178719,
+      "loss": 0.1089,
+      "step": 20747
+    },
+    {
+      "epoch": 0.18010260327601324,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0018148265082311458,
+      "loss": 0.082,
+      "step": 20748
+    },
+    {
+      "epoch": 0.1801112837562174,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001814808424501849,
+      "loss": 0.0938,
+      "step": 20749
+    },
+    {
+      "epoch": 0.18011996423642157,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0018147903399908486,
+      "loss": 0.0894,
+      "step": 20750
+    },
+    {
+      "epoch": 0.18012864471662574,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.001814772254698164,
+      "loss": 0.0981,
+      "step": 20751
+    },
+    {
+      "epoch": 0.1801373251968299,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0018147541686238152,
+      "loss": 0.1006,
+      "step": 20752
+    },
+    {
+      "epoch": 0.18014600567703407,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0018147360817678223,
+      "loss": 0.0908,
+      "step": 20753
+    },
+    {
+      "epoch": 0.18015468615723823,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0018147179941302046,
+      "loss": 0.1108,
+      "step": 20754
+    },
+    {
+      "epoch": 0.1801633666374424,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0018146999057109816,
+      "loss": 0.126,
+      "step": 20755
+    },
+    {
+      "epoch": 0.18017204711764656,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0018146818165101742,
+      "loss": 0.1191,
+      "step": 20756
+    },
+    {
+      "epoch": 0.18018072759785073,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0018146637265278012,
+      "loss": 0.1025,
+      "step": 20757
+    },
+    {
+      "epoch": 0.1801894080780549,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0018146456357638827,
+      "loss": 0.1758,
+      "step": 20758
+    },
+    {
+      "epoch": 0.18019808855825906,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0018146275442184386,
+      "loss": 0.1445,
+      "step": 20759
+    },
+    {
+      "epoch": 0.18020676903846322,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0018146094518914882,
+      "loss": 0.1367,
+      "step": 20760
+    },
+    {
+      "epoch": 0.18021544951866736,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0018145913587830518,
+      "loss": 0.0962,
+      "step": 20761
+    },
+    {
+      "epoch": 0.18022412999887152,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001814573264893149,
+      "loss": 0.1484,
+      "step": 20762
+    },
+    {
+      "epoch": 0.1802328104790757,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0018145551702217998,
+      "loss": 0.1035,
+      "step": 20763
+    },
+    {
+      "epoch": 0.18024149095927985,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0018145370747690238,
+      "loss": 0.1357,
+      "step": 20764
+    },
+    {
+      "epoch": 0.18025017143948402,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0018145189785348406,
+      "loss": 0.083,
+      "step": 20765
+    },
+    {
+      "epoch": 0.18025885191968818,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0018145008815192703,
+      "loss": 0.0928,
+      "step": 20766
+    },
+    {
+      "epoch": 0.18026753239989235,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0018144827837223325,
+      "loss": 0.0957,
+      "step": 20767
+    },
+    {
+      "epoch": 0.18027621288009651,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0018144646851440472,
+      "loss": 0.0908,
+      "step": 20768
+    },
+    {
+      "epoch": 0.18028489336030068,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0018144465857844341,
+      "loss": 0.0864,
+      "step": 20769
+    },
+    {
+      "epoch": 0.18029357384050484,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0018144284856435125,
+      "loss": 0.082,
+      "step": 20770
+    },
+    {
+      "epoch": 0.180302254320709,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001814410384721303,
+      "loss": 0.1279,
+      "step": 20771
+    },
+    {
+      "epoch": 0.18031093480091318,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001814392283017825,
+      "loss": 0.0996,
+      "step": 20772
+    },
+    {
+      "epoch": 0.18031961528111734,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0018143741805330981,
+      "loss": 0.1162,
+      "step": 20773
+    },
+    {
+      "epoch": 0.1803282957613215,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0018143560772671428,
+      "loss": 0.1084,
+      "step": 20774
+    },
+    {
+      "epoch": 0.18033697624152567,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001814337973219978,
+      "loss": 0.1191,
+      "step": 20775
+    },
+    {
+      "epoch": 0.18034565672172984,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0018143198683916242,
+      "loss": 0.1182,
+      "step": 20776
+    },
+    {
+      "epoch": 0.180354337201934,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0018143017627821009,
+      "loss": 0.1133,
+      "step": 20777
+    },
+    {
+      "epoch": 0.18036301768213817,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001814283656391428,
+      "loss": 0.1318,
+      "step": 20778
+    },
+    {
+      "epoch": 0.18037169816234233,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0018142655492196249,
+      "loss": 0.1152,
+      "step": 20779
+    },
+    {
+      "epoch": 0.1803803786425465,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001814247441266712,
+      "loss": 0.127,
+      "step": 20780
+    },
+    {
+      "epoch": 0.18038905912275066,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0018142293325327085,
+      "loss": 0.1543,
+      "step": 20781
+    },
+    {
+      "epoch": 0.18039773960295483,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0018142112230176348,
+      "loss": 0.1064,
+      "step": 20782
+    },
+    {
+      "epoch": 0.180406420083159,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0018141931127215105,
+      "loss": 0.1387,
+      "step": 20783
+    },
+    {
+      "epoch": 0.18041510056336316,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0018141750016443551,
+      "loss": 0.0952,
+      "step": 20784
+    },
+    {
+      "epoch": 0.18042378104356732,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001814156889786189,
+      "loss": 0.1016,
+      "step": 20785
+    },
+    {
+      "epoch": 0.1804324615237715,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0018141387771470315,
+      "loss": 0.1074,
+      "step": 20786
+    },
+    {
+      "epoch": 0.18044114200397565,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0018141206637269022,
+      "loss": 0.1377,
+      "step": 20787
+    },
+    {
+      "epoch": 0.18044982248417982,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001814102549525822,
+      "loss": 0.0786,
+      "step": 20788
+    },
+    {
+      "epoch": 0.18045850296438398,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0018140844345438092,
+      "loss": 0.1699,
+      "step": 20789
+    },
+    {
+      "epoch": 0.18046718344458815,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018140663187808847,
+      "loss": 0.1377,
+      "step": 20790
+    },
+    {
+      "epoch": 0.1804758639247923,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0018140482022370684,
+      "loss": 0.1064,
+      "step": 20791
+    },
+    {
+      "epoch": 0.18048454440499648,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.001814030084912379,
+      "loss": 0.1221,
+      "step": 20792
+    },
+    {
+      "epoch": 0.18049322488520064,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0018140119668068376,
+      "loss": 0.1001,
+      "step": 20793
+    },
+    {
+      "epoch": 0.1805019053654048,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001813993847920463,
+      "loss": 0.1133,
+      "step": 20794
+    },
+    {
+      "epoch": 0.18051058584560897,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001813975728253276,
+      "loss": 0.1221,
+      "step": 20795
+    },
+    {
+      "epoch": 0.18051926632581314,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0018139576078052954,
+      "loss": 0.1191,
+      "step": 20796
+    },
+    {
+      "epoch": 0.1805279468060173,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0018139394865765417,
+      "loss": 0.0986,
+      "step": 20797
+    },
+    {
+      "epoch": 0.18053662728622147,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0018139213645670346,
+      "loss": 0.0947,
+      "step": 20798
+    },
+    {
+      "epoch": 0.18054530776642563,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0018139032417767939,
+      "loss": 0.0938,
+      "step": 20799
+    },
+    {
+      "epoch": 0.1805539882466298,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018138851182058389,
+      "loss": 0.1221,
+      "step": 20800
+    },
+    {
+      "epoch": 0.18056266872683396,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0018138669938541903,
+      "loss": 0.0986,
+      "step": 20801
+    },
+    {
+      "epoch": 0.18057134920703813,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0018138488687218673,
+      "loss": 0.1211,
+      "step": 20802
+    },
+    {
+      "epoch": 0.1805800296872423,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0018138307428088897,
+      "loss": 0.1221,
+      "step": 20803
+    },
+    {
+      "epoch": 0.18058871016744646,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0018138126161152777,
+      "loss": 0.1377,
+      "step": 20804
+    },
+    {
+      "epoch": 0.18059739064765062,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0018137944886410509,
+      "loss": 0.1533,
+      "step": 20805
+    },
+    {
+      "epoch": 0.1806060711278548,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0018137763603862291,
+      "loss": 0.1118,
+      "step": 20806
+    },
+    {
+      "epoch": 0.18061475160805895,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0018137582313508323,
+      "loss": 0.085,
+      "step": 20807
+    },
+    {
+      "epoch": 0.18062343208826312,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0018137401015348802,
+      "loss": 0.1021,
+      "step": 20808
+    },
+    {
+      "epoch": 0.18063211256846728,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0018137219709383927,
+      "loss": 0.0776,
+      "step": 20809
+    },
+    {
+      "epoch": 0.18064079304867145,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0018137038395613897,
+      "loss": 0.1221,
+      "step": 20810
+    },
+    {
+      "epoch": 0.18064947352887561,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0018136857074038909,
+      "loss": 0.0854,
+      "step": 20811
+    },
+    {
+      "epoch": 0.18065815400907978,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0018136675744659157,
+      "loss": 0.1143,
+      "step": 20812
+    },
+    {
+      "epoch": 0.18066683448928395,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0018136494407474848,
+      "loss": 0.1406,
+      "step": 20813
+    },
+    {
+      "epoch": 0.1806755149694881,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0018136313062486174,
+      "loss": 0.1177,
+      "step": 20814
+    },
+    {
+      "epoch": 0.18068419544969228,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0018136131709693335,
+      "loss": 0.0972,
+      "step": 20815
+    },
+    {
+      "epoch": 0.18069287592989644,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001813595034909653,
+      "loss": 0.084,
+      "step": 20816
+    },
+    {
+      "epoch": 0.1807015564101006,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0018135768980695962,
+      "loss": 0.0859,
+      "step": 20817
+    },
+    {
+      "epoch": 0.18071023689030477,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0018135587604491818,
+      "loss": 0.1611,
+      "step": 20818
+    },
+    {
+      "epoch": 0.18071891737050894,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0018135406220484303,
+      "loss": 0.1289,
+      "step": 20819
+    },
+    {
+      "epoch": 0.1807275978507131,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0018135224828673615,
+      "loss": 0.1572,
+      "step": 20820
+    },
+    {
+      "epoch": 0.18073627833091727,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0018135043429059951,
+      "loss": 0.0889,
+      "step": 20821
+    },
+    {
+      "epoch": 0.18074495881112143,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0018134862021643517,
+      "loss": 0.103,
+      "step": 20822
+    },
+    {
+      "epoch": 0.1807536392913256,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.00181346806064245,
+      "loss": 0.1113,
+      "step": 20823
+    },
+    {
+      "epoch": 0.18076231977152976,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0018134499183403104,
+      "loss": 0.168,
+      "step": 20824
+    },
+    {
+      "epoch": 0.18077100025173393,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0018134317752579528,
+      "loss": 0.1079,
+      "step": 20825
+    },
+    {
+      "epoch": 0.1807796807319381,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001813413631395397,
+      "loss": 0.0986,
+      "step": 20826
+    },
+    {
+      "epoch": 0.18078836121214226,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0018133954867526627,
+      "loss": 0.1055,
+      "step": 20827
+    },
+    {
+      "epoch": 0.18079704169234642,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0018133773413297697,
+      "loss": 0.1328,
+      "step": 20828
+    },
+    {
+      "epoch": 0.1808057221725506,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001813359195126738,
+      "loss": 0.1099,
+      "step": 20829
+    },
+    {
+      "epoch": 0.18081440265275475,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0018133410481435874,
+      "loss": 0.1377,
+      "step": 20830
+    },
+    {
+      "epoch": 0.18082308313295892,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.001813322900380338,
+      "loss": 0.126,
+      "step": 20831
+    },
+    {
+      "epoch": 0.18083176361316308,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0018133047518370088,
+      "loss": 0.1348,
+      "step": 20832
+    },
+    {
+      "epoch": 0.18084044409336725,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0018132866025136207,
+      "loss": 0.0864,
+      "step": 20833
+    },
+    {
+      "epoch": 0.1808491245735714,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001813268452410193,
+      "loss": 0.0923,
+      "step": 20834
+    },
+    {
+      "epoch": 0.18085780505377558,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0018132503015267458,
+      "loss": 0.126,
+      "step": 20835
+    },
+    {
+      "epoch": 0.18086648553397974,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0018132321498632985,
+      "loss": 0.0581,
+      "step": 20836
+    },
+    {
+      "epoch": 0.1808751660141839,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0018132139974198717,
+      "loss": 0.0918,
+      "step": 20837
+    },
+    {
+      "epoch": 0.18088384649438807,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0018131958441964845,
+      "loss": 0.1094,
+      "step": 20838
+    },
+    {
+      "epoch": 0.18089252697459224,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0018131776901931568,
+      "loss": 0.0977,
+      "step": 20839
+    },
+    {
+      "epoch": 0.1809012074547964,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001813159535409909,
+      "loss": 0.1128,
+      "step": 20840
+    },
+    {
+      "epoch": 0.18090988793500057,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0018131413798467607,
+      "loss": 0.1523,
+      "step": 20841
+    },
+    {
+      "epoch": 0.18091856841520473,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0018131232235037312,
+      "loss": 0.1641,
+      "step": 20842
+    },
+    {
+      "epoch": 0.1809272488954089,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0018131050663808413,
+      "loss": 0.1016,
+      "step": 20843
+    },
+    {
+      "epoch": 0.18093592937561306,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0018130869084781102,
+      "loss": 0.127,
+      "step": 20844
+    },
+    {
+      "epoch": 0.18094460985581723,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001813068749795558,
+      "loss": 0.1006,
+      "step": 20845
+    },
+    {
+      "epoch": 0.1809532903360214,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0018130505903332047,
+      "loss": 0.1621,
+      "step": 20846
+    },
+    {
+      "epoch": 0.18096197081622556,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.00181303243009107,
+      "loss": 0.1201,
+      "step": 20847
+    },
+    {
+      "epoch": 0.18097065129642972,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0018130142690691736,
+      "loss": 0.1113,
+      "step": 20848
+    },
+    {
+      "epoch": 0.1809793317766339,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0018129961072675357,
+      "loss": 0.104,
+      "step": 20849
+    },
+    {
+      "epoch": 0.18098801225683805,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0018129779446861755,
+      "loss": 0.0957,
+      "step": 20850
+    },
+    {
+      "epoch": 0.18099669273704222,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001812959781325114,
+      "loss": 0.1543,
+      "step": 20851
+    },
+    {
+      "epoch": 0.18100537321724638,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00181294161718437,
+      "loss": 0.084,
+      "step": 20852
+    },
+    {
+      "epoch": 0.18101405369745055,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018129234522639636,
+      "loss": 0.0942,
+      "step": 20853
+    },
+    {
+      "epoch": 0.18102273417765471,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001812905286563915,
+      "loss": 0.0864,
+      "step": 20854
+    },
+    {
+      "epoch": 0.18103141465785888,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0018128871200842442,
+      "loss": 0.1143,
+      "step": 20855
+    },
+    {
+      "epoch": 0.18104009513806305,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0018128689528249704,
+      "loss": 0.0991,
+      "step": 20856
+    },
+    {
+      "epoch": 0.1810487756182672,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0018128507847861139,
+      "loss": 0.1104,
+      "step": 20857
+    },
+    {
+      "epoch": 0.18105745609847138,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0018128326159676945,
+      "loss": 0.1523,
+      "step": 20858
+    },
+    {
+      "epoch": 0.18106613657867554,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001812814446369732,
+      "loss": 0.1514,
+      "step": 20859
+    },
+    {
+      "epoch": 0.1810748170588797,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0018127962759922464,
+      "loss": 0.1572,
+      "step": 20860
+    },
+    {
+      "epoch": 0.18108349753908387,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0018127781048352576,
+      "loss": 0.1211,
+      "step": 20861
+    },
+    {
+      "epoch": 0.18109217801928804,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001812759932898785,
+      "loss": 0.1299,
+      "step": 20862
+    },
+    {
+      "epoch": 0.1811008584994922,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0018127417601828493,
+      "loss": 0.1367,
+      "step": 20863
+    },
+    {
+      "epoch": 0.18110953897969637,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.00181272358668747,
+      "loss": 0.0996,
+      "step": 20864
+    },
+    {
+      "epoch": 0.18111821945990053,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0018127054124126665,
+      "loss": 0.0889,
+      "step": 20865
+    },
+    {
+      "epoch": 0.1811268999401047,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0018126872373584594,
+      "loss": 0.0967,
+      "step": 20866
+    },
+    {
+      "epoch": 0.18113558042030886,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0018126690615248677,
+      "loss": 0.1172,
+      "step": 20867
+    },
+    {
+      "epoch": 0.18114426090051303,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0018126508849119123,
+      "loss": 0.1182,
+      "step": 20868
+    },
+    {
+      "epoch": 0.1811529413807172,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0018126327075196126,
+      "loss": 0.105,
+      "step": 20869
+    },
+    {
+      "epoch": 0.18116162186092136,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0018126145293479881,
+      "loss": 0.1357,
+      "step": 20870
+    },
+    {
+      "epoch": 0.18117030234112552,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0018125963503970593,
+      "loss": 0.127,
+      "step": 20871
+    },
+    {
+      "epoch": 0.1811789828213297,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.001812578170666846,
+      "loss": 0.0923,
+      "step": 20872
+    },
+    {
+      "epoch": 0.18118766330153385,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0018125599901573676,
+      "loss": 0.1216,
+      "step": 20873
+    },
+    {
+      "epoch": 0.18119634378173802,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0018125418088686446,
+      "loss": 0.166,
+      "step": 20874
+    },
+    {
+      "epoch": 0.18120502426194218,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0018125236268006961,
+      "loss": 0.1055,
+      "step": 20875
+    },
+    {
+      "epoch": 0.18121370474214635,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001812505443953543,
+      "loss": 0.123,
+      "step": 20876
+    },
+    {
+      "epoch": 0.1812223852223505,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0018124872603272045,
+      "loss": 0.1279,
+      "step": 20877
+    },
+    {
+      "epoch": 0.18123106570255468,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0018124690759217007,
+      "loss": 0.1182,
+      "step": 20878
+    },
+    {
+      "epoch": 0.18123974618275884,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0018124508907370513,
+      "loss": 0.1025,
+      "step": 20879
+    },
+    {
+      "epoch": 0.181248426662963,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0018124327047732765,
+      "loss": 0.126,
+      "step": 20880
+    },
+    {
+      "epoch": 0.18125710714316717,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0018124145180303955,
+      "loss": 0.1104,
+      "step": 20881
+    },
+    {
+      "epoch": 0.18126578762337134,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001812396330508429,
+      "loss": 0.0977,
+      "step": 20882
+    },
+    {
+      "epoch": 0.1812744681035755,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0018123781422073968,
+      "loss": 0.2061,
+      "step": 20883
+    },
+    {
+      "epoch": 0.18128314858377967,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0018123599531273185,
+      "loss": 0.1475,
+      "step": 20884
+    },
+    {
+      "epoch": 0.1812918290639838,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0018123417632682137,
+      "loss": 0.0967,
+      "step": 20885
+    },
+    {
+      "epoch": 0.18130050954418797,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018123235726301029,
+      "loss": 0.0996,
+      "step": 20886
+    },
+    {
+      "epoch": 0.18130919002439214,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0018123053812130058,
+      "loss": 0.126,
+      "step": 20887
+    },
+    {
+      "epoch": 0.1813178705045963,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0018122871890169422,
+      "loss": 0.0947,
+      "step": 20888
+    },
+    {
+      "epoch": 0.18132655098480047,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0018122689960419316,
+      "loss": 0.1064,
+      "step": 20889
+    },
+    {
+      "epoch": 0.18133523146500463,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0018122508022879948,
+      "loss": 0.1128,
+      "step": 20890
+    },
+    {
+      "epoch": 0.1813439119452088,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001812232607755151,
+      "loss": 0.0889,
+      "step": 20891
+    },
+    {
+      "epoch": 0.18135259242541296,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0018122144124434205,
+      "loss": 0.1113,
+      "step": 20892
+    },
+    {
+      "epoch": 0.18136127290561713,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001812196216352823,
+      "loss": 0.0879,
+      "step": 20893
+    },
+    {
+      "epoch": 0.1813699533858213,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001812178019483379,
+      "loss": 0.0928,
+      "step": 20894
+    },
+    {
+      "epoch": 0.18137863386602546,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0018121598218351067,
+      "loss": 0.0898,
+      "step": 20895
+    },
+    {
+      "epoch": 0.18138731434622962,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0018121416234080276,
+      "loss": 0.1504,
+      "step": 20896
+    },
+    {
+      "epoch": 0.1813959948264338,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0018121234242021611,
+      "loss": 0.0991,
+      "step": 20897
+    },
+    {
+      "epoch": 0.18140467530663795,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0018121052242175276,
+      "loss": 0.1099,
+      "step": 20898
+    },
+    {
+      "epoch": 0.18141335578684212,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0018120870234541457,
+      "loss": 0.1162,
+      "step": 20899
+    },
+    {
+      "epoch": 0.18142203626704628,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0018120688219120367,
+      "loss": 0.123,
+      "step": 20900
+    },
+    {
+      "epoch": 0.18143071674725045,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 0.0018120506195912196,
+      "loss": 0.0869,
+      "step": 20901
+    },
+    {
+      "epoch": 0.1814393972274546,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0018120324164917153,
+      "loss": 0.1064,
+      "step": 20902
+    },
+    {
+      "epoch": 0.18144807770765878,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0018120142126135425,
+      "loss": 0.1445,
+      "step": 20903
+    },
+    {
+      "epoch": 0.18145675818786294,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0018119960079567214,
+      "loss": 0.1152,
+      "step": 20904
+    },
+    {
+      "epoch": 0.1814654386680671,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0018119778025212725,
+      "loss": 0.0928,
+      "step": 20905
+    },
+    {
+      "epoch": 0.18147411914827127,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0018119595963072156,
+      "loss": 0.103,
+      "step": 20906
+    },
+    {
+      "epoch": 0.18148279962847544,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.00181194138931457,
+      "loss": 0.125,
+      "step": 20907
+    },
+    {
+      "epoch": 0.1814914801086796,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001811923181543356,
+      "loss": 0.1011,
+      "step": 20908
+    },
+    {
+      "epoch": 0.18150016058888377,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0018119049729935937,
+      "loss": 0.1309,
+      "step": 20909
+    },
+    {
+      "epoch": 0.18150884106908793,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0018118867636653028,
+      "loss": 0.1162,
+      "step": 20910
+    },
+    {
+      "epoch": 0.1815175215492921,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0018118685535585032,
+      "loss": 0.1553,
+      "step": 20911
+    },
+    {
+      "epoch": 0.18152620202949626,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0018118503426732147,
+      "loss": 0.0903,
+      "step": 20912
+    },
+    {
+      "epoch": 0.18153488250970043,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0018118321310094576,
+      "loss": 0.1357,
+      "step": 20913
+    },
+    {
+      "epoch": 0.1815435629899046,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0018118139185672514,
+      "loss": 0.0796,
+      "step": 20914
+    },
+    {
+      "epoch": 0.18155224347010876,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0018117957053466161,
+      "loss": 0.1074,
+      "step": 20915
+    },
+    {
+      "epoch": 0.18156092395031292,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0018117774913475719,
+      "loss": 0.1079,
+      "step": 20916
+    },
+    {
+      "epoch": 0.1815696044305171,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0018117592765701388,
+      "loss": 0.1006,
+      "step": 20917
+    },
+    {
+      "epoch": 0.18157828491072125,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0018117410610143361,
+      "loss": 0.0952,
+      "step": 20918
+    },
+    {
+      "epoch": 0.18158696539092542,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0018117228446801839,
+      "loss": 0.1035,
+      "step": 20919
+    },
+    {
+      "epoch": 0.18159564587112959,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0018117046275677026,
+      "loss": 0.126,
+      "step": 20920
+    },
+    {
+      "epoch": 0.18160432635133375,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0018116864096769116,
+      "loss": 0.1221,
+      "step": 20921
+    },
+    {
+      "epoch": 0.18161300683153792,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0018116681910078313,
+      "loss": 0.085,
+      "step": 20922
+    },
+    {
+      "epoch": 0.18162168731174208,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0018116499715604812,
+      "loss": 0.1348,
+      "step": 20923
+    },
+    {
+      "epoch": 0.18163036779194625,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0018116317513348812,
+      "loss": 0.0923,
+      "step": 20924
+    },
+    {
+      "epoch": 0.1816390482721504,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0018116135303310518,
+      "loss": 0.0977,
+      "step": 20925
+    },
+    {
+      "epoch": 0.18164772875235458,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0018115953085490121,
+      "loss": 0.0903,
+      "step": 20926
+    },
+    {
+      "epoch": 0.18165640923255874,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001811577085988783,
+      "loss": 0.1084,
+      "step": 20927
+    },
+    {
+      "epoch": 0.1816650897127629,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0018115588626503836,
+      "loss": 0.0986,
+      "step": 20928
+    },
+    {
+      "epoch": 0.18167377019296707,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001811540638533834,
+      "loss": 0.0967,
+      "step": 20929
+    },
+    {
+      "epoch": 0.18168245067317124,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0018115224136391545,
+      "loss": 0.0986,
+      "step": 20930
+    },
+    {
+      "epoch": 0.1816911311533754,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0018115041879663647,
+      "loss": 0.0996,
+      "step": 20931
+    },
+    {
+      "epoch": 0.18169981163357957,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0018114859615154845,
+      "loss": 0.1455,
+      "step": 20932
+    },
+    {
+      "epoch": 0.18170849211378373,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001811467734286534,
+      "loss": 0.0903,
+      "step": 20933
+    },
+    {
+      "epoch": 0.1817171725939879,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.001811449506279533,
+      "loss": 0.1621,
+      "step": 20934
+    },
+    {
+      "epoch": 0.18172585307419206,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0018114312774945014,
+      "loss": 0.1001,
+      "step": 20935
+    },
+    {
+      "epoch": 0.18173453355439623,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0018114130479314596,
+      "loss": 0.1396,
+      "step": 20936
+    },
+    {
+      "epoch": 0.1817432140346004,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0018113948175904268,
+      "loss": 0.1191,
+      "step": 20937
+    },
+    {
+      "epoch": 0.18175189451480456,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0018113765864714235,
+      "loss": 0.126,
+      "step": 20938
+    },
+    {
+      "epoch": 0.18176057499500872,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0018113583545744697,
+      "loss": 0.0859,
+      "step": 20939
+    },
+    {
+      "epoch": 0.1817692554752129,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0018113401218995847,
+      "loss": 0.1201,
+      "step": 20940
+    },
+    {
+      "epoch": 0.18177793595541705,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001811321888446789,
+      "loss": 0.1025,
+      "step": 20941
+    },
+    {
+      "epoch": 0.18178661643562122,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0018113036542161025,
+      "loss": 0.1279,
+      "step": 20942
+    },
+    {
+      "epoch": 0.18179529691582538,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0018112854192075447,
+      "loss": 0.0947,
+      "step": 20943
+    },
+    {
+      "epoch": 0.18180397739602955,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001811267183421136,
+      "loss": 0.0771,
+      "step": 20944
+    },
+    {
+      "epoch": 0.1818126578762337,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0018112489468568964,
+      "loss": 0.0957,
+      "step": 20945
+    },
+    {
+      "epoch": 0.18182133835643788,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0018112307095148456,
+      "loss": 0.1089,
+      "step": 20946
+    },
+    {
+      "epoch": 0.18183001883664204,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0018112124713950033,
+      "loss": 0.1465,
+      "step": 20947
+    },
+    {
+      "epoch": 0.1818386993168462,
+      "grad_norm": 1.984375,
+      "learning_rate": 0.0018111942324973897,
+      "loss": 0.1455,
+      "step": 20948
+    },
+    {
+      "epoch": 0.18184737979705037,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001811175992822025,
+      "loss": 0.1455,
+      "step": 20949
+    },
+    {
+      "epoch": 0.18185606027725454,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0018111577523689292,
+      "loss": 0.1299,
+      "step": 20950
+    },
+    {
+      "epoch": 0.1818647407574587,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0018111395111381214,
+      "loss": 0.125,
+      "step": 20951
+    },
+    {
+      "epoch": 0.18187342123766287,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0018111212691296222,
+      "loss": 0.3516,
+      "step": 20952
+    },
+    {
+      "epoch": 0.18188210171786703,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0018111030263434518,
+      "loss": 0.1787,
+      "step": 20953
+    },
+    {
+      "epoch": 0.1818907821980712,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0018110847827796294,
+      "loss": 0.1543,
+      "step": 20954
+    },
+    {
+      "epoch": 0.18189946267827536,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018110665384381756,
+      "loss": 0.0908,
+      "step": 20955
+    },
+    {
+      "epoch": 0.18190814315847953,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.00181104829331911,
+      "loss": 0.1118,
+      "step": 20956
+    },
+    {
+      "epoch": 0.1819168236386837,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001811030047422453,
+      "loss": 0.1182,
+      "step": 20957
+    },
+    {
+      "epoch": 0.18192550411888786,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001811011800748224,
+      "loss": 0.1426,
+      "step": 20958
+    },
+    {
+      "epoch": 0.18193418459909202,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001810993553296443,
+      "loss": 0.0918,
+      "step": 20959
+    },
+    {
+      "epoch": 0.1819428650792962,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0018109753050671307,
+      "loss": 0.084,
+      "step": 20960
+    },
+    {
+      "epoch": 0.18195154555950036,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001810957056060306,
+      "loss": 0.1504,
+      "step": 20961
+    },
+    {
+      "epoch": 0.18196022603970452,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0018109388062759893,
+      "loss": 0.0771,
+      "step": 20962
+    },
+    {
+      "epoch": 0.18196890651990869,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018109205557142007,
+      "loss": 0.1118,
+      "step": 20963
+    },
+    {
+      "epoch": 0.18197758700011285,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0018109023043749602,
+      "loss": 0.1621,
+      "step": 20964
+    },
+    {
+      "epoch": 0.18198626748031702,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018108840522582876,
+      "loss": 0.0996,
+      "step": 20965
+    },
+    {
+      "epoch": 0.18199494796052118,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001810865799364203,
+      "loss": 0.0874,
+      "step": 20966
+    },
+    {
+      "epoch": 0.18200362844072535,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0018108475456927256,
+      "loss": 0.1504,
+      "step": 20967
+    },
+    {
+      "epoch": 0.1820123089209295,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0018108292912438768,
+      "loss": 0.0977,
+      "step": 20968
+    },
+    {
+      "epoch": 0.18202098940113368,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0018108110360176752,
+      "loss": 0.1191,
+      "step": 20969
+    },
+    {
+      "epoch": 0.18202966988133784,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0018107927800141415,
+      "loss": 0.0977,
+      "step": 20970
+    },
+    {
+      "epoch": 0.182038350361542,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0018107745232332957,
+      "loss": 0.1182,
+      "step": 20971
+    },
+    {
+      "epoch": 0.18204703084174617,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0018107562656751574,
+      "loss": 0.1523,
+      "step": 20972
+    },
+    {
+      "epoch": 0.18205571132195034,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0018107380073397468,
+      "loss": 0.1279,
+      "step": 20973
+    },
+    {
+      "epoch": 0.1820643918021545,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0018107197482270837,
+      "loss": 0.1016,
+      "step": 20974
+    },
+    {
+      "epoch": 0.18207307228235867,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0018107014883371882,
+      "loss": 0.127,
+      "step": 20975
+    },
+    {
+      "epoch": 0.18208175276256283,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018106832276700805,
+      "loss": 0.1416,
+      "step": 20976
+    },
+    {
+      "epoch": 0.182090433242767,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0018106649662257797,
+      "loss": 0.0742,
+      "step": 20977
+    },
+    {
+      "epoch": 0.18209911372297116,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0018106467040043066,
+      "loss": 0.0952,
+      "step": 20978
+    },
+    {
+      "epoch": 0.18210779420317533,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0018106284410056813,
+      "loss": 0.1445,
+      "step": 20979
+    },
+    {
+      "epoch": 0.1821164746833795,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001810610177229923,
+      "loss": 0.0918,
+      "step": 20980
+    },
+    {
+      "epoch": 0.18212515516358366,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001810591912677052,
+      "loss": 0.0854,
+      "step": 20981
+    },
+    {
+      "epoch": 0.18213383564378782,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001810573647347089,
+      "loss": 0.0796,
+      "step": 20982
+    },
+    {
+      "epoch": 0.182142516123992,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0018105553812400527,
+      "loss": 0.1611,
+      "step": 20983
+    },
+    {
+      "epoch": 0.18215119660419615,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0018105371143559642,
+      "loss": 0.1299,
+      "step": 20984
+    },
+    {
+      "epoch": 0.18215987708440032,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0018105188466948428,
+      "loss": 0.1445,
+      "step": 20985
+    },
+    {
+      "epoch": 0.18216855756460448,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0018105005782567086,
+      "loss": 0.1196,
+      "step": 20986
+    },
+    {
+      "epoch": 0.18217723804480865,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0018104823090415814,
+      "loss": 0.1104,
+      "step": 20987
+    },
+    {
+      "epoch": 0.1821859185250128,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0018104640390494817,
+      "loss": 0.1108,
+      "step": 20988
+    },
+    {
+      "epoch": 0.18219459900521698,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0018104457682804294,
+      "loss": 0.0869,
+      "step": 20989
+    },
+    {
+      "epoch": 0.18220327948542114,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0018104274967344438,
+      "loss": 0.1172,
+      "step": 20990
+    },
+    {
+      "epoch": 0.1822119599656253,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0018104092244115456,
+      "loss": 0.1523,
+      "step": 20991
+    },
+    {
+      "epoch": 0.18222064044582947,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018103909513117547,
+      "loss": 0.1143,
+      "step": 20992
+    },
+    {
+      "epoch": 0.18222932092603364,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0018103726774350909,
+      "loss": 0.1348,
+      "step": 20993
+    },
+    {
+      "epoch": 0.1822380014062378,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0018103544027815738,
+      "loss": 0.1553,
+      "step": 20994
+    },
+    {
+      "epoch": 0.18224668188644197,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001810336127351224,
+      "loss": 0.1113,
+      "step": 20995
+    },
+    {
+      "epoch": 0.18225536236664613,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0018103178511440616,
+      "loss": 0.082,
+      "step": 20996
+    },
+    {
+      "epoch": 0.1822640428468503,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0018102995741601058,
+      "loss": 0.0942,
+      "step": 20997
+    },
+    {
+      "epoch": 0.18227272332705446,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0018102812963993777,
+      "loss": 0.1128,
+      "step": 20998
+    },
+    {
+      "epoch": 0.18228140380725863,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0018102630178618959,
+      "loss": 0.082,
+      "step": 20999
+    },
+    {
+      "epoch": 0.1822900842874628,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0018102447385476817,
+      "loss": 0.0967,
+      "step": 21000
+    },
+    {
+      "epoch": 0.18229876476766696,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0018102264584567542,
+      "loss": 0.1094,
+      "step": 21001
+    },
+    {
+      "epoch": 0.18230744524787112,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.001810208177589134,
+      "loss": 0.0938,
+      "step": 21002
+    },
+    {
+      "epoch": 0.1823161257280753,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0018101898959448407,
+      "loss": 0.0928,
+      "step": 21003
+    },
+    {
+      "epoch": 0.18232480620827946,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0018101716135238944,
+      "loss": 0.1113,
+      "step": 21004
+    },
+    {
+      "epoch": 0.18233348668848362,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0018101533303263148,
+      "loss": 0.0972,
+      "step": 21005
+    },
+    {
+      "epoch": 0.18234216716868779,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0018101350463521225,
+      "loss": 0.1475,
+      "step": 21006
+    },
+    {
+      "epoch": 0.18235084764889195,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001810116761601337,
+      "loss": 0.1089,
+      "step": 21007
+    },
+    {
+      "epoch": 0.1823595281290961,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0018100984760739786,
+      "loss": 0.1172,
+      "step": 21008
+    },
+    {
+      "epoch": 0.18236820860930025,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018100801897700674,
+      "loss": 0.1279,
+      "step": 21009
+    },
+    {
+      "epoch": 0.18237688908950442,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0018100619026896227,
+      "loss": 0.1602,
+      "step": 21010
+    },
+    {
+      "epoch": 0.18238556956970858,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0018100436148326654,
+      "loss": 0.1221,
+      "step": 21011
+    },
+    {
+      "epoch": 0.18239425004991275,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0018100253261992148,
+      "loss": 0.1387,
+      "step": 21012
+    },
+    {
+      "epoch": 0.1824029305301169,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0018100070367892912,
+      "loss": 0.0845,
+      "step": 21013
+    },
+    {
+      "epoch": 0.18241161101032108,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0018099887466029145,
+      "loss": 0.0879,
+      "step": 21014
+    },
+    {
+      "epoch": 0.18242029149052524,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0018099704556401048,
+      "loss": 0.0933,
+      "step": 21015
+    },
+    {
+      "epoch": 0.1824289719707294,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0018099521639008825,
+      "loss": 0.0723,
+      "step": 21016
+    },
+    {
+      "epoch": 0.18243765245093357,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0018099338713852666,
+      "loss": 0.1143,
+      "step": 21017
+    },
+    {
+      "epoch": 0.18244633293113774,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0018099155780932782,
+      "loss": 0.1133,
+      "step": 21018
+    },
+    {
+      "epoch": 0.1824550134113419,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0018098972840249364,
+      "loss": 0.1182,
+      "step": 21019
+    },
+    {
+      "epoch": 0.18246369389154607,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0018098789891802616,
+      "loss": 0.1152,
+      "step": 21020
+    },
+    {
+      "epoch": 0.18247237437175023,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001809860693559274,
+      "loss": 0.1162,
+      "step": 21021
+    },
+    {
+      "epoch": 0.1824810548519544,
+      "grad_norm": 1.7734375,
+      "learning_rate": 0.0018098423971619933,
+      "loss": 0.0771,
+      "step": 21022
+    },
+    {
+      "epoch": 0.18248973533215856,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0018098240999884399,
+      "loss": 0.0918,
+      "step": 21023
+    },
+    {
+      "epoch": 0.18249841581236273,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0018098058020386333,
+      "loss": 0.1104,
+      "step": 21024
+    },
+    {
+      "epoch": 0.1825070962925669,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0018097875033125934,
+      "loss": 0.1348,
+      "step": 21025
+    },
+    {
+      "epoch": 0.18251577677277106,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0018097692038103411,
+      "loss": 0.085,
+      "step": 21026
+    },
+    {
+      "epoch": 0.18252445725297523,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0018097509035318955,
+      "loss": 0.1387,
+      "step": 21027
+    },
+    {
+      "epoch": 0.1825331377331794,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0018097326024772771,
+      "loss": 0.1406,
+      "step": 21028
+    },
+    {
+      "epoch": 0.18254181821338356,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001809714300646506,
+      "loss": 0.167,
+      "step": 21029
+    },
+    {
+      "epoch": 0.18255049869358772,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018096959980396018,
+      "loss": 0.0728,
+      "step": 21030
+    },
+    {
+      "epoch": 0.18255917917379189,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0018096776946565846,
+      "loss": 0.1309,
+      "step": 21031
+    },
+    {
+      "epoch": 0.18256785965399605,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0018096593904974748,
+      "loss": 0.1328,
+      "step": 21032
+    },
+    {
+      "epoch": 0.18257654013420022,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001809641085562292,
+      "loss": 0.0869,
+      "step": 21033
+    },
+    {
+      "epoch": 0.18258522061440438,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0018096227798510562,
+      "loss": 0.1167,
+      "step": 21034
+    },
+    {
+      "epoch": 0.18259390109460855,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0018096044733637879,
+      "loss": 0.1021,
+      "step": 21035
+    },
+    {
+      "epoch": 0.1826025815748127,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0018095861661005066,
+      "loss": 0.1025,
+      "step": 21036
+    },
+    {
+      "epoch": 0.18261126205501688,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0018095678580612327,
+      "loss": 0.1543,
+      "step": 21037
+    },
+    {
+      "epoch": 0.18261994253522104,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001809549549245986,
+      "loss": 0.0957,
+      "step": 21038
+    },
+    {
+      "epoch": 0.1826286230154252,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0018095312396547865,
+      "loss": 0.123,
+      "step": 21039
+    },
+    {
+      "epoch": 0.18263730349562937,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0018095129292876545,
+      "loss": 0.1279,
+      "step": 21040
+    },
+    {
+      "epoch": 0.18264598397583354,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0018094946181446098,
+      "loss": 0.0957,
+      "step": 21041
+    },
+    {
+      "epoch": 0.1826546644560377,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001809476306225672,
+      "loss": 0.127,
+      "step": 21042
+    },
+    {
+      "epoch": 0.18266334493624187,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001809457993530862,
+      "loss": 0.1001,
+      "step": 21043
+    },
+    {
+      "epoch": 0.18267202541644603,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001809439680060199,
+      "loss": 0.1055,
+      "step": 21044
+    },
+    {
+      "epoch": 0.1826807058966502,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0018094213658137038,
+      "loss": 0.103,
+      "step": 21045
+    },
+    {
+      "epoch": 0.18268938637685436,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.001809403050791396,
+      "loss": 0.0942,
+      "step": 21046
+    },
+    {
+      "epoch": 0.18269806685705853,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0018093847349932956,
+      "loss": 0.1182,
+      "step": 21047
+    },
+    {
+      "epoch": 0.1827067473372627,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001809366418419423,
+      "loss": 0.1299,
+      "step": 21048
+    },
+    {
+      "epoch": 0.18271542781746686,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0018093481010697976,
+      "loss": 0.1006,
+      "step": 21049
+    },
+    {
+      "epoch": 0.18272410829767102,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0018093297829444397,
+      "loss": 0.1309,
+      "step": 21050
+    },
+    {
+      "epoch": 0.1827327887778752,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0018093114640433696,
+      "loss": 0.1211,
+      "step": 21051
+    },
+    {
+      "epoch": 0.18274146925807935,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0018092931443666072,
+      "loss": 0.1016,
+      "step": 21052
+    },
+    {
+      "epoch": 0.18275014973828352,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0018092748239141726,
+      "loss": 0.0938,
+      "step": 21053
+    },
+    {
+      "epoch": 0.18275883021848768,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0018092565026860854,
+      "loss": 0.1084,
+      "step": 21054
+    },
+    {
+      "epoch": 0.18276751069869185,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001809238180682366,
+      "loss": 0.0806,
+      "step": 21055
+    },
+    {
+      "epoch": 0.182776191178896,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0018092198579030344,
+      "loss": 0.1201,
+      "step": 21056
+    },
+    {
+      "epoch": 0.18278487165910018,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0018092015343481108,
+      "loss": 0.0996,
+      "step": 21057
+    },
+    {
+      "epoch": 0.18279355213930434,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0018091832100176146,
+      "loss": 0.1128,
+      "step": 21058
+    },
+    {
+      "epoch": 0.1828022326195085,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001809164884911567,
+      "loss": 0.0952,
+      "step": 21059
+    },
+    {
+      "epoch": 0.18281091309971267,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0018091465590299868,
+      "loss": 0.126,
+      "step": 21060
+    },
+    {
+      "epoch": 0.18281959357991684,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0018091282323728947,
+      "loss": 0.084,
+      "step": 21061
+    },
+    {
+      "epoch": 0.182828274060121,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0018091099049403106,
+      "loss": 0.0981,
+      "step": 21062
+    },
+    {
+      "epoch": 0.18283695454032517,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0018090915767322545,
+      "loss": 0.0684,
+      "step": 21063
+    },
+    {
+      "epoch": 0.18284563502052933,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0018090732477487467,
+      "loss": 0.0947,
+      "step": 21064
+    },
+    {
+      "epoch": 0.1828543155007335,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0018090549179898068,
+      "loss": 0.1138,
+      "step": 21065
+    },
+    {
+      "epoch": 0.18286299598093766,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0018090365874554556,
+      "loss": 0.106,
+      "step": 21066
+    },
+    {
+      "epoch": 0.18287167646114183,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001809018256145712,
+      "loss": 0.1377,
+      "step": 21067
+    },
+    {
+      "epoch": 0.182880356941346,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001808999924060597,
+      "loss": 0.1104,
+      "step": 21068
+    },
+    {
+      "epoch": 0.18288903742155016,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018089815912001302,
+      "loss": 0.1533,
+      "step": 21069
+    },
+    {
+      "epoch": 0.18289771790175433,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0018089632575643323,
+      "loss": 0.1377,
+      "step": 21070
+    },
+    {
+      "epoch": 0.1829063983819585,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001808944923153222,
+      "loss": 0.1387,
+      "step": 21071
+    },
+    {
+      "epoch": 0.18291507886216266,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0018089265879668208,
+      "loss": 0.1299,
+      "step": 21072
+    },
+    {
+      "epoch": 0.18292375934236682,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0018089082520051478,
+      "loss": 0.0864,
+      "step": 21073
+    },
+    {
+      "epoch": 0.18293243982257099,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0018088899152682236,
+      "loss": 0.124,
+      "step": 21074
+    },
+    {
+      "epoch": 0.18294112030277515,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001808871577756068,
+      "loss": 0.1172,
+      "step": 21075
+    },
+    {
+      "epoch": 0.18294980078297932,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018088532394687012,
+      "loss": 0.1045,
+      "step": 21076
+    },
+    {
+      "epoch": 0.18295848126318348,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001808834900406143,
+      "loss": 0.0967,
+      "step": 21077
+    },
+    {
+      "epoch": 0.18296716174338765,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0018088165605684135,
+      "loss": 0.127,
+      "step": 21078
+    },
+    {
+      "epoch": 0.1829758422235918,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001808798219955533,
+      "loss": 0.0938,
+      "step": 21079
+    },
+    {
+      "epoch": 0.18298452270379598,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0018087798785675214,
+      "loss": 0.0933,
+      "step": 21080
+    },
+    {
+      "epoch": 0.18299320318400014,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0018087615364043989,
+      "loss": 0.1162,
+      "step": 21081
+    },
+    {
+      "epoch": 0.1830018836642043,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018087431934661852,
+      "loss": 0.1318,
+      "step": 21082
+    },
+    {
+      "epoch": 0.18301056414440847,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0018087248497529007,
+      "loss": 0.1123,
+      "step": 21083
+    },
+    {
+      "epoch": 0.18301924462461264,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0018087065052645652,
+      "loss": 0.166,
+      "step": 21084
+    },
+    {
+      "epoch": 0.1830279251048168,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0018086881600011993,
+      "loss": 0.1162,
+      "step": 21085
+    },
+    {
+      "epoch": 0.18303660558502097,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018086698139628224,
+      "loss": 0.0947,
+      "step": 21086
+    },
+    {
+      "epoch": 0.18304528606522513,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0018086514671494546,
+      "loss": 0.1182,
+      "step": 21087
+    },
+    {
+      "epoch": 0.1830539665454293,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0018086331195611167,
+      "loss": 0.104,
+      "step": 21088
+    },
+    {
+      "epoch": 0.18306264702563346,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.001808614771197828,
+      "loss": 0.1064,
+      "step": 21089
+    },
+    {
+      "epoch": 0.18307132750583763,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001808596422059609,
+      "loss": 0.1279,
+      "step": 21090
+    },
+    {
+      "epoch": 0.1830800079860418,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0018085780721464793,
+      "loss": 0.1484,
+      "step": 21091
+    },
+    {
+      "epoch": 0.18308868846624596,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0018085597214584595,
+      "loss": 0.1074,
+      "step": 21092
+    },
+    {
+      "epoch": 0.18309736894645012,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0018085413699955692,
+      "loss": 0.0962,
+      "step": 21093
+    },
+    {
+      "epoch": 0.1831060494266543,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001808523017757829,
+      "loss": 0.0918,
+      "step": 21094
+    },
+    {
+      "epoch": 0.18311472990685845,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0018085046647452587,
+      "loss": 0.1089,
+      "step": 21095
+    },
+    {
+      "epoch": 0.18312341038706262,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0018084863109578781,
+      "loss": 0.0801,
+      "step": 21096
+    },
+    {
+      "epoch": 0.18313209086726678,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0018084679563957077,
+      "loss": 0.1289,
+      "step": 21097
+    },
+    {
+      "epoch": 0.18314077134747095,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0018084496010587672,
+      "loss": 0.0889,
+      "step": 21098
+    },
+    {
+      "epoch": 0.1831494518276751,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001808431244947077,
+      "loss": 0.1172,
+      "step": 21099
+    },
+    {
+      "epoch": 0.18315813230787928,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0018084128880606573,
+      "loss": 0.1123,
+      "step": 21100
+    },
+    {
+      "epoch": 0.18316681278808344,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0018083945303995273,
+      "loss": 0.1001,
+      "step": 21101
+    },
+    {
+      "epoch": 0.1831754932682876,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0018083761719637082,
+      "loss": 0.1187,
+      "step": 21102
+    },
+    {
+      "epoch": 0.18318417374849177,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018083578127532191,
+      "loss": 0.1235,
+      "step": 21103
+    },
+    {
+      "epoch": 0.18319285422869594,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0018083394527680808,
+      "loss": 0.0938,
+      "step": 21104
+    },
+    {
+      "epoch": 0.1832015347089001,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0018083210920083134,
+      "loss": 0.1289,
+      "step": 21105
+    },
+    {
+      "epoch": 0.18321021518910427,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0018083027304739362,
+      "loss": 0.104,
+      "step": 21106
+    },
+    {
+      "epoch": 0.18321889566930843,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.00180828436816497,
+      "loss": 0.083,
+      "step": 21107
+    },
+    {
+      "epoch": 0.1832275761495126,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0018082660050814346,
+      "loss": 0.0684,
+      "step": 21108
+    },
+    {
+      "epoch": 0.18323625662971676,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0018082476412233505,
+      "loss": 0.0928,
+      "step": 21109
+    },
+    {
+      "epoch": 0.18324493710992093,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.001808229276590737,
+      "loss": 0.1465,
+      "step": 21110
+    },
+    {
+      "epoch": 0.1832536175901251,
+      "grad_norm": 2.59375,
+      "learning_rate": 0.0018082109111836147,
+      "loss": 0.2188,
+      "step": 21111
+    },
+    {
+      "epoch": 0.18326229807032926,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0018081925450020035,
+      "loss": 0.1406,
+      "step": 21112
+    },
+    {
+      "epoch": 0.18327097855053343,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0018081741780459238,
+      "loss": 0.0923,
+      "step": 21113
+    },
+    {
+      "epoch": 0.1832796590307376,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0018081558103153952,
+      "loss": 0.1611,
+      "step": 21114
+    },
+    {
+      "epoch": 0.18328833951094176,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001808137441810438,
+      "loss": 0.0913,
+      "step": 21115
+    },
+    {
+      "epoch": 0.18329701999114592,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0018081190725310727,
+      "loss": 0.0967,
+      "step": 21116
+    },
+    {
+      "epoch": 0.18330570047135009,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0018081007024773191,
+      "loss": 0.165,
+      "step": 21117
+    },
+    {
+      "epoch": 0.18331438095155425,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018080823316491965,
+      "loss": 0.1338,
+      "step": 21118
+    },
+    {
+      "epoch": 0.18332306143175842,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0018080639600467263,
+      "loss": 0.1069,
+      "step": 21119
+    },
+    {
+      "epoch": 0.18333174191196258,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0018080455876699278,
+      "loss": 0.1045,
+      "step": 21120
+    },
+    {
+      "epoch": 0.18334042239216675,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0018080272145188211,
+      "loss": 0.1318,
+      "step": 21121
+    },
+    {
+      "epoch": 0.1833491028723709,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.001808008840593427,
+      "loss": 0.1016,
+      "step": 21122
+    },
+    {
+      "epoch": 0.18335778335257508,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0018079904658937647,
+      "loss": 0.085,
+      "step": 21123
+    },
+    {
+      "epoch": 0.18336646383277924,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001807972090419855,
+      "loss": 0.1138,
+      "step": 21124
+    },
+    {
+      "epoch": 0.1833751443129834,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0018079537141717172,
+      "loss": 0.124,
+      "step": 21125
+    },
+    {
+      "epoch": 0.18338382479318757,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001807935337149372,
+      "loss": 0.1289,
+      "step": 21126
+    },
+    {
+      "epoch": 0.18339250527339174,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0018079169593528395,
+      "loss": 0.1602,
+      "step": 21127
+    },
+    {
+      "epoch": 0.1834011857535959,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0018078985807821392,
+      "loss": 0.1006,
+      "step": 21128
+    },
+    {
+      "epoch": 0.18340986623380007,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0018078802014372922,
+      "loss": 0.0923,
+      "step": 21129
+    },
+    {
+      "epoch": 0.18341854671400423,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0018078618213183178,
+      "loss": 0.1069,
+      "step": 21130
+    },
+    {
+      "epoch": 0.18342722719420837,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0018078434404252364,
+      "loss": 0.1299,
+      "step": 21131
+    },
+    {
+      "epoch": 0.18343590767441253,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001807825058758068,
+      "loss": 0.1201,
+      "step": 21132
+    },
+    {
+      "epoch": 0.1834445881546167,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0018078066763168327,
+      "loss": 0.1377,
+      "step": 21133
+    },
+    {
+      "epoch": 0.18345326863482087,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001807788293101551,
+      "loss": 0.0977,
+      "step": 21134
+    },
+    {
+      "epoch": 0.18346194911502503,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0018077699091122424,
+      "loss": 0.0957,
+      "step": 21135
+    },
+    {
+      "epoch": 0.1834706295952292,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0018077515243489271,
+      "loss": 0.1455,
+      "step": 21136
+    },
+    {
+      "epoch": 0.18347931007543336,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018077331388116256,
+      "loss": 0.1055,
+      "step": 21137
+    },
+    {
+      "epoch": 0.18348799055563753,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0018077147525003576,
+      "loss": 0.123,
+      "step": 21138
+    },
+    {
+      "epoch": 0.1834966710358417,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0018076963654151436,
+      "loss": 0.1084,
+      "step": 21139
+    },
+    {
+      "epoch": 0.18350535151604586,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0018076779775560033,
+      "loss": 0.1543,
+      "step": 21140
+    },
+    {
+      "epoch": 0.18351403199625002,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001807659588922957,
+      "loss": 0.0967,
+      "step": 21141
+    },
+    {
+      "epoch": 0.1835227124764542,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0018076411995160248,
+      "loss": 0.1309,
+      "step": 21142
+    },
+    {
+      "epoch": 0.18353139295665835,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0018076228093352271,
+      "loss": 0.1201,
+      "step": 21143
+    },
+    {
+      "epoch": 0.18354007343686252,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0018076044183805836,
+      "loss": 0.1328,
+      "step": 21144
+    },
+    {
+      "epoch": 0.18354875391706668,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0018075860266521142,
+      "loss": 0.0933,
+      "step": 21145
+    },
+    {
+      "epoch": 0.18355743439727085,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0018075676341498396,
+      "loss": 0.1602,
+      "step": 21146
+    },
+    {
+      "epoch": 0.183566114877475,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0018075492408737798,
+      "loss": 0.1182,
+      "step": 21147
+    },
+    {
+      "epoch": 0.18357479535767918,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0018075308468239546,
+      "loss": 0.1045,
+      "step": 21148
+    },
+    {
+      "epoch": 0.18358347583788334,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0018075124520003842,
+      "loss": 0.1074,
+      "step": 21149
+    },
+    {
+      "epoch": 0.1835921563180875,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001807494056403089,
+      "loss": 0.0723,
+      "step": 21150
+    },
+    {
+      "epoch": 0.18360083679829167,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001807475660032089,
+      "loss": 0.1338,
+      "step": 21151
+    },
+    {
+      "epoch": 0.18360951727849584,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0018074572628874043,
+      "loss": 0.084,
+      "step": 21152
+    },
+    {
+      "epoch": 0.1836181977587,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0018074388649690547,
+      "loss": 0.1172,
+      "step": 21153
+    },
+    {
+      "epoch": 0.18362687823890417,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0018074204662770607,
+      "loss": 0.1309,
+      "step": 21154
+    },
+    {
+      "epoch": 0.18363555871910833,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001807402066811442,
+      "loss": 0.123,
+      "step": 21155
+    },
+    {
+      "epoch": 0.1836442391993125,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0018073836665722196,
+      "loss": 0.1074,
+      "step": 21156
+    },
+    {
+      "epoch": 0.18365291967951666,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0018073652655594126,
+      "loss": 0.1289,
+      "step": 21157
+    },
+    {
+      "epoch": 0.18366160015972083,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0018073468637730418,
+      "loss": 0.0947,
+      "step": 21158
+    },
+    {
+      "epoch": 0.183670280639925,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0018073284612131271,
+      "loss": 0.1025,
+      "step": 21159
+    },
+    {
+      "epoch": 0.18367896112012916,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0018073100578796886,
+      "loss": 0.2109,
+      "step": 21160
+    },
+    {
+      "epoch": 0.18368764160033332,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0018072916537727464,
+      "loss": 0.083,
+      "step": 21161
+    },
+    {
+      "epoch": 0.1836963220805375,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0018072732488923207,
+      "loss": 0.1543,
+      "step": 21162
+    },
+    {
+      "epoch": 0.18370500256074165,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0018072548432384314,
+      "loss": 0.0869,
+      "step": 21163
+    },
+    {
+      "epoch": 0.18371368304094582,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0018072364368110993,
+      "loss": 0.085,
+      "step": 21164
+    },
+    {
+      "epoch": 0.18372236352114998,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0018072180296103437,
+      "loss": 0.0996,
+      "step": 21165
+    },
+    {
+      "epoch": 0.18373104400135415,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0018071996216361852,
+      "loss": 0.0898,
+      "step": 21166
+    },
+    {
+      "epoch": 0.18373972448155831,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0018071812128886437,
+      "loss": 0.1152,
+      "step": 21167
+    },
+    {
+      "epoch": 0.18374840496176248,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0018071628033677397,
+      "loss": 0.1455,
+      "step": 21168
+    },
+    {
+      "epoch": 0.18375708544196664,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.001807144393073493,
+      "loss": 0.1143,
+      "step": 21169
+    },
+    {
+      "epoch": 0.1837657659221708,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0018071259820059235,
+      "loss": 0.1064,
+      "step": 21170
+    },
+    {
+      "epoch": 0.18377444640237497,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0018071075701650522,
+      "loss": 0.1216,
+      "step": 21171
+    },
+    {
+      "epoch": 0.18378312688257914,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0018070891575508981,
+      "loss": 0.1436,
+      "step": 21172
+    },
+    {
+      "epoch": 0.1837918073627833,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0018070707441634824,
+      "loss": 0.0791,
+      "step": 21173
+    },
+    {
+      "epoch": 0.18380048784298747,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0018070523300028245,
+      "loss": 0.1807,
+      "step": 21174
+    },
+    {
+      "epoch": 0.18380916832319164,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0018070339150689449,
+      "loss": 0.1289,
+      "step": 21175
+    },
+    {
+      "epoch": 0.1838178488033958,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0018070154993618634,
+      "loss": 0.1455,
+      "step": 21176
+    },
+    {
+      "epoch": 0.18382652928359997,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0018069970828816007,
+      "loss": 0.1377,
+      "step": 21177
+    },
+    {
+      "epoch": 0.18383520976380413,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0018069786656281764,
+      "loss": 0.1338,
+      "step": 21178
+    },
+    {
+      "epoch": 0.1838438902440083,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0018069602476016107,
+      "loss": 0.1426,
+      "step": 21179
+    },
+    {
+      "epoch": 0.18385257072421246,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0018069418288019245,
+      "loss": 0.083,
+      "step": 21180
+    },
+    {
+      "epoch": 0.18386125120441663,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0018069234092291367,
+      "loss": 0.1416,
+      "step": 21181
+    },
+    {
+      "epoch": 0.1838699316846208,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0018069049888832685,
+      "loss": 0.1221,
+      "step": 21182
+    },
+    {
+      "epoch": 0.18387861216482496,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0018068865677643394,
+      "loss": 0.1455,
+      "step": 21183
+    },
+    {
+      "epoch": 0.18388729264502912,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0018068681458723698,
+      "loss": 0.1318,
+      "step": 21184
+    },
+    {
+      "epoch": 0.1838959731252333,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.00180684972320738,
+      "loss": 0.105,
+      "step": 21185
+    },
+    {
+      "epoch": 0.18390465360543745,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0018068312997693897,
+      "loss": 0.0859,
+      "step": 21186
+    },
+    {
+      "epoch": 0.18391333408564162,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0018068128755584197,
+      "loss": 0.123,
+      "step": 21187
+    },
+    {
+      "epoch": 0.18392201456584578,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0018067944505744892,
+      "loss": 0.1084,
+      "step": 21188
+    },
+    {
+      "epoch": 0.18393069504604995,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0018067760248176193,
+      "loss": 0.1143,
+      "step": 21189
+    },
+    {
+      "epoch": 0.1839393755262541,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0018067575982878294,
+      "loss": 0.0913,
+      "step": 21190
+    },
+    {
+      "epoch": 0.18394805600645828,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0018067391709851407,
+      "loss": 0.0889,
+      "step": 21191
+    },
+    {
+      "epoch": 0.18395673648666244,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.001806720742909572,
+      "loss": 0.1182,
+      "step": 21192
+    },
+    {
+      "epoch": 0.1839654169668666,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0018067023140611444,
+      "loss": 0.1191,
+      "step": 21193
+    },
+    {
+      "epoch": 0.18397409744707077,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0018066838844398774,
+      "loss": 0.084,
+      "step": 21194
+    },
+    {
+      "epoch": 0.18398277792727494,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0018066654540457923,
+      "loss": 0.207,
+      "step": 21195
+    },
+    {
+      "epoch": 0.1839914584074791,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0018066470228789077,
+      "loss": 0.1309,
+      "step": 21196
+    },
+    {
+      "epoch": 0.18400013888768327,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001806628590939245,
+      "loss": 0.1187,
+      "step": 21197
+    },
+    {
+      "epoch": 0.18400881936788743,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0018066101582268237,
+      "loss": 0.1836,
+      "step": 21198
+    },
+    {
+      "epoch": 0.1840174998480916,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0018065917247416644,
+      "loss": 0.125,
+      "step": 21199
+    },
+    {
+      "epoch": 0.18402618032829576,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0018065732904837865,
+      "loss": 0.0938,
+      "step": 21200
+    },
+    {
+      "epoch": 0.18403486080849993,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.001806554855453211,
+      "loss": 0.0938,
+      "step": 21201
+    },
+    {
+      "epoch": 0.1840435412887041,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0018065364196499577,
+      "loss": 0.0869,
+      "step": 21202
+    },
+    {
+      "epoch": 0.18405222176890826,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0018065179830740465,
+      "loss": 0.1006,
+      "step": 21203
+    },
+    {
+      "epoch": 0.18406090224911242,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0018064995457254982,
+      "loss": 0.1221,
+      "step": 21204
+    },
+    {
+      "epoch": 0.1840695827293166,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0018064811076043325,
+      "loss": 0.0825,
+      "step": 21205
+    },
+    {
+      "epoch": 0.18407826320952075,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0018064626687105698,
+      "loss": 0.1133,
+      "step": 21206
+    },
+    {
+      "epoch": 0.18408694368972492,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00180644422904423,
+      "loss": 0.1226,
+      "step": 21207
+    },
+    {
+      "epoch": 0.18409562416992908,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0018064257886053336,
+      "loss": 0.0947,
+      "step": 21208
+    },
+    {
+      "epoch": 0.18410430465013325,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0018064073473939004,
+      "loss": 0.1064,
+      "step": 21209
+    },
+    {
+      "epoch": 0.18411298513033741,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0018063889054099507,
+      "loss": 0.1318,
+      "step": 21210
+    },
+    {
+      "epoch": 0.18412166561054158,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001806370462653505,
+      "loss": 0.1367,
+      "step": 21211
+    },
+    {
+      "epoch": 0.18413034609074574,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0018063520191245828,
+      "loss": 0.1572,
+      "step": 21212
+    },
+    {
+      "epoch": 0.1841390265709499,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0018063335748232047,
+      "loss": 0.1006,
+      "step": 21213
+    },
+    {
+      "epoch": 0.18414770705115407,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0018063151297493907,
+      "loss": 0.0854,
+      "step": 21214
+    },
+    {
+      "epoch": 0.18415638753135824,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0018062966839031613,
+      "loss": 0.0879,
+      "step": 21215
+    },
+    {
+      "epoch": 0.1841650680115624,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0018062782372845366,
+      "loss": 0.1084,
+      "step": 21216
+    },
+    {
+      "epoch": 0.18417374849176657,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0018062597898935364,
+      "loss": 0.1074,
+      "step": 21217
+    },
+    {
+      "epoch": 0.18418242897197074,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0018062413417301811,
+      "loss": 0.1113,
+      "step": 21218
+    },
+    {
+      "epoch": 0.1841911094521749,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001806222892794491,
+      "loss": 0.1089,
+      "step": 21219
+    },
+    {
+      "epoch": 0.18419978993237907,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018062044430864862,
+      "loss": 0.125,
+      "step": 21220
+    },
+    {
+      "epoch": 0.18420847041258323,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0018061859926061866,
+      "loss": 0.0908,
+      "step": 21221
+    },
+    {
+      "epoch": 0.1842171508927874,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0018061675413536128,
+      "loss": 0.1011,
+      "step": 21222
+    },
+    {
+      "epoch": 0.18422583137299156,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0018061490893287848,
+      "loss": 0.083,
+      "step": 21223
+    },
+    {
+      "epoch": 0.18423451185319573,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0018061306365317227,
+      "loss": 0.0986,
+      "step": 21224
+    },
+    {
+      "epoch": 0.1842431923333999,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0018061121829624468,
+      "loss": 0.0732,
+      "step": 21225
+    },
+    {
+      "epoch": 0.18425187281360406,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001806093728620977,
+      "loss": 0.1484,
+      "step": 21226
+    },
+    {
+      "epoch": 0.18426055329380822,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001806075273507334,
+      "loss": 0.0864,
+      "step": 21227
+    },
+    {
+      "epoch": 0.1842692337740124,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018060568176215376,
+      "loss": 0.1118,
+      "step": 21228
+    },
+    {
+      "epoch": 0.18427791425421655,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0018060383609636082,
+      "loss": 0.0791,
+      "step": 21229
+    },
+    {
+      "epoch": 0.18428659473442072,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0018060199035335653,
+      "loss": 0.127,
+      "step": 21230
+    },
+    {
+      "epoch": 0.18429527521462488,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018060014453314302,
+      "loss": 0.0752,
+      "step": 21231
+    },
+    {
+      "epoch": 0.18430395569482905,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0018059829863572223,
+      "loss": 0.1055,
+      "step": 21232
+    },
+    {
+      "epoch": 0.1843126361750332,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0018059645266109623,
+      "loss": 0.1104,
+      "step": 21233
+    },
+    {
+      "epoch": 0.18432131665523738,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0018059460660926699,
+      "loss": 0.1367,
+      "step": 21234
+    },
+    {
+      "epoch": 0.18432999713544154,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0018059276048023654,
+      "loss": 0.1045,
+      "step": 21235
+    },
+    {
+      "epoch": 0.1843386776156457,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018059091427400692,
+      "loss": 0.1279,
+      "step": 21236
+    },
+    {
+      "epoch": 0.18434735809584987,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0018058906799058012,
+      "loss": 0.1318,
+      "step": 21237
+    },
+    {
+      "epoch": 0.18435603857605404,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001805872216299582,
+      "loss": 0.0889,
+      "step": 21238
+    },
+    {
+      "epoch": 0.1843647190562582,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0018058537519214314,
+      "loss": 0.1348,
+      "step": 21239
+    },
+    {
+      "epoch": 0.18437339953646237,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0018058352867713697,
+      "loss": 0.0996,
+      "step": 21240
+    },
+    {
+      "epoch": 0.18438208001666653,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0018058168208494173,
+      "loss": 0.1157,
+      "step": 21241
+    },
+    {
+      "epoch": 0.1843907604968707,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001805798354155594,
+      "loss": 0.103,
+      "step": 21242
+    },
+    {
+      "epoch": 0.18439944097707486,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0018057798866899205,
+      "loss": 0.0708,
+      "step": 21243
+    },
+    {
+      "epoch": 0.18440812145727903,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0018057614184524165,
+      "loss": 0.1016,
+      "step": 21244
+    },
+    {
+      "epoch": 0.1844168019374832,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0018057429494431026,
+      "loss": 0.1299,
+      "step": 21245
+    },
+    {
+      "epoch": 0.18442548241768736,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018057244796619986,
+      "loss": 0.1348,
+      "step": 21246
+    },
+    {
+      "epoch": 0.18443416289789152,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0018057060091091252,
+      "loss": 0.1387,
+      "step": 21247
+    },
+    {
+      "epoch": 0.1844428433780957,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0018056875377845019,
+      "loss": 0.1045,
+      "step": 21248
+    },
+    {
+      "epoch": 0.18445152385829985,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0018056690656881496,
+      "loss": 0.0864,
+      "step": 21249
+    },
+    {
+      "epoch": 0.18446020433850402,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001805650592820088,
+      "loss": 0.1299,
+      "step": 21250
+    },
+    {
+      "epoch": 0.18446888481870818,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0018056321191803378,
+      "loss": 0.1719,
+      "step": 21251
+    },
+    {
+      "epoch": 0.18447756529891235,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0018056136447689188,
+      "loss": 0.1045,
+      "step": 21252
+    },
+    {
+      "epoch": 0.18448624577911651,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0018055951695858513,
+      "loss": 0.1436,
+      "step": 21253
+    },
+    {
+      "epoch": 0.18449492625932065,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0018055766936311555,
+      "loss": 0.127,
+      "step": 21254
+    },
+    {
+      "epoch": 0.18450360673952482,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0018055582169048516,
+      "loss": 0.1079,
+      "step": 21255
+    },
+    {
+      "epoch": 0.18451228721972898,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.00180553973940696,
+      "loss": 0.1396,
+      "step": 21256
+    },
+    {
+      "epoch": 0.18452096769993315,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0018055212611375005,
+      "loss": 0.166,
+      "step": 21257
+    },
+    {
+      "epoch": 0.1845296481801373,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001805502782096494,
+      "loss": 0.0815,
+      "step": 21258
+    },
+    {
+      "epoch": 0.18453832866034148,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0018054843022839598,
+      "loss": 0.0786,
+      "step": 21259
+    },
+    {
+      "epoch": 0.18454700914054564,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0018054658216999186,
+      "loss": 0.1064,
+      "step": 21260
+    },
+    {
+      "epoch": 0.1845556896207498,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0018054473403443906,
+      "loss": 0.1025,
+      "step": 21261
+    },
+    {
+      "epoch": 0.18456437010095397,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0018054288582173964,
+      "loss": 0.1367,
+      "step": 21262
+    },
+    {
+      "epoch": 0.18457305058115814,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0018054103753189553,
+      "loss": 0.1226,
+      "step": 21263
+    },
+    {
+      "epoch": 0.1845817310613623,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0018053918916490883,
+      "loss": 0.0864,
+      "step": 21264
+    },
+    {
+      "epoch": 0.18459041154156647,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0018053734072078149,
+      "loss": 0.1055,
+      "step": 21265
+    },
+    {
+      "epoch": 0.18459909202177063,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0018053549219951561,
+      "loss": 0.1836,
+      "step": 21266
+    },
+    {
+      "epoch": 0.1846077725019748,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0018053364360111318,
+      "loss": 0.1055,
+      "step": 21267
+    },
+    {
+      "epoch": 0.18461645298217896,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0018053179492557622,
+      "loss": 0.105,
+      "step": 21268
+    },
+    {
+      "epoch": 0.18462513346238313,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0018052994617290673,
+      "loss": 0.1025,
+      "step": 21269
+    },
+    {
+      "epoch": 0.1846338139425873,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0018052809734310676,
+      "loss": 0.125,
+      "step": 21270
+    },
+    {
+      "epoch": 0.18464249442279146,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001805262484361783,
+      "loss": 0.1182,
+      "step": 21271
+    },
+    {
+      "epoch": 0.18465117490299562,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0018052439945212345,
+      "loss": 0.1055,
+      "step": 21272
+    },
+    {
+      "epoch": 0.1846598553831998,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0018052255039094412,
+      "loss": 0.1133,
+      "step": 21273
+    },
+    {
+      "epoch": 0.18466853586340395,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001805207012526424,
+      "loss": 0.0859,
+      "step": 21274
+    },
+    {
+      "epoch": 0.18467721634360812,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0018051885203722033,
+      "loss": 0.2168,
+      "step": 21275
+    },
+    {
+      "epoch": 0.18468589682381228,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0018051700274467986,
+      "loss": 0.1084,
+      "step": 21276
+    },
+    {
+      "epoch": 0.18469457730401645,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001805151533750231,
+      "loss": 0.1504,
+      "step": 21277
+    },
+    {
+      "epoch": 0.18470325778422061,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.00180513303928252,
+      "loss": 0.1289,
+      "step": 21278
+    },
+    {
+      "epoch": 0.18471193826442478,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0018051145440436862,
+      "loss": 0.127,
+      "step": 21279
+    },
+    {
+      "epoch": 0.18472061874462894,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0018050960480337495,
+      "loss": 0.104,
+      "step": 21280
+    },
+    {
+      "epoch": 0.1847292992248331,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0018050775512527307,
+      "loss": 0.1484,
+      "step": 21281
+    },
+    {
+      "epoch": 0.18473797970503728,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0018050590537006495,
+      "loss": 0.1523,
+      "step": 21282
+    },
+    {
+      "epoch": 0.18474666018524144,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0018050405553775265,
+      "loss": 0.1221,
+      "step": 21283
+    },
+    {
+      "epoch": 0.1847553406654456,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0018050220562833815,
+      "loss": 0.1021,
+      "step": 21284
+    },
+    {
+      "epoch": 0.18476402114564977,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001805003556418235,
+      "loss": 0.1396,
+      "step": 21285
+    },
+    {
+      "epoch": 0.18477270162585394,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0018049850557821073,
+      "loss": 0.0928,
+      "step": 21286
+    },
+    {
+      "epoch": 0.1847813821060581,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0018049665543750184,
+      "loss": 0.1465,
+      "step": 21287
+    },
+    {
+      "epoch": 0.18479006258626227,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0018049480521969887,
+      "loss": 0.1162,
+      "step": 21288
+    },
+    {
+      "epoch": 0.18479874306646643,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0018049295492480384,
+      "loss": 0.084,
+      "step": 21289
+    },
+    {
+      "epoch": 0.1848074235466706,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001804911045528188,
+      "loss": 0.1426,
+      "step": 21290
+    },
+    {
+      "epoch": 0.18481610402687476,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0018048925410374572,
+      "loss": 0.0879,
+      "step": 21291
+    },
+    {
+      "epoch": 0.18482478450707893,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0018048740357758666,
+      "loss": 0.0996,
+      "step": 21292
+    },
+    {
+      "epoch": 0.1848334649872831,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0018048555297434366,
+      "loss": 0.0977,
+      "step": 21293
+    },
+    {
+      "epoch": 0.18484214546748726,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0018048370229401868,
+      "loss": 0.0698,
+      "step": 21294
+    },
+    {
+      "epoch": 0.18485082594769142,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0018048185153661377,
+      "loss": 0.1484,
+      "step": 21295
+    },
+    {
+      "epoch": 0.1848595064278956,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0018048000070213102,
+      "loss": 0.085,
+      "step": 21296
+    },
+    {
+      "epoch": 0.18486818690809975,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0018047814979057235,
+      "loss": 0.1025,
+      "step": 21297
+    },
+    {
+      "epoch": 0.18487686738830392,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0018047629880193986,
+      "loss": 0.1523,
+      "step": 21298
+    },
+    {
+      "epoch": 0.18488554786850808,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0018047444773623556,
+      "loss": 0.1406,
+      "step": 21299
+    },
+    {
+      "epoch": 0.18489422834871225,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0018047259659346143,
+      "loss": 0.1279,
+      "step": 21300
+    },
+    {
+      "epoch": 0.1849029088289164,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0018047074537361953,
+      "loss": 0.1348,
+      "step": 21301
+    },
+    {
+      "epoch": 0.18491158930912058,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.001804688940767119,
+      "loss": 0.0815,
+      "step": 21302
+    },
+    {
+      "epoch": 0.18492026978932474,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0018046704270274054,
+      "loss": 0.084,
+      "step": 21303
+    },
+    {
+      "epoch": 0.1849289502695289,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001804651912517075,
+      "loss": 0.123,
+      "step": 21304
+    },
+    {
+      "epoch": 0.18493763074973307,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0018046333972361476,
+      "loss": 0.085,
+      "step": 21305
+    },
+    {
+      "epoch": 0.18494631122993724,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0018046148811846437,
+      "loss": 0.1338,
+      "step": 21306
+    },
+    {
+      "epoch": 0.1849549917101414,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0018045963643625836,
+      "loss": 0.1152,
+      "step": 21307
+    },
+    {
+      "epoch": 0.18496367219034557,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0018045778467699874,
+      "loss": 0.1377,
+      "step": 21308
+    },
+    {
+      "epoch": 0.18497235267054973,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0018045593284068755,
+      "loss": 0.1133,
+      "step": 21309
+    },
+    {
+      "epoch": 0.1849810331507539,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0018045408092732683,
+      "loss": 0.1113,
+      "step": 21310
+    },
+    {
+      "epoch": 0.18498971363095806,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0018045222893691858,
+      "loss": 0.0938,
+      "step": 21311
+    },
+    {
+      "epoch": 0.18499839411116223,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0018045037686946481,
+      "loss": 0.0771,
+      "step": 21312
+    },
+    {
+      "epoch": 0.1850070745913664,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0018044852472496758,
+      "loss": 0.125,
+      "step": 21313
+    },
+    {
+      "epoch": 0.18501575507157056,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001804466725034289,
+      "loss": 0.1582,
+      "step": 21314
+    },
+    {
+      "epoch": 0.18502443555177472,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0018044482020485083,
+      "loss": 0.1348,
+      "step": 21315
+    },
+    {
+      "epoch": 0.1850331160319789,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0018044296782923532,
+      "loss": 0.208,
+      "step": 21316
+    },
+    {
+      "epoch": 0.18504179651218305,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0018044111537658446,
+      "loss": 0.0962,
+      "step": 21317
+    },
+    {
+      "epoch": 0.18505047699238722,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0018043926284690025,
+      "loss": 0.063,
+      "step": 21318
+    },
+    {
+      "epoch": 0.18505915747259138,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001804374102401847,
+      "loss": 0.0791,
+      "step": 21319
+    },
+    {
+      "epoch": 0.18506783795279555,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0018043555755643988,
+      "loss": 0.1084,
+      "step": 21320
+    },
+    {
+      "epoch": 0.18507651843299971,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001804337047956678,
+      "loss": 0.1064,
+      "step": 21321
+    },
+    {
+      "epoch": 0.18508519891320388,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0018043185195787043,
+      "loss": 0.1357,
+      "step": 21322
+    },
+    {
+      "epoch": 0.18509387939340805,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0018042999904304988,
+      "loss": 0.1172,
+      "step": 21323
+    },
+    {
+      "epoch": 0.1851025598736122,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0018042814605120816,
+      "loss": 0.1055,
+      "step": 21324
+    },
+    {
+      "epoch": 0.18511124035381638,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0018042629298234728,
+      "loss": 0.0967,
+      "step": 21325
+    },
+    {
+      "epoch": 0.18511992083402054,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0018042443983646921,
+      "loss": 0.1113,
+      "step": 21326
+    },
+    {
+      "epoch": 0.1851286013142247,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0018042258661357608,
+      "loss": 0.1406,
+      "step": 21327
+    },
+    {
+      "epoch": 0.18513728179442887,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0018042073331366985,
+      "loss": 0.1021,
+      "step": 21328
+    },
+    {
+      "epoch": 0.18514596227463304,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0018041887993675257,
+      "loss": 0.0898,
+      "step": 21329
+    },
+    {
+      "epoch": 0.1851546427548372,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0018041702648282625,
+      "loss": 0.0859,
+      "step": 21330
+    },
+    {
+      "epoch": 0.18516332323504137,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0018041517295189291,
+      "loss": 0.0942,
+      "step": 21331
+    },
+    {
+      "epoch": 0.18517200371524553,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018041331934395464,
+      "loss": 0.1201,
+      "step": 21332
+    },
+    {
+      "epoch": 0.1851806841954497,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018041146565901342,
+      "loss": 0.1152,
+      "step": 21333
+    },
+    {
+      "epoch": 0.18518936467565386,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0018040961189707127,
+      "loss": 0.0869,
+      "step": 21334
+    },
+    {
+      "epoch": 0.18519804515585803,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0018040775805813019,
+      "loss": 0.1099,
+      "step": 21335
+    },
+    {
+      "epoch": 0.1852067256360622,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001804059041421923,
+      "loss": 0.0869,
+      "step": 21336
+    },
+    {
+      "epoch": 0.18521540611626636,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001804040501492595,
+      "loss": 0.1289,
+      "step": 21337
+    },
+    {
+      "epoch": 0.18522408659647052,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0018040219607933395,
+      "loss": 0.0986,
+      "step": 21338
+    },
+    {
+      "epoch": 0.1852327670766747,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018040034193241758,
+      "loss": 0.1582,
+      "step": 21339
+    },
+    {
+      "epoch": 0.18524144755687885,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0018039848770851248,
+      "loss": 0.1113,
+      "step": 21340
+    },
+    {
+      "epoch": 0.18525012803708302,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0018039663340762063,
+      "loss": 0.0801,
+      "step": 21341
+    },
+    {
+      "epoch": 0.18525880851728718,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0018039477902974407,
+      "loss": 0.1152,
+      "step": 21342
+    },
+    {
+      "epoch": 0.18526748899749135,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.001803929245748849,
+      "loss": 0.1235,
+      "step": 21343
+    },
+    {
+      "epoch": 0.1852761694776955,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0018039107004304506,
+      "loss": 0.1182,
+      "step": 21344
+    },
+    {
+      "epoch": 0.18528484995789968,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0018038921543422658,
+      "loss": 0.1064,
+      "step": 21345
+    },
+    {
+      "epoch": 0.18529353043810384,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001803873607484315,
+      "loss": 0.0918,
+      "step": 21346
+    },
+    {
+      "epoch": 0.185302210918308,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0018038550598566186,
+      "loss": 0.1196,
+      "step": 21347
+    },
+    {
+      "epoch": 0.18531089139851217,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0018038365114591973,
+      "loss": 0.103,
+      "step": 21348
+    },
+    {
+      "epoch": 0.18531957187871634,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018038179622920706,
+      "loss": 0.127,
+      "step": 21349
+    },
+    {
+      "epoch": 0.1853282523589205,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0018037994123552593,
+      "loss": 0.0742,
+      "step": 21350
+    },
+    {
+      "epoch": 0.18533693283912467,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0018037808616487834,
+      "loss": 0.126,
+      "step": 21351
+    },
+    {
+      "epoch": 0.18534561331932883,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0018037623101726634,
+      "loss": 0.1514,
+      "step": 21352
+    },
+    {
+      "epoch": 0.185354293799533,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0018037437579269198,
+      "loss": 0.1211,
+      "step": 21353
+    },
+    {
+      "epoch": 0.18536297427973716,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0018037252049115722,
+      "loss": 0.1064,
+      "step": 21354
+    },
+    {
+      "epoch": 0.18537165475994133,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0018037066511266413,
+      "loss": 0.1162,
+      "step": 21355
+    },
+    {
+      "epoch": 0.1853803352401455,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0018036880965721475,
+      "loss": 0.0879,
+      "step": 21356
+    },
+    {
+      "epoch": 0.18538901572034966,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0018036695412481111,
+      "loss": 0.1357,
+      "step": 21357
+    },
+    {
+      "epoch": 0.18539769620055382,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001803650985154552,
+      "loss": 0.103,
+      "step": 21358
+    },
+    {
+      "epoch": 0.185406376680758,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0018036324282914914,
+      "loss": 0.1221,
+      "step": 21359
+    },
+    {
+      "epoch": 0.18541505716096215,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018036138706589483,
+      "loss": 0.1187,
+      "step": 21360
+    },
+    {
+      "epoch": 0.18542373764116632,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0018035953122569437,
+      "loss": 0.1138,
+      "step": 21361
+    },
+    {
+      "epoch": 0.18543241812137048,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0018035767530854981,
+      "loss": 0.1406,
+      "step": 21362
+    },
+    {
+      "epoch": 0.18544109860157465,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0018035581931446312,
+      "loss": 0.084,
+      "step": 21363
+    },
+    {
+      "epoch": 0.18544977908177881,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001803539632434364,
+      "loss": 0.1055,
+      "step": 21364
+    },
+    {
+      "epoch": 0.18545845956198298,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0018035210709547158,
+      "loss": 0.127,
+      "step": 21365
+    },
+    {
+      "epoch": 0.18546714004218715,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0018035025087057082,
+      "loss": 0.0942,
+      "step": 21366
+    },
+    {
+      "epoch": 0.1854758205223913,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0018034839456873607,
+      "loss": 0.1025,
+      "step": 21367
+    },
+    {
+      "epoch": 0.18548450100259548,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0018034653818996935,
+      "loss": 0.1064,
+      "step": 21368
+    },
+    {
+      "epoch": 0.18549318148279964,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0018034468173427271,
+      "loss": 0.1104,
+      "step": 21369
+    },
+    {
+      "epoch": 0.1855018619630038,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0018034282520164823,
+      "loss": 0.1152,
+      "step": 21370
+    },
+    {
+      "epoch": 0.18551054244320797,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0018034096859209784,
+      "loss": 0.0708,
+      "step": 21371
+    },
+    {
+      "epoch": 0.18551922292341214,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0018033911190562365,
+      "loss": 0.0908,
+      "step": 21372
+    },
+    {
+      "epoch": 0.1855279034036163,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0018033725514222767,
+      "loss": 0.1016,
+      "step": 21373
+    },
+    {
+      "epoch": 0.18553658388382047,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001803353983019119,
+      "loss": 0.1533,
+      "step": 21374
+    },
+    {
+      "epoch": 0.18554526436402463,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0018033354138467842,
+      "loss": 0.1387,
+      "step": 21375
+    },
+    {
+      "epoch": 0.1855539448442288,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0018033168439052921,
+      "loss": 0.0962,
+      "step": 21376
+    },
+    {
+      "epoch": 0.18556262532443293,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0018032982731946633,
+      "loss": 0.1299,
+      "step": 21377
+    },
+    {
+      "epoch": 0.1855713058046371,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0018032797017149185,
+      "loss": 0.1309,
+      "step": 21378
+    },
+    {
+      "epoch": 0.18557998628484126,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0018032611294660772,
+      "loss": 0.1357,
+      "step": 21379
+    },
+    {
+      "epoch": 0.18558866676504543,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.00180324255644816,
+      "loss": 0.1162,
+      "step": 21380
+    },
+    {
+      "epoch": 0.1855973472452496,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0018032239826611875,
+      "loss": 0.124,
+      "step": 21381
+    },
+    {
+      "epoch": 0.18560602772545376,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0018032054081051799,
+      "loss": 0.1221,
+      "step": 21382
+    },
+    {
+      "epoch": 0.18561470820565792,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0018031868327801574,
+      "loss": 0.082,
+      "step": 21383
+    },
+    {
+      "epoch": 0.1856233886858621,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00180316825668614,
+      "loss": 0.1084,
+      "step": 21384
+    },
+    {
+      "epoch": 0.18563206916606625,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0018031496798231487,
+      "loss": 0.1216,
+      "step": 21385
+    },
+    {
+      "epoch": 0.18564074964627042,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0018031311021912031,
+      "loss": 0.0889,
+      "step": 21386
+    },
+    {
+      "epoch": 0.18564943012647458,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0018031125237903244,
+      "loss": 0.0908,
+      "step": 21387
+    },
+    {
+      "epoch": 0.18565811060667875,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018030939446205322,
+      "loss": 0.1221,
+      "step": 21388
+    },
+    {
+      "epoch": 0.18566679108688292,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.001803075364681847,
+      "loss": 0.0996,
+      "step": 21389
+    },
+    {
+      "epoch": 0.18567547156708708,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001803056783974289,
+      "loss": 0.1069,
+      "step": 21390
+    },
+    {
+      "epoch": 0.18568415204729125,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001803038202497879,
+      "loss": 0.1074,
+      "step": 21391
+    },
+    {
+      "epoch": 0.1856928325274954,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001803019620252637,
+      "loss": 0.1162,
+      "step": 21392
+    },
+    {
+      "epoch": 0.18570151300769958,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001803001037238583,
+      "loss": 0.1094,
+      "step": 21393
+    },
+    {
+      "epoch": 0.18571019348790374,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0018029824534557377,
+      "loss": 0.0981,
+      "step": 21394
+    },
+    {
+      "epoch": 0.1857188739681079,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0018029638689041215,
+      "loss": 0.1201,
+      "step": 21395
+    },
+    {
+      "epoch": 0.18572755444831207,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018029452835837546,
+      "loss": 0.1162,
+      "step": 21396
+    },
+    {
+      "epoch": 0.18573623492851624,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001802926697494657,
+      "loss": 0.1084,
+      "step": 21397
+    },
+    {
+      "epoch": 0.1857449154087204,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0018029081106368495,
+      "loss": 0.1152,
+      "step": 21398
+    },
+    {
+      "epoch": 0.18575359588892457,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001802889523010352,
+      "loss": 0.1045,
+      "step": 21399
+    },
+    {
+      "epoch": 0.18576227636912873,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0018028709346151856,
+      "loss": 0.1074,
+      "step": 21400
+    },
+    {
+      "epoch": 0.1857709568493329,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0018028523454513697,
+      "loss": 0.1416,
+      "step": 21401
+    },
+    {
+      "epoch": 0.18577963732953706,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0018028337555189252,
+      "loss": 0.1309,
+      "step": 21402
+    },
+    {
+      "epoch": 0.18578831780974123,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0018028151648178722,
+      "loss": 0.0947,
+      "step": 21403
+    },
+    {
+      "epoch": 0.1857969982899454,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0018027965733482313,
+      "loss": 0.0908,
+      "step": 21404
+    },
+    {
+      "epoch": 0.18580567877014956,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0018027779811100224,
+      "loss": 0.125,
+      "step": 21405
+    },
+    {
+      "epoch": 0.18581435925035372,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.001802759388103266,
+      "loss": 0.1553,
+      "step": 21406
+    },
+    {
+      "epoch": 0.1858230397305579,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0018027407943279827,
+      "loss": 0.1221,
+      "step": 21407
+    },
+    {
+      "epoch": 0.18583172021076205,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0018027221997841925,
+      "loss": 0.1069,
+      "step": 21408
+    },
+    {
+      "epoch": 0.18584040069096622,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0018027036044719158,
+      "loss": 0.166,
+      "step": 21409
+    },
+    {
+      "epoch": 0.18584908117117038,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0018026850083911732,
+      "loss": 0.0981,
+      "step": 21410
+    },
+    {
+      "epoch": 0.18585776165137455,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018026664115419846,
+      "loss": 0.106,
+      "step": 21411
+    },
+    {
+      "epoch": 0.1858664421315787,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0018026478139243707,
+      "loss": 0.0898,
+      "step": 21412
+    },
+    {
+      "epoch": 0.18587512261178288,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001802629215538352,
+      "loss": 0.1836,
+      "step": 21413
+    },
+    {
+      "epoch": 0.18588380309198704,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0018026106163839483,
+      "loss": 0.1191,
+      "step": 21414
+    },
+    {
+      "epoch": 0.1858924835721912,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0018025920164611799,
+      "loss": 0.1426,
+      "step": 21415
+    },
+    {
+      "epoch": 0.18590116405239537,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0018025734157700675,
+      "loss": 0.0703,
+      "step": 21416
+    },
+    {
+      "epoch": 0.18590984453259954,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0018025548143106316,
+      "loss": 0.125,
+      "step": 21417
+    },
+    {
+      "epoch": 0.1859185250128037,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0018025362120828922,
+      "loss": 0.083,
+      "step": 21418
+    },
+    {
+      "epoch": 0.18592720549300787,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0018025176090868697,
+      "loss": 0.1152,
+      "step": 21419
+    },
+    {
+      "epoch": 0.18593588597321203,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018024990053225848,
+      "loss": 0.1104,
+      "step": 21420
+    },
+    {
+      "epoch": 0.1859445664534162,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0018024804007900574,
+      "loss": 0.1055,
+      "step": 21421
+    },
+    {
+      "epoch": 0.18595324693362036,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0018024617954893079,
+      "loss": 0.1201,
+      "step": 21422
+    },
+    {
+      "epoch": 0.18596192741382453,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0018024431894203565,
+      "loss": 0.0913,
+      "step": 21423
+    },
+    {
+      "epoch": 0.1859706078940287,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0018024245825832245,
+      "loss": 0.0986,
+      "step": 21424
+    },
+    {
+      "epoch": 0.18597928837423286,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0018024059749779308,
+      "loss": 0.1289,
+      "step": 21425
+    },
+    {
+      "epoch": 0.18598796885443702,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0018023873666044967,
+      "loss": 0.0811,
+      "step": 21426
+    },
+    {
+      "epoch": 0.1859966493346412,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0018023687574629426,
+      "loss": 0.1108,
+      "step": 21427
+    },
+    {
+      "epoch": 0.18600532981484535,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0018023501475532883,
+      "loss": 0.084,
+      "step": 21428
+    },
+    {
+      "epoch": 0.18601401029504952,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0018023315368755543,
+      "loss": 0.1318,
+      "step": 21429
+    },
+    {
+      "epoch": 0.18602269077525369,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0018023129254297614,
+      "loss": 0.1377,
+      "step": 21430
+    },
+    {
+      "epoch": 0.18603137125545785,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0018022943132159296,
+      "loss": 0.1006,
+      "step": 21431
+    },
+    {
+      "epoch": 0.18604005173566202,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0018022757002340794,
+      "loss": 0.0957,
+      "step": 21432
+    },
+    {
+      "epoch": 0.18604873221586618,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0018022570864842304,
+      "loss": 0.124,
+      "step": 21433
+    },
+    {
+      "epoch": 0.18605741269607035,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0018022384719664042,
+      "loss": 0.0967,
+      "step": 21434
+    },
+    {
+      "epoch": 0.1860660931762745,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0018022198566806204,
+      "loss": 0.1045,
+      "step": 21435
+    },
+    {
+      "epoch": 0.18607477365647868,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0018022012406268995,
+      "loss": 0.0962,
+      "step": 21436
+    },
+    {
+      "epoch": 0.18608345413668284,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001802182623805262,
+      "loss": 0.1094,
+      "step": 21437
+    },
+    {
+      "epoch": 0.186092134616887,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001802164006215728,
+      "loss": 0.1001,
+      "step": 21438
+    },
+    {
+      "epoch": 0.18610081509709117,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001802145387858318,
+      "loss": 0.0742,
+      "step": 21439
+    },
+    {
+      "epoch": 0.18610949557729534,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0018021267687330523,
+      "loss": 0.0908,
+      "step": 21440
+    },
+    {
+      "epoch": 0.1861181760574995,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0018021081488399516,
+      "loss": 0.1104,
+      "step": 21441
+    },
+    {
+      "epoch": 0.18612685653770367,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0018020895281790357,
+      "loss": 0.127,
+      "step": 21442
+    },
+    {
+      "epoch": 0.18613553701790783,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0018020709067503252,
+      "loss": 0.0967,
+      "step": 21443
+    },
+    {
+      "epoch": 0.186144217498112,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0018020522845538407,
+      "loss": 0.1006,
+      "step": 21444
+    },
+    {
+      "epoch": 0.18615289797831616,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0018020336615896025,
+      "loss": 0.0908,
+      "step": 21445
+    },
+    {
+      "epoch": 0.18616157845852033,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0018020150378576304,
+      "loss": 0.1387,
+      "step": 21446
+    },
+    {
+      "epoch": 0.1861702589387245,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0018019964133579455,
+      "loss": 0.1143,
+      "step": 21447
+    },
+    {
+      "epoch": 0.18617893941892866,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0018019777880905676,
+      "loss": 0.0933,
+      "step": 21448
+    },
+    {
+      "epoch": 0.18618761989913282,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0018019591620555176,
+      "loss": 0.1025,
+      "step": 21449
+    },
+    {
+      "epoch": 0.186196300379337,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0018019405352528155,
+      "loss": 0.1289,
+      "step": 21450
+    },
+    {
+      "epoch": 0.18620498085954115,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001801921907682482,
+      "loss": 0.1035,
+      "step": 21451
+    },
+    {
+      "epoch": 0.18621366133974532,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001801903279344537,
+      "loss": 0.0918,
+      "step": 21452
+    },
+    {
+      "epoch": 0.18622234181994948,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0018018846502390012,
+      "loss": 0.1289,
+      "step": 21453
+    },
+    {
+      "epoch": 0.18623102230015365,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0018018660203658948,
+      "loss": 0.0947,
+      "step": 21454
+    },
+    {
+      "epoch": 0.1862397027803578,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0018018473897252383,
+      "loss": 0.125,
+      "step": 21455
+    },
+    {
+      "epoch": 0.18624838326056198,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001801828758317052,
+      "loss": 0.0679,
+      "step": 21456
+    },
+    {
+      "epoch": 0.18625706374076614,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0018018101261413567,
+      "loss": 0.1104,
+      "step": 21457
+    },
+    {
+      "epoch": 0.1862657442209703,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001801791493198172,
+      "loss": 0.1562,
+      "step": 21458
+    },
+    {
+      "epoch": 0.18627442470117447,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0018017728594875186,
+      "loss": 0.1055,
+      "step": 21459
+    },
+    {
+      "epoch": 0.18628310518137864,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0018017542250094174,
+      "loss": 0.1128,
+      "step": 21460
+    },
+    {
+      "epoch": 0.1862917856615828,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.001801735589763888,
+      "loss": 0.1162,
+      "step": 21461
+    },
+    {
+      "epoch": 0.18630046614178697,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001801716953750951,
+      "loss": 0.1592,
+      "step": 21462
+    },
+    {
+      "epoch": 0.18630914662199113,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0018016983169706268,
+      "loss": 0.1621,
+      "step": 21463
+    },
+    {
+      "epoch": 0.1863178271021953,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001801679679422936,
+      "loss": 0.125,
+      "step": 21464
+    },
+    {
+      "epoch": 0.18632650758239946,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0018016610411078992,
+      "loss": 0.1206,
+      "step": 21465
+    },
+    {
+      "epoch": 0.18633518806260363,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001801642402025536,
+      "loss": 0.1016,
+      "step": 21466
+    },
+    {
+      "epoch": 0.1863438685428078,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0018016237621758674,
+      "loss": 0.0825,
+      "step": 21467
+    },
+    {
+      "epoch": 0.18635254902301196,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0018016051215589133,
+      "loss": 0.1504,
+      "step": 21468
+    },
+    {
+      "epoch": 0.18636122950321612,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001801586480174695,
+      "loss": 0.0918,
+      "step": 21469
+    },
+    {
+      "epoch": 0.1863699099834203,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001801567838023232,
+      "loss": 0.1172,
+      "step": 21470
+    },
+    {
+      "epoch": 0.18637859046362446,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0018015491951045447,
+      "loss": 0.1152,
+      "step": 21471
+    },
+    {
+      "epoch": 0.18638727094382862,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001801530551418654,
+      "loss": 0.1318,
+      "step": 21472
+    },
+    {
+      "epoch": 0.18639595142403279,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0018015119069655797,
+      "loss": 0.1221,
+      "step": 21473
+    },
+    {
+      "epoch": 0.18640463190423695,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0018014932617453428,
+      "loss": 0.1191,
+      "step": 21474
+    },
+    {
+      "epoch": 0.18641331238444112,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0018014746157579632,
+      "loss": 0.0938,
+      "step": 21475
+    },
+    {
+      "epoch": 0.18642199286464528,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0018014559690034617,
+      "loss": 0.0728,
+      "step": 21476
+    },
+    {
+      "epoch": 0.18643067334484945,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0018014373214818584,
+      "loss": 0.0996,
+      "step": 21477
+    },
+    {
+      "epoch": 0.1864393538250536,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0018014186731931741,
+      "loss": 0.1543,
+      "step": 21478
+    },
+    {
+      "epoch": 0.18644803430525778,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0018014000241374285,
+      "loss": 0.1172,
+      "step": 21479
+    },
+    {
+      "epoch": 0.18645671478546194,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0018013813743146426,
+      "loss": 0.105,
+      "step": 21480
+    },
+    {
+      "epoch": 0.1864653952656661,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0018013627237248363,
+      "loss": 0.168,
+      "step": 21481
+    },
+    {
+      "epoch": 0.18647407574587027,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0018013440723680303,
+      "loss": 0.1245,
+      "step": 21482
+    },
+    {
+      "epoch": 0.18648275622607444,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0018013254202442453,
+      "loss": 0.1338,
+      "step": 21483
+    },
+    {
+      "epoch": 0.1864914367062786,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001801306767353501,
+      "loss": 0.1201,
+      "step": 21484
+    },
+    {
+      "epoch": 0.18650011718648277,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0018012881136958184,
+      "loss": 0.1377,
+      "step": 21485
+    },
+    {
+      "epoch": 0.18650879766668693,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0018012694592712173,
+      "loss": 0.1631,
+      "step": 21486
+    },
+    {
+      "epoch": 0.1865174781468911,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001801250804079719,
+      "loss": 0.0977,
+      "step": 21487
+    },
+    {
+      "epoch": 0.18652615862709526,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0018012321481213427,
+      "loss": 0.1377,
+      "step": 21488
+    },
+    {
+      "epoch": 0.18653483910729943,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.00180121349139611,
+      "loss": 0.1777,
+      "step": 21489
+    },
+    {
+      "epoch": 0.1865435195875036,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0018011948339040405,
+      "loss": 0.0879,
+      "step": 21490
+    },
+    {
+      "epoch": 0.18655220006770776,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0018011761756451547,
+      "loss": 0.0889,
+      "step": 21491
+    },
+    {
+      "epoch": 0.18656088054791192,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0018011575166194736,
+      "loss": 0.1133,
+      "step": 21492
+    },
+    {
+      "epoch": 0.1865695610281161,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001801138856827017,
+      "loss": 0.1069,
+      "step": 21493
+    },
+    {
+      "epoch": 0.18657824150832025,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0018011201962678054,
+      "loss": 0.0967,
+      "step": 21494
+    },
+    {
+      "epoch": 0.18658692198852442,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0018011015349418595,
+      "loss": 0.0903,
+      "step": 21495
+    },
+    {
+      "epoch": 0.18659560246872858,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0018010828728491992,
+      "loss": 0.0957,
+      "step": 21496
+    },
+    {
+      "epoch": 0.18660428294893275,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0018010642099898454,
+      "loss": 0.1406,
+      "step": 21497
+    },
+    {
+      "epoch": 0.1866129634291369,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0018010455463638182,
+      "loss": 0.1133,
+      "step": 21498
+    },
+    {
+      "epoch": 0.18662164390934108,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0018010268819711383,
+      "loss": 0.0962,
+      "step": 21499
+    },
+    {
+      "epoch": 0.18663032438954524,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0018010082168118259,
+      "loss": 0.0947,
+      "step": 21500
+    },
+    {
+      "epoch": 0.18663900486974938,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0018009895508859014,
+      "loss": 0.085,
+      "step": 21501
+    },
+    {
+      "epoch": 0.18664768534995355,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0018009708841933852,
+      "loss": 0.125,
+      "step": 21502
+    },
+    {
+      "epoch": 0.1866563658301577,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0018009522167342978,
+      "loss": 0.1055,
+      "step": 21503
+    },
+    {
+      "epoch": 0.18666504631036188,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0018009335485086593,
+      "loss": 0.0996,
+      "step": 21504
+    },
+    {
+      "epoch": 0.18667372679056604,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0018009148795164907,
+      "loss": 0.125,
+      "step": 21505
+    },
+    {
+      "epoch": 0.1866824072707702,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.001800896209757812,
+      "loss": 0.1416,
+      "step": 21506
+    },
+    {
+      "epoch": 0.18669108775097437,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001800877539232644,
+      "loss": 0.0977,
+      "step": 21507
+    },
+    {
+      "epoch": 0.18669976823117854,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0018008588679410066,
+      "loss": 0.1016,
+      "step": 21508
+    },
+    {
+      "epoch": 0.1867084487113827,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0018008401958829207,
+      "loss": 0.1191,
+      "step": 21509
+    },
+    {
+      "epoch": 0.18671712919158687,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0018008215230584065,
+      "loss": 0.1133,
+      "step": 21510
+    },
+    {
+      "epoch": 0.18672580967179103,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0018008028494674842,
+      "loss": 0.1309,
+      "step": 21511
+    },
+    {
+      "epoch": 0.1867344901519952,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0018007841751101743,
+      "loss": 0.1055,
+      "step": 21512
+    },
+    {
+      "epoch": 0.18674317063219936,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0018007654999864977,
+      "loss": 0.0977,
+      "step": 21513
+    },
+    {
+      "epoch": 0.18675185111240353,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0018007468240964744,
+      "loss": 0.1211,
+      "step": 21514
+    },
+    {
+      "epoch": 0.1867605315926077,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001800728147440125,
+      "loss": 0.1318,
+      "step": 21515
+    },
+    {
+      "epoch": 0.18676921207281186,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0018007094700174696,
+      "loss": 0.0864,
+      "step": 21516
+    },
+    {
+      "epoch": 0.18677789255301602,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001800690791828529,
+      "loss": 0.1201,
+      "step": 21517
+    },
+    {
+      "epoch": 0.1867865730332202,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0018006721128733232,
+      "loss": 0.1035,
+      "step": 21518
+    },
+    {
+      "epoch": 0.18679525351342435,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0018006534331518728,
+      "loss": 0.0967,
+      "step": 21519
+    },
+    {
+      "epoch": 0.18680393399362852,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001800634752664199,
+      "loss": 0.1387,
+      "step": 21520
+    },
+    {
+      "epoch": 0.18681261447383268,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0018006160714103213,
+      "loss": 0.0938,
+      "step": 21521
+    },
+    {
+      "epoch": 0.18682129495403685,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00180059738939026,
+      "loss": 0.126,
+      "step": 21522
+    },
+    {
+      "epoch": 0.186829975434241,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0018005787066040363,
+      "loss": 0.1133,
+      "step": 21523
+    },
+    {
+      "epoch": 0.18683865591444518,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.00180056002305167,
+      "loss": 0.0845,
+      "step": 21524
+    },
+    {
+      "epoch": 0.18684733639464934,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0018005413387331822,
+      "loss": 0.1016,
+      "step": 21525
+    },
+    {
+      "epoch": 0.1868560168748535,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0018005226536485926,
+      "loss": 0.1191,
+      "step": 21526
+    },
+    {
+      "epoch": 0.18686469735505767,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001800503967797922,
+      "loss": 0.1196,
+      "step": 21527
+    },
+    {
+      "epoch": 0.18687337783526184,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0018004852811811906,
+      "loss": 0.0884,
+      "step": 21528
+    },
+    {
+      "epoch": 0.186882058315466,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0018004665937984194,
+      "loss": 0.1025,
+      "step": 21529
+    },
+    {
+      "epoch": 0.18689073879567017,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0018004479056496281,
+      "loss": 0.1021,
+      "step": 21530
+    },
+    {
+      "epoch": 0.18689941927587433,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0018004292167348376,
+      "loss": 0.1147,
+      "step": 21531
+    },
+    {
+      "epoch": 0.1869080997560785,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0018004105270540687,
+      "loss": 0.0928,
+      "step": 21532
+    },
+    {
+      "epoch": 0.18691678023628266,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0018003918366073408,
+      "loss": 0.103,
+      "step": 21533
+    },
+    {
+      "epoch": 0.18692546071648683,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0018003731453946751,
+      "loss": 0.1367,
+      "step": 21534
+    },
+    {
+      "epoch": 0.186934141196691,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0018003544534160919,
+      "loss": 0.1143,
+      "step": 21535
+    },
+    {
+      "epoch": 0.18694282167689516,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0018003357606716115,
+      "loss": 0.1074,
+      "step": 21536
+    },
+    {
+      "epoch": 0.18695150215709933,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0018003170671612545,
+      "loss": 0.1172,
+      "step": 21537
+    },
+    {
+      "epoch": 0.1869601826373035,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001800298372885041,
+      "loss": 0.1064,
+      "step": 21538
+    },
+    {
+      "epoch": 0.18696886311750766,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0018002796778429924,
+      "loss": 0.1191,
+      "step": 21539
+    },
+    {
+      "epoch": 0.18697754359771182,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0018002609820351276,
+      "loss": 0.1152,
+      "step": 21540
+    },
+    {
+      "epoch": 0.18698622407791599,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0018002422854614686,
+      "loss": 0.1089,
+      "step": 21541
+    },
+    {
+      "epoch": 0.18699490455812015,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001800223588122035,
+      "loss": 0.1416,
+      "step": 21542
+    },
+    {
+      "epoch": 0.18700358503832432,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001800204890016847,
+      "loss": 0.0938,
+      "step": 21543
+    },
+    {
+      "epoch": 0.18701226551852848,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0018001861911459258,
+      "loss": 0.1182,
+      "step": 21544
+    },
+    {
+      "epoch": 0.18702094599873265,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0018001674915092917,
+      "loss": 0.1074,
+      "step": 21545
+    },
+    {
+      "epoch": 0.1870296264789368,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0018001487911069646,
+      "loss": 0.1426,
+      "step": 21546
+    },
+    {
+      "epoch": 0.18703830695914098,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0018001300899389654,
+      "loss": 0.0703,
+      "step": 21547
+    },
+    {
+      "epoch": 0.18704698743934514,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0018001113880053146,
+      "loss": 0.1118,
+      "step": 21548
+    },
+    {
+      "epoch": 0.1870556679195493,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0018000926853060323,
+      "loss": 0.1143,
+      "step": 21549
+    },
+    {
+      "epoch": 0.18706434839975347,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0018000739818411391,
+      "loss": 0.0869,
+      "step": 21550
+    },
+    {
+      "epoch": 0.18707302887995764,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0018000552776106557,
+      "loss": 0.1104,
+      "step": 21551
+    },
+    {
+      "epoch": 0.1870817093601618,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0018000365726146023,
+      "loss": 0.1064,
+      "step": 21552
+    },
+    {
+      "epoch": 0.18709038984036597,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0018000178668529994,
+      "loss": 0.084,
+      "step": 21553
+    },
+    {
+      "epoch": 0.18709907032057013,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0017999991603258674,
+      "loss": 0.0986,
+      "step": 21554
+    },
+    {
+      "epoch": 0.1871077508007743,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001799980453033227,
+      "loss": 0.1895,
+      "step": 21555
+    },
+    {
+      "epoch": 0.18711643128097846,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0017999617449750984,
+      "loss": 0.1152,
+      "step": 21556
+    },
+    {
+      "epoch": 0.18712511176118263,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0017999430361515022,
+      "loss": 0.1182,
+      "step": 21557
+    },
+    {
+      "epoch": 0.1871337922413868,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001799924326562459,
+      "loss": 0.1133,
+      "step": 21558
+    },
+    {
+      "epoch": 0.18714247272159096,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0017999056162079888,
+      "loss": 0.125,
+      "step": 21559
+    },
+    {
+      "epoch": 0.18715115320179512,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0017998869050881124,
+      "loss": 0.1055,
+      "step": 21560
+    },
+    {
+      "epoch": 0.1871598336819993,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0017998681932028502,
+      "loss": 0.084,
+      "step": 21561
+    },
+    {
+      "epoch": 0.18716851416220345,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0017998494805522224,
+      "loss": 0.0991,
+      "step": 21562
+    },
+    {
+      "epoch": 0.18717719464240762,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0017998307671362502,
+      "loss": 0.1196,
+      "step": 21563
+    },
+    {
+      "epoch": 0.18718587512261178,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0017998120529549536,
+      "loss": 0.1016,
+      "step": 21564
+    },
+    {
+      "epoch": 0.18719455560281595,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0017997933380083527,
+      "loss": 0.1074,
+      "step": 21565
+    },
+    {
+      "epoch": 0.1872032360830201,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0017997746222964683,
+      "loss": 0.1309,
+      "step": 21566
+    },
+    {
+      "epoch": 0.18721191656322428,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0017997559058193211,
+      "loss": 0.085,
+      "step": 21567
+    },
+    {
+      "epoch": 0.18722059704342844,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0017997371885769315,
+      "loss": 0.1143,
+      "step": 21568
+    },
+    {
+      "epoch": 0.1872292775236326,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0017997184705693197,
+      "loss": 0.127,
+      "step": 21569
+    },
+    {
+      "epoch": 0.18723795800383677,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001799699751796506,
+      "loss": 0.1152,
+      "step": 21570
+    },
+    {
+      "epoch": 0.18724663848404094,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0017996810322585114,
+      "loss": 0.127,
+      "step": 21571
+    },
+    {
+      "epoch": 0.1872553189642451,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0017996623119553561,
+      "loss": 0.1143,
+      "step": 21572
+    },
+    {
+      "epoch": 0.18726399944444927,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0017996435908870605,
+      "loss": 0.0864,
+      "step": 21573
+    },
+    {
+      "epoch": 0.18727267992465343,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0017996248690536453,
+      "loss": 0.1104,
+      "step": 21574
+    },
+    {
+      "epoch": 0.1872813604048576,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001799606146455131,
+      "loss": 0.1152,
+      "step": 21575
+    },
+    {
+      "epoch": 0.18729004088506176,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017995874230915376,
+      "loss": 0.1582,
+      "step": 21576
+    },
+    {
+      "epoch": 0.18729872136526593,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0017995686989628864,
+      "loss": 0.082,
+      "step": 21577
+    },
+    {
+      "epoch": 0.1873074018454701,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001799549974069197,
+      "loss": 0.0801,
+      "step": 21578
+    },
+    {
+      "epoch": 0.18731608232567426,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0017995312484104901,
+      "loss": 0.1108,
+      "step": 21579
+    },
+    {
+      "epoch": 0.18732476280587843,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017995125219867866,
+      "loss": 0.0986,
+      "step": 21580
+    },
+    {
+      "epoch": 0.1873334432860826,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0017994937947981068,
+      "loss": 0.1226,
+      "step": 21581
+    },
+    {
+      "epoch": 0.18734212376628676,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0017994750668444707,
+      "loss": 0.0991,
+      "step": 21582
+    },
+    {
+      "epoch": 0.18735080424649092,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0017994563381258997,
+      "loss": 0.1377,
+      "step": 21583
+    },
+    {
+      "epoch": 0.18735948472669509,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0017994376086424135,
+      "loss": 0.1445,
+      "step": 21584
+    },
+    {
+      "epoch": 0.18736816520689925,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0017994188783940328,
+      "loss": 0.1138,
+      "step": 21585
+    },
+    {
+      "epoch": 0.18737684568710342,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0017994001473807782,
+      "loss": 0.0981,
+      "step": 21586
+    },
+    {
+      "epoch": 0.18738552616730758,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0017993814156026702,
+      "loss": 0.1367,
+      "step": 21587
+    },
+    {
+      "epoch": 0.18739420664751175,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0017993626830597289,
+      "loss": 0.1221,
+      "step": 21588
+    },
+    {
+      "epoch": 0.1874028871277159,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0017993439497519753,
+      "loss": 0.1816,
+      "step": 21589
+    },
+    {
+      "epoch": 0.18741156760792008,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0017993252156794295,
+      "loss": 0.0913,
+      "step": 21590
+    },
+    {
+      "epoch": 0.18742024808812424,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0017993064808421123,
+      "loss": 0.1191,
+      "step": 21591
+    },
+    {
+      "epoch": 0.1874289285683284,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0017992877452400443,
+      "loss": 0.0889,
+      "step": 21592
+    },
+    {
+      "epoch": 0.18743760904853257,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0017992690088732453,
+      "loss": 0.1152,
+      "step": 21593
+    },
+    {
+      "epoch": 0.18744628952873674,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0017992502717417364,
+      "loss": 0.0928,
+      "step": 21594
+    },
+    {
+      "epoch": 0.1874549700089409,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0017992315338455377,
+      "loss": 0.105,
+      "step": 21595
+    },
+    {
+      "epoch": 0.18746365048914507,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0017992127951846702,
+      "loss": 0.1055,
+      "step": 21596
+    },
+    {
+      "epoch": 0.18747233096934923,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0017991940557591539,
+      "loss": 0.0938,
+      "step": 21597
+    },
+    {
+      "epoch": 0.1874810114495534,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0017991753155690094,
+      "loss": 0.1245,
+      "step": 21598
+    },
+    {
+      "epoch": 0.18748969192975756,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0017991565746142575,
+      "loss": 0.0762,
+      "step": 21599
+    },
+    {
+      "epoch": 0.18749837240996173,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0017991378328949185,
+      "loss": 0.124,
+      "step": 21600
+    },
+    {
+      "epoch": 0.1875070528901659,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0017991190904110127,
+      "loss": 0.1167,
+      "step": 21601
+    },
+    {
+      "epoch": 0.18751573337037006,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0017991003471625607,
+      "loss": 0.1025,
+      "step": 21602
+    },
+    {
+      "epoch": 0.18752441385057422,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0017990816031495833,
+      "loss": 0.127,
+      "step": 21603
+    },
+    {
+      "epoch": 0.1875330943307784,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0017990628583721008,
+      "loss": 0.1143,
+      "step": 21604
+    },
+    {
+      "epoch": 0.18754177481098255,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0017990441128301333,
+      "loss": 0.1885,
+      "step": 21605
+    },
+    {
+      "epoch": 0.18755045529118672,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0017990253665237021,
+      "loss": 0.1196,
+      "step": 21606
+    },
+    {
+      "epoch": 0.18755913577139088,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001799006619452827,
+      "loss": 0.127,
+      "step": 21607
+    },
+    {
+      "epoch": 0.18756781625159505,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0017989878716175287,
+      "loss": 0.1152,
+      "step": 21608
+    },
+    {
+      "epoch": 0.1875764967317992,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001798969123017828,
+      "loss": 0.1064,
+      "step": 21609
+    },
+    {
+      "epoch": 0.18758517721200338,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001798950373653745,
+      "loss": 0.1748,
+      "step": 21610
+    },
+    {
+      "epoch": 0.18759385769220754,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0017989316235253003,
+      "loss": 0.0986,
+      "step": 21611
+    },
+    {
+      "epoch": 0.1876025381724117,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0017989128726325146,
+      "loss": 0.1084,
+      "step": 21612
+    },
+    {
+      "epoch": 0.18761121865261587,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0017988941209754084,
+      "loss": 0.1182,
+      "step": 21613
+    },
+    {
+      "epoch": 0.18761989913282004,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.001798875368554002,
+      "loss": 0.1348,
+      "step": 21614
+    },
+    {
+      "epoch": 0.1876285796130242,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001798856615368316,
+      "loss": 0.0986,
+      "step": 21615
+    },
+    {
+      "epoch": 0.18763726009322837,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0017988378614183706,
+      "loss": 0.1064,
+      "step": 21616
+    },
+    {
+      "epoch": 0.18764594057343253,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001798819106704187,
+      "loss": 0.1621,
+      "step": 21617
+    },
+    {
+      "epoch": 0.1876546210536367,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0017988003512257854,
+      "loss": 0.0679,
+      "step": 21618
+    },
+    {
+      "epoch": 0.18766330153384086,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001798781594983186,
+      "loss": 0.0972,
+      "step": 21619
+    },
+    {
+      "epoch": 0.18767198201404503,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0017987628379764094,
+      "loss": 0.1338,
+      "step": 21620
+    },
+    {
+      "epoch": 0.1876806624942492,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0017987440802054767,
+      "loss": 0.0854,
+      "step": 21621
+    },
+    {
+      "epoch": 0.18768934297445336,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0017987253216704077,
+      "loss": 0.1016,
+      "step": 21622
+    },
+    {
+      "epoch": 0.18769802345465753,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0017987065623712231,
+      "loss": 0.0957,
+      "step": 21623
+    },
+    {
+      "epoch": 0.18770670393486166,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0017986878023079436,
+      "loss": 0.1367,
+      "step": 21624
+    },
+    {
+      "epoch": 0.18771538441506583,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0017986690414805897,
+      "loss": 0.082,
+      "step": 21625
+    },
+    {
+      "epoch": 0.18772406489527,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0017986502798891817,
+      "loss": 0.1387,
+      "step": 21626
+    },
+    {
+      "epoch": 0.18773274537547416,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0017986315175337404,
+      "loss": 0.0942,
+      "step": 21627
+    },
+    {
+      "epoch": 0.18774142585567832,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0017986127544142858,
+      "loss": 0.1064,
+      "step": 21628
+    },
+    {
+      "epoch": 0.1877501063358825,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0017985939905308392,
+      "loss": 0.1055,
+      "step": 21629
+    },
+    {
+      "epoch": 0.18775878681608665,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0017985752258834204,
+      "loss": 0.0977,
+      "step": 21630
+    },
+    {
+      "epoch": 0.18776746729629082,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0017985564604720507,
+      "loss": 0.1738,
+      "step": 21631
+    },
+    {
+      "epoch": 0.18777614777649498,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0017985376942967497,
+      "loss": 0.1055,
+      "step": 21632
+    },
+    {
+      "epoch": 0.18778482825669915,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0017985189273575387,
+      "loss": 0.0742,
+      "step": 21633
+    },
+    {
+      "epoch": 0.18779350873690331,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0017985001596544376,
+      "loss": 0.1348,
+      "step": 21634
+    },
+    {
+      "epoch": 0.18780218921710748,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0017984813911874674,
+      "loss": 0.1416,
+      "step": 21635
+    },
+    {
+      "epoch": 0.18781086969731164,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0017984626219566485,
+      "loss": 0.124,
+      "step": 21636
+    },
+    {
+      "epoch": 0.1878195501775158,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001798443851962001,
+      "loss": 0.0869,
+      "step": 21637
+    },
+    {
+      "epoch": 0.18782823065771997,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0017984250812035463,
+      "loss": 0.082,
+      "step": 21638
+    },
+    {
+      "epoch": 0.18783691113792414,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001798406309681304,
+      "loss": 0.104,
+      "step": 21639
+    },
+    {
+      "epoch": 0.1878455916181283,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0017983875373952954,
+      "loss": 0.1094,
+      "step": 21640
+    },
+    {
+      "epoch": 0.18785427209833247,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0017983687643455405,
+      "loss": 0.1084,
+      "step": 21641
+    },
+    {
+      "epoch": 0.18786295257853663,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00179834999053206,
+      "loss": 0.1465,
+      "step": 21642
+    },
+    {
+      "epoch": 0.1878716330587408,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0017983312159548747,
+      "loss": 0.127,
+      "step": 21643
+    },
+    {
+      "epoch": 0.18788031353894497,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0017983124406140045,
+      "loss": 0.1035,
+      "step": 21644
+    },
+    {
+      "epoch": 0.18788899401914913,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0017982936645094705,
+      "loss": 0.1387,
+      "step": 21645
+    },
+    {
+      "epoch": 0.1878976744993533,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.001798274887641293,
+      "loss": 0.0903,
+      "step": 21646
+    },
+    {
+      "epoch": 0.18790635497955746,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0017982561100094924,
+      "loss": 0.1143,
+      "step": 21647
+    },
+    {
+      "epoch": 0.18791503545976163,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0017982373316140898,
+      "loss": 0.1445,
+      "step": 21648
+    },
+    {
+      "epoch": 0.1879237159399658,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0017982185524551052,
+      "loss": 0.082,
+      "step": 21649
+    },
+    {
+      "epoch": 0.18793239642016996,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0017981997725325589,
+      "loss": 0.1289,
+      "step": 21650
+    },
+    {
+      "epoch": 0.18794107690037412,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0017981809918464725,
+      "loss": 0.1104,
+      "step": 21651
+    },
+    {
+      "epoch": 0.1879497573805783,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0017981622103968654,
+      "loss": 0.1118,
+      "step": 21652
+    },
+    {
+      "epoch": 0.18795843786078245,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0017981434281837585,
+      "loss": 0.105,
+      "step": 21653
+    },
+    {
+      "epoch": 0.18796711834098662,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0017981246452071725,
+      "loss": 0.1094,
+      "step": 21654
+    },
+    {
+      "epoch": 0.18797579882119078,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0017981058614671285,
+      "loss": 0.0835,
+      "step": 21655
+    },
+    {
+      "epoch": 0.18798447930139495,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0017980870769636456,
+      "loss": 0.1104,
+      "step": 21656
+    },
+    {
+      "epoch": 0.1879931597815991,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0017980682916967456,
+      "loss": 0.0986,
+      "step": 21657
+    },
+    {
+      "epoch": 0.18800184026180328,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0017980495056664483,
+      "loss": 0.1191,
+      "step": 21658
+    },
+    {
+      "epoch": 0.18801052074200744,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0017980307188727746,
+      "loss": 0.1309,
+      "step": 21659
+    },
+    {
+      "epoch": 0.1880192012222116,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0017980119313157454,
+      "loss": 0.1074,
+      "step": 21660
+    },
+    {
+      "epoch": 0.18802788170241577,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0017979931429953804,
+      "loss": 0.1133,
+      "step": 21661
+    },
+    {
+      "epoch": 0.18803656218261994,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0017979743539117009,
+      "loss": 0.1187,
+      "step": 21662
+    },
+    {
+      "epoch": 0.1880452426628241,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0017979555640647272,
+      "loss": 0.1074,
+      "step": 21663
+    },
+    {
+      "epoch": 0.18805392314302827,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0017979367734544793,
+      "loss": 0.0889,
+      "step": 21664
+    },
+    {
+      "epoch": 0.18806260362323243,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0017979179820809784,
+      "loss": 0.1016,
+      "step": 21665
+    },
+    {
+      "epoch": 0.1880712841034366,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0017978991899442452,
+      "loss": 0.1162,
+      "step": 21666
+    },
+    {
+      "epoch": 0.18807996458364076,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0017978803970442993,
+      "loss": 0.0928,
+      "step": 21667
+    },
+    {
+      "epoch": 0.18808864506384493,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0017978616033811625,
+      "loss": 0.0938,
+      "step": 21668
+    },
+    {
+      "epoch": 0.1880973255440491,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0017978428089548545,
+      "loss": 0.0898,
+      "step": 21669
+    },
+    {
+      "epoch": 0.18810600602425326,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0017978240137653963,
+      "loss": 0.1201,
+      "step": 21670
+    },
+    {
+      "epoch": 0.18811468650445742,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0017978052178128081,
+      "loss": 0.1465,
+      "step": 21671
+    },
+    {
+      "epoch": 0.1881233669846616,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0017977864210971105,
+      "loss": 0.0986,
+      "step": 21672
+    },
+    {
+      "epoch": 0.18813204746486575,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0017977676236183243,
+      "loss": 0.1055,
+      "step": 21673
+    },
+    {
+      "epoch": 0.18814072794506992,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0017977488253764697,
+      "loss": 0.1079,
+      "step": 21674
+    },
+    {
+      "epoch": 0.18814940842527408,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0017977300263715677,
+      "loss": 0.103,
+      "step": 21675
+    },
+    {
+      "epoch": 0.18815808890547825,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0017977112266036384,
+      "loss": 0.0996,
+      "step": 21676
+    },
+    {
+      "epoch": 0.18816676938568241,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0017976924260727028,
+      "loss": 0.1118,
+      "step": 21677
+    },
+    {
+      "epoch": 0.18817544986588658,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.001797673624778781,
+      "loss": 0.1162,
+      "step": 21678
+    },
+    {
+      "epoch": 0.18818413034609074,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0017976548227218942,
+      "loss": 0.1118,
+      "step": 21679
+    },
+    {
+      "epoch": 0.1881928108262949,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001797636019902062,
+      "loss": 0.105,
+      "step": 21680
+    },
+    {
+      "epoch": 0.18820149130649907,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001797617216319306,
+      "loss": 0.0913,
+      "step": 21681
+    },
+    {
+      "epoch": 0.18821017178670324,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0017975984119736462,
+      "loss": 0.127,
+      "step": 21682
+    },
+    {
+      "epoch": 0.1882188522669074,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0017975796068651032,
+      "loss": 0.1221,
+      "step": 21683
+    },
+    {
+      "epoch": 0.18822753274711157,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0017975608009936977,
+      "loss": 0.0996,
+      "step": 21684
+    },
+    {
+      "epoch": 0.18823621322731574,
+      "grad_norm": 1.734375,
+      "learning_rate": 0.0017975419943594502,
+      "loss": 0.1562,
+      "step": 21685
+    },
+    {
+      "epoch": 0.1882448937075199,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001797523186962381,
+      "loss": 0.125,
+      "step": 21686
+    },
+    {
+      "epoch": 0.18825357418772407,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0017975043788025112,
+      "loss": 0.0869,
+      "step": 21687
+    },
+    {
+      "epoch": 0.18826225466792823,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0017974855698798609,
+      "loss": 0.1387,
+      "step": 21688
+    },
+    {
+      "epoch": 0.1882709351481324,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001797466760194451,
+      "loss": 0.1182,
+      "step": 21689
+    },
+    {
+      "epoch": 0.18827961562833656,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0017974479497463022,
+      "loss": 0.0884,
+      "step": 21690
+    },
+    {
+      "epoch": 0.18828829610854073,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0017974291385354341,
+      "loss": 0.1123,
+      "step": 21691
+    },
+    {
+      "epoch": 0.1882969765887449,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0017974103265618684,
+      "loss": 0.0889,
+      "step": 21692
+    },
+    {
+      "epoch": 0.18830565706894906,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0017973915138256255,
+      "loss": 0.1201,
+      "step": 21693
+    },
+    {
+      "epoch": 0.18831433754915322,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.001797372700326725,
+      "loss": 0.0889,
+      "step": 21694
+    },
+    {
+      "epoch": 0.1883230180293574,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0017973538860651885,
+      "loss": 0.0845,
+      "step": 21695
+    },
+    {
+      "epoch": 0.18833169850956155,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0017973350710410365,
+      "loss": 0.1357,
+      "step": 21696
+    },
+    {
+      "epoch": 0.18834037898976572,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0017973162552542893,
+      "loss": 0.1235,
+      "step": 21697
+    },
+    {
+      "epoch": 0.18834905946996988,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0017972974387049673,
+      "loss": 0.1074,
+      "step": 21698
+    },
+    {
+      "epoch": 0.18835773995017405,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0017972786213930915,
+      "loss": 0.1416,
+      "step": 21699
+    },
+    {
+      "epoch": 0.1883664204303782,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0017972598033186825,
+      "loss": 0.1289,
+      "step": 21700
+    },
+    {
+      "epoch": 0.18837510091058238,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.00179724098448176,
+      "loss": 0.1123,
+      "step": 21701
+    },
+    {
+      "epoch": 0.18838378139078654,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0017972221648823459,
+      "loss": 0.1426,
+      "step": 21702
+    },
+    {
+      "epoch": 0.1883924618709907,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.00179720334452046,
+      "loss": 0.125,
+      "step": 21703
+    },
+    {
+      "epoch": 0.18840114235119487,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0017971845233961225,
+      "loss": 0.1426,
+      "step": 21704
+    },
+    {
+      "epoch": 0.18840982283139904,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0017971657015093548,
+      "loss": 0.106,
+      "step": 21705
+    },
+    {
+      "epoch": 0.1884185033116032,
+      "grad_norm": 1.8828125,
+      "learning_rate": 0.001797146878860177,
+      "loss": 0.124,
+      "step": 21706
+    },
+    {
+      "epoch": 0.18842718379180737,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0017971280554486102,
+      "loss": 0.0962,
+      "step": 21707
+    },
+    {
+      "epoch": 0.18843586427201153,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0017971092312746744,
+      "loss": 0.127,
+      "step": 21708
+    },
+    {
+      "epoch": 0.1884445447522157,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0017970904063383906,
+      "loss": 0.207,
+      "step": 21709
+    },
+    {
+      "epoch": 0.18845322523241986,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001797071580639779,
+      "loss": 0.0894,
+      "step": 21710
+    },
+    {
+      "epoch": 0.18846190571262403,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0017970527541788604,
+      "loss": 0.082,
+      "step": 21711
+    },
+    {
+      "epoch": 0.1884705861928282,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0017970339269556554,
+      "loss": 0.123,
+      "step": 21712
+    },
+    {
+      "epoch": 0.18847926667303236,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0017970150989701848,
+      "loss": 0.207,
+      "step": 21713
+    },
+    {
+      "epoch": 0.18848794715323652,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0017969962702224687,
+      "loss": 0.0835,
+      "step": 21714
+    },
+    {
+      "epoch": 0.1884966276334407,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.001796977440712528,
+      "loss": 0.0874,
+      "step": 21715
+    },
+    {
+      "epoch": 0.18850530811364485,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0017969586104403835,
+      "loss": 0.1074,
+      "step": 21716
+    },
+    {
+      "epoch": 0.18851398859384902,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001796939779406055,
+      "loss": 0.0938,
+      "step": 21717
+    },
+    {
+      "epoch": 0.18852266907405318,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0017969209476095642,
+      "loss": 0.1221,
+      "step": 21718
+    },
+    {
+      "epoch": 0.18853134955425735,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017969021150509308,
+      "loss": 0.0938,
+      "step": 21719
+    },
+    {
+      "epoch": 0.18854003003446151,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0017968832817301756,
+      "loss": 0.1465,
+      "step": 21720
+    },
+    {
+      "epoch": 0.18854871051466568,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0017968644476473194,
+      "loss": 0.0991,
+      "step": 21721
+    },
+    {
+      "epoch": 0.18855739099486984,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0017968456128023829,
+      "loss": 0.1143,
+      "step": 21722
+    },
+    {
+      "epoch": 0.188566071475074,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0017968267771953862,
+      "loss": 0.0986,
+      "step": 21723
+    },
+    {
+      "epoch": 0.18857475195527817,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0017968079408263507,
+      "loss": 0.1328,
+      "step": 21724
+    },
+    {
+      "epoch": 0.18858343243548234,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001796789103695296,
+      "loss": 0.1172,
+      "step": 21725
+    },
+    {
+      "epoch": 0.1885921129156865,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0017967702658022438,
+      "loss": 0.1602,
+      "step": 21726
+    },
+    {
+      "epoch": 0.18860079339589067,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0017967514271472137,
+      "loss": 0.1084,
+      "step": 21727
+    },
+    {
+      "epoch": 0.18860947387609484,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0017967325877302266,
+      "loss": 0.1084,
+      "step": 21728
+    },
+    {
+      "epoch": 0.188618154356299,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0017967137475513033,
+      "loss": 0.1064,
+      "step": 21729
+    },
+    {
+      "epoch": 0.18862683483650317,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0017966949066104644,
+      "loss": 0.0967,
+      "step": 21730
+    },
+    {
+      "epoch": 0.18863551531670733,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0017966760649077305,
+      "loss": 0.0996,
+      "step": 21731
+    },
+    {
+      "epoch": 0.1886441957969115,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.001796657222443122,
+      "loss": 0.1494,
+      "step": 21732
+    },
+    {
+      "epoch": 0.18865287627711566,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0017966383792166595,
+      "loss": 0.1187,
+      "step": 21733
+    },
+    {
+      "epoch": 0.18866155675731983,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0017966195352283637,
+      "loss": 0.0889,
+      "step": 21734
+    },
+    {
+      "epoch": 0.188670237237524,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0017966006904782557,
+      "loss": 0.1221,
+      "step": 21735
+    },
+    {
+      "epoch": 0.18867891771772816,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0017965818449663551,
+      "loss": 0.124,
+      "step": 21736
+    },
+    {
+      "epoch": 0.18868759819793232,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0017965629986926835,
+      "loss": 0.1357,
+      "step": 21737
+    },
+    {
+      "epoch": 0.1886962786781365,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001796544151657261,
+      "loss": 0.0938,
+      "step": 21738
+    },
+    {
+      "epoch": 0.18870495915834065,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001796525303860108,
+      "loss": 0.085,
+      "step": 21739
+    },
+    {
+      "epoch": 0.18871363963854482,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0017965064553012457,
+      "loss": 0.0972,
+      "step": 21740
+    },
+    {
+      "epoch": 0.18872232011874898,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001796487605980694,
+      "loss": 0.1196,
+      "step": 21741
+    },
+    {
+      "epoch": 0.18873100059895315,
+      "grad_norm": 1.8671875,
+      "learning_rate": 0.0017964687558984745,
+      "loss": 0.1699,
+      "step": 21742
+    },
+    {
+      "epoch": 0.1887396810791573,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0017964499050546069,
+      "loss": 0.1514,
+      "step": 21743
+    },
+    {
+      "epoch": 0.18874836155936148,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0017964310534491121,
+      "loss": 0.1104,
+      "step": 21744
+    },
+    {
+      "epoch": 0.18875704203956564,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0017964122010820107,
+      "loss": 0.0703,
+      "step": 21745
+    },
+    {
+      "epoch": 0.1887657225197698,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0017963933479533236,
+      "loss": 0.1123,
+      "step": 21746
+    },
+    {
+      "epoch": 0.18877440299997394,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0017963744940630712,
+      "loss": 0.1865,
+      "step": 21747
+    },
+    {
+      "epoch": 0.1887830834801781,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0017963556394112738,
+      "loss": 0.0693,
+      "step": 21748
+    },
+    {
+      "epoch": 0.18879176396038228,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0017963367839979529,
+      "loss": 0.0957,
+      "step": 21749
+    },
+    {
+      "epoch": 0.18880044444058644,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0017963179278231282,
+      "loss": 0.1245,
+      "step": 21750
+    },
+    {
+      "epoch": 0.1888091249207906,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017962990708868206,
+      "loss": 0.0977,
+      "step": 21751
+    },
+    {
+      "epoch": 0.18881780540099477,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0017962802131890508,
+      "loss": 0.1035,
+      "step": 21752
+    },
+    {
+      "epoch": 0.18882648588119894,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0017962613547298396,
+      "loss": 0.1357,
+      "step": 21753
+    },
+    {
+      "epoch": 0.1888351663614031,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0017962424955092074,
+      "loss": 0.0962,
+      "step": 21754
+    },
+    {
+      "epoch": 0.18884384684160727,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0017962236355271747,
+      "loss": 0.0796,
+      "step": 21755
+    },
+    {
+      "epoch": 0.18885252732181143,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0017962047747837626,
+      "loss": 0.1123,
+      "step": 21756
+    },
+    {
+      "epoch": 0.1888612078020156,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0017961859132789911,
+      "loss": 0.1001,
+      "step": 21757
+    },
+    {
+      "epoch": 0.18886988828221976,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0017961670510128814,
+      "loss": 0.1006,
+      "step": 21758
+    },
+    {
+      "epoch": 0.18887856876242393,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0017961481879854538,
+      "loss": 0.127,
+      "step": 21759
+    },
+    {
+      "epoch": 0.1888872492426281,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001796129324196729,
+      "loss": 0.0918,
+      "step": 21760
+    },
+    {
+      "epoch": 0.18889592972283226,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0017961104596467274,
+      "loss": 0.1631,
+      "step": 21761
+    },
+    {
+      "epoch": 0.18890461020303642,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0017960915943354698,
+      "loss": 0.1152,
+      "step": 21762
+    },
+    {
+      "epoch": 0.1889132906832406,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0017960727282629774,
+      "loss": 0.0957,
+      "step": 21763
+    },
+    {
+      "epoch": 0.18892197116344475,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.00179605386142927,
+      "loss": 0.0845,
+      "step": 21764
+    },
+    {
+      "epoch": 0.18893065164364892,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0017960349938343687,
+      "loss": 0.0786,
+      "step": 21765
+    },
+    {
+      "epoch": 0.18893933212385308,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.001796016125478294,
+      "loss": 0.1235,
+      "step": 21766
+    },
+    {
+      "epoch": 0.18894801260405725,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0017959972563610664,
+      "loss": 0.085,
+      "step": 21767
+    },
+    {
+      "epoch": 0.1889566930842614,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0017959783864827067,
+      "loss": 0.0918,
+      "step": 21768
+    },
+    {
+      "epoch": 0.18896537356446558,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0017959595158432353,
+      "loss": 0.0664,
+      "step": 21769
+    },
+    {
+      "epoch": 0.18897405404466974,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0017959406444426733,
+      "loss": 0.0938,
+      "step": 21770
+    },
+    {
+      "epoch": 0.1889827345248739,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001795921772281041,
+      "loss": 0.1904,
+      "step": 21771
+    },
+    {
+      "epoch": 0.18899141500507807,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0017959028993583593,
+      "loss": 0.1289,
+      "step": 21772
+    },
+    {
+      "epoch": 0.18900009548528224,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0017958840256746484,
+      "loss": 0.1016,
+      "step": 21773
+    },
+    {
+      "epoch": 0.1890087759654864,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0017958651512299292,
+      "loss": 0.1216,
+      "step": 21774
+    },
+    {
+      "epoch": 0.18901745644569057,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0017958462760242222,
+      "loss": 0.1465,
+      "step": 21775
+    },
+    {
+      "epoch": 0.18902613692589473,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0017958274000575483,
+      "loss": 0.0972,
+      "step": 21776
+    },
+    {
+      "epoch": 0.1890348174060989,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0017958085233299279,
+      "loss": 0.1128,
+      "step": 21777
+    },
+    {
+      "epoch": 0.18904349788630306,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0017957896458413818,
+      "loss": 0.1396,
+      "step": 21778
+    },
+    {
+      "epoch": 0.18905217836650723,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001795770767591931,
+      "loss": 0.0908,
+      "step": 21779
+    },
+    {
+      "epoch": 0.1890608588467114,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0017957518885815952,
+      "loss": 0.103,
+      "step": 21780
+    },
+    {
+      "epoch": 0.18906953932691556,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001795733008810396,
+      "loss": 0.0811,
+      "step": 21781
+    },
+    {
+      "epoch": 0.18907821980711972,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0017957141282783533,
+      "loss": 0.1396,
+      "step": 21782
+    },
+    {
+      "epoch": 0.1890869002873239,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0017956952469854882,
+      "loss": 0.1001,
+      "step": 21783
+    },
+    {
+      "epoch": 0.18909558076752805,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001795676364931821,
+      "loss": 0.1069,
+      "step": 21784
+    },
+    {
+      "epoch": 0.18910426124773222,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001795657482117373,
+      "loss": 0.1523,
+      "step": 21785
+    },
+    {
+      "epoch": 0.18911294172793638,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0017956385985421644,
+      "loss": 0.1045,
+      "step": 21786
+    },
+    {
+      "epoch": 0.18912162220814055,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001795619714206216,
+      "loss": 0.1113,
+      "step": 21787
+    },
+    {
+      "epoch": 0.18913030268834471,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0017956008291095478,
+      "loss": 0.1201,
+      "step": 21788
+    },
+    {
+      "epoch": 0.18913898316854888,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0017955819432521813,
+      "loss": 0.0781,
+      "step": 21789
+    },
+    {
+      "epoch": 0.18914766364875304,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0017955630566341366,
+      "loss": 0.0933,
+      "step": 21790
+    },
+    {
+      "epoch": 0.1891563441289572,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0017955441692554348,
+      "loss": 0.1465,
+      "step": 21791
+    },
+    {
+      "epoch": 0.18916502460916138,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017955252811160964,
+      "loss": 0.1211,
+      "step": 21792
+    },
+    {
+      "epoch": 0.18917370508936554,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.001795506392216142,
+      "loss": 0.1016,
+      "step": 21793
+    },
+    {
+      "epoch": 0.1891823855695697,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0017954875025555923,
+      "loss": 0.1221,
+      "step": 21794
+    },
+    {
+      "epoch": 0.18919106604977387,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0017954686121344676,
+      "loss": 0.1533,
+      "step": 21795
+    },
+    {
+      "epoch": 0.18919974652997804,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0017954497209527892,
+      "loss": 0.0776,
+      "step": 21796
+    },
+    {
+      "epoch": 0.1892084270101822,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0017954308290105772,
+      "loss": 0.1064,
+      "step": 21797
+    },
+    {
+      "epoch": 0.18921710749038637,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0017954119363078527,
+      "loss": 0.1006,
+      "step": 21798
+    },
+    {
+      "epoch": 0.18922578797059053,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001795393042844636,
+      "loss": 0.1045,
+      "step": 21799
+    },
+    {
+      "epoch": 0.1892344684507947,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001795374148620948,
+      "loss": 0.0693,
+      "step": 21800
+    },
+    {
+      "epoch": 0.18924314893099886,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0017953552536368093,
+      "loss": 0.1309,
+      "step": 21801
+    },
+    {
+      "epoch": 0.18925182941120303,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0017953363578922404,
+      "loss": 0.1025,
+      "step": 21802
+    },
+    {
+      "epoch": 0.1892605098914072,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0017953174613872622,
+      "loss": 0.126,
+      "step": 21803
+    },
+    {
+      "epoch": 0.18926919037161136,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0017952985641218955,
+      "loss": 0.0947,
+      "step": 21804
+    },
+    {
+      "epoch": 0.18927787085181552,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0017952796660961603,
+      "loss": 0.1357,
+      "step": 21805
+    },
+    {
+      "epoch": 0.1892865513320197,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001795260767310078,
+      "loss": 0.1152,
+      "step": 21806
+    },
+    {
+      "epoch": 0.18929523181222385,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017952418677636688,
+      "loss": 0.0796,
+      "step": 21807
+    },
+    {
+      "epoch": 0.18930391229242802,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0017952229674569537,
+      "loss": 0.084,
+      "step": 21808
+    },
+    {
+      "epoch": 0.18931259277263218,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0017952040663899531,
+      "loss": 0.1187,
+      "step": 21809
+    },
+    {
+      "epoch": 0.18932127325283635,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0017951851645626878,
+      "loss": 0.0986,
+      "step": 21810
+    },
+    {
+      "epoch": 0.1893299537330405,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017951662619751785,
+      "loss": 0.1045,
+      "step": 21811
+    },
+    {
+      "epoch": 0.18933863421324468,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0017951473586274459,
+      "loss": 0.0933,
+      "step": 21812
+    },
+    {
+      "epoch": 0.18934731469344884,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0017951284545195104,
+      "loss": 0.1143,
+      "step": 21813
+    },
+    {
+      "epoch": 0.189355995173653,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0017951095496513932,
+      "loss": 0.0952,
+      "step": 21814
+    },
+    {
+      "epoch": 0.18936467565385717,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0017950906440231142,
+      "loss": 0.1025,
+      "step": 21815
+    },
+    {
+      "epoch": 0.18937335613406134,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0017950717376346947,
+      "loss": 0.1084,
+      "step": 21816
+    },
+    {
+      "epoch": 0.1893820366142655,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0017950528304861552,
+      "loss": 0.0938,
+      "step": 21817
+    },
+    {
+      "epoch": 0.18939071709446967,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0017950339225775163,
+      "loss": 0.0938,
+      "step": 21818
+    },
+    {
+      "epoch": 0.18939939757467383,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001795015013908799,
+      "loss": 0.1143,
+      "step": 21819
+    },
+    {
+      "epoch": 0.189408078054878,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0017949961044800235,
+      "loss": 0.0986,
+      "step": 21820
+    },
+    {
+      "epoch": 0.18941675853508216,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0017949771942912106,
+      "loss": 0.1729,
+      "step": 21821
+    },
+    {
+      "epoch": 0.18942543901528633,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0017949582833423812,
+      "loss": 0.1211,
+      "step": 21822
+    },
+    {
+      "epoch": 0.1894341194954905,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001794939371633556,
+      "loss": 0.1206,
+      "step": 21823
+    },
+    {
+      "epoch": 0.18944279997569466,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0017949204591647555,
+      "loss": 0.1064,
+      "step": 21824
+    },
+    {
+      "epoch": 0.18945148045589882,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0017949015459360004,
+      "loss": 0.1001,
+      "step": 21825
+    },
+    {
+      "epoch": 0.189460160936103,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0017948826319473114,
+      "loss": 0.1699,
+      "step": 21826
+    },
+    {
+      "epoch": 0.18946884141630715,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001794863717198709,
+      "loss": 0.1338,
+      "step": 21827
+    },
+    {
+      "epoch": 0.18947752189651132,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0017948448016902143,
+      "loss": 0.125,
+      "step": 21828
+    },
+    {
+      "epoch": 0.18948620237671548,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0017948258854218475,
+      "loss": 0.0786,
+      "step": 21829
+    },
+    {
+      "epoch": 0.18949488285691965,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.00179480696839363,
+      "loss": 0.1191,
+      "step": 21830
+    },
+    {
+      "epoch": 0.18950356333712381,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0017947880506055816,
+      "loss": 0.1021,
+      "step": 21831
+    },
+    {
+      "epoch": 0.18951224381732798,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0017947691320577236,
+      "loss": 0.1216,
+      "step": 21832
+    },
+    {
+      "epoch": 0.18952092429753215,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0017947502127500768,
+      "loss": 0.1279,
+      "step": 21833
+    },
+    {
+      "epoch": 0.1895296047777363,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001794731292682661,
+      "loss": 0.1406,
+      "step": 21834
+    },
+    {
+      "epoch": 0.18953828525794048,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0017947123718554982,
+      "loss": 0.1279,
+      "step": 21835
+    },
+    {
+      "epoch": 0.18954696573814464,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001794693450268608,
+      "loss": 0.0957,
+      "step": 21836
+    },
+    {
+      "epoch": 0.1895556462183488,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0017946745279220115,
+      "loss": 0.1089,
+      "step": 21837
+    },
+    {
+      "epoch": 0.18956432669855297,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0017946556048157292,
+      "loss": 0.1094,
+      "step": 21838
+    },
+    {
+      "epoch": 0.18957300717875714,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001794636680949782,
+      "loss": 0.105,
+      "step": 21839
+    },
+    {
+      "epoch": 0.1895816876589613,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001794617756324191,
+      "loss": 0.1279,
+      "step": 21840
+    },
+    {
+      "epoch": 0.18959036813916547,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0017945988309389762,
+      "loss": 0.0996,
+      "step": 21841
+    },
+    {
+      "epoch": 0.18959904861936963,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0017945799047941584,
+      "loss": 0.0957,
+      "step": 21842
+    },
+    {
+      "epoch": 0.1896077290995738,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0017945609778897587,
+      "loss": 0.0972,
+      "step": 21843
+    },
+    {
+      "epoch": 0.18961640957977796,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0017945420502257971,
+      "loss": 0.1396,
+      "step": 21844
+    },
+    {
+      "epoch": 0.18962509005998213,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0017945231218022954,
+      "loss": 0.1289,
+      "step": 21845
+    },
+    {
+      "epoch": 0.1896337705401863,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0017945041926192733,
+      "loss": 0.0615,
+      "step": 21846
+    },
+    {
+      "epoch": 0.18964245102039046,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0017944852626767518,
+      "loss": 0.103,
+      "step": 21847
+    },
+    {
+      "epoch": 0.18965113150059462,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001794466331974752,
+      "loss": 0.103,
+      "step": 21848
+    },
+    {
+      "epoch": 0.1896598119807988,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0017944474005132935,
+      "loss": 0.0898,
+      "step": 21849
+    },
+    {
+      "epoch": 0.18966849246100295,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0017944284682923984,
+      "loss": 0.1138,
+      "step": 21850
+    },
+    {
+      "epoch": 0.18967717294120712,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0017944095353120866,
+      "loss": 0.1172,
+      "step": 21851
+    },
+    {
+      "epoch": 0.18968585342141128,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001794390601572379,
+      "loss": 0.0859,
+      "step": 21852
+    },
+    {
+      "epoch": 0.18969453390161545,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0017943716670732962,
+      "loss": 0.1104,
+      "step": 21853
+    },
+    {
+      "epoch": 0.1897032143818196,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0017943527318148589,
+      "loss": 0.0923,
+      "step": 21854
+    },
+    {
+      "epoch": 0.18971189486202378,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0017943337957970881,
+      "loss": 0.1289,
+      "step": 21855
+    },
+    {
+      "epoch": 0.18972057534222794,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0017943148590200042,
+      "loss": 0.0889,
+      "step": 21856
+    },
+    {
+      "epoch": 0.1897292558224321,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001794295921483628,
+      "loss": 0.1562,
+      "step": 21857
+    },
+    {
+      "epoch": 0.18973793630263627,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.00179427698318798,
+      "loss": 0.1064,
+      "step": 21858
+    },
+    {
+      "epoch": 0.18974661678284044,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0017942580441330815,
+      "loss": 0.1055,
+      "step": 21859
+    },
+    {
+      "epoch": 0.1897552972630446,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0017942391043189526,
+      "loss": 0.0957,
+      "step": 21860
+    },
+    {
+      "epoch": 0.18976397774324877,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0017942201637456143,
+      "loss": 0.1357,
+      "step": 21861
+    },
+    {
+      "epoch": 0.18977265822345293,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0017942012224130876,
+      "loss": 0.0996,
+      "step": 21862
+    },
+    {
+      "epoch": 0.1897813387036571,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0017941822803213923,
+      "loss": 0.0996,
+      "step": 21863
+    },
+    {
+      "epoch": 0.18979001918386126,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0017941633374705499,
+      "loss": 0.0889,
+      "step": 21864
+    },
+    {
+      "epoch": 0.18979869966406543,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0017941443938605808,
+      "loss": 0.1367,
+      "step": 21865
+    },
+    {
+      "epoch": 0.1898073801442696,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0017941254494915062,
+      "loss": 0.1006,
+      "step": 21866
+    },
+    {
+      "epoch": 0.18981606062447376,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0017941065043633462,
+      "loss": 0.0859,
+      "step": 21867
+    },
+    {
+      "epoch": 0.18982474110467792,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017940875584761219,
+      "loss": 0.0928,
+      "step": 21868
+    },
+    {
+      "epoch": 0.1898334215848821,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0017940686118298535,
+      "loss": 0.125,
+      "step": 21869
+    },
+    {
+      "epoch": 0.18984210206508623,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0017940496644245623,
+      "loss": 0.1504,
+      "step": 21870
+    },
+    {
+      "epoch": 0.1898507825452904,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0017940307162602686,
+      "loss": 0.0889,
+      "step": 21871
+    },
+    {
+      "epoch": 0.18985946302549456,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0017940117673369937,
+      "loss": 0.1104,
+      "step": 21872
+    },
+    {
+      "epoch": 0.18986814350569872,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001793992817654758,
+      "loss": 0.1426,
+      "step": 21873
+    },
+    {
+      "epoch": 0.1898768239859029,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001793973867213582,
+      "loss": 0.1113,
+      "step": 21874
+    },
+    {
+      "epoch": 0.18988550446610705,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0017939549160134866,
+      "loss": 0.0952,
+      "step": 21875
+    },
+    {
+      "epoch": 0.18989418494631122,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0017939359640544922,
+      "loss": 0.1641,
+      "step": 21876
+    },
+    {
+      "epoch": 0.18990286542651538,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0017939170113366206,
+      "loss": 0.123,
+      "step": 21877
+    },
+    {
+      "epoch": 0.18991154590671955,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001793898057859891,
+      "loss": 0.0957,
+      "step": 21878
+    },
+    {
+      "epoch": 0.1899202263869237,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0017938791036243257,
+      "loss": 0.1221,
+      "step": 21879
+    },
+    {
+      "epoch": 0.18992890686712788,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001793860148629944,
+      "loss": 0.1582,
+      "step": 21880
+    },
+    {
+      "epoch": 0.18993758734733204,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0017938411928767677,
+      "loss": 0.1191,
+      "step": 21881
+    },
+    {
+      "epoch": 0.1899462678275362,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0017938222363648166,
+      "loss": 0.126,
+      "step": 21882
+    },
+    {
+      "epoch": 0.18995494830774037,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0017938032790941123,
+      "loss": 0.0884,
+      "step": 21883
+    },
+    {
+      "epoch": 0.18996362878794454,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0017937843210646752,
+      "loss": 0.0977,
+      "step": 21884
+    },
+    {
+      "epoch": 0.1899723092681487,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0017937653622765257,
+      "loss": 0.0869,
+      "step": 21885
+    },
+    {
+      "epoch": 0.18998098974835287,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001793746402729685,
+      "loss": 0.0977,
+      "step": 21886
+    },
+    {
+      "epoch": 0.18998967022855703,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0017937274424241736,
+      "loss": 0.0977,
+      "step": 21887
+    },
+    {
+      "epoch": 0.1899983507087612,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0017937084813600126,
+      "loss": 0.1211,
+      "step": 21888
+    },
+    {
+      "epoch": 0.19000703118896536,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0017936895195372223,
+      "loss": 0.0825,
+      "step": 21889
+    },
+    {
+      "epoch": 0.19001571166916953,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001793670556955823,
+      "loss": 0.1309,
+      "step": 21890
+    },
+    {
+      "epoch": 0.1900243921493737,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0017936515936158369,
+      "loss": 0.1562,
+      "step": 21891
+    },
+    {
+      "epoch": 0.19003307262957786,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0017936326295172834,
+      "loss": 0.0859,
+      "step": 21892
+    },
+    {
+      "epoch": 0.19004175310978202,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017936136646601833,
+      "loss": 0.1523,
+      "step": 21893
+    },
+    {
+      "epoch": 0.1900504335899862,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0017935946990445583,
+      "loss": 0.0928,
+      "step": 21894
+    },
+    {
+      "epoch": 0.19005911407019035,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001793575732670428,
+      "loss": 0.0864,
+      "step": 21895
+    },
+    {
+      "epoch": 0.19006779455039452,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0017935567655378143,
+      "loss": 0.0908,
+      "step": 21896
+    },
+    {
+      "epoch": 0.19007647503059868,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001793537797646737,
+      "loss": 0.1235,
+      "step": 21897
+    },
+    {
+      "epoch": 0.19008515551080285,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001793518828997217,
+      "loss": 0.1396,
+      "step": 21898
+    },
+    {
+      "epoch": 0.19009383599100702,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0017934998595892757,
+      "loss": 0.1055,
+      "step": 21899
+    },
+    {
+      "epoch": 0.19010251647121118,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0017934808894229334,
+      "loss": 0.082,
+      "step": 21900
+    },
+    {
+      "epoch": 0.19011119695141535,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0017934619184982103,
+      "loss": 0.1113,
+      "step": 21901
+    },
+    {
+      "epoch": 0.1901198774316195,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0017934429468151282,
+      "loss": 0.1099,
+      "step": 21902
+    },
+    {
+      "epoch": 0.19012855791182368,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0017934239743737069,
+      "loss": 0.0889,
+      "step": 21903
+    },
+    {
+      "epoch": 0.19013723839202784,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0017934050011739675,
+      "loss": 0.126,
+      "step": 21904
+    },
+    {
+      "epoch": 0.190145918872232,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0017933860272159312,
+      "loss": 0.0967,
+      "step": 21905
+    },
+    {
+      "epoch": 0.19015459935243617,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0017933670524996185,
+      "loss": 0.1016,
+      "step": 21906
+    },
+    {
+      "epoch": 0.19016327983264034,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0017933480770250495,
+      "loss": 0.1162,
+      "step": 21907
+    },
+    {
+      "epoch": 0.1901719603128445,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0017933291007922456,
+      "loss": 0.0913,
+      "step": 21908
+    },
+    {
+      "epoch": 0.19018064079304867,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0017933101238012273,
+      "loss": 0.127,
+      "step": 21909
+    },
+    {
+      "epoch": 0.19018932127325283,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0017932911460520156,
+      "loss": 0.1118,
+      "step": 21910
+    },
+    {
+      "epoch": 0.190198001753457,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0017932721675446312,
+      "loss": 0.1162,
+      "step": 21911
+    },
+    {
+      "epoch": 0.19020668223366116,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0017932531882790949,
+      "loss": 0.1025,
+      "step": 21912
+    },
+    {
+      "epoch": 0.19021536271386533,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001793234208255427,
+      "loss": 0.0894,
+      "step": 21913
+    },
+    {
+      "epoch": 0.1902240431940695,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0017932152274736488,
+      "loss": 0.126,
+      "step": 21914
+    },
+    {
+      "epoch": 0.19023272367427366,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0017931962459337807,
+      "loss": 0.0884,
+      "step": 21915
+    },
+    {
+      "epoch": 0.19024140415447782,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0017931772636358436,
+      "loss": 0.0996,
+      "step": 21916
+    },
+    {
+      "epoch": 0.190250084634682,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0017931582805798586,
+      "loss": 0.1797,
+      "step": 21917
+    },
+    {
+      "epoch": 0.19025876511488615,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0017931392967658459,
+      "loss": 0.1133,
+      "step": 21918
+    },
+    {
+      "epoch": 0.19026744559509032,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0017931203121938262,
+      "loss": 0.1147,
+      "step": 21919
+    },
+    {
+      "epoch": 0.19027612607529448,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0017931013268638208,
+      "loss": 0.103,
+      "step": 21920
+    },
+    {
+      "epoch": 0.19028480655549865,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0017930823407758501,
+      "loss": 0.1045,
+      "step": 21921
+    },
+    {
+      "epoch": 0.1902934870357028,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001793063353929935,
+      "loss": 0.1216,
+      "step": 21922
+    },
+    {
+      "epoch": 0.19030216751590698,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0017930443663260966,
+      "loss": 0.1172,
+      "step": 21923
+    },
+    {
+      "epoch": 0.19031084799611114,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0017930253779643549,
+      "loss": 0.1377,
+      "step": 21924
+    },
+    {
+      "epoch": 0.1903195284763153,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001793006388844731,
+      "loss": 0.0996,
+      "step": 21925
+    },
+    {
+      "epoch": 0.19032820895651947,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001792987398967246,
+      "loss": 0.1543,
+      "step": 21926
+    },
+    {
+      "epoch": 0.19033688943672364,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0017929684083319202,
+      "loss": 0.1318,
+      "step": 21927
+    },
+    {
+      "epoch": 0.1903455699169278,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0017929494169387744,
+      "loss": 0.1177,
+      "step": 21928
+    },
+    {
+      "epoch": 0.19035425039713197,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00179293042478783,
+      "loss": 0.1475,
+      "step": 21929
+    },
+    {
+      "epoch": 0.19036293087733613,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0017929114318791068,
+      "loss": 0.125,
+      "step": 21930
+    },
+    {
+      "epoch": 0.1903716113575403,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0017928924382126265,
+      "loss": 0.1152,
+      "step": 21931
+    },
+    {
+      "epoch": 0.19038029183774446,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0017928734437884094,
+      "loss": 0.1484,
+      "step": 21932
+    },
+    {
+      "epoch": 0.19038897231794863,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.001792854448606476,
+      "loss": 0.1562,
+      "step": 21933
+    },
+    {
+      "epoch": 0.1903976527981528,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0017928354526668474,
+      "loss": 0.1055,
+      "step": 21934
+    },
+    {
+      "epoch": 0.19040633327835696,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0017928164559695445,
+      "loss": 0.0835,
+      "step": 21935
+    },
+    {
+      "epoch": 0.19041501375856112,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0017927974585145881,
+      "loss": 0.1133,
+      "step": 21936
+    },
+    {
+      "epoch": 0.1904236942387653,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0017927784603019986,
+      "loss": 0.1475,
+      "step": 21937
+    },
+    {
+      "epoch": 0.19043237471896945,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0017927594613317969,
+      "loss": 0.1387,
+      "step": 21938
+    },
+    {
+      "epoch": 0.19044105519917362,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001792740461604004,
+      "loss": 0.1143,
+      "step": 21939
+    },
+    {
+      "epoch": 0.19044973567937779,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0017927214611186406,
+      "loss": 0.1113,
+      "step": 21940
+    },
+    {
+      "epoch": 0.19045841615958195,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001792702459875727,
+      "loss": 0.1221,
+      "step": 21941
+    },
+    {
+      "epoch": 0.19046709663978612,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001792683457875285,
+      "loss": 0.083,
+      "step": 21942
+    },
+    {
+      "epoch": 0.19047577711999028,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0017926644551173345,
+      "loss": 0.1128,
+      "step": 21943
+    },
+    {
+      "epoch": 0.19048445760019445,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0017926454516018968,
+      "loss": 0.0977,
+      "step": 21944
+    },
+    {
+      "epoch": 0.1904931380803986,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.001792626447328992,
+      "loss": 0.0884,
+      "step": 21945
+    },
+    {
+      "epoch": 0.19050181856060278,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0017926074422986416,
+      "loss": 0.1914,
+      "step": 21946
+    },
+    {
+      "epoch": 0.19051049904080694,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.001792588436510866,
+      "loss": 0.1113,
+      "step": 21947
+    },
+    {
+      "epoch": 0.1905191795210111,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0017925694299656863,
+      "loss": 0.1104,
+      "step": 21948
+    },
+    {
+      "epoch": 0.19052786000121527,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0017925504226631228,
+      "loss": 0.127,
+      "step": 21949
+    },
+    {
+      "epoch": 0.19053654048141944,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0017925314146031968,
+      "loss": 0.1084,
+      "step": 21950
+    },
+    {
+      "epoch": 0.1905452209616236,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0017925124057859287,
+      "loss": 0.082,
+      "step": 21951
+    },
+    {
+      "epoch": 0.19055390144182777,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0017924933962113394,
+      "loss": 0.1602,
+      "step": 21952
+    },
+    {
+      "epoch": 0.19056258192203193,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.00179247438587945,
+      "loss": 0.1113,
+      "step": 21953
+    },
+    {
+      "epoch": 0.1905712624022361,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001792455374790281,
+      "loss": 0.0928,
+      "step": 21954
+    },
+    {
+      "epoch": 0.19057994288244026,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0017924363629438526,
+      "loss": 0.2812,
+      "step": 21955
+    },
+    {
+      "epoch": 0.19058862336264443,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0017924173503401867,
+      "loss": 0.1758,
+      "step": 21956
+    },
+    {
+      "epoch": 0.1905973038428486,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0017923983369793036,
+      "loss": 0.1279,
+      "step": 21957
+    },
+    {
+      "epoch": 0.19060598432305276,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.001792379322861224,
+      "loss": 0.0938,
+      "step": 21958
+    },
+    {
+      "epoch": 0.19061466480325692,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0017923603079859685,
+      "loss": 0.0986,
+      "step": 21959
+    },
+    {
+      "epoch": 0.1906233452834611,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0017923412923535587,
+      "loss": 0.125,
+      "step": 21960
+    },
+    {
+      "epoch": 0.19063202576366525,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0017923222759640145,
+      "loss": 0.1387,
+      "step": 21961
+    },
+    {
+      "epoch": 0.19064070624386942,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0017923032588173572,
+      "loss": 0.0898,
+      "step": 21962
+    },
+    {
+      "epoch": 0.19064938672407358,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0017922842409136073,
+      "loss": 0.1064,
+      "step": 21963
+    },
+    {
+      "epoch": 0.19065806720427775,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.001792265222252786,
+      "loss": 0.1299,
+      "step": 21964
+    },
+    {
+      "epoch": 0.1906667476844819,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0017922462028349137,
+      "loss": 0.0806,
+      "step": 21965
+    },
+    {
+      "epoch": 0.19067542816468608,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0017922271826600114,
+      "loss": 0.1084,
+      "step": 21966
+    },
+    {
+      "epoch": 0.19068410864489024,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017922081617280999,
+      "loss": 0.1016,
+      "step": 21967
+    },
+    {
+      "epoch": 0.1906927891250944,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0017921891400391999,
+      "loss": 0.0874,
+      "step": 21968
+    },
+    {
+      "epoch": 0.19070146960529857,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0017921701175933323,
+      "loss": 0.0972,
+      "step": 21969
+    },
+    {
+      "epoch": 0.19071015008550274,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0017921510943905176,
+      "loss": 0.1074,
+      "step": 21970
+    },
+    {
+      "epoch": 0.1907188305657069,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001792132070430777,
+      "loss": 0.1084,
+      "step": 21971
+    },
+    {
+      "epoch": 0.19072751104591107,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0017921130457141314,
+      "loss": 0.1621,
+      "step": 21972
+    },
+    {
+      "epoch": 0.19073619152611523,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0017920940202406012,
+      "loss": 0.1123,
+      "step": 21973
+    },
+    {
+      "epoch": 0.1907448720063194,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0017920749940102073,
+      "loss": 0.1289,
+      "step": 21974
+    },
+    {
+      "epoch": 0.19075355248652356,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001792055967022971,
+      "loss": 0.1182,
+      "step": 21975
+    },
+    {
+      "epoch": 0.19076223296672773,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0017920369392789121,
+      "loss": 0.0918,
+      "step": 21976
+    },
+    {
+      "epoch": 0.1907709134469319,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0017920179107780523,
+      "loss": 0.0967,
+      "step": 21977
+    },
+    {
+      "epoch": 0.19077959392713606,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0017919988815204122,
+      "loss": 0.0986,
+      "step": 21978
+    },
+    {
+      "epoch": 0.19078827440734022,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0017919798515060122,
+      "loss": 0.0996,
+      "step": 21979
+    },
+    {
+      "epoch": 0.1907969548875444,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0017919608207348737,
+      "loss": 0.1211,
+      "step": 21980
+    },
+    {
+      "epoch": 0.19080563536774856,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0017919417892070172,
+      "loss": 0.1406,
+      "step": 21981
+    },
+    {
+      "epoch": 0.19081431584795272,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0017919227569224636,
+      "loss": 0.0898,
+      "step": 21982
+    },
+    {
+      "epoch": 0.19082299632815689,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0017919037238812335,
+      "loss": 0.1138,
+      "step": 21983
+    },
+    {
+      "epoch": 0.19083167680836105,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.001791884690083348,
+      "loss": 0.0732,
+      "step": 21984
+    },
+    {
+      "epoch": 0.19084035728856522,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0017918656555288277,
+      "loss": 0.1123,
+      "step": 21985
+    },
+    {
+      "epoch": 0.19084903776876938,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017918466202176938,
+      "loss": 0.126,
+      "step": 21986
+    },
+    {
+      "epoch": 0.19085771824897355,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0017918275841499664,
+      "loss": 0.0986,
+      "step": 21987
+    },
+    {
+      "epoch": 0.1908663987291777,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0017918085473256672,
+      "loss": 0.1338,
+      "step": 21988
+    },
+    {
+      "epoch": 0.19087507920938188,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001791789509744816,
+      "loss": 0.123,
+      "step": 21989
+    },
+    {
+      "epoch": 0.19088375968958604,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0017917704714074348,
+      "loss": 0.1357,
+      "step": 21990
+    },
+    {
+      "epoch": 0.1908924401697902,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0017917514323135433,
+      "loss": 0.125,
+      "step": 21991
+    },
+    {
+      "epoch": 0.19090112064999437,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0017917323924631632,
+      "loss": 0.1562,
+      "step": 21992
+    },
+    {
+      "epoch": 0.1909098011301985,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0017917133518563148,
+      "loss": 0.105,
+      "step": 21993
+    },
+    {
+      "epoch": 0.19091848161040267,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0017916943104930192,
+      "loss": 0.1338,
+      "step": 21994
+    },
+    {
+      "epoch": 0.19092716209060684,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0017916752683732968,
+      "loss": 0.0942,
+      "step": 21995
+    },
+    {
+      "epoch": 0.190935842570811,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0017916562254971689,
+      "loss": 0.0986,
+      "step": 21996
+    },
+    {
+      "epoch": 0.19094452305101517,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0017916371818646562,
+      "loss": 0.127,
+      "step": 21997
+    },
+    {
+      "epoch": 0.19095320353121933,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0017916181374757795,
+      "loss": 0.1318,
+      "step": 21998
+    },
+    {
+      "epoch": 0.1909618840114235,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0017915990923305595,
+      "loss": 0.1719,
+      "step": 21999
+    },
+    {
+      "epoch": 0.19097056449162766,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001791580046429017,
+      "loss": 0.1367,
+      "step": 22000
+    },
+    {
+      "epoch": 0.19097924497183183,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0017915609997711732,
+      "loss": 0.0952,
+      "step": 22001
+    },
+    {
+      "epoch": 0.190987925452036,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0017915419523570485,
+      "loss": 0.105,
+      "step": 22002
+    },
+    {
+      "epoch": 0.19099660593224016,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001791522904186664,
+      "loss": 0.1709,
+      "step": 22003
+    },
+    {
+      "epoch": 0.19100528641244433,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0017915038552600402,
+      "loss": 0.1084,
+      "step": 22004
+    },
+    {
+      "epoch": 0.1910139668926485,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0017914848055771986,
+      "loss": 0.1494,
+      "step": 22005
+    },
+    {
+      "epoch": 0.19102264737285266,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0017914657551381594,
+      "loss": 0.1099,
+      "step": 22006
+    },
+    {
+      "epoch": 0.19103132785305682,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0017914467039429436,
+      "loss": 0.1108,
+      "step": 22007
+    },
+    {
+      "epoch": 0.19104000833326099,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001791427651991572,
+      "loss": 0.1377,
+      "step": 22008
+    },
+    {
+      "epoch": 0.19104868881346515,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0017914085992840657,
+      "loss": 0.124,
+      "step": 22009
+    },
+    {
+      "epoch": 0.19105736929366932,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0017913895458204453,
+      "loss": 0.1196,
+      "step": 22010
+    },
+    {
+      "epoch": 0.19106604977387348,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0017913704916007317,
+      "loss": 0.0972,
+      "step": 22011
+    },
+    {
+      "epoch": 0.19107473025407765,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0017913514366249457,
+      "loss": 0.1309,
+      "step": 22012
+    },
+    {
+      "epoch": 0.1910834107342818,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001791332380893108,
+      "loss": 0.1143,
+      "step": 22013
+    },
+    {
+      "epoch": 0.19109209121448598,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0017913133244052398,
+      "loss": 0.0898,
+      "step": 22014
+    },
+    {
+      "epoch": 0.19110077169469014,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0017912942671613615,
+      "loss": 0.1021,
+      "step": 22015
+    },
+    {
+      "epoch": 0.1911094521748943,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0017912752091614945,
+      "loss": 0.0928,
+      "step": 22016
+    },
+    {
+      "epoch": 0.19111813265509847,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001791256150405659,
+      "loss": 0.6172,
+      "step": 22017
+    },
+    {
+      "epoch": 0.19112681313530264,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0017912370908938762,
+      "loss": 0.1064,
+      "step": 22018
+    },
+    {
+      "epoch": 0.1911354936155068,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0017912180306261672,
+      "loss": 0.1445,
+      "step": 22019
+    },
+    {
+      "epoch": 0.19114417409571097,
+      "grad_norm": 4.21875,
+      "learning_rate": 0.0017911989696025522,
+      "loss": 0.2539,
+      "step": 22020
+    },
+    {
+      "epoch": 0.19115285457591513,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0017911799078230526,
+      "loss": 0.1289,
+      "step": 22021
+    },
+    {
+      "epoch": 0.1911615350561193,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001791160845287689,
+      "loss": 0.1035,
+      "step": 22022
+    },
+    {
+      "epoch": 0.19117021553632346,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0017911417819964821,
+      "loss": 0.1318,
+      "step": 22023
+    },
+    {
+      "epoch": 0.19117889601652763,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0017911227179494532,
+      "loss": 0.1084,
+      "step": 22024
+    },
+    {
+      "epoch": 0.1911875764967318,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0017911036531466228,
+      "loss": 0.1162,
+      "step": 22025
+    },
+    {
+      "epoch": 0.19119625697693596,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0017910845875880117,
+      "loss": 0.1172,
+      "step": 22026
+    },
+    {
+      "epoch": 0.19120493745714012,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0017910655212736413,
+      "loss": 0.1172,
+      "step": 22027
+    },
+    {
+      "epoch": 0.1912136179373443,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0017910464542035317,
+      "loss": 0.0938,
+      "step": 22028
+    },
+    {
+      "epoch": 0.19122229841754845,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.001791027386377704,
+      "loss": 0.1582,
+      "step": 22029
+    },
+    {
+      "epoch": 0.19123097889775262,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001791008317796179,
+      "loss": 0.1191,
+      "step": 22030
+    },
+    {
+      "epoch": 0.19123965937795678,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001790989248458978,
+      "loss": 0.1074,
+      "step": 22031
+    },
+    {
+      "epoch": 0.19124833985816095,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0017909701783661214,
+      "loss": 0.1035,
+      "step": 22032
+    },
+    {
+      "epoch": 0.1912570203383651,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00179095110751763,
+      "loss": 0.1465,
+      "step": 22033
+    },
+    {
+      "epoch": 0.19126570081856928,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0017909320359135255,
+      "loss": 0.105,
+      "step": 22034
+    },
+    {
+      "epoch": 0.19127438129877344,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0017909129635538276,
+      "loss": 0.1094,
+      "step": 22035
+    },
+    {
+      "epoch": 0.1912830617789776,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001790893890438558,
+      "loss": 0.1147,
+      "step": 22036
+    },
+    {
+      "epoch": 0.19129174225918177,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0017908748165677366,
+      "loss": 0.124,
+      "step": 22037
+    },
+    {
+      "epoch": 0.19130042273938594,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0017908557419413856,
+      "loss": 0.1123,
+      "step": 22038
+    },
+    {
+      "epoch": 0.1913091032195901,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0017908366665595247,
+      "loss": 0.1133,
+      "step": 22039
+    },
+    {
+      "epoch": 0.19131778369979427,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0017908175904221752,
+      "loss": 0.0962,
+      "step": 22040
+    },
+    {
+      "epoch": 0.19132646417999843,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0017907985135293583,
+      "loss": 0.124,
+      "step": 22041
+    },
+    {
+      "epoch": 0.1913351446602026,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001790779435881094,
+      "loss": 0.0957,
+      "step": 22042
+    },
+    {
+      "epoch": 0.19134382514040676,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0017907603574774042,
+      "loss": 0.0898,
+      "step": 22043
+    },
+    {
+      "epoch": 0.19135250562061093,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0017907412783183087,
+      "loss": 0.0869,
+      "step": 22044
+    },
+    {
+      "epoch": 0.1913611861008151,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0017907221984038296,
+      "loss": 0.0957,
+      "step": 22045
+    },
+    {
+      "epoch": 0.19136986658101926,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0017907031177339867,
+      "loss": 0.1035,
+      "step": 22046
+    },
+    {
+      "epoch": 0.19137854706122343,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0017906840363088013,
+      "loss": 0.1001,
+      "step": 22047
+    },
+    {
+      "epoch": 0.1913872275414276,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001790664954128294,
+      "loss": 0.0898,
+      "step": 22048
+    },
+    {
+      "epoch": 0.19139590802163176,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0017906458711924864,
+      "loss": 0.1025,
+      "step": 22049
+    },
+    {
+      "epoch": 0.19140458850183592,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0017906267875013983,
+      "loss": 0.1006,
+      "step": 22050
+    },
+    {
+      "epoch": 0.19141326898204009,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0017906077030550513,
+      "loss": 0.1553,
+      "step": 22051
+    },
+    {
+      "epoch": 0.19142194946224425,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0017905886178534662,
+      "loss": 0.1094,
+      "step": 22052
+    },
+    {
+      "epoch": 0.19143062994244842,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0017905695318966638,
+      "loss": 0.1367,
+      "step": 22053
+    },
+    {
+      "epoch": 0.19143931042265258,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0017905504451846649,
+      "loss": 0.1279,
+      "step": 22054
+    },
+    {
+      "epoch": 0.19144799090285675,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0017905313577174901,
+      "loss": 0.0762,
+      "step": 22055
+    },
+    {
+      "epoch": 0.1914566713830609,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0017905122694951612,
+      "loss": 0.1172,
+      "step": 22056
+    },
+    {
+      "epoch": 0.19146535186326508,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001790493180517698,
+      "loss": 0.0815,
+      "step": 22057
+    },
+    {
+      "epoch": 0.19147403234346924,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0017904740907851218,
+      "loss": 0.0923,
+      "step": 22058
+    },
+    {
+      "epoch": 0.1914827128236734,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0017904550002974537,
+      "loss": 0.1357,
+      "step": 22059
+    },
+    {
+      "epoch": 0.19149139330387757,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0017904359090547145,
+      "loss": 0.0894,
+      "step": 22060
+    },
+    {
+      "epoch": 0.19150007378408174,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0017904168170569245,
+      "loss": 0.1021,
+      "step": 22061
+    },
+    {
+      "epoch": 0.1915087542642859,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0017903977243041054,
+      "loss": 0.1377,
+      "step": 22062
+    },
+    {
+      "epoch": 0.19151743474449007,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0017903786307962776,
+      "loss": 0.1475,
+      "step": 22063
+    },
+    {
+      "epoch": 0.19152611522469423,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0017903595365334625,
+      "loss": 0.1108,
+      "step": 22064
+    },
+    {
+      "epoch": 0.1915347957048984,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.00179034044151568,
+      "loss": 0.0918,
+      "step": 22065
+    },
+    {
+      "epoch": 0.19154347618510256,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0017903213457429517,
+      "loss": 0.0767,
+      "step": 22066
+    },
+    {
+      "epoch": 0.19155215666530673,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0017903022492152983,
+      "loss": 0.0947,
+      "step": 22067
+    },
+    {
+      "epoch": 0.1915608371455109,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0017902831519327408,
+      "loss": 0.1299,
+      "step": 22068
+    },
+    {
+      "epoch": 0.19156951762571506,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0017902640538953003,
+      "loss": 0.1289,
+      "step": 22069
+    },
+    {
+      "epoch": 0.19157819810591922,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0017902449551029976,
+      "loss": 0.0884,
+      "step": 22070
+    },
+    {
+      "epoch": 0.1915868785861234,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0017902258555558526,
+      "loss": 0.0981,
+      "step": 22071
+    },
+    {
+      "epoch": 0.19159555906632755,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0017902067552538876,
+      "loss": 0.085,
+      "step": 22072
+    },
+    {
+      "epoch": 0.19160423954653172,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0017901876541971224,
+      "loss": 0.124,
+      "step": 22073
+    },
+    {
+      "epoch": 0.19161292002673588,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0017901685523855789,
+      "loss": 0.0874,
+      "step": 22074
+    },
+    {
+      "epoch": 0.19162160050694005,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0017901494498192767,
+      "loss": 0.1133,
+      "step": 22075
+    },
+    {
+      "epoch": 0.1916302809871442,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001790130346498238,
+      "loss": 0.1177,
+      "step": 22076
+    },
+    {
+      "epoch": 0.19163896146734838,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001790111242422483,
+      "loss": 0.1123,
+      "step": 22077
+    },
+    {
+      "epoch": 0.19164764194755254,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0017900921375920327,
+      "loss": 0.0791,
+      "step": 22078
+    },
+    {
+      "epoch": 0.1916563224277567,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001790073032006908,
+      "loss": 0.1055,
+      "step": 22079
+    },
+    {
+      "epoch": 0.19166500290796087,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0017900539256671298,
+      "loss": 0.1191,
+      "step": 22080
+    },
+    {
+      "epoch": 0.19167368338816504,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0017900348185727192,
+      "loss": 0.1055,
+      "step": 22081
+    },
+    {
+      "epoch": 0.1916823638683692,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0017900157107236963,
+      "loss": 0.1582,
+      "step": 22082
+    },
+    {
+      "epoch": 0.19169104434857337,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001789996602120083,
+      "loss": 0.0952,
+      "step": 22083
+    },
+    {
+      "epoch": 0.19169972482877753,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0017899774927619,
+      "loss": 0.1777,
+      "step": 22084
+    },
+    {
+      "epoch": 0.1917084053089817,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0017899583826491677,
+      "loss": 0.127,
+      "step": 22085
+    },
+    {
+      "epoch": 0.19171708578918586,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0017899392717819072,
+      "loss": 0.1064,
+      "step": 22086
+    },
+    {
+      "epoch": 0.19172576626939003,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0017899201601601397,
+      "loss": 0.0835,
+      "step": 22087
+    },
+    {
+      "epoch": 0.1917344467495942,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0017899010477838856,
+      "loss": 0.1133,
+      "step": 22088
+    },
+    {
+      "epoch": 0.19174312722979836,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0017898819346531664,
+      "loss": 0.125,
+      "step": 22089
+    },
+    {
+      "epoch": 0.19175180771000253,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0017898628207680023,
+      "loss": 0.1182,
+      "step": 22090
+    },
+    {
+      "epoch": 0.1917604881902067,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001789843706128415,
+      "loss": 0.1064,
+      "step": 22091
+    },
+    {
+      "epoch": 0.19176916867041086,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0017898245907344247,
+      "loss": 0.0923,
+      "step": 22092
+    },
+    {
+      "epoch": 0.19177784915061502,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0017898054745860527,
+      "loss": 0.1172,
+      "step": 22093
+    },
+    {
+      "epoch": 0.19178652963081919,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0017897863576833198,
+      "loss": 0.0791,
+      "step": 22094
+    },
+    {
+      "epoch": 0.19179521011102335,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001789767240026247,
+      "loss": 0.0938,
+      "step": 22095
+    },
+    {
+      "epoch": 0.19180389059122752,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001789748121614855,
+      "loss": 0.1289,
+      "step": 22096
+    },
+    {
+      "epoch": 0.19181257107143168,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0017897290024491647,
+      "loss": 0.0767,
+      "step": 22097
+    },
+    {
+      "epoch": 0.19182125155163585,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0017897098825291973,
+      "loss": 0.1348,
+      "step": 22098
+    },
+    {
+      "epoch": 0.19182993203184,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0017896907618549735,
+      "loss": 0.0796,
+      "step": 22099
+    },
+    {
+      "epoch": 0.19183861251204418,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0017896716404265142,
+      "loss": 0.1426,
+      "step": 22100
+    },
+    {
+      "epoch": 0.19184729299224834,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0017896525182438403,
+      "loss": 0.1123,
+      "step": 22101
+    },
+    {
+      "epoch": 0.1918559734724525,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001789633395306973,
+      "loss": 0.0986,
+      "step": 22102
+    },
+    {
+      "epoch": 0.19186465395265667,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0017896142716159328,
+      "loss": 0.3125,
+      "step": 22103
+    },
+    {
+      "epoch": 0.19187333443286084,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0017895951471707408,
+      "loss": 0.1328,
+      "step": 22104
+    },
+    {
+      "epoch": 0.191882014913065,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001789576021971418,
+      "loss": 0.0889,
+      "step": 22105
+    },
+    {
+      "epoch": 0.19189069539326917,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001789556896017985,
+      "loss": 0.0889,
+      "step": 22106
+    },
+    {
+      "epoch": 0.19189937587347333,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001789537769310463,
+      "loss": 0.1279,
+      "step": 22107
+    },
+    {
+      "epoch": 0.1919080563536775,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0017895186418488732,
+      "loss": 0.1426,
+      "step": 22108
+    },
+    {
+      "epoch": 0.19191673683388166,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001789499513633236,
+      "loss": 0.1162,
+      "step": 22109
+    },
+    {
+      "epoch": 0.19192541731408583,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001789480384663572,
+      "loss": 0.1094,
+      "step": 22110
+    },
+    {
+      "epoch": 0.19193409779429,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0017894612549399032,
+      "loss": 0.0957,
+      "step": 22111
+    },
+    {
+      "epoch": 0.19194277827449416,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0017894421244622496,
+      "loss": 0.0845,
+      "step": 22112
+    },
+    {
+      "epoch": 0.19195145875469832,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0017894229932306327,
+      "loss": 0.1211,
+      "step": 22113
+    },
+    {
+      "epoch": 0.1919601392349025,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001789403861245073,
+      "loss": 0.1074,
+      "step": 22114
+    },
+    {
+      "epoch": 0.19196881971510665,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0017893847285055918,
+      "loss": 0.1562,
+      "step": 22115
+    },
+    {
+      "epoch": 0.19197750019531082,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0017893655950122097,
+      "loss": 0.1055,
+      "step": 22116
+    },
+    {
+      "epoch": 0.19198618067551496,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001789346460764948,
+      "loss": 0.1011,
+      "step": 22117
+    },
+    {
+      "epoch": 0.19199486115571912,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001789327325763827,
+      "loss": 0.1016,
+      "step": 22118
+    },
+    {
+      "epoch": 0.19200354163592329,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001789308190008868,
+      "loss": 0.1309,
+      "step": 22119
+    },
+    {
+      "epoch": 0.19201222211612745,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001789289053500092,
+      "loss": 0.0972,
+      "step": 22120
+    },
+    {
+      "epoch": 0.19202090259633162,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00178926991623752,
+      "loss": 0.1084,
+      "step": 22121
+    },
+    {
+      "epoch": 0.19202958307653578,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0017892507782211727,
+      "loss": 0.1221,
+      "step": 22122
+    },
+    {
+      "epoch": 0.19203826355673995,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0017892316394510707,
+      "loss": 0.1357,
+      "step": 22123
+    },
+    {
+      "epoch": 0.1920469440369441,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001789212499927236,
+      "loss": 0.0918,
+      "step": 22124
+    },
+    {
+      "epoch": 0.19205562451714828,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0017891933596496884,
+      "loss": 0.106,
+      "step": 22125
+    },
+    {
+      "epoch": 0.19206430499735244,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0017891742186184494,
+      "loss": 0.1162,
+      "step": 22126
+    },
+    {
+      "epoch": 0.1920729854775566,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0017891550768335401,
+      "loss": 0.1113,
+      "step": 22127
+    },
+    {
+      "epoch": 0.19208166595776077,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0017891359342949806,
+      "loss": 0.1064,
+      "step": 22128
+    },
+    {
+      "epoch": 0.19209034643796494,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001789116791002793,
+      "loss": 0.1797,
+      "step": 22129
+    },
+    {
+      "epoch": 0.1920990269181691,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0017890976469569972,
+      "loss": 0.1045,
+      "step": 22130
+    },
+    {
+      "epoch": 0.19210770739837327,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001789078502157615,
+      "loss": 0.0718,
+      "step": 22131
+    },
+    {
+      "epoch": 0.19211638787857743,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0017890593566046665,
+      "loss": 0.1035,
+      "step": 22132
+    },
+    {
+      "epoch": 0.1921250683587816,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001789040210298173,
+      "loss": 0.1504,
+      "step": 22133
+    },
+    {
+      "epoch": 0.19213374883898576,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0017890210632381558,
+      "loss": 0.1348,
+      "step": 22134
+    },
+    {
+      "epoch": 0.19214242931918993,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0017890019154246353,
+      "loss": 0.1108,
+      "step": 22135
+    },
+    {
+      "epoch": 0.1921511097993941,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0017889827668576327,
+      "loss": 0.1201,
+      "step": 22136
+    },
+    {
+      "epoch": 0.19215979027959826,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0017889636175371692,
+      "loss": 0.1035,
+      "step": 22137
+    },
+    {
+      "epoch": 0.19216847075980242,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001788944467463265,
+      "loss": 0.1133,
+      "step": 22138
+    },
+    {
+      "epoch": 0.1921771512400066,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0017889253166359417,
+      "loss": 0.125,
+      "step": 22139
+    },
+    {
+      "epoch": 0.19218583172021075,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0017889061650552198,
+      "loss": 0.1064,
+      "step": 22140
+    },
+    {
+      "epoch": 0.19219451220041492,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0017888870127211207,
+      "loss": 0.1201,
+      "step": 22141
+    },
+    {
+      "epoch": 0.19220319268061908,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0017888678596336652,
+      "loss": 0.1113,
+      "step": 22142
+    },
+    {
+      "epoch": 0.19221187316082325,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0017888487057928738,
+      "loss": 0.0874,
+      "step": 22143
+    },
+    {
+      "epoch": 0.19222055364102741,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0017888295511987683,
+      "loss": 0.0879,
+      "step": 22144
+    },
+    {
+      "epoch": 0.19222923412123158,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0017888103958513687,
+      "loss": 0.1328,
+      "step": 22145
+    },
+    {
+      "epoch": 0.19223791460143574,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0017887912397506964,
+      "loss": 0.1582,
+      "step": 22146
+    },
+    {
+      "epoch": 0.1922465950816399,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0017887720828967727,
+      "loss": 0.1147,
+      "step": 22147
+    },
+    {
+      "epoch": 0.19225527556184407,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001788752925289618,
+      "loss": 0.0918,
+      "step": 22148
+    },
+    {
+      "epoch": 0.19226395604204824,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0017887337669292534,
+      "loss": 0.1152,
+      "step": 22149
+    },
+    {
+      "epoch": 0.1922726365222524,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0017887146078157002,
+      "loss": 0.1074,
+      "step": 22150
+    },
+    {
+      "epoch": 0.19228131700245657,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0017886954479489786,
+      "loss": 0.1118,
+      "step": 22151
+    },
+    {
+      "epoch": 0.19228999748266073,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0017886762873291103,
+      "loss": 0.1387,
+      "step": 22152
+    },
+    {
+      "epoch": 0.1922986779628649,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001788657125956116,
+      "loss": 0.1396,
+      "step": 22153
+    },
+    {
+      "epoch": 0.19230735844306907,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0017886379638300165,
+      "loss": 0.1016,
+      "step": 22154
+    },
+    {
+      "epoch": 0.19231603892327323,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0017886188009508328,
+      "loss": 0.125,
+      "step": 22155
+    },
+    {
+      "epoch": 0.1923247194034774,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001788599637318586,
+      "loss": 0.1328,
+      "step": 22156
+    },
+    {
+      "epoch": 0.19233339988368156,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.0017885804729332968,
+      "loss": 0.127,
+      "step": 22157
+    },
+    {
+      "epoch": 0.19234208036388573,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0017885613077949867,
+      "loss": 0.1201,
+      "step": 22158
+    },
+    {
+      "epoch": 0.1923507608440899,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001788542141903676,
+      "loss": 0.0913,
+      "step": 22159
+    },
+    {
+      "epoch": 0.19235944132429406,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0017885229752593861,
+      "loss": 0.1201,
+      "step": 22160
+    },
+    {
+      "epoch": 0.19236812180449822,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0017885038078621377,
+      "loss": 0.1147,
+      "step": 22161
+    },
+    {
+      "epoch": 0.1923768022847024,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0017884846397119524,
+      "loss": 0.126,
+      "step": 22162
+    },
+    {
+      "epoch": 0.19238548276490655,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.00178846547080885,
+      "loss": 0.1377,
+      "step": 22163
+    },
+    {
+      "epoch": 0.19239416324511072,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0017884463011528522,
+      "loss": 0.1128,
+      "step": 22164
+    },
+    {
+      "epoch": 0.19240284372531488,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.00178842713074398,
+      "loss": 0.1338,
+      "step": 22165
+    },
+    {
+      "epoch": 0.19241152420551905,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0017884079595822545,
+      "loss": 0.0728,
+      "step": 22166
+    },
+    {
+      "epoch": 0.1924202046857232,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001788388787667696,
+      "loss": 0.126,
+      "step": 22167
+    },
+    {
+      "epoch": 0.19242888516592738,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.001788369615000326,
+      "loss": 0.0977,
+      "step": 22168
+    },
+    {
+      "epoch": 0.19243756564613154,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0017883504415801652,
+      "loss": 0.0859,
+      "step": 22169
+    },
+    {
+      "epoch": 0.1924462461263357,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001788331267407235,
+      "loss": 0.123,
+      "step": 22170
+    },
+    {
+      "epoch": 0.19245492660653987,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0017883120924815556,
+      "loss": 0.1201,
+      "step": 22171
+    },
+    {
+      "epoch": 0.19246360708674404,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001788292916803149,
+      "loss": 0.1465,
+      "step": 22172
+    },
+    {
+      "epoch": 0.1924722875669482,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0017882737403720351,
+      "loss": 0.1016,
+      "step": 22173
+    },
+    {
+      "epoch": 0.19248096804715237,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0017882545631882358,
+      "loss": 0.1426,
+      "step": 22174
+    },
+    {
+      "epoch": 0.19248964852735653,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0017882353852517712,
+      "loss": 0.0845,
+      "step": 22175
+    },
+    {
+      "epoch": 0.1924983290075607,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0017882162065626633,
+      "loss": 0.1079,
+      "step": 22176
+    },
+    {
+      "epoch": 0.19250700948776486,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001788197027120932,
+      "loss": 0.1094,
+      "step": 22177
+    },
+    {
+      "epoch": 0.19251568996796903,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0017881778469265991,
+      "loss": 0.126,
+      "step": 22178
+    },
+    {
+      "epoch": 0.1925243704481732,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0017881586659796852,
+      "loss": 0.0962,
+      "step": 22179
+    },
+    {
+      "epoch": 0.19253305092837736,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001788139484280211,
+      "loss": 0.0713,
+      "step": 22180
+    },
+    {
+      "epoch": 0.19254173140858152,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001788120301828198,
+      "loss": 0.1235,
+      "step": 22181
+    },
+    {
+      "epoch": 0.1925504118887857,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0017881011186236674,
+      "loss": 0.1045,
+      "step": 22182
+    },
+    {
+      "epoch": 0.19255909236898985,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001788081934666639,
+      "loss": 0.1094,
+      "step": 22183
+    },
+    {
+      "epoch": 0.19256777284919402,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001788062749957135,
+      "loss": 0.085,
+      "step": 22184
+    },
+    {
+      "epoch": 0.19257645332939818,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0017880435644951759,
+      "loss": 0.0771,
+      "step": 22185
+    },
+    {
+      "epoch": 0.19258513380960235,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0017880243782807826,
+      "loss": 0.0981,
+      "step": 22186
+    },
+    {
+      "epoch": 0.19259381428980651,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0017880051913139758,
+      "loss": 0.1104,
+      "step": 22187
+    },
+    {
+      "epoch": 0.19260249477001068,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0017879860035947778,
+      "loss": 0.0908,
+      "step": 22188
+    },
+    {
+      "epoch": 0.19261117525021484,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.001787966815123208,
+      "loss": 0.1069,
+      "step": 22189
+    },
+    {
+      "epoch": 0.192619855730419,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001787947625899288,
+      "loss": 0.0918,
+      "step": 22190
+    },
+    {
+      "epoch": 0.19262853621062317,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0017879284359230388,
+      "loss": 0.1191,
+      "step": 22191
+    },
+    {
+      "epoch": 0.19263721669082734,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0017879092451944813,
+      "loss": 0.124,
+      "step": 22192
+    },
+    {
+      "epoch": 0.1926458971710315,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0017878900537136371,
+      "loss": 0.165,
+      "step": 22193
+    },
+    {
+      "epoch": 0.19265457765123567,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0017878708614805265,
+      "loss": 0.0903,
+      "step": 22194
+    },
+    {
+      "epoch": 0.19266325813143984,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.0017878516684951703,
+      "loss": 0.1484,
+      "step": 22195
+    },
+    {
+      "epoch": 0.192671938611644,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0017878324747575903,
+      "loss": 0.1226,
+      "step": 22196
+    },
+    {
+      "epoch": 0.19268061909184817,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0017878132802678067,
+      "loss": 0.1621,
+      "step": 22197
+    },
+    {
+      "epoch": 0.19268929957205233,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.001787794085025841,
+      "loss": 0.0698,
+      "step": 22198
+    },
+    {
+      "epoch": 0.1926979800522565,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0017877748890317141,
+      "loss": 0.1099,
+      "step": 22199
+    },
+    {
+      "epoch": 0.19270666053246066,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0017877556922854464,
+      "loss": 0.1211,
+      "step": 22200
+    },
+    {
+      "epoch": 0.19271534101266483,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0017877364947870601,
+      "loss": 0.1211,
+      "step": 22201
+    },
+    {
+      "epoch": 0.192724021492869,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001787717296536575,
+      "loss": 0.0996,
+      "step": 22202
+    },
+    {
+      "epoch": 0.19273270197307316,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0017876980975340129,
+      "loss": 0.1089,
+      "step": 22203
+    },
+    {
+      "epoch": 0.19274138245327732,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0017876788977793945,
+      "loss": 0.1084,
+      "step": 22204
+    },
+    {
+      "epoch": 0.1927500629334815,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0017876596972727407,
+      "loss": 0.1494,
+      "step": 22205
+    },
+    {
+      "epoch": 0.19275874341368565,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0017876404960140725,
+      "loss": 0.1187,
+      "step": 22206
+    },
+    {
+      "epoch": 0.19276742389388982,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0017876212940034111,
+      "loss": 0.124,
+      "step": 22207
+    },
+    {
+      "epoch": 0.19277610437409398,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0017876020912407774,
+      "loss": 0.1021,
+      "step": 22208
+    },
+    {
+      "epoch": 0.19278478485429815,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0017875828877261923,
+      "loss": 0.1162,
+      "step": 22209
+    },
+    {
+      "epoch": 0.1927934653345023,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001787563683459677,
+      "loss": 0.0957,
+      "step": 22210
+    },
+    {
+      "epoch": 0.19280214581470648,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0017875444784412522,
+      "loss": 0.166,
+      "step": 22211
+    },
+    {
+      "epoch": 0.19281082629491064,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0017875252726709393,
+      "loss": 0.1025,
+      "step": 22212
+    },
+    {
+      "epoch": 0.1928195067751148,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001787506066148759,
+      "loss": 0.1216,
+      "step": 22213
+    },
+    {
+      "epoch": 0.19282818725531897,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0017874868588747324,
+      "loss": 0.1016,
+      "step": 22214
+    },
+    {
+      "epoch": 0.19283686773552314,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0017874676508488803,
+      "loss": 0.1182,
+      "step": 22215
+    },
+    {
+      "epoch": 0.1928455482157273,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.001787448442071224,
+      "loss": 0.1162,
+      "step": 22216
+    },
+    {
+      "epoch": 0.19285422869593147,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0017874292325417848,
+      "loss": 0.127,
+      "step": 22217
+    },
+    {
+      "epoch": 0.19286290917613563,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0017874100222605826,
+      "loss": 0.127,
+      "step": 22218
+    },
+    {
+      "epoch": 0.1928715896563398,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.00178739081122764,
+      "loss": 0.0986,
+      "step": 22219
+    },
+    {
+      "epoch": 0.19288027013654396,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0017873715994429764,
+      "loss": 0.166,
+      "step": 22220
+    },
+    {
+      "epoch": 0.19288895061674813,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001787352386906614,
+      "loss": 0.1069,
+      "step": 22221
+    },
+    {
+      "epoch": 0.1928976310969523,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0017873331736185732,
+      "loss": 0.0767,
+      "step": 22222
+    },
+    {
+      "epoch": 0.19290631157715646,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0017873139595788751,
+      "loss": 0.0688,
+      "step": 22223
+    },
+    {
+      "epoch": 0.19291499205736062,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.001787294744787541,
+      "loss": 0.0869,
+      "step": 22224
+    },
+    {
+      "epoch": 0.1929236725375648,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0017872755292445916,
+      "loss": 0.1367,
+      "step": 22225
+    },
+    {
+      "epoch": 0.19293235301776895,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001787256312950048,
+      "loss": 0.1055,
+      "step": 22226
+    },
+    {
+      "epoch": 0.19294103349797312,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0017872370959039307,
+      "loss": 0.1826,
+      "step": 22227
+    },
+    {
+      "epoch": 0.19294971397817728,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0017872178781062618,
+      "loss": 0.1328,
+      "step": 22228
+    },
+    {
+      "epoch": 0.19295839445838145,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0017871986595570618,
+      "loss": 0.1338,
+      "step": 22229
+    },
+    {
+      "epoch": 0.19296707493858561,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0017871794402563514,
+      "loss": 0.0957,
+      "step": 22230
+    },
+    {
+      "epoch": 0.19297575541878978,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.001787160220204152,
+      "loss": 0.1084,
+      "step": 22231
+    },
+    {
+      "epoch": 0.19298443589899394,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0017871409994004847,
+      "loss": 0.1377,
+      "step": 22232
+    },
+    {
+      "epoch": 0.1929931163791981,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0017871217778453702,
+      "loss": 0.1289,
+      "step": 22233
+    },
+    {
+      "epoch": 0.19300179685940227,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0017871025555388296,
+      "loss": 0.1025,
+      "step": 22234
+    },
+    {
+      "epoch": 0.19301047733960644,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0017870833324808838,
+      "loss": 0.1123,
+      "step": 22235
+    },
+    {
+      "epoch": 0.1930191578198106,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001787064108671554,
+      "loss": 0.124,
+      "step": 22236
+    },
+    {
+      "epoch": 0.19302783830001477,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0017870448841108616,
+      "loss": 0.0996,
+      "step": 22237
+    },
+    {
+      "epoch": 0.19303651878021894,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001787025658798827,
+      "loss": 0.1328,
+      "step": 22238
+    },
+    {
+      "epoch": 0.1930451992604231,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0017870064327354714,
+      "loss": 0.0889,
+      "step": 22239
+    },
+    {
+      "epoch": 0.19305387974062724,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0017869872059208159,
+      "loss": 0.1035,
+      "step": 22240
+    },
+    {
+      "epoch": 0.1930625602208314,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0017869679783548815,
+      "loss": 0.1201,
+      "step": 22241
+    },
+    {
+      "epoch": 0.19307124070103557,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0017869487500376895,
+      "loss": 0.1182,
+      "step": 22242
+    },
+    {
+      "epoch": 0.19307992118123973,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0017869295209692604,
+      "loss": 0.1309,
+      "step": 22243
+    },
+    {
+      "epoch": 0.1930886016614439,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0017869102911496155,
+      "loss": 0.1006,
+      "step": 22244
+    },
+    {
+      "epoch": 0.19309728214164806,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001786891060578776,
+      "loss": 0.1162,
+      "step": 22245
+    },
+    {
+      "epoch": 0.19310596262185223,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0017868718292567626,
+      "loss": 0.1387,
+      "step": 22246
+    },
+    {
+      "epoch": 0.1931146431020564,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0017868525971835966,
+      "loss": 0.1138,
+      "step": 22247
+    },
+    {
+      "epoch": 0.19312332358226056,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0017868333643592986,
+      "loss": 0.0996,
+      "step": 22248
+    },
+    {
+      "epoch": 0.19313200406246472,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0017868141307838903,
+      "loss": 0.1055,
+      "step": 22249
+    },
+    {
+      "epoch": 0.1931406845426689,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0017867948964573927,
+      "loss": 0.0728,
+      "step": 22250
+    },
+    {
+      "epoch": 0.19314936502287305,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001786775661379826,
+      "loss": 0.1045,
+      "step": 22251
+    },
+    {
+      "epoch": 0.19315804550307722,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001786756425551212,
+      "loss": 0.0884,
+      "step": 22252
+    },
+    {
+      "epoch": 0.19316672598328138,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0017867371889715709,
+      "loss": 0.0869,
+      "step": 22253
+    },
+    {
+      "epoch": 0.19317540646348555,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0017867179516409249,
+      "loss": 0.1992,
+      "step": 22254
+    },
+    {
+      "epoch": 0.19318408694368971,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0017866987135592945,
+      "loss": 0.1104,
+      "step": 22255
+    },
+    {
+      "epoch": 0.19319276742389388,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0017866794747267007,
+      "loss": 0.1309,
+      "step": 22256
+    },
+    {
+      "epoch": 0.19320144790409804,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017866602351431643,
+      "loss": 0.1074,
+      "step": 22257
+    },
+    {
+      "epoch": 0.1932101283843022,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0017866409948087065,
+      "loss": 0.127,
+      "step": 22258
+    },
+    {
+      "epoch": 0.19321880886450638,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0017866217537233487,
+      "loss": 0.1025,
+      "step": 22259
+    },
+    {
+      "epoch": 0.19322748934471054,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0017866025118871117,
+      "loss": 0.1475,
+      "step": 22260
+    },
+    {
+      "epoch": 0.1932361698249147,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0017865832693000163,
+      "loss": 0.0947,
+      "step": 22261
+    },
+    {
+      "epoch": 0.19324485030511887,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0017865640259620839,
+      "loss": 0.123,
+      "step": 22262
+    },
+    {
+      "epoch": 0.19325353078532304,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0017865447818733353,
+      "loss": 0.0928,
+      "step": 22263
+    },
+    {
+      "epoch": 0.1932622112655272,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0017865255370337917,
+      "loss": 0.1099,
+      "step": 22264
+    },
+    {
+      "epoch": 0.19327089174573137,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0017865062914434742,
+      "loss": 0.1094,
+      "step": 22265
+    },
+    {
+      "epoch": 0.19327957222593553,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0017864870451024036,
+      "loss": 0.1016,
+      "step": 22266
+    },
+    {
+      "epoch": 0.1932882527061397,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.001786467798010601,
+      "loss": 0.1592,
+      "step": 22267
+    },
+    {
+      "epoch": 0.19329693318634386,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001786448550168088,
+      "loss": 0.1406,
+      "step": 22268
+    },
+    {
+      "epoch": 0.19330561366654803,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0017864293015748844,
+      "loss": 0.1182,
+      "step": 22269
+    },
+    {
+      "epoch": 0.1933142941467522,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0017864100522310125,
+      "loss": 0.1182,
+      "step": 22270
+    },
+    {
+      "epoch": 0.19332297462695636,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0017863908021364928,
+      "loss": 0.0923,
+      "step": 22271
+    },
+    {
+      "epoch": 0.19333165510716052,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0017863715512913463,
+      "loss": 0.0938,
+      "step": 22272
+    },
+    {
+      "epoch": 0.1933403355873647,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0017863522996955942,
+      "loss": 0.126,
+      "step": 22273
+    },
+    {
+      "epoch": 0.19334901606756885,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001786333047349258,
+      "loss": 0.126,
+      "step": 22274
+    },
+    {
+      "epoch": 0.19335769654777302,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0017863137942523582,
+      "loss": 0.1035,
+      "step": 22275
+    },
+    {
+      "epoch": 0.19336637702797718,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0017862945404049154,
+      "loss": 0.1001,
+      "step": 22276
+    },
+    {
+      "epoch": 0.19337505750818135,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0017862752858069518,
+      "loss": 0.1133,
+      "step": 22277
+    },
+    {
+      "epoch": 0.1933837379883855,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0017862560304584875,
+      "loss": 0.0938,
+      "step": 22278
+    },
+    {
+      "epoch": 0.19339241846858968,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0017862367743595438,
+      "loss": 0.1152,
+      "step": 22279
+    },
+    {
+      "epoch": 0.19340109894879384,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001786217517510142,
+      "loss": 0.0791,
+      "step": 22280
+    },
+    {
+      "epoch": 0.193409779428998,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001786198259910303,
+      "loss": 0.105,
+      "step": 22281
+    },
+    {
+      "epoch": 0.19341845990920217,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0017861790015600482,
+      "loss": 0.1187,
+      "step": 22282
+    },
+    {
+      "epoch": 0.19342714038940634,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001786159742459398,
+      "loss": 0.0801,
+      "step": 22283
+    },
+    {
+      "epoch": 0.1934358208696105,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0017861404826083741,
+      "loss": 0.1201,
+      "step": 22284
+    },
+    {
+      "epoch": 0.19344450134981467,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0017861212220069969,
+      "loss": 0.1445,
+      "step": 22285
+    },
+    {
+      "epoch": 0.19345318183001883,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0017861019606552882,
+      "loss": 0.0786,
+      "step": 22286
+    },
+    {
+      "epoch": 0.193461862310223,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0017860826985532686,
+      "loss": 0.1055,
+      "step": 22287
+    },
+    {
+      "epoch": 0.19347054279042716,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001786063435700959,
+      "loss": 0.1445,
+      "step": 22288
+    },
+    {
+      "epoch": 0.19347922327063133,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0017860441720983812,
+      "loss": 0.1309,
+      "step": 22289
+    },
+    {
+      "epoch": 0.1934879037508355,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0017860249077455556,
+      "loss": 0.0928,
+      "step": 22290
+    },
+    {
+      "epoch": 0.19349658423103966,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0017860056426425032,
+      "loss": 0.0957,
+      "step": 22291
+    },
+    {
+      "epoch": 0.19350526471124382,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001785986376789246,
+      "loss": 0.1216,
+      "step": 22292
+    },
+    {
+      "epoch": 0.193513945191448,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0017859671101858037,
+      "loss": 0.1318,
+      "step": 22293
+    },
+    {
+      "epoch": 0.19352262567165215,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017859478428321985,
+      "loss": 0.1084,
+      "step": 22294
+    },
+    {
+      "epoch": 0.19353130615185632,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0017859285747284508,
+      "loss": 0.1299,
+      "step": 22295
+    },
+    {
+      "epoch": 0.19353998663206048,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0017859093058745822,
+      "loss": 0.1079,
+      "step": 22296
+    },
+    {
+      "epoch": 0.19354866711226465,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0017858900362706133,
+      "loss": 0.1309,
+      "step": 22297
+    },
+    {
+      "epoch": 0.19355734759246881,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0017858707659165651,
+      "loss": 0.0879,
+      "step": 22298
+    },
+    {
+      "epoch": 0.19356602807267298,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0017858514948124594,
+      "loss": 0.0952,
+      "step": 22299
+    },
+    {
+      "epoch": 0.19357470855287714,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0017858322229583163,
+      "loss": 0.1777,
+      "step": 22300
+    },
+    {
+      "epoch": 0.1935833890330813,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.001785812950354158,
+      "loss": 0.1348,
+      "step": 22301
+    },
+    {
+      "epoch": 0.19359206951328548,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0017857936770000046,
+      "loss": 0.1611,
+      "step": 22302
+    },
+    {
+      "epoch": 0.19360074999348964,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0017857744028958779,
+      "loss": 0.0991,
+      "step": 22303
+    },
+    {
+      "epoch": 0.1936094304736938,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001785755128041798,
+      "loss": 0.0884,
+      "step": 22304
+    },
+    {
+      "epoch": 0.19361811095389797,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001785735852437787,
+      "loss": 0.1162,
+      "step": 22305
+    },
+    {
+      "epoch": 0.19362679143410214,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0017857165760838654,
+      "loss": 0.1079,
+      "step": 22306
+    },
+    {
+      "epoch": 0.1936354719143063,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0017856972989800544,
+      "loss": 0.1084,
+      "step": 22307
+    },
+    {
+      "epoch": 0.19364415239451047,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0017856780211263754,
+      "loss": 0.0859,
+      "step": 22308
+    },
+    {
+      "epoch": 0.19365283287471463,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0017856587425228493,
+      "loss": 0.0947,
+      "step": 22309
+    },
+    {
+      "epoch": 0.1936615133549188,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001785639463169497,
+      "loss": 0.1455,
+      "step": 22310
+    },
+    {
+      "epoch": 0.19367019383512296,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0017856201830663393,
+      "loss": 0.1455,
+      "step": 22311
+    },
+    {
+      "epoch": 0.19367887431532713,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0017856009022133982,
+      "loss": 0.0996,
+      "step": 22312
+    },
+    {
+      "epoch": 0.1936875547955313,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0017855816206106938,
+      "loss": 0.0947,
+      "step": 22313
+    },
+    {
+      "epoch": 0.19369623527573546,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0017855623382582479,
+      "loss": 0.1182,
+      "step": 22314
+    },
+    {
+      "epoch": 0.19370491575593962,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0017855430551560812,
+      "loss": 0.1484,
+      "step": 22315
+    },
+    {
+      "epoch": 0.1937135962361438,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0017855237713042152,
+      "loss": 0.0947,
+      "step": 22316
+    },
+    {
+      "epoch": 0.19372227671634795,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0017855044867026704,
+      "loss": 0.0903,
+      "step": 22317
+    },
+    {
+      "epoch": 0.19373095719655212,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0017854852013514681,
+      "loss": 0.1455,
+      "step": 22318
+    },
+    {
+      "epoch": 0.19373963767675628,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0017854659152506298,
+      "loss": 0.0879,
+      "step": 22319
+    },
+    {
+      "epoch": 0.19374831815696045,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0017854466284001763,
+      "loss": 0.0942,
+      "step": 22320
+    },
+    {
+      "epoch": 0.1937569986371646,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0017854273408001284,
+      "loss": 0.1152,
+      "step": 22321
+    },
+    {
+      "epoch": 0.19376567911736878,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017854080524505077,
+      "loss": 0.1128,
+      "step": 22322
+    },
+    {
+      "epoch": 0.19377435959757294,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0017853887633513348,
+      "loss": 0.1191,
+      "step": 22323
+    },
+    {
+      "epoch": 0.1937830400777771,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0017853694735026312,
+      "loss": 0.1338,
+      "step": 22324
+    },
+    {
+      "epoch": 0.19379172055798127,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001785350182904418,
+      "loss": 0.1445,
+      "step": 22325
+    },
+    {
+      "epoch": 0.19380040103818544,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001785330891556716,
+      "loss": 0.1118,
+      "step": 22326
+    },
+    {
+      "epoch": 0.1938090815183896,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0017853115994595462,
+      "loss": 0.1016,
+      "step": 22327
+    },
+    {
+      "epoch": 0.19381776199859377,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00178529230661293,
+      "loss": 0.1523,
+      "step": 22328
+    },
+    {
+      "epoch": 0.19382644247879793,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0017852730130168885,
+      "loss": 0.0791,
+      "step": 22329
+    },
+    {
+      "epoch": 0.1938351229590021,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0017852537186714428,
+      "loss": 0.0942,
+      "step": 22330
+    },
+    {
+      "epoch": 0.19384380343920626,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001785234423576614,
+      "loss": 0.1367,
+      "step": 22331
+    },
+    {
+      "epoch": 0.19385248391941043,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001785215127732423,
+      "loss": 0.1084,
+      "step": 22332
+    },
+    {
+      "epoch": 0.1938611643996146,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0017851958311388908,
+      "loss": 0.1523,
+      "step": 22333
+    },
+    {
+      "epoch": 0.19386984487981876,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0017851765337960394,
+      "loss": 0.1099,
+      "step": 22334
+    },
+    {
+      "epoch": 0.19387852536002292,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0017851572357038886,
+      "loss": 0.0986,
+      "step": 22335
+    },
+    {
+      "epoch": 0.1938872058402271,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0017851379368624604,
+      "loss": 0.1113,
+      "step": 22336
+    },
+    {
+      "epoch": 0.19389588632043125,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0017851186372717757,
+      "loss": 0.1055,
+      "step": 22337
+    },
+    {
+      "epoch": 0.19390456680063542,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0017850993369318555,
+      "loss": 0.0981,
+      "step": 22338
+    },
+    {
+      "epoch": 0.19391324728083958,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0017850800358427208,
+      "loss": 0.127,
+      "step": 22339
+    },
+    {
+      "epoch": 0.19392192776104375,
+      "grad_norm": 1.6328125,
+      "learning_rate": 0.0017850607340043933,
+      "loss": 0.1084,
+      "step": 22340
+    },
+    {
+      "epoch": 0.19393060824124791,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0017850414314168933,
+      "loss": 0.1138,
+      "step": 22341
+    },
+    {
+      "epoch": 0.19393928872145208,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0017850221280802424,
+      "loss": 0.0771,
+      "step": 22342
+    },
+    {
+      "epoch": 0.19394796920165625,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0017850028239944616,
+      "loss": 0.1367,
+      "step": 22343
+    },
+    {
+      "epoch": 0.1939566496818604,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0017849835191595719,
+      "loss": 0.0918,
+      "step": 22344
+    },
+    {
+      "epoch": 0.19396533016206458,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0017849642135755945,
+      "loss": 0.1279,
+      "step": 22345
+    },
+    {
+      "epoch": 0.19397401064226874,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0017849449072425507,
+      "loss": 0.1836,
+      "step": 22346
+    },
+    {
+      "epoch": 0.1939826911224729,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0017849256001604612,
+      "loss": 0.1182,
+      "step": 22347
+    },
+    {
+      "epoch": 0.19399137160267707,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0017849062923293475,
+      "loss": 0.1104,
+      "step": 22348
+    },
+    {
+      "epoch": 0.19400005208288124,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0017848869837492306,
+      "loss": 0.0801,
+      "step": 22349
+    },
+    {
+      "epoch": 0.1940087325630854,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0017848676744201316,
+      "loss": 0.126,
+      "step": 22350
+    },
+    {
+      "epoch": 0.19401741304328957,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0017848483643420714,
+      "loss": 0.0742,
+      "step": 22351
+    },
+    {
+      "epoch": 0.19402609352349373,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0017848290535150716,
+      "loss": 0.1367,
+      "step": 22352
+    },
+    {
+      "epoch": 0.1940347740036979,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001784809741939153,
+      "loss": 0.0879,
+      "step": 22353
+    },
+    {
+      "epoch": 0.19404345448390206,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0017847904296143362,
+      "loss": 0.0933,
+      "step": 22354
+    },
+    {
+      "epoch": 0.19405213496410623,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0017847711165406438,
+      "loss": 0.1084,
+      "step": 22355
+    },
+    {
+      "epoch": 0.1940608154443104,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0017847518027180953,
+      "loss": 0.1709,
+      "step": 22356
+    },
+    {
+      "epoch": 0.19406949592451456,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0017847324881467128,
+      "loss": 0.1104,
+      "step": 22357
+    },
+    {
+      "epoch": 0.19407817640471872,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001784713172826517,
+      "loss": 0.1025,
+      "step": 22358
+    },
+    {
+      "epoch": 0.1940868568849229,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0017846938567575292,
+      "loss": 0.1172,
+      "step": 22359
+    },
+    {
+      "epoch": 0.19409553736512705,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0017846745399397708,
+      "loss": 0.0879,
+      "step": 22360
+    },
+    {
+      "epoch": 0.19410421784533122,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0017846552223732624,
+      "loss": 0.1436,
+      "step": 22361
+    },
+    {
+      "epoch": 0.19411289832553538,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0017846359040580254,
+      "loss": 0.0718,
+      "step": 22362
+    },
+    {
+      "epoch": 0.19412157880573952,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0017846165849940805,
+      "loss": 0.1621,
+      "step": 22363
+    },
+    {
+      "epoch": 0.19413025928594368,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0017845972651814492,
+      "loss": 0.1221,
+      "step": 22364
+    },
+    {
+      "epoch": 0.19413893976614785,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0017845779446201528,
+      "loss": 0.1221,
+      "step": 22365
+    },
+    {
+      "epoch": 0.19414762024635202,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0017845586233102124,
+      "loss": 0.127,
+      "step": 22366
+    },
+    {
+      "epoch": 0.19415630072655618,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0017845393012516492,
+      "loss": 0.1152,
+      "step": 22367
+    },
+    {
+      "epoch": 0.19416498120676035,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0017845199784444835,
+      "loss": 0.1396,
+      "step": 22368
+    },
+    {
+      "epoch": 0.1941736616869645,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0017845006548887373,
+      "loss": 0.1318,
+      "step": 22369
+    },
+    {
+      "epoch": 0.19418234216716868,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017844813305844317,
+      "loss": 0.0859,
+      "step": 22370
+    },
+    {
+      "epoch": 0.19419102264737284,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0017844620055315872,
+      "loss": 0.125,
+      "step": 22371
+    },
+    {
+      "epoch": 0.194199703127577,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0017844426797302255,
+      "loss": 0.0728,
+      "step": 22372
+    },
+    {
+      "epoch": 0.19420838360778117,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0017844233531803676,
+      "loss": 0.1387,
+      "step": 22373
+    },
+    {
+      "epoch": 0.19421706408798534,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0017844040258820346,
+      "loss": 0.1143,
+      "step": 22374
+    },
+    {
+      "epoch": 0.1942257445681895,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0017843846978352479,
+      "loss": 0.1494,
+      "step": 22375
+    },
+    {
+      "epoch": 0.19423442504839367,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001784365369040028,
+      "loss": 0.1201,
+      "step": 22376
+    },
+    {
+      "epoch": 0.19424310552859783,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0017843460394963964,
+      "loss": 0.1035,
+      "step": 22377
+    },
+    {
+      "epoch": 0.194251786008802,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0017843267092043744,
+      "loss": 0.1211,
+      "step": 22378
+    },
+    {
+      "epoch": 0.19426046648900616,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001784307378163983,
+      "loss": 0.1133,
+      "step": 22379
+    },
+    {
+      "epoch": 0.19426914696921033,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0017842880463752432,
+      "loss": 0.1445,
+      "step": 22380
+    },
+    {
+      "epoch": 0.1942778274494145,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0017842687138381766,
+      "loss": 0.1504,
+      "step": 22381
+    },
+    {
+      "epoch": 0.19428650792961866,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0017842493805528036,
+      "loss": 0.127,
+      "step": 22382
+    },
+    {
+      "epoch": 0.19429518840982282,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0017842300465191459,
+      "loss": 0.1406,
+      "step": 22383
+    },
+    {
+      "epoch": 0.194303868890027,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0017842107117372245,
+      "loss": 0.125,
+      "step": 22384
+    },
+    {
+      "epoch": 0.19431254937023115,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0017841913762070606,
+      "loss": 0.1299,
+      "step": 22385
+    },
+    {
+      "epoch": 0.19432122985043532,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0017841720399286755,
+      "loss": 0.0957,
+      "step": 22386
+    },
+    {
+      "epoch": 0.19432991033063948,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0017841527029020898,
+      "loss": 0.1094,
+      "step": 22387
+    },
+    {
+      "epoch": 0.19433859081084365,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.001784133365127325,
+      "loss": 0.084,
+      "step": 22388
+    },
+    {
+      "epoch": 0.1943472712910478,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0017841140266044025,
+      "loss": 0.0708,
+      "step": 22389
+    },
+    {
+      "epoch": 0.19435595177125198,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.001784094687333343,
+      "loss": 0.1182,
+      "step": 22390
+    },
+    {
+      "epoch": 0.19436463225145614,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0017840753473141679,
+      "loss": 0.1387,
+      "step": 22391
+    },
+    {
+      "epoch": 0.1943733127316603,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.001784056006546898,
+      "loss": 0.1367,
+      "step": 22392
+    },
+    {
+      "epoch": 0.19438199321186447,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0017840366650315551,
+      "loss": 0.105,
+      "step": 22393
+    },
+    {
+      "epoch": 0.19439067369206864,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0017840173227681598,
+      "loss": 0.1006,
+      "step": 22394
+    },
+    {
+      "epoch": 0.1943993541722728,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0017839979797567337,
+      "loss": 0.1001,
+      "step": 22395
+    },
+    {
+      "epoch": 0.19440803465247697,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0017839786359972974,
+      "loss": 0.1011,
+      "step": 22396
+    },
+    {
+      "epoch": 0.19441671513268113,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001783959291489872,
+      "loss": 0.1206,
+      "step": 22397
+    },
+    {
+      "epoch": 0.1944253956128853,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0017839399462344797,
+      "loss": 0.0835,
+      "step": 22398
+    },
+    {
+      "epoch": 0.19443407609308946,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0017839206002311407,
+      "loss": 0.1436,
+      "step": 22399
+    },
+    {
+      "epoch": 0.19444275657329363,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0017839012534798763,
+      "loss": 0.1006,
+      "step": 22400
+    },
+    {
+      "epoch": 0.1944514370534978,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001783881905980708,
+      "loss": 0.0967,
+      "step": 22401
+    },
+    {
+      "epoch": 0.19446011753370196,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0017838625577336563,
+      "loss": 0.0938,
+      "step": 22402
+    },
+    {
+      "epoch": 0.19446879801390612,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001783843208738743,
+      "loss": 0.1631,
+      "step": 22403
+    },
+    {
+      "epoch": 0.1944774784941103,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001783823858995989,
+      "loss": 0.1123,
+      "step": 22404
+    },
+    {
+      "epoch": 0.19448615897431445,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0017838045085054155,
+      "loss": 0.1191,
+      "step": 22405
+    },
+    {
+      "epoch": 0.19449483945451862,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0017837851572670438,
+      "loss": 0.1523,
+      "step": 22406
+    },
+    {
+      "epoch": 0.19450351993472279,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.001783765805280895,
+      "loss": 0.1201,
+      "step": 22407
+    },
+    {
+      "epoch": 0.19451220041492695,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0017837464525469898,
+      "loss": 0.0718,
+      "step": 22408
+    },
+    {
+      "epoch": 0.19452088089513112,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0017837270990653502,
+      "loss": 0.1221,
+      "step": 22409
+    },
+    {
+      "epoch": 0.19452956137533528,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0017837077448359965,
+      "loss": 0.0854,
+      "step": 22410
+    },
+    {
+      "epoch": 0.19453824185553945,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0017836883898589505,
+      "loss": 0.1069,
+      "step": 22411
+    },
+    {
+      "epoch": 0.1945469223357436,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0017836690341342333,
+      "loss": 0.0903,
+      "step": 22412
+    },
+    {
+      "epoch": 0.19455560281594778,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0017836496776618659,
+      "loss": 0.1357,
+      "step": 22413
+    },
+    {
+      "epoch": 0.19456428329615194,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0017836303204418692,
+      "loss": 0.1426,
+      "step": 22414
+    },
+    {
+      "epoch": 0.1945729637763561,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0017836109624742647,
+      "loss": 0.1182,
+      "step": 22415
+    },
+    {
+      "epoch": 0.19458164425656027,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017835916037590735,
+      "loss": 0.1328,
+      "step": 22416
+    },
+    {
+      "epoch": 0.19459032473676444,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0017835722442963169,
+      "loss": 0.1016,
+      "step": 22417
+    },
+    {
+      "epoch": 0.1945990052169686,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001783552884086016,
+      "loss": 0.1089,
+      "step": 22418
+    },
+    {
+      "epoch": 0.19460768569717277,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.001783533523128192,
+      "loss": 0.1147,
+      "step": 22419
+    },
+    {
+      "epoch": 0.19461636617737693,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0017835141614228661,
+      "loss": 0.1416,
+      "step": 22420
+    },
+    {
+      "epoch": 0.1946250466575811,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001783494798970059,
+      "loss": 0.1152,
+      "step": 22421
+    },
+    {
+      "epoch": 0.19463372713778526,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0017834754357697927,
+      "loss": 0.0908,
+      "step": 22422
+    },
+    {
+      "epoch": 0.19464240761798943,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0017834560718220877,
+      "loss": 0.1074,
+      "step": 22423
+    },
+    {
+      "epoch": 0.1946510880981936,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001783436707126965,
+      "loss": 0.0977,
+      "step": 22424
+    },
+    {
+      "epoch": 0.19465976857839776,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0017834173416844468,
+      "loss": 0.1104,
+      "step": 22425
+    },
+    {
+      "epoch": 0.19466844905860192,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0017833979754945536,
+      "loss": 0.0947,
+      "step": 22426
+    },
+    {
+      "epoch": 0.1946771295388061,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0017833786085573063,
+      "loss": 0.126,
+      "step": 22427
+    },
+    {
+      "epoch": 0.19468581001901025,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0017833592408727267,
+      "loss": 0.1523,
+      "step": 22428
+    },
+    {
+      "epoch": 0.19469449049921442,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0017833398724408356,
+      "loss": 0.1011,
+      "step": 22429
+    },
+    {
+      "epoch": 0.19470317097941858,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0017833205032616545,
+      "loss": 0.1328,
+      "step": 22430
+    },
+    {
+      "epoch": 0.19471185145962275,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001783301133335204,
+      "loss": 0.1182,
+      "step": 22431
+    },
+    {
+      "epoch": 0.1947205319398269,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0017832817626615058,
+      "loss": 0.1094,
+      "step": 22432
+    },
+    {
+      "epoch": 0.19472921242003108,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001783262391240581,
+      "loss": 0.1035,
+      "step": 22433
+    },
+    {
+      "epoch": 0.19473789290023524,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0017832430190724505,
+      "loss": 0.0781,
+      "step": 22434
+    },
+    {
+      "epoch": 0.1947465733804394,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0017832236461571358,
+      "loss": 0.1289,
+      "step": 22435
+    },
+    {
+      "epoch": 0.19475525386064357,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0017832042724946582,
+      "loss": 0.1211,
+      "step": 22436
+    },
+    {
+      "epoch": 0.19476393434084774,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0017831848980850386,
+      "loss": 0.1279,
+      "step": 22437
+    },
+    {
+      "epoch": 0.1947726148210519,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001783165522928298,
+      "loss": 0.1016,
+      "step": 22438
+    },
+    {
+      "epoch": 0.19478129530125607,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0017831461470244581,
+      "loss": 0.0957,
+      "step": 22439
+    },
+    {
+      "epoch": 0.19478997578146023,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0017831267703735398,
+      "loss": 0.1191,
+      "step": 22440
+    },
+    {
+      "epoch": 0.1947986562616644,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0017831073929755641,
+      "loss": 0.0957,
+      "step": 22441
+    },
+    {
+      "epoch": 0.19480733674186856,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0017830880148305525,
+      "loss": 0.0908,
+      "step": 22442
+    },
+    {
+      "epoch": 0.19481601722207273,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0017830686359385265,
+      "loss": 0.1016,
+      "step": 22443
+    },
+    {
+      "epoch": 0.1948246977022769,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0017830492562995067,
+      "loss": 0.0977,
+      "step": 22444
+    },
+    {
+      "epoch": 0.19483337818248106,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0017830298759135142,
+      "loss": 0.1187,
+      "step": 22445
+    },
+    {
+      "epoch": 0.19484205866268522,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0017830104947805707,
+      "loss": 0.1104,
+      "step": 22446
+    },
+    {
+      "epoch": 0.1948507391428894,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0017829911129006973,
+      "loss": 0.1089,
+      "step": 22447
+    },
+    {
+      "epoch": 0.19485941962309355,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0017829717302739149,
+      "loss": 0.0957,
+      "step": 22448
+    },
+    {
+      "epoch": 0.19486810010329772,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.001782952346900245,
+      "loss": 0.0942,
+      "step": 22449
+    },
+    {
+      "epoch": 0.19487678058350189,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0017829329627797087,
+      "loss": 0.1113,
+      "step": 22450
+    },
+    {
+      "epoch": 0.19488546106370605,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.001782913577912327,
+      "loss": 0.0933,
+      "step": 22451
+    },
+    {
+      "epoch": 0.19489414154391022,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0017828941922981214,
+      "loss": 0.0684,
+      "step": 22452
+    },
+    {
+      "epoch": 0.19490282202411438,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001782874805937113,
+      "loss": 0.1357,
+      "step": 22453
+    },
+    {
+      "epoch": 0.19491150250431855,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0017828554188293226,
+      "loss": 0.0977,
+      "step": 22454
+    },
+    {
+      "epoch": 0.1949201829845227,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017828360309747723,
+      "loss": 0.1396,
+      "step": 22455
+    },
+    {
+      "epoch": 0.19492886346472688,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0017828166423734823,
+      "loss": 0.1152,
+      "step": 22456
+    },
+    {
+      "epoch": 0.19493754394493104,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001782797253025475,
+      "loss": 0.1123,
+      "step": 22457
+    },
+    {
+      "epoch": 0.1949462244251352,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0017827778629307702,
+      "loss": 0.1191,
+      "step": 22458
+    },
+    {
+      "epoch": 0.19495490490533937,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00178275847208939,
+      "loss": 0.1299,
+      "step": 22459
+    },
+    {
+      "epoch": 0.19496358538554354,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0017827390805013556,
+      "loss": 0.1299,
+      "step": 22460
+    },
+    {
+      "epoch": 0.1949722658657477,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0017827196881666876,
+      "loss": 0.1104,
+      "step": 22461
+    },
+    {
+      "epoch": 0.19498094634595187,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0017827002950854078,
+      "loss": 0.1309,
+      "step": 22462
+    },
+    {
+      "epoch": 0.19498962682615603,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0017826809012575374,
+      "loss": 0.1279,
+      "step": 22463
+    },
+    {
+      "epoch": 0.1949983073063602,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0017826615066830972,
+      "loss": 0.0913,
+      "step": 22464
+    },
+    {
+      "epoch": 0.19500698778656436,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0017826421113621086,
+      "loss": 0.0903,
+      "step": 22465
+    },
+    {
+      "epoch": 0.19501566826676853,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001782622715294593,
+      "loss": 0.1328,
+      "step": 22466
+    },
+    {
+      "epoch": 0.1950243487469727,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0017826033184805714,
+      "loss": 0.0889,
+      "step": 22467
+    },
+    {
+      "epoch": 0.19503302922717686,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0017825839209200652,
+      "loss": 0.1279,
+      "step": 22468
+    },
+    {
+      "epoch": 0.19504170970738102,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0017825645226130953,
+      "loss": 0.103,
+      "step": 22469
+    },
+    {
+      "epoch": 0.1950503901875852,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001782545123559683,
+      "loss": 0.0801,
+      "step": 22470
+    },
+    {
+      "epoch": 0.19505907066778935,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0017825257237598499,
+      "loss": 0.1147,
+      "step": 22471
+    },
+    {
+      "epoch": 0.19506775114799352,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0017825063232136168,
+      "loss": 0.1025,
+      "step": 22472
+    },
+    {
+      "epoch": 0.19507643162819768,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0017824869219210046,
+      "loss": 0.1123,
+      "step": 22473
+    },
+    {
+      "epoch": 0.19508511210840185,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0017824675198820357,
+      "loss": 0.0869,
+      "step": 22474
+    },
+    {
+      "epoch": 0.195093792588606,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.00178244811709673,
+      "loss": 0.1426,
+      "step": 22475
+    },
+    {
+      "epoch": 0.19510247306881018,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0017824287135651095,
+      "loss": 0.1133,
+      "step": 22476
+    },
+    {
+      "epoch": 0.19511115354901434,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0017824093092871955,
+      "loss": 0.0869,
+      "step": 22477
+    },
+    {
+      "epoch": 0.1951198340292185,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0017823899042630085,
+      "loss": 0.1074,
+      "step": 22478
+    },
+    {
+      "epoch": 0.19512851450942267,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0017823704984925703,
+      "loss": 0.103,
+      "step": 22479
+    },
+    {
+      "epoch": 0.19513719498962684,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0017823510919759019,
+      "loss": 0.1143,
+      "step": 22480
+    },
+    {
+      "epoch": 0.195145875469831,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0017823316847130248,
+      "loss": 0.0747,
+      "step": 22481
+    },
+    {
+      "epoch": 0.19515455595003517,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017823122767039597,
+      "loss": 0.1133,
+      "step": 22482
+    },
+    {
+      "epoch": 0.19516323643023933,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0017822928679487285,
+      "loss": 0.1016,
+      "step": 22483
+    },
+    {
+      "epoch": 0.1951719169104435,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001782273458447352,
+      "loss": 0.0757,
+      "step": 22484
+    },
+    {
+      "epoch": 0.19518059739064766,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0017822540481998513,
+      "loss": 0.127,
+      "step": 22485
+    },
+    {
+      "epoch": 0.1951892778708518,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0017822346372062482,
+      "loss": 0.1104,
+      "step": 22486
+    },
+    {
+      "epoch": 0.19519795835105597,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0017822152254665631,
+      "loss": 0.0669,
+      "step": 22487
+    },
+    {
+      "epoch": 0.19520663883126013,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0017821958129808177,
+      "loss": 0.1562,
+      "step": 22488
+    },
+    {
+      "epoch": 0.1952153193114643,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0017821763997490335,
+      "loss": 0.083,
+      "step": 22489
+    },
+    {
+      "epoch": 0.19522399979166846,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0017821569857712314,
+      "loss": 0.1377,
+      "step": 22490
+    },
+    {
+      "epoch": 0.19523268027187263,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0017821375710474325,
+      "loss": 0.0898,
+      "step": 22491
+    },
+    {
+      "epoch": 0.1952413607520768,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0017821181555776585,
+      "loss": 0.0889,
+      "step": 22492
+    },
+    {
+      "epoch": 0.19525004123228096,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0017820987393619302,
+      "loss": 0.0947,
+      "step": 22493
+    },
+    {
+      "epoch": 0.19525872171248512,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0017820793224002688,
+      "loss": 0.0952,
+      "step": 22494
+    },
+    {
+      "epoch": 0.1952674021926893,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0017820599046926959,
+      "loss": 0.0947,
+      "step": 22495
+    },
+    {
+      "epoch": 0.19527608267289345,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0017820404862392324,
+      "loss": 0.1455,
+      "step": 22496
+    },
+    {
+      "epoch": 0.19528476315309762,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0017820210670398998,
+      "loss": 0.0928,
+      "step": 22497
+    },
+    {
+      "epoch": 0.19529344363330178,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0017820016470947188,
+      "loss": 0.1143,
+      "step": 22498
+    },
+    {
+      "epoch": 0.19530212411350595,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0017819822264037114,
+      "loss": 0.0938,
+      "step": 22499
+    },
+    {
+      "epoch": 0.1953108045937101,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0017819628049668983,
+      "loss": 0.0752,
+      "step": 22500
+    },
+    {
+      "epoch": 0.19531948507391428,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0017819433827843015,
+      "loss": 0.1211,
+      "step": 22501
+    },
+    {
+      "epoch": 0.19532816555411844,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001781923959855941,
+      "loss": 0.0991,
+      "step": 22502
+    },
+    {
+      "epoch": 0.1953368460343226,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0017819045361818391,
+      "loss": 0.1318,
+      "step": 22503
+    },
+    {
+      "epoch": 0.19534552651452677,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0017818851117620165,
+      "loss": 0.0864,
+      "step": 22504
+    },
+    {
+      "epoch": 0.19535420699473094,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0017818656865964944,
+      "loss": 0.104,
+      "step": 22505
+    },
+    {
+      "epoch": 0.1953628874749351,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0017818462606852942,
+      "loss": 0.1167,
+      "step": 22506
+    },
+    {
+      "epoch": 0.19537156795513927,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0017818268340284375,
+      "loss": 0.125,
+      "step": 22507
+    },
+    {
+      "epoch": 0.19538024843534343,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001781807406625945,
+      "loss": 0.1079,
+      "step": 22508
+    },
+    {
+      "epoch": 0.1953889289155476,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0017817879784778386,
+      "loss": 0.1123,
+      "step": 22509
+    },
+    {
+      "epoch": 0.19539760939575176,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0017817685495841386,
+      "loss": 0.1162,
+      "step": 22510
+    },
+    {
+      "epoch": 0.19540628987595593,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.001781749119944867,
+      "loss": 0.1357,
+      "step": 22511
+    },
+    {
+      "epoch": 0.1954149703561601,
+      "grad_norm": 2.046875,
+      "learning_rate": 0.0017817296895600446,
+      "loss": 0.4629,
+      "step": 22512
+    },
+    {
+      "epoch": 0.19542365083636426,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001781710258429693,
+      "loss": 0.0962,
+      "step": 22513
+    },
+    {
+      "epoch": 0.19543233131656843,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0017816908265538332,
+      "loss": 0.1001,
+      "step": 22514
+    },
+    {
+      "epoch": 0.1954410117967726,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001781671393932487,
+      "loss": 0.1328,
+      "step": 22515
+    },
+    {
+      "epoch": 0.19544969227697676,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0017816519605656747,
+      "loss": 0.1084,
+      "step": 22516
+    },
+    {
+      "epoch": 0.19545837275718092,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001781632526453418,
+      "loss": 0.106,
+      "step": 22517
+    },
+    {
+      "epoch": 0.19546705323738509,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0017816130915957384,
+      "loss": 0.1094,
+      "step": 22518
+    },
+    {
+      "epoch": 0.19547573371758925,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.001781593655992657,
+      "loss": 0.1221,
+      "step": 22519
+    },
+    {
+      "epoch": 0.19548441419779342,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0017815742196441952,
+      "loss": 0.1504,
+      "step": 22520
+    },
+    {
+      "epoch": 0.19549309467799758,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001781554782550374,
+      "loss": 0.0918,
+      "step": 22521
+    },
+    {
+      "epoch": 0.19550177515820175,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0017815353447112143,
+      "loss": 0.1104,
+      "step": 22522
+    },
+    {
+      "epoch": 0.1955104556384059,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001781515906126738,
+      "loss": 0.1084,
+      "step": 22523
+    },
+    {
+      "epoch": 0.19551913611861008,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0017814964667969663,
+      "loss": 0.1309,
+      "step": 22524
+    },
+    {
+      "epoch": 0.19552781659881424,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0017814770267219205,
+      "loss": 0.1221,
+      "step": 22525
+    },
+    {
+      "epoch": 0.1955364970790184,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001781457585901621,
+      "loss": 0.1182,
+      "step": 22526
+    },
+    {
+      "epoch": 0.19554517755922257,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0017814381443360904,
+      "loss": 0.0981,
+      "step": 22527
+    },
+    {
+      "epoch": 0.19555385803942674,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001781418702025349,
+      "loss": 0.082,
+      "step": 22528
+    },
+    {
+      "epoch": 0.1955625385196309,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0017813992589694183,
+      "loss": 0.1201,
+      "step": 22529
+    },
+    {
+      "epoch": 0.19557121899983507,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0017813798151683196,
+      "loss": 0.1328,
+      "step": 22530
+    },
+    {
+      "epoch": 0.19557989948003923,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0017813603706220745,
+      "loss": 0.127,
+      "step": 22531
+    },
+    {
+      "epoch": 0.1955885799602434,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0017813409253307033,
+      "loss": 0.1387,
+      "step": 22532
+    },
+    {
+      "epoch": 0.19559726044044756,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0017813214792942284,
+      "loss": 0.1367,
+      "step": 22533
+    },
+    {
+      "epoch": 0.19560594092065173,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0017813020325126705,
+      "loss": 0.1104,
+      "step": 22534
+    },
+    {
+      "epoch": 0.1956146214008559,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0017812825849860508,
+      "loss": 0.1504,
+      "step": 22535
+    },
+    {
+      "epoch": 0.19562330188106006,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001781263136714391,
+      "loss": 0.0737,
+      "step": 22536
+    },
+    {
+      "epoch": 0.19563198236126422,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0017812436876977118,
+      "loss": 0.0957,
+      "step": 22537
+    },
+    {
+      "epoch": 0.1956406628414684,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0017812242379360347,
+      "loss": 0.1387,
+      "step": 22538
+    },
+    {
+      "epoch": 0.19564934332167255,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001781204787429381,
+      "loss": 0.1143,
+      "step": 22539
+    },
+    {
+      "epoch": 0.19565802380187672,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.001781185336177772,
+      "loss": 0.1035,
+      "step": 22540
+    },
+    {
+      "epoch": 0.19566670428208088,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001781165884181229,
+      "loss": 0.1016,
+      "step": 22541
+    },
+    {
+      "epoch": 0.19567538476228505,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017811464314397734,
+      "loss": 0.1045,
+      "step": 22542
+    },
+    {
+      "epoch": 0.1956840652424892,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001781126977953426,
+      "loss": 0.127,
+      "step": 22543
+    },
+    {
+      "epoch": 0.19569274572269338,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0017811075237222085,
+      "loss": 0.1465,
+      "step": 22544
+    },
+    {
+      "epoch": 0.19570142620289754,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0017810880687461424,
+      "loss": 0.0938,
+      "step": 22545
+    },
+    {
+      "epoch": 0.1957101066831017,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001781068613025248,
+      "loss": 0.0977,
+      "step": 22546
+    },
+    {
+      "epoch": 0.19571878716330587,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0017810491565595475,
+      "loss": 0.1992,
+      "step": 22547
+    },
+    {
+      "epoch": 0.19572746764351004,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0017810296993490617,
+      "loss": 0.1289,
+      "step": 22548
+    },
+    {
+      "epoch": 0.1957361481237142,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0017810102413938122,
+      "loss": 0.1025,
+      "step": 22549
+    },
+    {
+      "epoch": 0.19574482860391837,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0017809907826938202,
+      "loss": 0.1816,
+      "step": 22550
+    },
+    {
+      "epoch": 0.19575350908412253,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0017809713232491069,
+      "loss": 0.0967,
+      "step": 22551
+    },
+    {
+      "epoch": 0.1957621895643267,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0017809518630596934,
+      "loss": 0.1055,
+      "step": 22552
+    },
+    {
+      "epoch": 0.19577087004453086,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0017809324021256013,
+      "loss": 0.084,
+      "step": 22553
+    },
+    {
+      "epoch": 0.19577955052473503,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.001780912940446852,
+      "loss": 0.0737,
+      "step": 22554
+    },
+    {
+      "epoch": 0.1957882310049392,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0017808934780234663,
+      "loss": 0.1055,
+      "step": 22555
+    },
+    {
+      "epoch": 0.19579691148514336,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0017808740148554658,
+      "loss": 0.1084,
+      "step": 22556
+    },
+    {
+      "epoch": 0.19580559196534753,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0017808545509428715,
+      "loss": 0.1387,
+      "step": 22557
+    },
+    {
+      "epoch": 0.1958142724455517,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001780835086285705,
+      "loss": 0.1089,
+      "step": 22558
+    },
+    {
+      "epoch": 0.19582295292575586,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0017808156208839877,
+      "loss": 0.1084,
+      "step": 22559
+    },
+    {
+      "epoch": 0.19583163340596002,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0017807961547377403,
+      "loss": 0.0938,
+      "step": 22560
+    },
+    {
+      "epoch": 0.19584031388616419,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0017807766878469846,
+      "loss": 0.1206,
+      "step": 22561
+    },
+    {
+      "epoch": 0.19584899436636835,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0017807572202117419,
+      "loss": 0.1187,
+      "step": 22562
+    },
+    {
+      "epoch": 0.19585767484657252,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0017807377518320332,
+      "loss": 0.1035,
+      "step": 22563
+    },
+    {
+      "epoch": 0.19586635532677668,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0017807182827078798,
+      "loss": 0.1045,
+      "step": 22564
+    },
+    {
+      "epoch": 0.19587503580698085,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0017806988128393033,
+      "loss": 0.1133,
+      "step": 22565
+    },
+    {
+      "epoch": 0.195883716287185,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0017806793422263248,
+      "loss": 0.1011,
+      "step": 22566
+    },
+    {
+      "epoch": 0.19589239676738918,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0017806598708689657,
+      "loss": 0.1279,
+      "step": 22567
+    },
+    {
+      "epoch": 0.19590107724759334,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0017806403987672469,
+      "loss": 0.1426,
+      "step": 22568
+    },
+    {
+      "epoch": 0.1959097577277975,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0017806209259211903,
+      "loss": 0.0996,
+      "step": 22569
+    },
+    {
+      "epoch": 0.19591843820800167,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0017806014523308162,
+      "loss": 0.1123,
+      "step": 22570
+    },
+    {
+      "epoch": 0.19592711868820584,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0017805819779961474,
+      "loss": 0.0854,
+      "step": 22571
+    },
+    {
+      "epoch": 0.19593579916841,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0017805625029172038,
+      "loss": 0.1758,
+      "step": 22572
+    },
+    {
+      "epoch": 0.19594447964861417,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0017805430270940077,
+      "loss": 0.1436,
+      "step": 22573
+    },
+    {
+      "epoch": 0.19595316012881833,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0017805235505265798,
+      "loss": 0.1885,
+      "step": 22574
+    },
+    {
+      "epoch": 0.1959618406090225,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0017805040732149415,
+      "loss": 0.0986,
+      "step": 22575
+    },
+    {
+      "epoch": 0.19597052108922666,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0017804845951591144,
+      "loss": 0.0918,
+      "step": 22576
+    },
+    {
+      "epoch": 0.19597920156943083,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0017804651163591193,
+      "loss": 0.1445,
+      "step": 22577
+    },
+    {
+      "epoch": 0.195987882049635,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0017804456368149782,
+      "loss": 0.1367,
+      "step": 22578
+    },
+    {
+      "epoch": 0.19599656252983916,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0017804261565267116,
+      "loss": 0.1079,
+      "step": 22579
+    },
+    {
+      "epoch": 0.19600524301004332,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001780406675494341,
+      "loss": 0.1934,
+      "step": 22580
+    },
+    {
+      "epoch": 0.1960139234902475,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0017803871937178883,
+      "loss": 0.1123,
+      "step": 22581
+    },
+    {
+      "epoch": 0.19602260397045165,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0017803677111973741,
+      "loss": 0.1123,
+      "step": 22582
+    },
+    {
+      "epoch": 0.19603128445065582,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00178034822793282,
+      "loss": 0.1182,
+      "step": 22583
+    },
+    {
+      "epoch": 0.19603996493085998,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0017803287439242476,
+      "loss": 0.127,
+      "step": 22584
+    },
+    {
+      "epoch": 0.19604864541106415,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001780309259171678,
+      "loss": 0.104,
+      "step": 22585
+    },
+    {
+      "epoch": 0.1960573258912683,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0017802897736751318,
+      "loss": 0.1172,
+      "step": 22586
+    },
+    {
+      "epoch": 0.19606600637147248,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0017802702874346312,
+      "loss": 0.127,
+      "step": 22587
+    },
+    {
+      "epoch": 0.19607468685167664,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0017802508004501974,
+      "loss": 0.105,
+      "step": 22588
+    },
+    {
+      "epoch": 0.1960833673318808,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0017802313127218512,
+      "loss": 0.1123,
+      "step": 22589
+    },
+    {
+      "epoch": 0.19609204781208497,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0017802118242496148,
+      "loss": 0.125,
+      "step": 22590
+    },
+    {
+      "epoch": 0.19610072829228914,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0017801923350335084,
+      "loss": 0.127,
+      "step": 22591
+    },
+    {
+      "epoch": 0.1961094087724933,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0017801728450735543,
+      "loss": 0.0791,
+      "step": 22592
+    },
+    {
+      "epoch": 0.19611808925269747,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0017801533543697734,
+      "loss": 0.1133,
+      "step": 22593
+    },
+    {
+      "epoch": 0.19612676973290163,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0017801338629221865,
+      "loss": 0.1055,
+      "step": 22594
+    },
+    {
+      "epoch": 0.1961354502131058,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001780114370730816,
+      "loss": 0.0781,
+      "step": 22595
+    },
+    {
+      "epoch": 0.19614413069330996,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0017800948777956826,
+      "loss": 0.1611,
+      "step": 22596
+    },
+    {
+      "epoch": 0.19615281117351413,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0017800753841168073,
+      "loss": 0.1436,
+      "step": 22597
+    },
+    {
+      "epoch": 0.1961614916537183,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0017800558896942118,
+      "loss": 0.0811,
+      "step": 22598
+    },
+    {
+      "epoch": 0.19617017213392246,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0017800363945279177,
+      "loss": 0.0593,
+      "step": 22599
+    },
+    {
+      "epoch": 0.19617885261412663,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0017800168986179456,
+      "loss": 0.1025,
+      "step": 22600
+    },
+    {
+      "epoch": 0.1961875330943308,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0017799974019643177,
+      "loss": 0.0786,
+      "step": 22601
+    },
+    {
+      "epoch": 0.19619621357453496,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0017799779045670548,
+      "loss": 0.0889,
+      "step": 22602
+    },
+    {
+      "epoch": 0.19620489405473912,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.001779958406426178,
+      "loss": 0.1045,
+      "step": 22603
+    },
+    {
+      "epoch": 0.19621357453494329,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0017799389075417089,
+      "loss": 0.2148,
+      "step": 22604
+    },
+    {
+      "epoch": 0.19622225501514745,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0017799194079136692,
+      "loss": 0.1006,
+      "step": 22605
+    },
+    {
+      "epoch": 0.19623093549535162,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0017798999075420796,
+      "loss": 0.0811,
+      "step": 22606
+    },
+    {
+      "epoch": 0.19623961597555578,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0017798804064269617,
+      "loss": 0.126,
+      "step": 22607
+    },
+    {
+      "epoch": 0.19624829645575995,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0017798609045683368,
+      "loss": 0.0938,
+      "step": 22608
+    },
+    {
+      "epoch": 0.19625697693596408,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001779841401966226,
+      "loss": 0.166,
+      "step": 22609
+    },
+    {
+      "epoch": 0.19626565741616825,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0017798218986206512,
+      "loss": 0.106,
+      "step": 22610
+    },
+    {
+      "epoch": 0.19627433789637241,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0017798023945316333,
+      "loss": 0.1035,
+      "step": 22611
+    },
+    {
+      "epoch": 0.19628301837657658,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0017797828896991936,
+      "loss": 0.1084,
+      "step": 22612
+    },
+    {
+      "epoch": 0.19629169885678074,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0017797633841233537,
+      "loss": 0.1055,
+      "step": 22613
+    },
+    {
+      "epoch": 0.1963003793369849,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0017797438778041346,
+      "loss": 0.0898,
+      "step": 22614
+    },
+    {
+      "epoch": 0.19630905981718907,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001779724370741558,
+      "loss": 0.1104,
+      "step": 22615
+    },
+    {
+      "epoch": 0.19631774029739324,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0017797048629356448,
+      "loss": 0.1338,
+      "step": 22616
+    },
+    {
+      "epoch": 0.1963264207775974,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001779685354386417,
+      "loss": 0.1108,
+      "step": 22617
+    },
+    {
+      "epoch": 0.19633510125780157,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0017796658450938952,
+      "loss": 0.1182,
+      "step": 22618
+    },
+    {
+      "epoch": 0.19634378173800573,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0017796463350581012,
+      "loss": 0.1108,
+      "step": 22619
+    },
+    {
+      "epoch": 0.1963524622182099,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001779626824279056,
+      "loss": 0.1221,
+      "step": 22620
+    },
+    {
+      "epoch": 0.19636114269841407,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001779607312756781,
+      "loss": 0.1514,
+      "step": 22621
+    },
+    {
+      "epoch": 0.19636982317861823,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0017795878004912983,
+      "loss": 0.123,
+      "step": 22622
+    },
+    {
+      "epoch": 0.1963785036588224,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0017795682874826278,
+      "loss": 0.1201,
+      "step": 22623
+    },
+    {
+      "epoch": 0.19638718413902656,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001779548773730792,
+      "loss": 0.125,
+      "step": 22624
+    },
+    {
+      "epoch": 0.19639586461923073,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0017795292592358116,
+      "loss": 0.0977,
+      "step": 22625
+    },
+    {
+      "epoch": 0.1964045450994349,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0017795097439977086,
+      "loss": 0.1504,
+      "step": 22626
+    },
+    {
+      "epoch": 0.19641322557963906,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001779490228016504,
+      "loss": 0.1074,
+      "step": 22627
+    },
+    {
+      "epoch": 0.19642190605984322,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.001779470711292219,
+      "loss": 0.082,
+      "step": 22628
+    },
+    {
+      "epoch": 0.19643058654004739,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0017794511938248746,
+      "loss": 0.1025,
+      "step": 22629
+    },
+    {
+      "epoch": 0.19643926702025155,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001779431675614493,
+      "loss": 0.0732,
+      "step": 22630
+    },
+    {
+      "epoch": 0.19644794750045572,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001779412156661095,
+      "loss": 0.063,
+      "step": 22631
+    },
+    {
+      "epoch": 0.19645662798065988,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0017793926369647022,
+      "loss": 0.168,
+      "step": 22632
+    },
+    {
+      "epoch": 0.19646530846086405,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0017793731165253357,
+      "loss": 0.0996,
+      "step": 22633
+    },
+    {
+      "epoch": 0.1964739889410682,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001779353595343017,
+      "loss": 0.1191,
+      "step": 22634
+    },
+    {
+      "epoch": 0.19648266942127238,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0017793340734177673,
+      "loss": 0.0977,
+      "step": 22635
+    },
+    {
+      "epoch": 0.19649134990147654,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0017793145507496082,
+      "loss": 0.1484,
+      "step": 22636
+    },
+    {
+      "epoch": 0.1965000303816807,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.001779295027338561,
+      "loss": 0.1025,
+      "step": 22637
+    },
+    {
+      "epoch": 0.19650871086188487,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0017792755031846468,
+      "loss": 0.1328,
+      "step": 22638
+    },
+    {
+      "epoch": 0.19651739134208904,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0017792559782878873,
+      "loss": 0.0791,
+      "step": 22639
+    },
+    {
+      "epoch": 0.1965260718222932,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0017792364526483034,
+      "loss": 0.1123,
+      "step": 22640
+    },
+    {
+      "epoch": 0.19653475230249737,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0017792169262659167,
+      "loss": 0.1099,
+      "step": 22641
+    },
+    {
+      "epoch": 0.19654343278270153,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001779197399140749,
+      "loss": 0.1553,
+      "step": 22642
+    },
+    {
+      "epoch": 0.1965521132629057,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0017791778712728209,
+      "loss": 0.1172,
+      "step": 22643
+    },
+    {
+      "epoch": 0.19656079374310986,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0017791583426621542,
+      "loss": 0.1143,
+      "step": 22644
+    },
+    {
+      "epoch": 0.19656947422331403,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0017791388133087704,
+      "loss": 0.0942,
+      "step": 22645
+    },
+    {
+      "epoch": 0.1965781547035182,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0017791192832126899,
+      "loss": 0.1328,
+      "step": 22646
+    },
+    {
+      "epoch": 0.19658683518372236,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0017790997523739352,
+      "loss": 0.1025,
+      "step": 22647
+    },
+    {
+      "epoch": 0.19659551566392652,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0017790802207925272,
+      "loss": 0.1484,
+      "step": 22648
+    },
+    {
+      "epoch": 0.1966041961441307,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0017790606884684871,
+      "loss": 0.1318,
+      "step": 22649
+    },
+    {
+      "epoch": 0.19661287662433485,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0017790411554018367,
+      "loss": 0.1426,
+      "step": 22650
+    },
+    {
+      "epoch": 0.19662155710453902,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001779021621592597,
+      "loss": 0.1055,
+      "step": 22651
+    },
+    {
+      "epoch": 0.19663023758474318,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0017790020870407895,
+      "loss": 0.1318,
+      "step": 22652
+    },
+    {
+      "epoch": 0.19663891806494735,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0017789825517464356,
+      "loss": 0.0928,
+      "step": 22653
+    },
+    {
+      "epoch": 0.19664759854515151,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0017789630157095562,
+      "loss": 0.1377,
+      "step": 22654
+    },
+    {
+      "epoch": 0.19665627902535568,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0017789434789301736,
+      "loss": 0.0732,
+      "step": 22655
+    },
+    {
+      "epoch": 0.19666495950555984,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001778923941408308,
+      "loss": 0.1021,
+      "step": 22656
+    },
+    {
+      "epoch": 0.196673639985764,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0017789044031439815,
+      "loss": 0.1138,
+      "step": 22657
+    },
+    {
+      "epoch": 0.19668232046596817,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0017788848641372157,
+      "loss": 0.1211,
+      "step": 22658
+    },
+    {
+      "epoch": 0.19669100094617234,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0017788653243880315,
+      "loss": 0.083,
+      "step": 22659
+    },
+    {
+      "epoch": 0.1966996814263765,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0017788457838964502,
+      "loss": 0.1006,
+      "step": 22660
+    },
+    {
+      "epoch": 0.19670836190658067,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0017788262426624935,
+      "loss": 0.1016,
+      "step": 22661
+    },
+    {
+      "epoch": 0.19671704238678484,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0017788067006861826,
+      "loss": 0.1123,
+      "step": 22662
+    },
+    {
+      "epoch": 0.196725722866989,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0017787871579675388,
+      "loss": 0.1299,
+      "step": 22663
+    },
+    {
+      "epoch": 0.19673440334719317,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0017787676145065837,
+      "loss": 0.1016,
+      "step": 22664
+    },
+    {
+      "epoch": 0.19674308382739733,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0017787480703033386,
+      "loss": 0.0962,
+      "step": 22665
+    },
+    {
+      "epoch": 0.1967517643076015,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0017787285253578246,
+      "loss": 0.1016,
+      "step": 22666
+    },
+    {
+      "epoch": 0.19676044478780566,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0017787089796700633,
+      "loss": 0.085,
+      "step": 22667
+    },
+    {
+      "epoch": 0.19676912526800983,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001778689433240076,
+      "loss": 0.1187,
+      "step": 22668
+    },
+    {
+      "epoch": 0.196777805748214,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0017786698860678843,
+      "loss": 0.125,
+      "step": 22669
+    },
+    {
+      "epoch": 0.19678648622841816,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0017786503381535092,
+      "loss": 0.0889,
+      "step": 22670
+    },
+    {
+      "epoch": 0.19679516670862232,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0017786307894969726,
+      "loss": 0.0957,
+      "step": 22671
+    },
+    {
+      "epoch": 0.1968038471888265,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0017786112400982954,
+      "loss": 0.0967,
+      "step": 22672
+    },
+    {
+      "epoch": 0.19681252766903065,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0017785916899574993,
+      "loss": 0.1006,
+      "step": 22673
+    },
+    {
+      "epoch": 0.19682120814923482,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0017785721390746053,
+      "loss": 0.1465,
+      "step": 22674
+    },
+    {
+      "epoch": 0.19682988862943898,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001778552587449635,
+      "loss": 0.1445,
+      "step": 22675
+    },
+    {
+      "epoch": 0.19683856910964315,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0017785330350826099,
+      "loss": 0.0972,
+      "step": 22676
+    },
+    {
+      "epoch": 0.1968472495898473,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0017785134819735514,
+      "loss": 0.1025,
+      "step": 22677
+    },
+    {
+      "epoch": 0.19685593007005148,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0017784939281224803,
+      "loss": 0.1348,
+      "step": 22678
+    },
+    {
+      "epoch": 0.19686461055025564,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001778474373529419,
+      "loss": 0.1377,
+      "step": 22679
+    },
+    {
+      "epoch": 0.1968732910304598,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0017784548181943877,
+      "loss": 0.0977,
+      "step": 22680
+    },
+    {
+      "epoch": 0.19688197151066397,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0017784352621174086,
+      "loss": 0.1357,
+      "step": 22681
+    },
+    {
+      "epoch": 0.19689065199086814,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0017784157052985032,
+      "loss": 0.0884,
+      "step": 22682
+    },
+    {
+      "epoch": 0.1968993324710723,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0017783961477376928,
+      "loss": 0.1465,
+      "step": 22683
+    },
+    {
+      "epoch": 0.19690801295127647,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.001778376589434998,
+      "loss": 0.085,
+      "step": 22684
+    },
+    {
+      "epoch": 0.19691669343148063,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0017783570303904408,
+      "loss": 0.126,
+      "step": 22685
+    },
+    {
+      "epoch": 0.1969253739116848,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001778337470604043,
+      "loss": 0.1113,
+      "step": 22686
+    },
+    {
+      "epoch": 0.19693405439188896,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.001778317910075825,
+      "loss": 0.0947,
+      "step": 22687
+    },
+    {
+      "epoch": 0.19694273487209313,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0017782983488058089,
+      "loss": 0.1406,
+      "step": 22688
+    },
+    {
+      "epoch": 0.1969514153522973,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0017782787867940157,
+      "loss": 0.1016,
+      "step": 22689
+    },
+    {
+      "epoch": 0.19696009583250146,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0017782592240404675,
+      "loss": 0.0859,
+      "step": 22690
+    },
+    {
+      "epoch": 0.19696877631270562,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001778239660545185,
+      "loss": 0.1025,
+      "step": 22691
+    },
+    {
+      "epoch": 0.1969774567929098,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0017782200963081898,
+      "loss": 0.085,
+      "step": 22692
+    },
+    {
+      "epoch": 0.19698613727311395,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0017782005313295031,
+      "loss": 0.1196,
+      "step": 22693
+    },
+    {
+      "epoch": 0.19699481775331812,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0017781809656091468,
+      "loss": 0.0986,
+      "step": 22694
+    },
+    {
+      "epoch": 0.19700349823352228,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0017781613991471417,
+      "loss": 0.1211,
+      "step": 22695
+    },
+    {
+      "epoch": 0.19701217871372645,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0017781418319435093,
+      "loss": 0.1167,
+      "step": 22696
+    },
+    {
+      "epoch": 0.19702085919393061,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0017781222639982716,
+      "loss": 0.0977,
+      "step": 22697
+    },
+    {
+      "epoch": 0.19702953967413478,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0017781026953114496,
+      "loss": 0.1201,
+      "step": 22698
+    },
+    {
+      "epoch": 0.19703822015433894,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0017780831258830646,
+      "loss": 0.0977,
+      "step": 22699
+    },
+    {
+      "epoch": 0.1970469006345431,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001778063555713138,
+      "loss": 0.1484,
+      "step": 22700
+    },
+    {
+      "epoch": 0.19705558111474727,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0017780439848016914,
+      "loss": 0.127,
+      "step": 22701
+    },
+    {
+      "epoch": 0.19706426159495144,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001778024413148746,
+      "loss": 0.0811,
+      "step": 22702
+    },
+    {
+      "epoch": 0.1970729420751556,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0017780048407543233,
+      "loss": 0.1602,
+      "step": 22703
+    },
+    {
+      "epoch": 0.19708162255535977,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0017779852676184447,
+      "loss": 0.1035,
+      "step": 22704
+    },
+    {
+      "epoch": 0.19709030303556394,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0017779656937411317,
+      "loss": 0.1299,
+      "step": 22705
+    },
+    {
+      "epoch": 0.1970989835157681,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0017779461191224054,
+      "loss": 0.1113,
+      "step": 22706
+    },
+    {
+      "epoch": 0.19710766399597227,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0017779265437622876,
+      "loss": 0.1279,
+      "step": 22707
+    },
+    {
+      "epoch": 0.19711634447617643,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0017779069676607992,
+      "loss": 0.064,
+      "step": 22708
+    },
+    {
+      "epoch": 0.1971250249563806,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0017778873908179623,
+      "loss": 0.1328,
+      "step": 22709
+    },
+    {
+      "epoch": 0.19713370543658476,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0017778678132337978,
+      "loss": 0.1128,
+      "step": 22710
+    },
+    {
+      "epoch": 0.19714238591678893,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0017778482349083272,
+      "loss": 0.1074,
+      "step": 22711
+    },
+    {
+      "epoch": 0.1971510663969931,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001777828655841572,
+      "loss": 0.1777,
+      "step": 22712
+    },
+    {
+      "epoch": 0.19715974687719726,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0017778090760335534,
+      "loss": 0.1084,
+      "step": 22713
+    },
+    {
+      "epoch": 0.19716842735740142,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001777789495484293,
+      "loss": 0.1094,
+      "step": 22714
+    },
+    {
+      "epoch": 0.1971771078376056,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0017777699141938125,
+      "loss": 0.0859,
+      "step": 22715
+    },
+    {
+      "epoch": 0.19718578831780975,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0017777503321621327,
+      "loss": 0.1562,
+      "step": 22716
+    },
+    {
+      "epoch": 0.19719446879801392,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0017777307493892756,
+      "loss": 0.0811,
+      "step": 22717
+    },
+    {
+      "epoch": 0.19720314927821808,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001777711165875262,
+      "loss": 0.1113,
+      "step": 22718
+    },
+    {
+      "epoch": 0.19721182975842225,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0017776915816201137,
+      "loss": 0.1133,
+      "step": 22719
+    },
+    {
+      "epoch": 0.1972205102386264,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0017776719966238524,
+      "loss": 0.0977,
+      "step": 22720
+    },
+    {
+      "epoch": 0.19722919071883058,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001777652410886499,
+      "loss": 0.1123,
+      "step": 22721
+    },
+    {
+      "epoch": 0.19723787119903474,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0017776328244080751,
+      "loss": 0.0967,
+      "step": 22722
+    },
+    {
+      "epoch": 0.1972465516792389,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001777613237188602,
+      "loss": 0.1182,
+      "step": 22723
+    },
+    {
+      "epoch": 0.19725523215944307,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0017775936492281015,
+      "loss": 0.0767,
+      "step": 22724
+    },
+    {
+      "epoch": 0.19726391263964724,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0017775740605265947,
+      "loss": 0.1143,
+      "step": 22725
+    },
+    {
+      "epoch": 0.1972725931198514,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001777554471084103,
+      "loss": 0.1187,
+      "step": 22726
+    },
+    {
+      "epoch": 0.19728127360005557,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0017775348809006479,
+      "loss": 0.1035,
+      "step": 22727
+    },
+    {
+      "epoch": 0.19728995408025973,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0017775152899762508,
+      "loss": 0.0854,
+      "step": 22728
+    },
+    {
+      "epoch": 0.1972986345604639,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0017774956983109332,
+      "loss": 0.0967,
+      "step": 22729
+    },
+    {
+      "epoch": 0.19730731504066806,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0017774761059047165,
+      "loss": 0.0693,
+      "step": 22730
+    },
+    {
+      "epoch": 0.19731599552087223,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0017774565127576221,
+      "loss": 0.0747,
+      "step": 22731
+    },
+    {
+      "epoch": 0.1973246760010764,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0017774369188696714,
+      "loss": 0.0938,
+      "step": 22732
+    },
+    {
+      "epoch": 0.19733335648128053,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001777417324240886,
+      "loss": 0.0859,
+      "step": 22733
+    },
+    {
+      "epoch": 0.1973420369614847,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0017773977288712871,
+      "loss": 0.1094,
+      "step": 22734
+    },
+    {
+      "epoch": 0.19735071744168886,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001777378132760896,
+      "loss": 0.0938,
+      "step": 22735
+    },
+    {
+      "epoch": 0.19735939792189303,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001777358535909735,
+      "loss": 0.1177,
+      "step": 22736
+    },
+    {
+      "epoch": 0.1973680784020972,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0017773389383178243,
+      "loss": 0.0996,
+      "step": 22737
+    },
+    {
+      "epoch": 0.19737675888230136,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0017773193399851861,
+      "loss": 0.0859,
+      "step": 22738
+    },
+    {
+      "epoch": 0.19738543936250552,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0017772997409118414,
+      "loss": 0.0742,
+      "step": 22739
+    },
+    {
+      "epoch": 0.1973941198427097,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001777280141097812,
+      "loss": 0.0796,
+      "step": 22740
+    },
+    {
+      "epoch": 0.19740280032291385,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0017772605405431195,
+      "loss": 0.0708,
+      "step": 22741
+    },
+    {
+      "epoch": 0.19741148080311802,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0017772409392477848,
+      "loss": 0.1475,
+      "step": 22742
+    },
+    {
+      "epoch": 0.19742016128332218,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0017772213372118295,
+      "loss": 0.0806,
+      "step": 22743
+    },
+    {
+      "epoch": 0.19742884176352635,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0017772017344352756,
+      "loss": 0.125,
+      "step": 22744
+    },
+    {
+      "epoch": 0.1974375222437305,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0017771821309181435,
+      "loss": 0.1406,
+      "step": 22745
+    },
+    {
+      "epoch": 0.19744620272393468,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0017771625266604554,
+      "loss": 0.1064,
+      "step": 22746
+    },
+    {
+      "epoch": 0.19745488320413884,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0017771429216622326,
+      "loss": 0.1387,
+      "step": 22747
+    },
+    {
+      "epoch": 0.197463563684343,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0017771233159234966,
+      "loss": 0.0967,
+      "step": 22748
+    },
+    {
+      "epoch": 0.19747224416454717,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0017771037094442685,
+      "loss": 0.1216,
+      "step": 22749
+    },
+    {
+      "epoch": 0.19748092464475134,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0017770841022245697,
+      "loss": 0.0791,
+      "step": 22750
+    },
+    {
+      "epoch": 0.1974896051249555,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0017770644942644222,
+      "loss": 0.1621,
+      "step": 22751
+    },
+    {
+      "epoch": 0.19749828560515967,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0017770448855638473,
+      "loss": 0.1045,
+      "step": 22752
+    },
+    {
+      "epoch": 0.19750696608536383,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0017770252761228661,
+      "loss": 0.1367,
+      "step": 22753
+    },
+    {
+      "epoch": 0.197515646565568,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0017770056659415004,
+      "loss": 0.123,
+      "step": 22754
+    },
+    {
+      "epoch": 0.19752432704577216,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0017769860550197712,
+      "loss": 0.106,
+      "step": 22755
+    },
+    {
+      "epoch": 0.19753300752597633,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0017769664433577002,
+      "loss": 0.0815,
+      "step": 22756
+    },
+    {
+      "epoch": 0.1975416880061805,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017769468309553093,
+      "loss": 0.1279,
+      "step": 22757
+    },
+    {
+      "epoch": 0.19755036848638466,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001776927217812619,
+      "loss": 0.0923,
+      "step": 22758
+    },
+    {
+      "epoch": 0.19755904896658882,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0017769076039296516,
+      "loss": 0.0977,
+      "step": 22759
+    },
+    {
+      "epoch": 0.197567729446793,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.001776887989306428,
+      "loss": 0.1875,
+      "step": 22760
+    },
+    {
+      "epoch": 0.19757640992699715,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0017768683739429701,
+      "loss": 0.0908,
+      "step": 22761
+    },
+    {
+      "epoch": 0.19758509040720132,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.001776848757839299,
+      "loss": 0.1104,
+      "step": 22762
+    },
+    {
+      "epoch": 0.19759377088740548,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0017768291409954362,
+      "loss": 0.1191,
+      "step": 22763
+    },
+    {
+      "epoch": 0.19760245136760965,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0017768095234114033,
+      "loss": 0.1055,
+      "step": 22764
+    },
+    {
+      "epoch": 0.19761113184781381,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0017767899050872217,
+      "loss": 0.1357,
+      "step": 22765
+    },
+    {
+      "epoch": 0.19761981232801798,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0017767702860229126,
+      "loss": 0.0781,
+      "step": 22766
+    },
+    {
+      "epoch": 0.19762849280822214,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0017767506662184978,
+      "loss": 0.1045,
+      "step": 22767
+    },
+    {
+      "epoch": 0.1976371732884263,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0017767310456739988,
+      "loss": 0.0874,
+      "step": 22768
+    },
+    {
+      "epoch": 0.19764585376863048,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0017767114243894368,
+      "loss": 0.0693,
+      "step": 22769
+    },
+    {
+      "epoch": 0.19765453424883464,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0017766918023648338,
+      "loss": 0.0957,
+      "step": 22770
+    },
+    {
+      "epoch": 0.1976632147290388,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0017766721796002101,
+      "loss": 0.0967,
+      "step": 22771
+    },
+    {
+      "epoch": 0.19767189520924297,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0017766525560955879,
+      "loss": 0.1006,
+      "step": 22772
+    },
+    {
+      "epoch": 0.19768057568944714,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0017766329318509892,
+      "loss": 0.1162,
+      "step": 22773
+    },
+    {
+      "epoch": 0.1976892561696513,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0017766133068664346,
+      "loss": 0.1113,
+      "step": 22774
+    },
+    {
+      "epoch": 0.19769793664985547,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0017765936811419457,
+      "loss": 0.1279,
+      "step": 22775
+    },
+    {
+      "epoch": 0.19770661713005963,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0017765740546775441,
+      "loss": 0.1182,
+      "step": 22776
+    },
+    {
+      "epoch": 0.1977152976102638,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0017765544274732517,
+      "loss": 0.0796,
+      "step": 22777
+    },
+    {
+      "epoch": 0.19772397809046796,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001776534799529089,
+      "loss": 0.1084,
+      "step": 22778
+    },
+    {
+      "epoch": 0.19773265857067213,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0017765151708450783,
+      "loss": 0.1328,
+      "step": 22779
+    },
+    {
+      "epoch": 0.1977413390508763,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0017764955414212409,
+      "loss": 0.1152,
+      "step": 22780
+    },
+    {
+      "epoch": 0.19775001953108046,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0017764759112575978,
+      "loss": 0.1118,
+      "step": 22781
+    },
+    {
+      "epoch": 0.19775870001128462,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0017764562803541711,
+      "loss": 0.1523,
+      "step": 22782
+    },
+    {
+      "epoch": 0.1977673804914888,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001776436648710982,
+      "loss": 0.1104,
+      "step": 22783
+    },
+    {
+      "epoch": 0.19777606097169295,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0017764170163280517,
+      "loss": 0.0688,
+      "step": 22784
+    },
+    {
+      "epoch": 0.19778474145189712,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0017763973832054022,
+      "loss": 0.1416,
+      "step": 22785
+    },
+    {
+      "epoch": 0.19779342193210128,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0017763777493430542,
+      "loss": 0.1416,
+      "step": 22786
+    },
+    {
+      "epoch": 0.19780210241230545,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0017763581147410298,
+      "loss": 0.1367,
+      "step": 22787
+    },
+    {
+      "epoch": 0.1978107828925096,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0017763384793993509,
+      "loss": 0.1074,
+      "step": 22788
+    },
+    {
+      "epoch": 0.19781946337271378,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0017763188433180383,
+      "loss": 0.1045,
+      "step": 22789
+    },
+    {
+      "epoch": 0.19782814385291794,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0017762992064971129,
+      "loss": 0.1523,
+      "step": 22790
+    },
+    {
+      "epoch": 0.1978368243331221,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0017762795689365972,
+      "loss": 0.1045,
+      "step": 22791
+    },
+    {
+      "epoch": 0.19784550481332627,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0017762599306365124,
+      "loss": 0.0884,
+      "step": 22792
+    },
+    {
+      "epoch": 0.19785418529353044,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0017762402915968798,
+      "loss": 0.124,
+      "step": 22793
+    },
+    {
+      "epoch": 0.1978628657737346,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0017762206518177211,
+      "loss": 0.0859,
+      "step": 22794
+    },
+    {
+      "epoch": 0.19787154625393877,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0017762010112990576,
+      "loss": 0.0996,
+      "step": 22795
+    },
+    {
+      "epoch": 0.19788022673414293,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0017761813700409106,
+      "loss": 0.1279,
+      "step": 22796
+    },
+    {
+      "epoch": 0.1978889072143471,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001776161728043302,
+      "loss": 0.1079,
+      "step": 22797
+    },
+    {
+      "epoch": 0.19789758769455126,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001776142085306253,
+      "loss": 0.1299,
+      "step": 22798
+    },
+    {
+      "epoch": 0.19790626817475543,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0017761224418297854,
+      "loss": 0.1289,
+      "step": 22799
+    },
+    {
+      "epoch": 0.1979149486549596,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0017761027976139205,
+      "loss": 0.1592,
+      "step": 22800
+    },
+    {
+      "epoch": 0.19792362913516376,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0017760831526586797,
+      "loss": 0.1104,
+      "step": 22801
+    },
+    {
+      "epoch": 0.19793230961536792,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0017760635069640842,
+      "loss": 0.1426,
+      "step": 22802
+    },
+    {
+      "epoch": 0.1979409900955721,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0017760438605301562,
+      "loss": 0.0869,
+      "step": 22803
+    },
+    {
+      "epoch": 0.19794967057577625,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0017760242133569165,
+      "loss": 0.1021,
+      "step": 22804
+    },
+    {
+      "epoch": 0.19795835105598042,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0017760045654443871,
+      "loss": 0.1201,
+      "step": 22805
+    },
+    {
+      "epoch": 0.19796703153618458,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0017759849167925893,
+      "loss": 0.1797,
+      "step": 22806
+    },
+    {
+      "epoch": 0.19797571201638875,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001775965267401544,
+      "loss": 0.1016,
+      "step": 22807
+    },
+    {
+      "epoch": 0.19798439249659291,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001775945617271274,
+      "loss": 0.0801,
+      "step": 22808
+    },
+    {
+      "epoch": 0.19799307297679708,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0017759259664017995,
+      "loss": 0.1094,
+      "step": 22809
+    },
+    {
+      "epoch": 0.19800175345700124,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0017759063147931428,
+      "loss": 0.0854,
+      "step": 22810
+    },
+    {
+      "epoch": 0.1980104339372054,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001775886662445325,
+      "loss": 0.1045,
+      "step": 22811
+    },
+    {
+      "epoch": 0.19801911441740958,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0017758670093583674,
+      "loss": 0.0957,
+      "step": 22812
+    },
+    {
+      "epoch": 0.19802779489761374,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0017758473555322925,
+      "loss": 0.1162,
+      "step": 22813
+    },
+    {
+      "epoch": 0.1980364753778179,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0017758277009671205,
+      "loss": 0.1055,
+      "step": 22814
+    },
+    {
+      "epoch": 0.19804515585802207,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0017758080456628735,
+      "loss": 0.1021,
+      "step": 22815
+    },
+    {
+      "epoch": 0.19805383633822624,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0017757883896195733,
+      "loss": 0.1787,
+      "step": 22816
+    },
+    {
+      "epoch": 0.1980625168184304,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0017757687328372408,
+      "loss": 0.0796,
+      "step": 22817
+    },
+    {
+      "epoch": 0.19807119729863457,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0017757490753158981,
+      "loss": 0.1055,
+      "step": 22818
+    },
+    {
+      "epoch": 0.19807987777883873,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001775729417055566,
+      "loss": 0.1641,
+      "step": 22819
+    },
+    {
+      "epoch": 0.1980885582590429,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0017757097580562665,
+      "loss": 0.0996,
+      "step": 22820
+    },
+    {
+      "epoch": 0.19809723873924706,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001775690098318021,
+      "loss": 0.0938,
+      "step": 22821
+    },
+    {
+      "epoch": 0.19810591921945123,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0017756704378408508,
+      "loss": 0.127,
+      "step": 22822
+    },
+    {
+      "epoch": 0.1981145996996554,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0017756507766247777,
+      "loss": 0.1006,
+      "step": 22823
+    },
+    {
+      "epoch": 0.19812328017985956,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001775631114669823,
+      "loss": 0.1025,
+      "step": 22824
+    },
+    {
+      "epoch": 0.19813196066006372,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0017756114519760083,
+      "loss": 0.0981,
+      "step": 22825
+    },
+    {
+      "epoch": 0.1981406411402679,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001775591788543355,
+      "loss": 0.1094,
+      "step": 22826
+    },
+    {
+      "epoch": 0.19814932162047205,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001775572124371885,
+      "loss": 0.1079,
+      "step": 22827
+    },
+    {
+      "epoch": 0.19815800210067622,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0017755524594616193,
+      "loss": 0.125,
+      "step": 22828
+    },
+    {
+      "epoch": 0.19816668258088038,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001775532793812579,
+      "loss": 0.5391,
+      "step": 22829
+    },
+    {
+      "epoch": 0.19817536306108455,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001775513127424787,
+      "loss": 0.1172,
+      "step": 22830
+    },
+    {
+      "epoch": 0.1981840435412887,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0017754934602982634,
+      "loss": 0.1006,
+      "step": 22831
+    },
+    {
+      "epoch": 0.19819272402149288,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0017754737924330307,
+      "loss": 0.106,
+      "step": 22832
+    },
+    {
+      "epoch": 0.19820140450169704,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00177545412382911,
+      "loss": 0.1084,
+      "step": 22833
+    },
+    {
+      "epoch": 0.1982100849819012,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0017754344544865226,
+      "loss": 0.106,
+      "step": 22834
+    },
+    {
+      "epoch": 0.19821876546210537,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0017754147844052904,
+      "loss": 0.0879,
+      "step": 22835
+    },
+    {
+      "epoch": 0.19822744594230954,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001775395113585435,
+      "loss": 0.1191,
+      "step": 22836
+    },
+    {
+      "epoch": 0.1982361264225137,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0017753754420269771,
+      "loss": 0.0884,
+      "step": 22837
+    },
+    {
+      "epoch": 0.19824480690271787,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001775355769729939,
+      "loss": 0.1025,
+      "step": 22838
+    },
+    {
+      "epoch": 0.19825348738292203,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.001775336096694342,
+      "loss": 0.1123,
+      "step": 22839
+    },
+    {
+      "epoch": 0.1982621678631262,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0017753164229202074,
+      "loss": 0.0889,
+      "step": 22840
+    },
+    {
+      "epoch": 0.19827084834333036,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001775296748407557,
+      "loss": 0.1348,
+      "step": 22841
+    },
+    {
+      "epoch": 0.19827952882353453,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0017752770731564126,
+      "loss": 0.104,
+      "step": 22842
+    },
+    {
+      "epoch": 0.1982882093037387,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0017752573971667948,
+      "loss": 0.1445,
+      "step": 22843
+    },
+    {
+      "epoch": 0.19829688978394286,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001775237720438726,
+      "loss": 0.1475,
+      "step": 22844
+    },
+    {
+      "epoch": 0.19830557026414702,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0017752180429722274,
+      "loss": 0.0742,
+      "step": 22845
+    },
+    {
+      "epoch": 0.1983142507443512,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0017751983647673203,
+      "loss": 0.1006,
+      "step": 22846
+    },
+    {
+      "epoch": 0.19832293122455535,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0017751786858240263,
+      "loss": 0.1357,
+      "step": 22847
+    },
+    {
+      "epoch": 0.19833161170475952,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0017751590061423673,
+      "loss": 0.083,
+      "step": 22848
+    },
+    {
+      "epoch": 0.19834029218496368,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0017751393257223645,
+      "loss": 0.0913,
+      "step": 22849
+    },
+    {
+      "epoch": 0.19834897266516785,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0017751196445640394,
+      "loss": 0.1182,
+      "step": 22850
+    },
+    {
+      "epoch": 0.19835765314537201,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0017750999626674135,
+      "loss": 0.1562,
+      "step": 22851
+    },
+    {
+      "epoch": 0.19836633362557618,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0017750802800325087,
+      "loss": 0.1025,
+      "step": 22852
+    },
+    {
+      "epoch": 0.19837501410578035,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001775060596659346,
+      "loss": 0.1328,
+      "step": 22853
+    },
+    {
+      "epoch": 0.1983836945859845,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0017750409125479474,
+      "loss": 0.0928,
+      "step": 22854
+    },
+    {
+      "epoch": 0.19839237506618868,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001775021227698334,
+      "loss": 0.1035,
+      "step": 22855
+    },
+    {
+      "epoch": 0.1984010555463928,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017750015421105277,
+      "loss": 0.1206,
+      "step": 22856
+    },
+    {
+      "epoch": 0.19840973602659698,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.00177498185578455,
+      "loss": 0.1162,
+      "step": 22857
+    },
+    {
+      "epoch": 0.19841841650680114,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001774962168720422,
+      "loss": 0.1128,
+      "step": 22858
+    },
+    {
+      "epoch": 0.1984270969870053,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0017749424809181654,
+      "loss": 0.1279,
+      "step": 22859
+    },
+    {
+      "epoch": 0.19843577746720947,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0017749227923778022,
+      "loss": 0.1182,
+      "step": 22860
+    },
+    {
+      "epoch": 0.19844445794741364,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0017749031030993533,
+      "loss": 0.0874,
+      "step": 22861
+    },
+    {
+      "epoch": 0.1984531384276178,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0017748834130828404,
+      "loss": 0.0737,
+      "step": 22862
+    },
+    {
+      "epoch": 0.19846181890782197,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0017748637223282853,
+      "loss": 0.1226,
+      "step": 22863
+    },
+    {
+      "epoch": 0.19847049938802613,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0017748440308357095,
+      "loss": 0.1099,
+      "step": 22864
+    },
+    {
+      "epoch": 0.1984791798682303,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0017748243386051343,
+      "loss": 0.1348,
+      "step": 22865
+    },
+    {
+      "epoch": 0.19848786034843446,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0017748046456365814,
+      "loss": 0.1465,
+      "step": 22866
+    },
+    {
+      "epoch": 0.19849654082863863,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0017747849519300723,
+      "loss": 0.1108,
+      "step": 22867
+    },
+    {
+      "epoch": 0.1985052213088428,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001774765257485628,
+      "loss": 0.1377,
+      "step": 22868
+    },
+    {
+      "epoch": 0.19851390178904696,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0017747455623032713,
+      "loss": 0.1064,
+      "step": 22869
+    },
+    {
+      "epoch": 0.19852258226925112,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0017747258663830228,
+      "loss": 0.1162,
+      "step": 22870
+    },
+    {
+      "epoch": 0.1985312627494553,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.001774706169724904,
+      "loss": 0.1055,
+      "step": 22871
+    },
+    {
+      "epoch": 0.19853994322965945,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0017746864723289367,
+      "loss": 0.1074,
+      "step": 22872
+    },
+    {
+      "epoch": 0.19854862370986362,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0017746667741951425,
+      "loss": 0.105,
+      "step": 22873
+    },
+    {
+      "epoch": 0.19855730419006778,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0017746470753235428,
+      "loss": 0.1084,
+      "step": 22874
+    },
+    {
+      "epoch": 0.19856598467027195,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0017746273757141594,
+      "loss": 0.0771,
+      "step": 22875
+    },
+    {
+      "epoch": 0.19857466515047612,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0017746076753670135,
+      "loss": 0.0854,
+      "step": 22876
+    },
+    {
+      "epoch": 0.19858334563068028,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0017745879742821268,
+      "loss": 0.1016,
+      "step": 22877
+    },
+    {
+      "epoch": 0.19859202611088445,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0017745682724595207,
+      "loss": 0.123,
+      "step": 22878
+    },
+    {
+      "epoch": 0.1986007065910886,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001774548569899217,
+      "loss": 0.1094,
+      "step": 22879
+    },
+    {
+      "epoch": 0.19860938707129278,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0017745288666012372,
+      "loss": 0.0933,
+      "step": 22880
+    },
+    {
+      "epoch": 0.19861806755149694,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0017745091625656028,
+      "loss": 0.0708,
+      "step": 22881
+    },
+    {
+      "epoch": 0.1986267480317011,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001774489457792335,
+      "loss": 0.1104,
+      "step": 22882
+    },
+    {
+      "epoch": 0.19863542851190527,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0017744697522814558,
+      "loss": 0.1504,
+      "step": 22883
+    },
+    {
+      "epoch": 0.19864410899210944,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0017744500460329866,
+      "loss": 0.1045,
+      "step": 22884
+    },
+    {
+      "epoch": 0.1986527894723136,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001774430339046949,
+      "loss": 0.0898,
+      "step": 22885
+    },
+    {
+      "epoch": 0.19866146995251777,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0017744106313233646,
+      "loss": 0.0767,
+      "step": 22886
+    },
+    {
+      "epoch": 0.19867015043272193,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0017743909228622545,
+      "loss": 0.1436,
+      "step": 22887
+    },
+    {
+      "epoch": 0.1986788309129261,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001774371213663641,
+      "loss": 0.1021,
+      "step": 22888
+    },
+    {
+      "epoch": 0.19868751139313026,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0017743515037275455,
+      "loss": 0.1074,
+      "step": 22889
+    },
+    {
+      "epoch": 0.19869619187333443,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0017743317930539889,
+      "loss": 0.1328,
+      "step": 22890
+    },
+    {
+      "epoch": 0.1987048723535386,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0017743120816429933,
+      "loss": 0.1562,
+      "step": 22891
+    },
+    {
+      "epoch": 0.19871355283374276,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.00177429236949458,
+      "loss": 0.0859,
+      "step": 22892
+    },
+    {
+      "epoch": 0.19872223331394692,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0017742726566087707,
+      "loss": 0.0889,
+      "step": 22893
+    },
+    {
+      "epoch": 0.1987309137941511,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0017742529429855872,
+      "loss": 0.1475,
+      "step": 22894
+    },
+    {
+      "epoch": 0.19873959427435525,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0017742332286250506,
+      "loss": 0.1108,
+      "step": 22895
+    },
+    {
+      "epoch": 0.19874827475455942,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0017742135135271828,
+      "loss": 0.1128,
+      "step": 22896
+    },
+    {
+      "epoch": 0.19875695523476358,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0017741937976920052,
+      "loss": 0.1279,
+      "step": 22897
+    },
+    {
+      "epoch": 0.19876563571496775,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0017741740811195396,
+      "loss": 0.1045,
+      "step": 22898
+    },
+    {
+      "epoch": 0.1987743161951719,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001774154363809807,
+      "loss": 0.1377,
+      "step": 22899
+    },
+    {
+      "epoch": 0.19878299667537608,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0017741346457628296,
+      "loss": 0.1289,
+      "step": 22900
+    },
+    {
+      "epoch": 0.19879167715558024,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017741149269786284,
+      "loss": 0.0957,
+      "step": 22901
+    },
+    {
+      "epoch": 0.1988003576357844,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0017740952074572255,
+      "loss": 0.0894,
+      "step": 22902
+    },
+    {
+      "epoch": 0.19880903811598857,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0017740754871986418,
+      "loss": 0.1406,
+      "step": 22903
+    },
+    {
+      "epoch": 0.19881771859619274,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0017740557662028998,
+      "loss": 0.0693,
+      "step": 22904
+    },
+    {
+      "epoch": 0.1988263990763969,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.00177403604447002,
+      "loss": 0.1318,
+      "step": 22905
+    },
+    {
+      "epoch": 0.19883507955660107,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001774016322000025,
+      "loss": 0.1177,
+      "step": 22906
+    },
+    {
+      "epoch": 0.19884376003680523,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0017739965987929357,
+      "loss": 0.1011,
+      "step": 22907
+    },
+    {
+      "epoch": 0.1988524405170094,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001773976874848774,
+      "loss": 0.3203,
+      "step": 22908
+    },
+    {
+      "epoch": 0.19886112099721356,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0017739571501675612,
+      "loss": 0.0737,
+      "step": 22909
+    },
+    {
+      "epoch": 0.19886980147741773,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0017739374247493188,
+      "loss": 0.1123,
+      "step": 22910
+    },
+    {
+      "epoch": 0.1988784819576219,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0017739176985940687,
+      "loss": 0.1357,
+      "step": 22911
+    },
+    {
+      "epoch": 0.19888716243782606,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0017738979717018323,
+      "loss": 0.1094,
+      "step": 22912
+    },
+    {
+      "epoch": 0.19889584291803022,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001773878244072631,
+      "loss": 0.1089,
+      "step": 22913
+    },
+    {
+      "epoch": 0.1989045233982344,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001773858515706487,
+      "loss": 0.063,
+      "step": 22914
+    },
+    {
+      "epoch": 0.19891320387843855,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.001773838786603421,
+      "loss": 0.1128,
+      "step": 22915
+    },
+    {
+      "epoch": 0.19892188435864272,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0017738190567634555,
+      "loss": 0.1396,
+      "step": 22916
+    },
+    {
+      "epoch": 0.19893056483884689,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0017737993261866115,
+      "loss": 0.0942,
+      "step": 22917
+    },
+    {
+      "epoch": 0.19893924531905105,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0017737795948729105,
+      "loss": 0.0938,
+      "step": 22918
+    },
+    {
+      "epoch": 0.19894792579925522,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0017737598628223743,
+      "loss": 0.1055,
+      "step": 22919
+    },
+    {
+      "epoch": 0.19895660627945938,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0017737401300350244,
+      "loss": 0.1133,
+      "step": 22920
+    },
+    {
+      "epoch": 0.19896528675966355,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017737203965108826,
+      "loss": 0.127,
+      "step": 22921
+    },
+    {
+      "epoch": 0.1989739672398677,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0017737006622499704,
+      "loss": 0.1152,
+      "step": 22922
+    },
+    {
+      "epoch": 0.19898264772007188,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001773680927252309,
+      "loss": 0.1162,
+      "step": 22923
+    },
+    {
+      "epoch": 0.19899132820027604,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0017736611915179204,
+      "loss": 0.1079,
+      "step": 22924
+    },
+    {
+      "epoch": 0.1990000086804802,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0017736414550468258,
+      "loss": 0.1064,
+      "step": 22925
+    },
+    {
+      "epoch": 0.19900868916068437,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0017736217178390473,
+      "loss": 0.1475,
+      "step": 22926
+    },
+    {
+      "epoch": 0.19901736964088854,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0017736019798946059,
+      "loss": 0.1475,
+      "step": 22927
+    },
+    {
+      "epoch": 0.1990260501210927,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0017735822412135239,
+      "loss": 0.0952,
+      "step": 22928
+    },
+    {
+      "epoch": 0.19903473060129687,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0017735625017958224,
+      "loss": 0.1055,
+      "step": 22929
+    },
+    {
+      "epoch": 0.19904341108150103,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0017735427616415228,
+      "loss": 0.1016,
+      "step": 22930
+    },
+    {
+      "epoch": 0.1990520915617052,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0017735230207506472,
+      "loss": 0.0728,
+      "step": 22931
+    },
+    {
+      "epoch": 0.19906077204190936,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0017735032791232168,
+      "loss": 0.1201,
+      "step": 22932
+    },
+    {
+      "epoch": 0.19906945252211353,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0017734835367592533,
+      "loss": 0.1113,
+      "step": 22933
+    },
+    {
+      "epoch": 0.1990781330023177,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0017734637936587785,
+      "loss": 0.1216,
+      "step": 22934
+    },
+    {
+      "epoch": 0.19908681348252186,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0017734440498218137,
+      "loss": 0.1084,
+      "step": 22935
+    },
+    {
+      "epoch": 0.19909549396272602,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0017734243052483806,
+      "loss": 0.1426,
+      "step": 22936
+    },
+    {
+      "epoch": 0.1991041744429302,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0017734045599385006,
+      "loss": 0.1079,
+      "step": 22937
+    },
+    {
+      "epoch": 0.19911285492313435,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001773384813892196,
+      "loss": 0.0908,
+      "step": 22938
+    },
+    {
+      "epoch": 0.19912153540333852,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0017733650671094875,
+      "loss": 0.0908,
+      "step": 22939
+    },
+    {
+      "epoch": 0.19913021588354268,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001773345319590397,
+      "loss": 0.1211,
+      "step": 22940
+    },
+    {
+      "epoch": 0.19913889636374685,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0017733255713349462,
+      "loss": 0.125,
+      "step": 22941
+    },
+    {
+      "epoch": 0.199147576843951,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001773305822343157,
+      "loss": 0.1416,
+      "step": 22942
+    },
+    {
+      "epoch": 0.19915625732415518,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.001773286072615051,
+      "loss": 0.1128,
+      "step": 22943
+    },
+    {
+      "epoch": 0.19916493780435934,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0017732663221506487,
+      "loss": 0.1318,
+      "step": 22944
+    },
+    {
+      "epoch": 0.1991736182845635,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0017732465709499727,
+      "loss": 0.1064,
+      "step": 22945
+    },
+    {
+      "epoch": 0.19918229876476767,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0017732268190130444,
+      "loss": 0.083,
+      "step": 22946
+    },
+    {
+      "epoch": 0.19919097924497184,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0017732070663398851,
+      "loss": 0.1826,
+      "step": 22947
+    },
+    {
+      "epoch": 0.199199659725176,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0017731873129305169,
+      "loss": 0.0718,
+      "step": 22948
+    },
+    {
+      "epoch": 0.19920834020538017,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0017731675587849614,
+      "loss": 0.1099,
+      "step": 22949
+    },
+    {
+      "epoch": 0.19921702068558433,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0017731478039032399,
+      "loss": 0.0986,
+      "step": 22950
+    },
+    {
+      "epoch": 0.1992257011657885,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0017731280482853739,
+      "loss": 0.1299,
+      "step": 22951
+    },
+    {
+      "epoch": 0.19923438164599266,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.001773108291931385,
+      "loss": 0.0791,
+      "step": 22952
+    },
+    {
+      "epoch": 0.19924306212619683,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0017730885348412953,
+      "loss": 0.0957,
+      "step": 22953
+    },
+    {
+      "epoch": 0.199251742606401,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001773068777015126,
+      "loss": 0.1094,
+      "step": 22954
+    },
+    {
+      "epoch": 0.19926042308660516,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001773049018452899,
+      "loss": 0.1299,
+      "step": 22955
+    },
+    {
+      "epoch": 0.19926910356680932,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0017730292591546352,
+      "loss": 0.1914,
+      "step": 22956
+    },
+    {
+      "epoch": 0.1992777840470135,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0017730094991203571,
+      "loss": 0.1475,
+      "step": 22957
+    },
+    {
+      "epoch": 0.19928646452721765,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001772989738350086,
+      "loss": 0.125,
+      "step": 22958
+    },
+    {
+      "epoch": 0.19929514500742182,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0017729699768438431,
+      "loss": 0.0947,
+      "step": 22959
+    },
+    {
+      "epoch": 0.19930382548762599,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0017729502146016508,
+      "loss": 0.1206,
+      "step": 22960
+    },
+    {
+      "epoch": 0.19931250596783015,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0017729304516235298,
+      "loss": 0.1123,
+      "step": 22961
+    },
+    {
+      "epoch": 0.19932118644803432,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0017729106879095023,
+      "loss": 0.1133,
+      "step": 22962
+    },
+    {
+      "epoch": 0.19932986692823848,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0017728909234595899,
+      "loss": 0.1504,
+      "step": 22963
+    },
+    {
+      "epoch": 0.19933854740844265,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001772871158273814,
+      "loss": 0.0762,
+      "step": 22964
+    },
+    {
+      "epoch": 0.1993472278886468,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0017728513923521963,
+      "loss": 0.0806,
+      "step": 22965
+    },
+    {
+      "epoch": 0.19935590836885098,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0017728316256947583,
+      "loss": 0.1045,
+      "step": 22966
+    },
+    {
+      "epoch": 0.19936458884905514,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001772811858301522,
+      "loss": 0.1221,
+      "step": 22967
+    },
+    {
+      "epoch": 0.1993732693292593,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0017727920901725086,
+      "loss": 0.1045,
+      "step": 22968
+    },
+    {
+      "epoch": 0.19938194980946347,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0017727723213077401,
+      "loss": 0.0977,
+      "step": 22969
+    },
+    {
+      "epoch": 0.19939063028966764,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0017727525517072377,
+      "loss": 0.0918,
+      "step": 22970
+    },
+    {
+      "epoch": 0.1993993107698718,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0017727327813710232,
+      "loss": 0.0981,
+      "step": 22971
+    },
+    {
+      "epoch": 0.19940799125007597,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0017727130102991185,
+      "loss": 0.1641,
+      "step": 22972
+    },
+    {
+      "epoch": 0.19941667173028013,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0017726932384915445,
+      "loss": 0.0996,
+      "step": 22973
+    },
+    {
+      "epoch": 0.1994253522104843,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0017726734659483235,
+      "loss": 0.1182,
+      "step": 22974
+    },
+    {
+      "epoch": 0.19943403269068846,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0017726536926694772,
+      "loss": 0.0928,
+      "step": 22975
+    },
+    {
+      "epoch": 0.19944271317089263,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0017726339186550266,
+      "loss": 0.1533,
+      "step": 22976
+    },
+    {
+      "epoch": 0.1994513936510968,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0017726141439049937,
+      "loss": 0.1021,
+      "step": 22977
+    },
+    {
+      "epoch": 0.19946007413130096,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0017725943684194002,
+      "loss": 0.1016,
+      "step": 22978
+    },
+    {
+      "epoch": 0.1994687546115051,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0017725745921982675,
+      "loss": 0.1187,
+      "step": 22979
+    },
+    {
+      "epoch": 0.19947743509170926,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0017725548152416172,
+      "loss": 0.085,
+      "step": 22980
+    },
+    {
+      "epoch": 0.19948611557191342,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0017725350375494711,
+      "loss": 0.0923,
+      "step": 22981
+    },
+    {
+      "epoch": 0.1994947960521176,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001772515259121851,
+      "loss": 0.1436,
+      "step": 22982
+    },
+    {
+      "epoch": 0.19950347653232176,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0017724954799587783,
+      "loss": 0.1211,
+      "step": 22983
+    },
+    {
+      "epoch": 0.19951215701252592,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0017724757000602744,
+      "loss": 0.1396,
+      "step": 22984
+    },
+    {
+      "epoch": 0.19952083749273009,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0017724559194263615,
+      "loss": 0.0972,
+      "step": 22985
+    },
+    {
+      "epoch": 0.19952951797293425,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0017724361380570606,
+      "loss": 0.0762,
+      "step": 22986
+    },
+    {
+      "epoch": 0.19953819845313842,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0017724163559523938,
+      "loss": 0.127,
+      "step": 22987
+    },
+    {
+      "epoch": 0.19954687893334258,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0017723965731123825,
+      "loss": 0.1133,
+      "step": 22988
+    },
+    {
+      "epoch": 0.19955555941354675,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0017723767895370486,
+      "loss": 0.1143,
+      "step": 22989
+    },
+    {
+      "epoch": 0.1995642398937509,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0017723570052264133,
+      "loss": 0.1162,
+      "step": 22990
+    },
+    {
+      "epoch": 0.19957292037395508,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0017723372201804987,
+      "loss": 0.1104,
+      "step": 22991
+    },
+    {
+      "epoch": 0.19958160085415924,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0017723174343993262,
+      "loss": 0.0938,
+      "step": 22992
+    },
+    {
+      "epoch": 0.1995902813343634,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0017722976478829172,
+      "loss": 0.1016,
+      "step": 22993
+    },
+    {
+      "epoch": 0.19959896181456757,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0017722778606312937,
+      "loss": 0.1885,
+      "step": 22994
+    },
+    {
+      "epoch": 0.19960764229477174,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0017722580726444773,
+      "loss": 0.1133,
+      "step": 22995
+    },
+    {
+      "epoch": 0.1996163227749759,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0017722382839224895,
+      "loss": 0.1079,
+      "step": 22996
+    },
+    {
+      "epoch": 0.19962500325518007,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0017722184944653522,
+      "loss": 0.0981,
+      "step": 22997
+    },
+    {
+      "epoch": 0.19963368373538423,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0017721987042730867,
+      "loss": 0.1309,
+      "step": 22998
+    },
+    {
+      "epoch": 0.1996423642155884,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001772178913345715,
+      "loss": 0.1177,
+      "step": 22999
+    },
+    {
+      "epoch": 0.19965104469579256,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0017721591216832582,
+      "loss": 0.1172,
+      "step": 23000
+    },
+    {
+      "epoch": 0.19965972517599673,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0017721393292857387,
+      "loss": 0.1152,
+      "step": 23001
+    },
+    {
+      "epoch": 0.1996684056562009,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0017721195361531771,
+      "loss": 0.1406,
+      "step": 23002
+    },
+    {
+      "epoch": 0.19967708613640506,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0017720997422855961,
+      "loss": 0.1011,
+      "step": 23003
+    },
+    {
+      "epoch": 0.19968576661660922,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0017720799476830169,
+      "loss": 0.1299,
+      "step": 23004
+    },
+    {
+      "epoch": 0.1996944470968134,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0017720601523454613,
+      "loss": 0.0962,
+      "step": 23005
+    },
+    {
+      "epoch": 0.19970312757701755,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0017720403562729505,
+      "loss": 0.0879,
+      "step": 23006
+    },
+    {
+      "epoch": 0.19971180805722172,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0017720205594655067,
+      "loss": 0.085,
+      "step": 23007
+    },
+    {
+      "epoch": 0.19972048853742588,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0017720007619231512,
+      "loss": 0.0889,
+      "step": 23008
+    },
+    {
+      "epoch": 0.19972916901763005,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001771980963645906,
+      "loss": 0.1074,
+      "step": 23009
+    },
+    {
+      "epoch": 0.1997378494978342,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017719611646337923,
+      "loss": 0.123,
+      "step": 23010
+    },
+    {
+      "epoch": 0.19974652997803838,
+      "grad_norm": 2.34375,
+      "learning_rate": 0.0017719413648868319,
+      "loss": 0.6016,
+      "step": 23011
+    },
+    {
+      "epoch": 0.19975521045824254,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0017719215644050467,
+      "loss": 0.0918,
+      "step": 23012
+    },
+    {
+      "epoch": 0.1997638909384467,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001771901763188458,
+      "loss": 0.1167,
+      "step": 23013
+    },
+    {
+      "epoch": 0.19977257141865087,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0017718819612370876,
+      "loss": 0.1387,
+      "step": 23014
+    },
+    {
+      "epoch": 0.19978125189885504,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0017718621585509574,
+      "loss": 0.1104,
+      "step": 23015
+    },
+    {
+      "epoch": 0.1997899323790592,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017718423551300886,
+      "loss": 0.0986,
+      "step": 23016
+    },
+    {
+      "epoch": 0.19979861285926337,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0017718225509745032,
+      "loss": 0.1475,
+      "step": 23017
+    },
+    {
+      "epoch": 0.19980729333946753,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0017718027460842227,
+      "loss": 0.1035,
+      "step": 23018
+    },
+    {
+      "epoch": 0.1998159738196717,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.001771782940459269,
+      "loss": 0.0947,
+      "step": 23019
+    },
+    {
+      "epoch": 0.19982465429987586,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0017717631340996634,
+      "loss": 0.1064,
+      "step": 23020
+    },
+    {
+      "epoch": 0.19983333478008003,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0017717433270054278,
+      "loss": 0.0835,
+      "step": 23021
+    },
+    {
+      "epoch": 0.1998420152602842,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0017717235191765839,
+      "loss": 0.1162,
+      "step": 23022
+    },
+    {
+      "epoch": 0.19985069574048836,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001771703710613153,
+      "loss": 0.1289,
+      "step": 23023
+    },
+    {
+      "epoch": 0.19985937622069253,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.001771683901315157,
+      "loss": 0.1235,
+      "step": 23024
+    },
+    {
+      "epoch": 0.1998680567008967,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0017716640912826178,
+      "loss": 0.0908,
+      "step": 23025
+    },
+    {
+      "epoch": 0.19987673718110086,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001771644280515557,
+      "loss": 0.0669,
+      "step": 23026
+    },
+    {
+      "epoch": 0.19988541766130502,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0017716244690139955,
+      "loss": 0.0933,
+      "step": 23027
+    },
+    {
+      "epoch": 0.19989409814150919,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0017716046567779562,
+      "loss": 0.0947,
+      "step": 23028
+    },
+    {
+      "epoch": 0.19990277862171335,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0017715848438074598,
+      "loss": 0.0762,
+      "step": 23029
+    },
+    {
+      "epoch": 0.19991145910191752,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0017715650301025284,
+      "loss": 0.1152,
+      "step": 23030
+    },
+    {
+      "epoch": 0.19992013958212168,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0017715452156631835,
+      "loss": 0.1592,
+      "step": 23031
+    },
+    {
+      "epoch": 0.19992882006232585,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0017715254004894469,
+      "loss": 0.1011,
+      "step": 23032
+    },
+    {
+      "epoch": 0.19993750054253,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0017715055845813402,
+      "loss": 0.0664,
+      "step": 23033
+    },
+    {
+      "epoch": 0.19994618102273418,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0017714857679388852,
+      "loss": 0.0972,
+      "step": 23034
+    },
+    {
+      "epoch": 0.19995486150293834,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001771465950562103,
+      "loss": 0.084,
+      "step": 23035
+    },
+    {
+      "epoch": 0.1999635419831425,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001771446132451016,
+      "loss": 0.1289,
+      "step": 23036
+    },
+    {
+      "epoch": 0.19997222246334667,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0017714263136056458,
+      "loss": 0.1016,
+      "step": 23037
+    },
+    {
+      "epoch": 0.19998090294355084,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0017714064940260138,
+      "loss": 0.1641,
+      "step": 23038
+    },
+    {
+      "epoch": 0.199989583423755,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0017713866737121416,
+      "loss": 0.1602,
+      "step": 23039
+    },
+    {
+      "epoch": 0.19999826390395917,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0017713668526640513,
+      "loss": 0.0801,
+      "step": 23040
+    },
+    {
+      "epoch": 0.20000694438416333,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001771347030881764,
+      "loss": 0.126,
+      "step": 23041
+    },
+    {
+      "epoch": 0.2000156248643675,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0017713272083653018,
+      "loss": 0.1396,
+      "step": 23042
+    },
+    {
+      "epoch": 0.20002430534457166,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0017713073851146862,
+      "loss": 0.1182,
+      "step": 23043
+    },
+    {
+      "epoch": 0.20003298582477583,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001771287561129939,
+      "loss": 0.084,
+      "step": 23044
+    },
+    {
+      "epoch": 0.20004166630498,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0017712677364110816,
+      "loss": 0.0996,
+      "step": 23045
+    },
+    {
+      "epoch": 0.20005034678518416,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0017712479109581358,
+      "loss": 0.1191,
+      "step": 23046
+    },
+    {
+      "epoch": 0.20005902726538832,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0017712280847711238,
+      "loss": 0.1045,
+      "step": 23047
+    },
+    {
+      "epoch": 0.2000677077455925,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017712082578500668,
+      "loss": 0.0898,
+      "step": 23048
+    },
+    {
+      "epoch": 0.20007638822579665,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0017711884301949866,
+      "loss": 0.0928,
+      "step": 23049
+    },
+    {
+      "epoch": 0.20008506870600082,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0017711686018059044,
+      "loss": 0.1357,
+      "step": 23050
+    },
+    {
+      "epoch": 0.20009374918620498,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0017711487726828426,
+      "loss": 0.0957,
+      "step": 23051
+    },
+    {
+      "epoch": 0.20010242966640915,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017711289428258225,
+      "loss": 0.1621,
+      "step": 23052
+    },
+    {
+      "epoch": 0.2001111101466133,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001771109112234866,
+      "loss": 0.0869,
+      "step": 23053
+    },
+    {
+      "epoch": 0.20011979062681748,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0017710892809099945,
+      "loss": 0.1387,
+      "step": 23054
+    },
+    {
+      "epoch": 0.20012847110702164,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0017710694488512295,
+      "loss": 0.0908,
+      "step": 23055
+    },
+    {
+      "epoch": 0.2001371515872258,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017710496160585937,
+      "loss": 0.1387,
+      "step": 23056
+    },
+    {
+      "epoch": 0.20014583206742997,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001771029782532108,
+      "loss": 0.1738,
+      "step": 23057
+    },
+    {
+      "epoch": 0.20015451254763414,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0017710099482717938,
+      "loss": 0.1216,
+      "step": 23058
+    },
+    {
+      "epoch": 0.2001631930278383,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.0017709901132776735,
+      "loss": 0.1289,
+      "step": 23059
+    },
+    {
+      "epoch": 0.20017187350804247,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0017709702775497685,
+      "loss": 0.1484,
+      "step": 23060
+    },
+    {
+      "epoch": 0.20018055398824663,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0017709504410881004,
+      "loss": 0.1289,
+      "step": 23061
+    },
+    {
+      "epoch": 0.2001892344684508,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0017709306038926908,
+      "loss": 0.1191,
+      "step": 23062
+    },
+    {
+      "epoch": 0.20019791494865496,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001770910765963562,
+      "loss": 0.125,
+      "step": 23063
+    },
+    {
+      "epoch": 0.20020659542885913,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001770890927300735,
+      "loss": 0.0859,
+      "step": 23064
+    },
+    {
+      "epoch": 0.2002152759090633,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0017708710879042319,
+      "loss": 0.124,
+      "step": 23065
+    },
+    {
+      "epoch": 0.20022395638926746,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0017708512477740742,
+      "loss": 0.1377,
+      "step": 23066
+    },
+    {
+      "epoch": 0.20023263686947163,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0017708314069102835,
+      "loss": 0.0962,
+      "step": 23067
+    },
+    {
+      "epoch": 0.2002413173496758,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0017708115653128817,
+      "loss": 0.1006,
+      "step": 23068
+    },
+    {
+      "epoch": 0.20024999782987996,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0017707917229818908,
+      "loss": 0.127,
+      "step": 23069
+    },
+    {
+      "epoch": 0.20025867831008412,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001770771879917332,
+      "loss": 0.1094,
+      "step": 23070
+    },
+    {
+      "epoch": 0.20026735879028829,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0017707520361192268,
+      "loss": 0.1055,
+      "step": 23071
+    },
+    {
+      "epoch": 0.20027603927049245,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0017707321915875973,
+      "loss": 0.1387,
+      "step": 23072
+    },
+    {
+      "epoch": 0.20028471975069662,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0017707123463224655,
+      "loss": 0.1191,
+      "step": 23073
+    },
+    {
+      "epoch": 0.20029340023090078,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0017706925003238526,
+      "loss": 0.1064,
+      "step": 23074
+    },
+    {
+      "epoch": 0.20030208071110495,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0017706726535917804,
+      "loss": 0.0933,
+      "step": 23075
+    },
+    {
+      "epoch": 0.2003107611913091,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0017706528061262707,
+      "loss": 0.0898,
+      "step": 23076
+    },
+    {
+      "epoch": 0.20031944167151328,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0017706329579273452,
+      "loss": 0.1582,
+      "step": 23077
+    },
+    {
+      "epoch": 0.20032812215171744,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0017706131089950256,
+      "loss": 0.1182,
+      "step": 23078
+    },
+    {
+      "epoch": 0.2003368026319216,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0017705932593293334,
+      "loss": 0.0874,
+      "step": 23079
+    },
+    {
+      "epoch": 0.20034548311212577,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0017705734089302908,
+      "loss": 0.125,
+      "step": 23080
+    },
+    {
+      "epoch": 0.20035416359232994,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0017705535577979188,
+      "loss": 0.123,
+      "step": 23081
+    },
+    {
+      "epoch": 0.2003628440725341,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0017705337059322397,
+      "loss": 0.0879,
+      "step": 23082
+    },
+    {
+      "epoch": 0.20037152455273827,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0017705138533332752,
+      "loss": 0.1006,
+      "step": 23083
+    },
+    {
+      "epoch": 0.20038020503294243,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0017704940000010465,
+      "loss": 0.1113,
+      "step": 23084
+    },
+    {
+      "epoch": 0.2003888855131466,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001770474145935576,
+      "loss": 0.0947,
+      "step": 23085
+    },
+    {
+      "epoch": 0.20039756599335076,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0017704542911368849,
+      "loss": 0.0859,
+      "step": 23086
+    },
+    {
+      "epoch": 0.20040624647355493,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.001770434435604995,
+      "loss": 0.1309,
+      "step": 23087
+    },
+    {
+      "epoch": 0.2004149269537591,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001770414579339928,
+      "loss": 0.1777,
+      "step": 23088
+    },
+    {
+      "epoch": 0.20042360743396326,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0017703947223417056,
+      "loss": 0.123,
+      "step": 23089
+    },
+    {
+      "epoch": 0.20043228791416742,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00177037486461035,
+      "loss": 0.1094,
+      "step": 23090
+    },
+    {
+      "epoch": 0.2004409683943716,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0017703550061458821,
+      "loss": 0.1006,
+      "step": 23091
+    },
+    {
+      "epoch": 0.20044964887457575,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0017703351469483244,
+      "loss": 0.0938,
+      "step": 23092
+    },
+    {
+      "epoch": 0.20045832935477992,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001770315287017698,
+      "loss": 0.1152,
+      "step": 23093
+    },
+    {
+      "epoch": 0.20046700983498408,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0017702954263540249,
+      "loss": 0.1309,
+      "step": 23094
+    },
+    {
+      "epoch": 0.20047569031518825,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0017702755649573268,
+      "loss": 0.1562,
+      "step": 23095
+    },
+    {
+      "epoch": 0.2004843707953924,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0017702557028276254,
+      "loss": 0.1221,
+      "step": 23096
+    },
+    {
+      "epoch": 0.20049305127559658,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0017702358399649426,
+      "loss": 0.0781,
+      "step": 23097
+    },
+    {
+      "epoch": 0.20050173175580074,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0017702159763692998,
+      "loss": 0.1426,
+      "step": 23098
+    },
+    {
+      "epoch": 0.2005104122360049,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001770196112040719,
+      "loss": 0.1123,
+      "step": 23099
+    },
+    {
+      "epoch": 0.20051909271620907,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001770176246979222,
+      "loss": 0.0825,
+      "step": 23100
+    },
+    {
+      "epoch": 0.20052777319641324,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.00177015638118483,
+      "loss": 0.1099,
+      "step": 23101
+    },
+    {
+      "epoch": 0.20053645367661738,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0017701365146575649,
+      "loss": 0.1104,
+      "step": 23102
+    },
+    {
+      "epoch": 0.20054513415682154,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0017701166473974488,
+      "loss": 0.1309,
+      "step": 23103
+    },
+    {
+      "epoch": 0.2005538146370257,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0017700967794045033,
+      "loss": 0.0942,
+      "step": 23104
+    },
+    {
+      "epoch": 0.20056249511722987,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0017700769106787498,
+      "loss": 0.1016,
+      "step": 23105
+    },
+    {
+      "epoch": 0.20057117559743404,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0017700570412202104,
+      "loss": 0.1182,
+      "step": 23106
+    },
+    {
+      "epoch": 0.2005798560776382,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0017700371710289064,
+      "loss": 0.1367,
+      "step": 23107
+    },
+    {
+      "epoch": 0.20058853655784237,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0017700173001048603,
+      "loss": 0.1367,
+      "step": 23108
+    },
+    {
+      "epoch": 0.20059721703804653,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0017699974284480931,
+      "loss": 0.0913,
+      "step": 23109
+    },
+    {
+      "epoch": 0.2006058975182507,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001769977556058627,
+      "loss": 0.1357,
+      "step": 23110
+    },
+    {
+      "epoch": 0.20061457799845486,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0017699576829364831,
+      "loss": 0.083,
+      "step": 23111
+    },
+    {
+      "epoch": 0.20062325847865903,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0017699378090816839,
+      "loss": 0.1094,
+      "step": 23112
+    },
+    {
+      "epoch": 0.2006319389588632,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0017699179344942506,
+      "loss": 0.0698,
+      "step": 23113
+    },
+    {
+      "epoch": 0.20064061943906736,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001769898059174205,
+      "loss": 0.0894,
+      "step": 23114
+    },
+    {
+      "epoch": 0.20064929991927152,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.001769878183121569,
+      "loss": 0.0859,
+      "step": 23115
+    },
+    {
+      "epoch": 0.2006579803994757,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0017698583063363647,
+      "loss": 0.1064,
+      "step": 23116
+    },
+    {
+      "epoch": 0.20066666087967985,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001769838428818613,
+      "loss": 0.1133,
+      "step": 23117
+    },
+    {
+      "epoch": 0.20067534135988402,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0017698185505683362,
+      "loss": 0.104,
+      "step": 23118
+    },
+    {
+      "epoch": 0.20068402184008818,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0017697986715855558,
+      "loss": 0.1064,
+      "step": 23119
+    },
+    {
+      "epoch": 0.20069270232029235,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0017697787918702936,
+      "loss": 0.1611,
+      "step": 23120
+    },
+    {
+      "epoch": 0.20070138280049651,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0017697589114225715,
+      "loss": 0.0903,
+      "step": 23121
+    },
+    {
+      "epoch": 0.20071006328070068,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001769739030242411,
+      "loss": 0.1055,
+      "step": 23122
+    },
+    {
+      "epoch": 0.20071874376090484,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0017697191483298342,
+      "loss": 0.0811,
+      "step": 23123
+    },
+    {
+      "epoch": 0.200727424241109,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0017696992656848626,
+      "loss": 0.1445,
+      "step": 23124
+    },
+    {
+      "epoch": 0.20073610472131317,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0017696793823075176,
+      "loss": 0.1279,
+      "step": 23125
+    },
+    {
+      "epoch": 0.20074478520151734,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0017696594981978216,
+      "loss": 0.0972,
+      "step": 23126
+    },
+    {
+      "epoch": 0.2007534656817215,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001769639613355796,
+      "loss": 0.0884,
+      "step": 23127
+    },
+    {
+      "epoch": 0.20076214616192567,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0017696197277814623,
+      "loss": 0.127,
+      "step": 23128
+    },
+    {
+      "epoch": 0.20077082664212983,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0017695998414748429,
+      "loss": 0.0977,
+      "step": 23129
+    },
+    {
+      "epoch": 0.200779507122334,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0017695799544359587,
+      "loss": 0.1279,
+      "step": 23130
+    },
+    {
+      "epoch": 0.20078818760253817,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0017695600666648324,
+      "loss": 0.1318,
+      "step": 23131
+    },
+    {
+      "epoch": 0.20079686808274233,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0017695401781614851,
+      "loss": 0.1162,
+      "step": 23132
+    },
+    {
+      "epoch": 0.2008055485629465,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001769520288925939,
+      "loss": 0.1309,
+      "step": 23133
+    },
+    {
+      "epoch": 0.20081422904315066,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001769500398958215,
+      "loss": 0.0962,
+      "step": 23134
+    },
+    {
+      "epoch": 0.20082290952335483,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0017694805082583355,
+      "loss": 0.165,
+      "step": 23135
+    },
+    {
+      "epoch": 0.200831590003559,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0017694606168263225,
+      "loss": 0.0908,
+      "step": 23136
+    },
+    {
+      "epoch": 0.20084027048376316,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0017694407246621976,
+      "loss": 0.0977,
+      "step": 23137
+    },
+    {
+      "epoch": 0.20084895096396732,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001769420831765982,
+      "loss": 0.1182,
+      "step": 23138
+    },
+    {
+      "epoch": 0.20085763144417149,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001769400938137698,
+      "loss": 0.1436,
+      "step": 23139
+    },
+    {
+      "epoch": 0.20086631192437565,
+      "grad_norm": 2.1875,
+      "learning_rate": 0.001769381043777367,
+      "loss": 0.3145,
+      "step": 23140
+    },
+    {
+      "epoch": 0.20087499240457982,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.001769361148685011,
+      "loss": 0.1348,
+      "step": 23141
+    },
+    {
+      "epoch": 0.20088367288478398,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001769341252860652,
+      "loss": 0.1074,
+      "step": 23142
+    },
+    {
+      "epoch": 0.20089235336498815,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001769321356304311,
+      "loss": 0.0898,
+      "step": 23143
+    },
+    {
+      "epoch": 0.2009010338451923,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0017693014590160106,
+      "loss": 0.168,
+      "step": 23144
+    },
+    {
+      "epoch": 0.20090971432539648,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001769281560995772,
+      "loss": 0.127,
+      "step": 23145
+    },
+    {
+      "epoch": 0.20091839480560064,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001769261662243617,
+      "loss": 0.1514,
+      "step": 23146
+    },
+    {
+      "epoch": 0.2009270752858048,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001769241762759568,
+      "loss": 0.126,
+      "step": 23147
+    },
+    {
+      "epoch": 0.20093575576600897,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0017692218625436457,
+      "loss": 0.1631,
+      "step": 23148
+    },
+    {
+      "epoch": 0.20094443624621314,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0017692019615958728,
+      "loss": 0.0864,
+      "step": 23149
+    },
+    {
+      "epoch": 0.2009531167264173,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0017691820599162707,
+      "loss": 0.1094,
+      "step": 23150
+    },
+    {
+      "epoch": 0.20096179720662147,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0017691621575048608,
+      "loss": 0.0952,
+      "step": 23151
+    },
+    {
+      "epoch": 0.20097047768682563,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0017691422543616652,
+      "loss": 0.1021,
+      "step": 23152
+    },
+    {
+      "epoch": 0.2009791581670298,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0017691223504867062,
+      "loss": 0.1069,
+      "step": 23153
+    },
+    {
+      "epoch": 0.20098783864723396,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017691024458800046,
+      "loss": 0.1328,
+      "step": 23154
+    },
+    {
+      "epoch": 0.20099651912743813,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001769082540541583,
+      "loss": 0.1309,
+      "step": 23155
+    },
+    {
+      "epoch": 0.2010051996076423,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0017690626344714624,
+      "loss": 0.0811,
+      "step": 23156
+    },
+    {
+      "epoch": 0.20101388008784646,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001769042727669665,
+      "loss": 0.1006,
+      "step": 23157
+    },
+    {
+      "epoch": 0.20102256056805062,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0017690228201362132,
+      "loss": 0.0854,
+      "step": 23158
+    },
+    {
+      "epoch": 0.2010312410482548,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0017690029118711272,
+      "loss": 0.1357,
+      "step": 23159
+    },
+    {
+      "epoch": 0.20103992152845895,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.00176898300287443,
+      "loss": 0.1084,
+      "step": 23160
+    },
+    {
+      "epoch": 0.20104860200866312,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001768963093146143,
+      "loss": 0.1084,
+      "step": 23161
+    },
+    {
+      "epoch": 0.20105728248886728,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001768943182686288,
+      "loss": 0.1367,
+      "step": 23162
+    },
+    {
+      "epoch": 0.20106596296907145,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0017689232714948868,
+      "loss": 0.1016,
+      "step": 23163
+    },
+    {
+      "epoch": 0.20107464344927561,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0017689033595719615,
+      "loss": 0.0806,
+      "step": 23164
+    },
+    {
+      "epoch": 0.20108332392947978,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001768883446917533,
+      "loss": 0.1504,
+      "step": 23165
+    },
+    {
+      "epoch": 0.20109200440968394,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0017688635335316241,
+      "loss": 0.0791,
+      "step": 23166
+    },
+    {
+      "epoch": 0.2011006848898881,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0017688436194142555,
+      "loss": 0.1602,
+      "step": 23167
+    },
+    {
+      "epoch": 0.20110936537009227,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0017688237045654503,
+      "loss": 0.1309,
+      "step": 23168
+    },
+    {
+      "epoch": 0.20111804585029644,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.001768803788985229,
+      "loss": 0.1367,
+      "step": 23169
+    },
+    {
+      "epoch": 0.2011267263305006,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0017687838726736143,
+      "loss": 0.1016,
+      "step": 23170
+    },
+    {
+      "epoch": 0.20113540681070477,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0017687639556306274,
+      "loss": 0.0879,
+      "step": 23171
+    },
+    {
+      "epoch": 0.20114408729090894,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.00176874403785629,
+      "loss": 0.1484,
+      "step": 23172
+    },
+    {
+      "epoch": 0.2011527677711131,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0017687241193506248,
+      "loss": 0.0869,
+      "step": 23173
+    },
+    {
+      "epoch": 0.20116144825131727,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0017687042001136525,
+      "loss": 0.1182,
+      "step": 23174
+    },
+    {
+      "epoch": 0.20117012873152143,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0017686842801453953,
+      "loss": 0.0938,
+      "step": 23175
+    },
+    {
+      "epoch": 0.2011788092117256,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.001768664359445875,
+      "loss": 0.0786,
+      "step": 23176
+    },
+    {
+      "epoch": 0.20118748969192976,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0017686444380151138,
+      "loss": 0.1699,
+      "step": 23177
+    },
+    {
+      "epoch": 0.20119617017213393,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0017686245158531329,
+      "loss": 0.1045,
+      "step": 23178
+    },
+    {
+      "epoch": 0.2012048506523381,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0017686045929599542,
+      "loss": 0.1338,
+      "step": 23179
+    },
+    {
+      "epoch": 0.20121353113254226,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0017685846693355997,
+      "loss": 0.1074,
+      "step": 23180
+    },
+    {
+      "epoch": 0.20122221161274642,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0017685647449800907,
+      "loss": 0.1235,
+      "step": 23181
+    },
+    {
+      "epoch": 0.2012308920929506,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0017685448198934497,
+      "loss": 0.0786,
+      "step": 23182
+    },
+    {
+      "epoch": 0.20123957257315475,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0017685248940756979,
+      "loss": 0.1201,
+      "step": 23183
+    },
+    {
+      "epoch": 0.20124825305335892,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0017685049675268574,
+      "loss": 0.1221,
+      "step": 23184
+    },
+    {
+      "epoch": 0.20125693353356308,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0017684850402469498,
+      "loss": 0.0771,
+      "step": 23185
+    },
+    {
+      "epoch": 0.20126561401376725,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0017684651122359973,
+      "loss": 0.123,
+      "step": 23186
+    },
+    {
+      "epoch": 0.2012742944939714,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0017684451834940212,
+      "loss": 0.1157,
+      "step": 23187
+    },
+    {
+      "epoch": 0.20128297497417558,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0017684252540210435,
+      "loss": 0.1299,
+      "step": 23188
+    },
+    {
+      "epoch": 0.20129165545437974,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0017684053238170858,
+      "loss": 0.0957,
+      "step": 23189
+    },
+    {
+      "epoch": 0.2013003359345839,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0017683853928821707,
+      "loss": 0.1006,
+      "step": 23190
+    },
+    {
+      "epoch": 0.20130901641478807,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0017683654612163189,
+      "loss": 0.1167,
+      "step": 23191
+    },
+    {
+      "epoch": 0.20131769689499224,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0017683455288195528,
+      "loss": 0.1172,
+      "step": 23192
+    },
+    {
+      "epoch": 0.2013263773751964,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.001768325595691894,
+      "loss": 0.1143,
+      "step": 23193
+    },
+    {
+      "epoch": 0.20133505785540057,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0017683056618333641,
+      "loss": 0.0903,
+      "step": 23194
+    },
+    {
+      "epoch": 0.20134373833560473,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0017682857272439856,
+      "loss": 0.1045,
+      "step": 23195
+    },
+    {
+      "epoch": 0.2013524188158089,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0017682657919237798,
+      "loss": 0.082,
+      "step": 23196
+    },
+    {
+      "epoch": 0.20136109929601306,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0017682458558727684,
+      "loss": 0.1143,
+      "step": 23197
+    },
+    {
+      "epoch": 0.20136977977621723,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0017682259190909735,
+      "loss": 0.1348,
+      "step": 23198
+    },
+    {
+      "epoch": 0.2013784602564214,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0017682059815784168,
+      "loss": 0.1108,
+      "step": 23199
+    },
+    {
+      "epoch": 0.20138714073662556,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.00176818604333512,
+      "loss": 0.1836,
+      "step": 23200
+    },
+    {
+      "epoch": 0.20139582121682972,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001768166104361105,
+      "loss": 0.1182,
+      "step": 23201
+    },
+    {
+      "epoch": 0.2014045016970339,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0017681461646563937,
+      "loss": 0.1299,
+      "step": 23202
+    },
+    {
+      "epoch": 0.20141318217723805,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0017681262242210075,
+      "loss": 0.127,
+      "step": 23203
+    },
+    {
+      "epoch": 0.20142186265744222,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0017681062830549688,
+      "loss": 0.1523,
+      "step": 23204
+    },
+    {
+      "epoch": 0.20143054313764638,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.001768086341158299,
+      "loss": 0.1328,
+      "step": 23205
+    },
+    {
+      "epoch": 0.20143922361785055,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0017680663985310198,
+      "loss": 0.1357,
+      "step": 23206
+    },
+    {
+      "epoch": 0.20144790409805471,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0017680464551731536,
+      "loss": 0.1504,
+      "step": 23207
+    },
+    {
+      "epoch": 0.20145658457825888,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0017680265110847217,
+      "loss": 0.1084,
+      "step": 23208
+    },
+    {
+      "epoch": 0.20146526505846304,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.001768006566265746,
+      "loss": 0.1245,
+      "step": 23209
+    },
+    {
+      "epoch": 0.2014739455386672,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0017679866207162484,
+      "loss": 0.104,
+      "step": 23210
+    },
+    {
+      "epoch": 0.20148262601887137,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0017679666744362507,
+      "loss": 0.1064,
+      "step": 23211
+    },
+    {
+      "epoch": 0.20149130649907554,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0017679467274257746,
+      "loss": 0.1128,
+      "step": 23212
+    },
+    {
+      "epoch": 0.2014999869792797,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001767926779684842,
+      "loss": 0.1182,
+      "step": 23213
+    },
+    {
+      "epoch": 0.20150866745948387,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0017679068312134744,
+      "loss": 0.1094,
+      "step": 23214
+    },
+    {
+      "epoch": 0.20151734793968804,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0017678868820116943,
+      "loss": 0.1309,
+      "step": 23215
+    },
+    {
+      "epoch": 0.2015260284198922,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0017678669320795233,
+      "loss": 0.0928,
+      "step": 23216
+    },
+    {
+      "epoch": 0.20153470890009637,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0017678469814169828,
+      "loss": 0.124,
+      "step": 23217
+    },
+    {
+      "epoch": 0.20154338938030053,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001767827030024095,
+      "loss": 0.1084,
+      "step": 23218
+    },
+    {
+      "epoch": 0.2015520698605047,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0017678070779008814,
+      "loss": 0.1299,
+      "step": 23219
+    },
+    {
+      "epoch": 0.20156075034070886,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0017677871250473642,
+      "loss": 0.1064,
+      "step": 23220
+    },
+    {
+      "epoch": 0.20156943082091303,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001767767171463565,
+      "loss": 0.1211,
+      "step": 23221
+    },
+    {
+      "epoch": 0.2015781113011172,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0017677472171495057,
+      "loss": 0.1406,
+      "step": 23222
+    },
+    {
+      "epoch": 0.20158679178132136,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001767727262105208,
+      "loss": 0.1118,
+      "step": 23223
+    },
+    {
+      "epoch": 0.20159547226152552,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0017677073063306938,
+      "loss": 0.1074,
+      "step": 23224
+    },
+    {
+      "epoch": 0.2016041527417297,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0017676873498259846,
+      "loss": 0.0552,
+      "step": 23225
+    },
+    {
+      "epoch": 0.20161283322193382,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001767667392591103,
+      "loss": 0.1211,
+      "step": 23226
+    },
+    {
+      "epoch": 0.201621513702138,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0017676474346260702,
+      "loss": 0.0908,
+      "step": 23227
+    },
+    {
+      "epoch": 0.20163019418234215,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001767627475930908,
+      "loss": 0.0991,
+      "step": 23228
+    },
+    {
+      "epoch": 0.20163887466254632,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0017676075165056389,
+      "loss": 0.1138,
+      "step": 23229
+    },
+    {
+      "epoch": 0.20164755514275048,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0017675875563502841,
+      "loss": 0.1006,
+      "step": 23230
+    },
+    {
+      "epoch": 0.20165623562295465,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0017675675954648654,
+      "loss": 0.0962,
+      "step": 23231
+    },
+    {
+      "epoch": 0.20166491610315881,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.001767547633849405,
+      "loss": 0.1001,
+      "step": 23232
+    },
+    {
+      "epoch": 0.20167359658336298,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0017675276715039244,
+      "loss": 0.0894,
+      "step": 23233
+    },
+    {
+      "epoch": 0.20168227706356714,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0017675077084284454,
+      "loss": 0.1172,
+      "step": 23234
+    },
+    {
+      "epoch": 0.2016909575437713,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0017674877446229903,
+      "loss": 0.0986,
+      "step": 23235
+    },
+    {
+      "epoch": 0.20169963802397547,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0017674677800875805,
+      "loss": 0.1006,
+      "step": 23236
+    },
+    {
+      "epoch": 0.20170831850417964,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0017674478148222378,
+      "loss": 0.1201,
+      "step": 23237
+    },
+    {
+      "epoch": 0.2017169989843838,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0017674278488269844,
+      "loss": 0.0859,
+      "step": 23238
+    },
+    {
+      "epoch": 0.20172567946458797,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001767407882101842,
+      "loss": 0.2637,
+      "step": 23239
+    },
+    {
+      "epoch": 0.20173435994479214,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0017673879146468323,
+      "loss": 0.0996,
+      "step": 23240
+    },
+    {
+      "epoch": 0.2017430404249963,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001767367946461977,
+      "loss": 0.0801,
+      "step": 23241
+    },
+    {
+      "epoch": 0.20175172090520047,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017673479775472984,
+      "loss": 0.1074,
+      "step": 23242
+    },
+    {
+      "epoch": 0.20176040138540463,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0017673280079028182,
+      "loss": 0.1182,
+      "step": 23243
+    },
+    {
+      "epoch": 0.2017690818656088,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.001767308037528558,
+      "loss": 0.1201,
+      "step": 23244
+    },
+    {
+      "epoch": 0.20177776234581296,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0017672880664245394,
+      "loss": 0.1182,
+      "step": 23245
+    },
+    {
+      "epoch": 0.20178644282601713,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0017672680945907849,
+      "loss": 0.1211,
+      "step": 23246
+    },
+    {
+      "epoch": 0.2017951233062213,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.001767248122027316,
+      "loss": 0.1338,
+      "step": 23247
+    },
+    {
+      "epoch": 0.20180380378642546,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0017672281487341548,
+      "loss": 0.1025,
+      "step": 23248
+    },
+    {
+      "epoch": 0.20181248426662962,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0017672081747113226,
+      "loss": 0.124,
+      "step": 23249
+    },
+    {
+      "epoch": 0.2018211647468338,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0017671881999588419,
+      "loss": 0.1172,
+      "step": 23250
+    },
+    {
+      "epoch": 0.20182984522703795,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001767168224476734,
+      "loss": 0.1445,
+      "step": 23251
+    },
+    {
+      "epoch": 0.20183852570724212,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0017671482482650208,
+      "loss": 0.1338,
+      "step": 23252
+    },
+    {
+      "epoch": 0.20184720618744628,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017671282713237243,
+      "loss": 0.0967,
+      "step": 23253
+    },
+    {
+      "epoch": 0.20185588666765045,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0017671082936528667,
+      "loss": 0.0942,
+      "step": 23254
+    },
+    {
+      "epoch": 0.2018645671478546,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001767088315252469,
+      "loss": 0.1045,
+      "step": 23255
+    },
+    {
+      "epoch": 0.20187324762805878,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001767068336122554,
+      "loss": 0.1011,
+      "step": 23256
+    },
+    {
+      "epoch": 0.20188192810826294,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001767048356263143,
+      "loss": 0.127,
+      "step": 23257
+    },
+    {
+      "epoch": 0.2018906085884671,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0017670283756742577,
+      "loss": 0.0864,
+      "step": 23258
+    },
+    {
+      "epoch": 0.20189928906867127,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0017670083943559203,
+      "loss": 0.0845,
+      "step": 23259
+    },
+    {
+      "epoch": 0.20190796954887544,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0017669884123081528,
+      "loss": 0.0889,
+      "step": 23260
+    },
+    {
+      "epoch": 0.2019166500290796,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0017669684295309764,
+      "loss": 0.0869,
+      "step": 23261
+    },
+    {
+      "epoch": 0.20192533050928377,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0017669484460244133,
+      "loss": 0.1445,
+      "step": 23262
+    },
+    {
+      "epoch": 0.20193401098948793,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001766928461788486,
+      "loss": 0.1211,
+      "step": 23263
+    },
+    {
+      "epoch": 0.2019426914696921,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0017669084768232152,
+      "loss": 0.1143,
+      "step": 23264
+    },
+    {
+      "epoch": 0.20195137194989626,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0017668884911286235,
+      "loss": 0.0874,
+      "step": 23265
+    },
+    {
+      "epoch": 0.20196005243010043,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0017668685047047329,
+      "loss": 0.0732,
+      "step": 23266
+    },
+    {
+      "epoch": 0.2019687329103046,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0017668485175515642,
+      "loss": 0.1738,
+      "step": 23267
+    },
+    {
+      "epoch": 0.20197741339050876,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0017668285296691404,
+      "loss": 0.1064,
+      "step": 23268
+    },
+    {
+      "epoch": 0.20198609387071292,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001766808541057483,
+      "loss": 0.1074,
+      "step": 23269
+    },
+    {
+      "epoch": 0.2019947743509171,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0017667885517166138,
+      "loss": 0.1055,
+      "step": 23270
+    },
+    {
+      "epoch": 0.20200345483112125,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0017667685616465545,
+      "loss": 0.0962,
+      "step": 23271
+    },
+    {
+      "epoch": 0.20201213531132542,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0017667485708473272,
+      "loss": 0.0811,
+      "step": 23272
+    },
+    {
+      "epoch": 0.20202081579152958,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0017667285793189534,
+      "loss": 0.1104,
+      "step": 23273
+    },
+    {
+      "epoch": 0.20202949627173375,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0017667085870614557,
+      "loss": 0.1055,
+      "step": 23274
+    },
+    {
+      "epoch": 0.20203817675193791,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0017666885940748553,
+      "loss": 0.0933,
+      "step": 23275
+    },
+    {
+      "epoch": 0.20204685723214208,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0017666686003591742,
+      "loss": 0.1387,
+      "step": 23276
+    },
+    {
+      "epoch": 0.20205553771234624,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0017666486059144343,
+      "loss": 0.1104,
+      "step": 23277
+    },
+    {
+      "epoch": 0.2020642181925504,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0017666286107406578,
+      "loss": 0.1289,
+      "step": 23278
+    },
+    {
+      "epoch": 0.20207289867275458,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001766608614837866,
+      "loss": 0.1064,
+      "step": 23279
+    },
+    {
+      "epoch": 0.20208157915295874,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0017665886182060813,
+      "loss": 0.1152,
+      "step": 23280
+    },
+    {
+      "epoch": 0.2020902596331629,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0017665686208453248,
+      "loss": 0.0742,
+      "step": 23281
+    },
+    {
+      "epoch": 0.20209894011336707,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0017665486227556194,
+      "loss": 0.5703,
+      "step": 23282
+    },
+    {
+      "epoch": 0.20210762059357124,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017665286239369861,
+      "loss": 0.0981,
+      "step": 23283
+    },
+    {
+      "epoch": 0.2021163010737754,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017665086243894472,
+      "loss": 0.1162,
+      "step": 23284
+    },
+    {
+      "epoch": 0.20212498155397957,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0017664886241130246,
+      "loss": 0.1162,
+      "step": 23285
+    },
+    {
+      "epoch": 0.20213366203418373,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0017664686231077399,
+      "loss": 0.1147,
+      "step": 23286
+    },
+    {
+      "epoch": 0.2021423425143879,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0017664486213736152,
+      "loss": 0.1074,
+      "step": 23287
+    },
+    {
+      "epoch": 0.20215102299459206,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0017664286189106727,
+      "loss": 0.127,
+      "step": 23288
+    },
+    {
+      "epoch": 0.20215970347479623,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0017664086157189332,
+      "loss": 0.0986,
+      "step": 23289
+    },
+    {
+      "epoch": 0.2021683839550004,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017663886117984197,
+      "loss": 0.0977,
+      "step": 23290
+    },
+    {
+      "epoch": 0.20217706443520456,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0017663686071491532,
+      "loss": 0.1182,
+      "step": 23291
+    },
+    {
+      "epoch": 0.20218574491540872,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0017663486017711564,
+      "loss": 0.0889,
+      "step": 23292
+    },
+    {
+      "epoch": 0.2021944253956129,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0017663285956644507,
+      "loss": 0.124,
+      "step": 23293
+    },
+    {
+      "epoch": 0.20220310587581705,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0017663085888290578,
+      "loss": 0.1182,
+      "step": 23294
+    },
+    {
+      "epoch": 0.20221178635602122,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0017662885812650003,
+      "loss": 0.1084,
+      "step": 23295
+    },
+    {
+      "epoch": 0.20222046683622538,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0017662685729722993,
+      "loss": 0.1348,
+      "step": 23296
+    },
+    {
+      "epoch": 0.20222914731642955,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0017662485639509768,
+      "loss": 0.0869,
+      "step": 23297
+    },
+    {
+      "epoch": 0.2022378277966337,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0017662285542010552,
+      "loss": 0.1191,
+      "step": 23298
+    },
+    {
+      "epoch": 0.20224650827683788,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0017662085437225561,
+      "loss": 0.1182,
+      "step": 23299
+    },
+    {
+      "epoch": 0.20225518875704204,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0017661885325155012,
+      "loss": 0.1172,
+      "step": 23300
+    },
+    {
+      "epoch": 0.2022638692372462,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0017661685205799125,
+      "loss": 0.1465,
+      "step": 23301
+    },
+    {
+      "epoch": 0.20227254971745037,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0017661485079158122,
+      "loss": 0.1855,
+      "step": 23302
+    },
+    {
+      "epoch": 0.20228123019765454,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0017661284945232217,
+      "loss": 0.0869,
+      "step": 23303
+    },
+    {
+      "epoch": 0.2022899106778587,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0017661084804021629,
+      "loss": 0.1021,
+      "step": 23304
+    },
+    {
+      "epoch": 0.20229859115806287,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0017660884655526581,
+      "loss": 0.1309,
+      "step": 23305
+    },
+    {
+      "epoch": 0.20230727163826703,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0017660684499747291,
+      "loss": 0.0903,
+      "step": 23306
+    },
+    {
+      "epoch": 0.2023159521184712,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0017660484336683973,
+      "loss": 0.123,
+      "step": 23307
+    },
+    {
+      "epoch": 0.20232463259867536,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001766028416633685,
+      "loss": 0.1338,
+      "step": 23308
+    },
+    {
+      "epoch": 0.20233331307887953,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0017660083988706144,
+      "loss": 0.0752,
+      "step": 23309
+    },
+    {
+      "epoch": 0.2023419935590837,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0017659883803792067,
+      "loss": 0.082,
+      "step": 23310
+    },
+    {
+      "epoch": 0.20235067403928786,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001765968361159484,
+      "loss": 0.127,
+      "step": 23311
+    },
+    {
+      "epoch": 0.20235935451949202,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0017659483412114686,
+      "loss": 0.1011,
+      "step": 23312
+    },
+    {
+      "epoch": 0.2023680349996962,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0017659283205351819,
+      "loss": 0.1309,
+      "step": 23313
+    },
+    {
+      "epoch": 0.20237671547990035,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001765908299130646,
+      "loss": 0.1396,
+      "step": 23314
+    },
+    {
+      "epoch": 0.20238539596010452,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001765888276997883,
+      "loss": 0.0908,
+      "step": 23315
+    },
+    {
+      "epoch": 0.20239407644030868,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0017658682541369145,
+      "loss": 0.1084,
+      "step": 23316
+    },
+    {
+      "epoch": 0.20240275692051285,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0017658482305477625,
+      "loss": 0.1162,
+      "step": 23317
+    },
+    {
+      "epoch": 0.20241143740071701,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0017658282062304486,
+      "loss": 0.1143,
+      "step": 23318
+    },
+    {
+      "epoch": 0.20242011788092118,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0017658081811849952,
+      "loss": 0.1143,
+      "step": 23319
+    },
+    {
+      "epoch": 0.20242879836112534,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0017657881554114239,
+      "loss": 0.0938,
+      "step": 23320
+    },
+    {
+      "epoch": 0.2024374788413295,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017657681289097566,
+      "loss": 0.1094,
+      "step": 23321
+    },
+    {
+      "epoch": 0.20244615932153368,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001765748101680016,
+      "loss": 0.1318,
+      "step": 23322
+    },
+    {
+      "epoch": 0.20245483980173784,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0017657280737222224,
+      "loss": 0.1484,
+      "step": 23323
+    },
+    {
+      "epoch": 0.202463520281942,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0017657080450363988,
+      "loss": 0.1182,
+      "step": 23324
+    },
+    {
+      "epoch": 0.20247220076214617,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001765688015622567,
+      "loss": 0.1074,
+      "step": 23325
+    },
+    {
+      "epoch": 0.20248088124235034,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001765667985480749,
+      "loss": 0.1016,
+      "step": 23326
+    },
+    {
+      "epoch": 0.2024895617225545,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0017656479546109662,
+      "loss": 0.1289,
+      "step": 23327
+    },
+    {
+      "epoch": 0.20249824220275867,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0017656279230132406,
+      "loss": 0.0938,
+      "step": 23328
+    },
+    {
+      "epoch": 0.20250692268296283,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0017656078906875946,
+      "loss": 0.1191,
+      "step": 23329
+    },
+    {
+      "epoch": 0.202515603163167,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0017655878576340496,
+      "loss": 0.0928,
+      "step": 23330
+    },
+    {
+      "epoch": 0.20252428364337116,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0017655678238526278,
+      "loss": 0.1221,
+      "step": 23331
+    },
+    {
+      "epoch": 0.20253296412357533,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0017655477893433514,
+      "loss": 0.0884,
+      "step": 23332
+    },
+    {
+      "epoch": 0.2025416446037795,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0017655277541062415,
+      "loss": 0.1406,
+      "step": 23333
+    },
+    {
+      "epoch": 0.20255032508398366,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0017655077181413206,
+      "loss": 0.0938,
+      "step": 23334
+    },
+    {
+      "epoch": 0.20255900556418782,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0017654876814486105,
+      "loss": 0.126,
+      "step": 23335
+    },
+    {
+      "epoch": 0.202567686044392,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0017654676440281333,
+      "loss": 0.1069,
+      "step": 23336
+    },
+    {
+      "epoch": 0.20257636652459615,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0017654476058799101,
+      "loss": 0.124,
+      "step": 23337
+    },
+    {
+      "epoch": 0.20258504700480032,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001765427567003964,
+      "loss": 0.1157,
+      "step": 23338
+    },
+    {
+      "epoch": 0.20259372748500448,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0017654075274003158,
+      "loss": 0.1221,
+      "step": 23339
+    },
+    {
+      "epoch": 0.20260240796520865,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0017653874870689882,
+      "loss": 0.0815,
+      "step": 23340
+    },
+    {
+      "epoch": 0.2026110884454128,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0017653674460100027,
+      "loss": 0.0986,
+      "step": 23341
+    },
+    {
+      "epoch": 0.20261976892561698,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0017653474042233816,
+      "loss": 0.1016,
+      "step": 23342
+    },
+    {
+      "epoch": 0.20262844940582114,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0017653273617091463,
+      "loss": 0.0801,
+      "step": 23343
+    },
+    {
+      "epoch": 0.2026371298860253,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001765307318467319,
+      "loss": 0.1855,
+      "step": 23344
+    },
+    {
+      "epoch": 0.20264581036622947,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0017652872744979217,
+      "loss": 0.1084,
+      "step": 23345
+    },
+    {
+      "epoch": 0.20265449084643364,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0017652672298009763,
+      "loss": 0.1006,
+      "step": 23346
+    },
+    {
+      "epoch": 0.2026631713266378,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0017652471843765045,
+      "loss": 0.1108,
+      "step": 23347
+    },
+    {
+      "epoch": 0.20267185180684197,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0017652271382245287,
+      "loss": 0.1641,
+      "step": 23348
+    },
+    {
+      "epoch": 0.2026805322870461,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0017652070913450702,
+      "loss": 0.1191,
+      "step": 23349
+    },
+    {
+      "epoch": 0.20268921276725027,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0017651870437381512,
+      "loss": 0.1406,
+      "step": 23350
+    },
+    {
+      "epoch": 0.20269789324745444,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0017651669954037938,
+      "loss": 0.1152,
+      "step": 23351
+    },
+    {
+      "epoch": 0.2027065737276586,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0017651469463420197,
+      "loss": 0.1162,
+      "step": 23352
+    },
+    {
+      "epoch": 0.20271525420786277,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.001765126896552851,
+      "loss": 0.1001,
+      "step": 23353
+    },
+    {
+      "epoch": 0.20272393468806693,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0017651068460363094,
+      "loss": 0.1182,
+      "step": 23354
+    },
+    {
+      "epoch": 0.2027326151682711,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001765086794792417,
+      "loss": 0.1309,
+      "step": 23355
+    },
+    {
+      "epoch": 0.20274129564847526,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0017650667428211954,
+      "loss": 0.1191,
+      "step": 23356
+    },
+    {
+      "epoch": 0.20274997612867943,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0017650466901226668,
+      "loss": 0.1064,
+      "step": 23357
+    },
+    {
+      "epoch": 0.2027586566088836,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0017650266366968537,
+      "loss": 0.0854,
+      "step": 23358
+    },
+    {
+      "epoch": 0.20276733708908776,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001765006582543777,
+      "loss": 0.127,
+      "step": 23359
+    },
+    {
+      "epoch": 0.20277601756929192,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0017649865276634594,
+      "loss": 0.0928,
+      "step": 23360
+    },
+    {
+      "epoch": 0.2027846980494961,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001764966472055922,
+      "loss": 0.1318,
+      "step": 23361
+    },
+    {
+      "epoch": 0.20279337852970025,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001764946415721188,
+      "loss": 0.1113,
+      "step": 23362
+    },
+    {
+      "epoch": 0.20280205900990442,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.001764926358659278,
+      "loss": 0.1396,
+      "step": 23363
+    },
+    {
+      "epoch": 0.20281073949010858,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001764906300870215,
+      "loss": 0.1592,
+      "step": 23364
+    },
+    {
+      "epoch": 0.20281941997031275,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.00176488624235402,
+      "loss": 0.0938,
+      "step": 23365
+    },
+    {
+      "epoch": 0.2028281004505169,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0017648661831107155,
+      "loss": 0.0928,
+      "step": 23366
+    },
+    {
+      "epoch": 0.20283678093072108,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0017648461231403235,
+      "loss": 0.1699,
+      "step": 23367
+    },
+    {
+      "epoch": 0.20284546141092524,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0017648260624428655,
+      "loss": 0.0791,
+      "step": 23368
+    },
+    {
+      "epoch": 0.2028541418911294,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0017648060010183643,
+      "loss": 0.0854,
+      "step": 23369
+    },
+    {
+      "epoch": 0.20286282237133357,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0017647859388668407,
+      "loss": 0.1108,
+      "step": 23370
+    },
+    {
+      "epoch": 0.20287150285153774,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0017647658759883172,
+      "loss": 0.1074,
+      "step": 23371
+    },
+    {
+      "epoch": 0.2028801833317419,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0017647458123828158,
+      "loss": 0.1104,
+      "step": 23372
+    },
+    {
+      "epoch": 0.20288886381194607,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0017647257480503582,
+      "loss": 0.1143,
+      "step": 23373
+    },
+    {
+      "epoch": 0.20289754429215023,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017647056829909668,
+      "loss": 0.1094,
+      "step": 23374
+    },
+    {
+      "epoch": 0.2029062247723544,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0017646856172046635,
+      "loss": 0.085,
+      "step": 23375
+    },
+    {
+      "epoch": 0.20291490525255856,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0017646655506914695,
+      "loss": 0.1191,
+      "step": 23376
+    },
+    {
+      "epoch": 0.20292358573276273,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0017646454834514074,
+      "loss": 0.123,
+      "step": 23377
+    },
+    {
+      "epoch": 0.2029322662129669,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0017646254154844987,
+      "loss": 0.0972,
+      "step": 23378
+    },
+    {
+      "epoch": 0.20294094669317106,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0017646053467907663,
+      "loss": 0.1201,
+      "step": 23379
+    },
+    {
+      "epoch": 0.20294962717337522,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001764585277370231,
+      "loss": 0.0879,
+      "step": 23380
+    },
+    {
+      "epoch": 0.2029583076535794,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0017645652072229153,
+      "loss": 0.0952,
+      "step": 23381
+    },
+    {
+      "epoch": 0.20296698813378355,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0017645451363488415,
+      "loss": 0.1436,
+      "step": 23382
+    },
+    {
+      "epoch": 0.20297566861398772,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0017645250647480301,
+      "loss": 0.1182,
+      "step": 23383
+    },
+    {
+      "epoch": 0.20298434909419188,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.001764504992420505,
+      "loss": 0.1045,
+      "step": 23384
+    },
+    {
+      "epoch": 0.20299302957439605,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001764484919366287,
+      "loss": 0.1152,
+      "step": 23385
+    },
+    {
+      "epoch": 0.20300171005460022,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001764464845585398,
+      "loss": 0.0645,
+      "step": 23386
+    },
+    {
+      "epoch": 0.20301039053480438,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0017644447710778604,
+      "loss": 0.0938,
+      "step": 23387
+    },
+    {
+      "epoch": 0.20301907101500855,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0017644246958436961,
+      "loss": 0.1152,
+      "step": 23388
+    },
+    {
+      "epoch": 0.2030277514952127,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0017644046198829267,
+      "loss": 0.1367,
+      "step": 23389
+    },
+    {
+      "epoch": 0.20303643197541688,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0017643845431955748,
+      "loss": 0.0938,
+      "step": 23390
+    },
+    {
+      "epoch": 0.20304511245562104,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0017643644657816615,
+      "loss": 0.0908,
+      "step": 23391
+    },
+    {
+      "epoch": 0.2030537929358252,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0017643443876412094,
+      "loss": 0.1211,
+      "step": 23392
+    },
+    {
+      "epoch": 0.20306247341602937,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0017643243087742404,
+      "loss": 0.105,
+      "step": 23393
+    },
+    {
+      "epoch": 0.20307115389623354,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0017643042291807763,
+      "loss": 0.1167,
+      "step": 23394
+    },
+    {
+      "epoch": 0.2030798343764377,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0017642841488608384,
+      "loss": 0.1089,
+      "step": 23395
+    },
+    {
+      "epoch": 0.20308851485664187,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0017642640678144502,
+      "loss": 0.0718,
+      "step": 23396
+    },
+    {
+      "epoch": 0.20309719533684603,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0017642439860416327,
+      "loss": 0.0977,
+      "step": 23397
+    },
+    {
+      "epoch": 0.2031058758170502,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0017642239035424074,
+      "loss": 0.0972,
+      "step": 23398
+    },
+    {
+      "epoch": 0.20311455629725436,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0017642038203167973,
+      "loss": 0.0981,
+      "step": 23399
+    },
+    {
+      "epoch": 0.20312323677745853,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0017641837363648238,
+      "loss": 0.1104,
+      "step": 23400
+    },
+    {
+      "epoch": 0.2031319172576627,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0017641636516865088,
+      "loss": 0.1299,
+      "step": 23401
+    },
+    {
+      "epoch": 0.20314059773786686,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0017641435662818744,
+      "loss": 0.1123,
+      "step": 23402
+    },
+    {
+      "epoch": 0.20314927821807102,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0017641234801509427,
+      "loss": 0.124,
+      "step": 23403
+    },
+    {
+      "epoch": 0.2031579586982752,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0017641033932937354,
+      "loss": 0.1289,
+      "step": 23404
+    },
+    {
+      "epoch": 0.20316663917847935,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0017640833057102749,
+      "loss": 0.1445,
+      "step": 23405
+    },
+    {
+      "epoch": 0.20317531965868352,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0017640632174005825,
+      "loss": 0.1338,
+      "step": 23406
+    },
+    {
+      "epoch": 0.20318400013888768,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0017640431283646807,
+      "loss": 0.1157,
+      "step": 23407
+    },
+    {
+      "epoch": 0.20319268061909185,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0017640230386025916,
+      "loss": 0.0928,
+      "step": 23408
+    },
+    {
+      "epoch": 0.203201361099296,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0017640029481143366,
+      "loss": 0.0845,
+      "step": 23409
+    },
+    {
+      "epoch": 0.20321004157950018,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0017639828568999378,
+      "loss": 0.0894,
+      "step": 23410
+    },
+    {
+      "epoch": 0.20321872205970434,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0017639627649594174,
+      "loss": 0.1006,
+      "step": 23411
+    },
+    {
+      "epoch": 0.2032274025399085,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0017639426722927975,
+      "loss": 0.1152,
+      "step": 23412
+    },
+    {
+      "epoch": 0.20323608302011267,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0017639225789000997,
+      "loss": 0.1377,
+      "step": 23413
+    },
+    {
+      "epoch": 0.20324476350031684,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0017639024847813463,
+      "loss": 0.0981,
+      "step": 23414
+    },
+    {
+      "epoch": 0.203253443980521,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001763882389936559,
+      "loss": 0.1172,
+      "step": 23415
+    },
+    {
+      "epoch": 0.20326212446072517,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00176386229436576,
+      "loss": 0.0811,
+      "step": 23416
+    },
+    {
+      "epoch": 0.20327080494092933,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001763842198068971,
+      "loss": 0.1064,
+      "step": 23417
+    },
+    {
+      "epoch": 0.2032794854211335,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001763822101046214,
+      "loss": 0.0933,
+      "step": 23418
+    },
+    {
+      "epoch": 0.20328816590133766,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0017638020032975116,
+      "loss": 0.1152,
+      "step": 23419
+    },
+    {
+      "epoch": 0.20329684638154183,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001763781904822885,
+      "loss": 0.1016,
+      "step": 23420
+    },
+    {
+      "epoch": 0.203305526861746,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0017637618056223566,
+      "loss": 0.085,
+      "step": 23421
+    },
+    {
+      "epoch": 0.20331420734195016,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0017637417056959484,
+      "loss": 0.1348,
+      "step": 23422
+    },
+    {
+      "epoch": 0.20332288782215432,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0017637216050436818,
+      "loss": 0.1094,
+      "step": 23423
+    },
+    {
+      "epoch": 0.2033315683023585,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0017637015036655794,
+      "loss": 0.1104,
+      "step": 23424
+    },
+    {
+      "epoch": 0.20334024878256265,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001763681401561663,
+      "loss": 0.1396,
+      "step": 23425
+    },
+    {
+      "epoch": 0.20334892926276682,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0017636612987319549,
+      "loss": 0.1143,
+      "step": 23426
+    },
+    {
+      "epoch": 0.20335760974297099,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0017636411951764766,
+      "loss": 0.1162,
+      "step": 23427
+    },
+    {
+      "epoch": 0.20336629022317515,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0017636210908952498,
+      "loss": 0.1211,
+      "step": 23428
+    },
+    {
+      "epoch": 0.20337497070337932,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0017636009858882976,
+      "loss": 0.1855,
+      "step": 23429
+    },
+    {
+      "epoch": 0.20338365118358348,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0017635808801556411,
+      "loss": 0.1138,
+      "step": 23430
+    },
+    {
+      "epoch": 0.20339233166378765,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0017635607736973026,
+      "loss": 0.1162,
+      "step": 23431
+    },
+    {
+      "epoch": 0.2034010121439918,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0017635406665133038,
+      "loss": 0.1533,
+      "step": 23432
+    },
+    {
+      "epoch": 0.20340969262419598,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001763520558603667,
+      "loss": 0.1328,
+      "step": 23433
+    },
+    {
+      "epoch": 0.20341837310440014,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001763500449968414,
+      "loss": 0.1172,
+      "step": 23434
+    },
+    {
+      "epoch": 0.2034270535846043,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001763480340607567,
+      "loss": 0.1055,
+      "step": 23435
+    },
+    {
+      "epoch": 0.20343573406480847,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001763460230521148,
+      "loss": 0.0957,
+      "step": 23436
+    },
+    {
+      "epoch": 0.20344441454501264,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0017634401197091786,
+      "loss": 0.1045,
+      "step": 23437
+    },
+    {
+      "epoch": 0.2034530950252168,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0017634200081716811,
+      "loss": 0.1123,
+      "step": 23438
+    },
+    {
+      "epoch": 0.20346177550542097,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0017633998959086777,
+      "loss": 0.1084,
+      "step": 23439
+    },
+    {
+      "epoch": 0.20347045598562513,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0017633797829201896,
+      "loss": 0.1079,
+      "step": 23440
+    },
+    {
+      "epoch": 0.2034791364658293,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0017633596692062396,
+      "loss": 0.084,
+      "step": 23441
+    },
+    {
+      "epoch": 0.20348781694603346,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0017633395547668497,
+      "loss": 0.1338,
+      "step": 23442
+    },
+    {
+      "epoch": 0.20349649742623763,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0017633194396020414,
+      "loss": 0.0996,
+      "step": 23443
+    },
+    {
+      "epoch": 0.2035051779064418,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0017632993237118371,
+      "loss": 0.1074,
+      "step": 23444
+    },
+    {
+      "epoch": 0.20351385838664596,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0017632792070962585,
+      "loss": 0.123,
+      "step": 23445
+    },
+    {
+      "epoch": 0.20352253886685012,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0017632590897553276,
+      "loss": 0.0898,
+      "step": 23446
+    },
+    {
+      "epoch": 0.2035312193470543,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0017632389716890668,
+      "loss": 0.1133,
+      "step": 23447
+    },
+    {
+      "epoch": 0.20353989982725845,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0017632188528974974,
+      "loss": 0.0815,
+      "step": 23448
+    },
+    {
+      "epoch": 0.20354858030746262,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0017631987333806423,
+      "loss": 0.1182,
+      "step": 23449
+    },
+    {
+      "epoch": 0.20355726078766678,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0017631786131385228,
+      "loss": 0.1079,
+      "step": 23450
+    },
+    {
+      "epoch": 0.20356594126787095,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001763158492171161,
+      "loss": 0.1338,
+      "step": 23451
+    },
+    {
+      "epoch": 0.2035746217480751,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0017631383704785793,
+      "loss": 0.0913,
+      "step": 23452
+    },
+    {
+      "epoch": 0.20358330222827928,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0017631182480607993,
+      "loss": 0.0938,
+      "step": 23453
+    },
+    {
+      "epoch": 0.20359198270848344,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0017630981249178433,
+      "loss": 0.1143,
+      "step": 23454
+    },
+    {
+      "epoch": 0.2036006631886876,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001763078001049733,
+      "loss": 0.1309,
+      "step": 23455
+    },
+    {
+      "epoch": 0.20360934366889177,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0017630578764564904,
+      "loss": 0.1201,
+      "step": 23456
+    },
+    {
+      "epoch": 0.20361802414909594,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001763037751138138,
+      "loss": 0.1289,
+      "step": 23457
+    },
+    {
+      "epoch": 0.2036267046293001,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0017630176250946971,
+      "loss": 0.1152,
+      "step": 23458
+    },
+    {
+      "epoch": 0.20363538510950427,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0017629974983261905,
+      "loss": 0.1367,
+      "step": 23459
+    },
+    {
+      "epoch": 0.20364406558970843,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0017629773708326398,
+      "loss": 0.0898,
+      "step": 23460
+    },
+    {
+      "epoch": 0.2036527460699126,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0017629572426140667,
+      "loss": 0.1094,
+      "step": 23461
+    },
+    {
+      "epoch": 0.20366142655011676,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0017629371136704934,
+      "loss": 0.084,
+      "step": 23462
+    },
+    {
+      "epoch": 0.20367010703032093,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0017629169840019427,
+      "loss": 0.1177,
+      "step": 23463
+    },
+    {
+      "epoch": 0.2036787875105251,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0017628968536084354,
+      "loss": 0.1475,
+      "step": 23464
+    },
+    {
+      "epoch": 0.20368746799072926,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0017628767224899943,
+      "loss": 0.1367,
+      "step": 23465
+    },
+    {
+      "epoch": 0.20369614847093342,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001762856590646641,
+      "loss": 0.124,
+      "step": 23466
+    },
+    {
+      "epoch": 0.2037048289511376,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.001762836458078398,
+      "loss": 0.1328,
+      "step": 23467
+    },
+    {
+      "epoch": 0.20371350943134175,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0017628163247852865,
+      "loss": 0.1367,
+      "step": 23468
+    },
+    {
+      "epoch": 0.20372218991154592,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001762796190767329,
+      "loss": 0.1108,
+      "step": 23469
+    },
+    {
+      "epoch": 0.20373087039175009,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001762776056024548,
+      "loss": 0.1299,
+      "step": 23470
+    },
+    {
+      "epoch": 0.20373955087195425,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001762755920556965,
+      "loss": 0.0903,
+      "step": 23471
+    },
+    {
+      "epoch": 0.2037482313521584,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0017627357843646022,
+      "loss": 0.1504,
+      "step": 23472
+    },
+    {
+      "epoch": 0.20375691183236255,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001762715647447481,
+      "loss": 0.1035,
+      "step": 23473
+    },
+    {
+      "epoch": 0.20376559231256672,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0017626955098056243,
+      "loss": 0.0825,
+      "step": 23474
+    },
+    {
+      "epoch": 0.20377427279277088,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0017626753714390538,
+      "loss": 0.1182,
+      "step": 23475
+    },
+    {
+      "epoch": 0.20378295327297505,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0017626552323477913,
+      "loss": 0.1099,
+      "step": 23476
+    },
+    {
+      "epoch": 0.2037916337531792,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001762635092531859,
+      "loss": 0.1025,
+      "step": 23477
+    },
+    {
+      "epoch": 0.20380031423338338,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001762614951991279,
+      "loss": 0.1064,
+      "step": 23478
+    },
+    {
+      "epoch": 0.20380899471358754,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0017625948107260732,
+      "loss": 0.1152,
+      "step": 23479
+    },
+    {
+      "epoch": 0.2038176751937917,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0017625746687362637,
+      "loss": 0.1025,
+      "step": 23480
+    },
+    {
+      "epoch": 0.20382635567399587,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0017625545260218725,
+      "loss": 0.1162,
+      "step": 23481
+    },
+    {
+      "epoch": 0.20383503615420004,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0017625343825829211,
+      "loss": 0.0928,
+      "step": 23482
+    },
+    {
+      "epoch": 0.2038437166344042,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0017625142384194328,
+      "loss": 0.1377,
+      "step": 23483
+    },
+    {
+      "epoch": 0.20385239711460837,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017624940935314286,
+      "loss": 0.1475,
+      "step": 23484
+    },
+    {
+      "epoch": 0.20386107759481253,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.001762473947918931,
+      "loss": 0.1064,
+      "step": 23485
+    },
+    {
+      "epoch": 0.2038697580750167,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0017624538015819616,
+      "loss": 0.124,
+      "step": 23486
+    },
+    {
+      "epoch": 0.20387843855522086,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0017624336545205427,
+      "loss": 0.1094,
+      "step": 23487
+    },
+    {
+      "epoch": 0.20388711903542503,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0017624135067346965,
+      "loss": 0.1211,
+      "step": 23488
+    },
+    {
+      "epoch": 0.2038957995156292,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0017623933582244446,
+      "loss": 0.105,
+      "step": 23489
+    },
+    {
+      "epoch": 0.20390447999583336,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0017623732089898094,
+      "loss": 0.106,
+      "step": 23490
+    },
+    {
+      "epoch": 0.20391316047603752,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0017623530590308127,
+      "loss": 0.1006,
+      "step": 23491
+    },
+    {
+      "epoch": 0.2039218409562417,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0017623329083474768,
+      "loss": 0.126,
+      "step": 23492
+    },
+    {
+      "epoch": 0.20393052143644586,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0017623127569398234,
+      "loss": 0.1226,
+      "step": 23493
+    },
+    {
+      "epoch": 0.20393920191665002,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.001762292604807875,
+      "loss": 0.1064,
+      "step": 23494
+    },
+    {
+      "epoch": 0.20394788239685419,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001762272451951653,
+      "loss": 0.1279,
+      "step": 23495
+    },
+    {
+      "epoch": 0.20395656287705835,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0017622522983711802,
+      "loss": 0.1123,
+      "step": 23496
+    },
+    {
+      "epoch": 0.20396524335726252,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001762232144066478,
+      "loss": 0.1162,
+      "step": 23497
+    },
+    {
+      "epoch": 0.20397392383746668,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0017622119890375687,
+      "loss": 0.1621,
+      "step": 23498
+    },
+    {
+      "epoch": 0.20398260431767085,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0017621918332844742,
+      "loss": 0.1172,
+      "step": 23499
+    },
+    {
+      "epoch": 0.203991284797875,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001762171676807217,
+      "loss": 0.1201,
+      "step": 23500
+    },
+    {
+      "epoch": 0.20399996527807918,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0017621515196058187,
+      "loss": 0.1484,
+      "step": 23501
+    },
+    {
+      "epoch": 0.20400864575828334,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0017621313616803013,
+      "loss": 0.1074,
+      "step": 23502
+    },
+    {
+      "epoch": 0.2040173262384875,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001762111203030687,
+      "loss": 0.1016,
+      "step": 23503
+    },
+    {
+      "epoch": 0.20402600671869167,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0017620910436569979,
+      "loss": 0.0977,
+      "step": 23504
+    },
+    {
+      "epoch": 0.20403468719889584,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0017620708835592558,
+      "loss": 0.084,
+      "step": 23505
+    },
+    {
+      "epoch": 0.2040433676791,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001762050722737483,
+      "loss": 0.0977,
+      "step": 23506
+    },
+    {
+      "epoch": 0.20405204815930417,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0017620305611917018,
+      "loss": 0.1104,
+      "step": 23507
+    },
+    {
+      "epoch": 0.20406072863950833,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0017620103989219335,
+      "loss": 0.1133,
+      "step": 23508
+    },
+    {
+      "epoch": 0.2040694091197125,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001761990235928201,
+      "loss": 0.1143,
+      "step": 23509
+    },
+    {
+      "epoch": 0.20407808959991666,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0017619700722105254,
+      "loss": 0.1143,
+      "step": 23510
+    },
+    {
+      "epoch": 0.20408677008012083,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0017619499077689295,
+      "loss": 0.1191,
+      "step": 23511
+    },
+    {
+      "epoch": 0.204095450560325,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0017619297426034353,
+      "loss": 0.1445,
+      "step": 23512
+    },
+    {
+      "epoch": 0.20410413104052916,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0017619095767140644,
+      "loss": 0.0967,
+      "step": 23513
+    },
+    {
+      "epoch": 0.20411281152073332,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0017618894101008393,
+      "loss": 0.1221,
+      "step": 23514
+    },
+    {
+      "epoch": 0.2041214920009375,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0017618692427637818,
+      "loss": 0.0913,
+      "step": 23515
+    },
+    {
+      "epoch": 0.20413017248114165,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0017618490747029147,
+      "loss": 0.0835,
+      "step": 23516
+    },
+    {
+      "epoch": 0.20413885296134582,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0017618289059182584,
+      "loss": 0.085,
+      "step": 23517
+    },
+    {
+      "epoch": 0.20414753344154998,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0017618087364098364,
+      "loss": 0.1006,
+      "step": 23518
+    },
+    {
+      "epoch": 0.20415621392175415,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0017617885661776703,
+      "loss": 0.104,
+      "step": 23519
+    },
+    {
+      "epoch": 0.2041648944019583,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0017617683952217823,
+      "loss": 0.1123,
+      "step": 23520
+    },
+    {
+      "epoch": 0.20417357488216248,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.001761748223542194,
+      "loss": 0.0986,
+      "step": 23521
+    },
+    {
+      "epoch": 0.20418225536236664,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0017617280511389281,
+      "loss": 0.1123,
+      "step": 23522
+    },
+    {
+      "epoch": 0.2041909358425708,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001761707878012006,
+      "loss": 0.1348,
+      "step": 23523
+    },
+    {
+      "epoch": 0.20419961632277497,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0017616877041614503,
+      "loss": 0.0825,
+      "step": 23524
+    },
+    {
+      "epoch": 0.20420829680297914,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001761667529587283,
+      "loss": 0.0991,
+      "step": 23525
+    },
+    {
+      "epoch": 0.2042169772831833,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0017616473542895258,
+      "loss": 0.125,
+      "step": 23526
+    },
+    {
+      "epoch": 0.20422565776338747,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0017616271782682012,
+      "loss": 0.1211,
+      "step": 23527
+    },
+    {
+      "epoch": 0.20423433824359163,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001761607001523331,
+      "loss": 0.1143,
+      "step": 23528
+    },
+    {
+      "epoch": 0.2042430187237958,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.001761586824054937,
+      "loss": 0.1631,
+      "step": 23529
+    },
+    {
+      "epoch": 0.20425169920399996,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0017615666458630419,
+      "loss": 0.0718,
+      "step": 23530
+    },
+    {
+      "epoch": 0.20426037968420413,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0017615464669476675,
+      "loss": 0.1157,
+      "step": 23531
+    },
+    {
+      "epoch": 0.2042690601644083,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0017615262873088358,
+      "loss": 0.1216,
+      "step": 23532
+    },
+    {
+      "epoch": 0.20427774064461246,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001761506106946569,
+      "loss": 0.0991,
+      "step": 23533
+    },
+    {
+      "epoch": 0.20428642112481663,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0017614859258608887,
+      "loss": 0.1055,
+      "step": 23534
+    },
+    {
+      "epoch": 0.2042951016050208,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0017614657440518176,
+      "loss": 0.1201,
+      "step": 23535
+    },
+    {
+      "epoch": 0.20430378208522496,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0017614455615193774,
+      "loss": 0.0942,
+      "step": 23536
+    },
+    {
+      "epoch": 0.20431246256542912,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0017614253782635902,
+      "loss": 0.0918,
+      "step": 23537
+    },
+    {
+      "epoch": 0.20432114304563329,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0017614051942844784,
+      "loss": 0.0732,
+      "step": 23538
+    },
+    {
+      "epoch": 0.20432982352583745,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0017613850095820635,
+      "loss": 0.1406,
+      "step": 23539
+    },
+    {
+      "epoch": 0.20433850400604162,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0017613648241563682,
+      "loss": 0.1445,
+      "step": 23540
+    },
+    {
+      "epoch": 0.20434718448624578,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0017613446380074142,
+      "loss": 0.0889,
+      "step": 23541
+    },
+    {
+      "epoch": 0.20435586496644995,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0017613244511352235,
+      "loss": 0.1426,
+      "step": 23542
+    },
+    {
+      "epoch": 0.2043645454466541,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0017613042635398187,
+      "loss": 0.0864,
+      "step": 23543
+    },
+    {
+      "epoch": 0.20437322592685828,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0017612840752212209,
+      "loss": 0.1914,
+      "step": 23544
+    },
+    {
+      "epoch": 0.20438190640706244,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001761263886179453,
+      "loss": 0.1128,
+      "step": 23545
+    },
+    {
+      "epoch": 0.2043905868872666,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0017612436964145372,
+      "loss": 0.1387,
+      "step": 23546
+    },
+    {
+      "epoch": 0.20439926736747077,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0017612235059264946,
+      "loss": 0.1025,
+      "step": 23547
+    },
+    {
+      "epoch": 0.20440794784767494,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0017612033147153484,
+      "loss": 0.1357,
+      "step": 23548
+    },
+    {
+      "epoch": 0.2044166283278791,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0017611831227811198,
+      "loss": 0.1016,
+      "step": 23549
+    },
+    {
+      "epoch": 0.20442530880808327,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0017611629301238316,
+      "loss": 0.1143,
+      "step": 23550
+    },
+    {
+      "epoch": 0.20443398928828743,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0017611427367435054,
+      "loss": 0.1152,
+      "step": 23551
+    },
+    {
+      "epoch": 0.2044426697684916,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0017611225426401637,
+      "loss": 0.1128,
+      "step": 23552
+    },
+    {
+      "epoch": 0.20445135024869576,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001761102347813828,
+      "loss": 0.1021,
+      "step": 23553
+    },
+    {
+      "epoch": 0.20446003072889993,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0017610821522645208,
+      "loss": 0.1055,
+      "step": 23554
+    },
+    {
+      "epoch": 0.2044687112091041,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001761061955992264,
+      "loss": 0.1133,
+      "step": 23555
+    },
+    {
+      "epoch": 0.20447739168930826,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0017610417589970801,
+      "loss": 0.1396,
+      "step": 23556
+    },
+    {
+      "epoch": 0.20448607216951242,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0017610215612789903,
+      "loss": 0.1143,
+      "step": 23557
+    },
+    {
+      "epoch": 0.2044947526497166,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.001761001362838018,
+      "loss": 0.3965,
+      "step": 23558
+    },
+    {
+      "epoch": 0.20450343312992075,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001760981163674184,
+      "loss": 0.1299,
+      "step": 23559
+    },
+    {
+      "epoch": 0.20451211361012492,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001760960963787511,
+      "loss": 0.0801,
+      "step": 23560
+    },
+    {
+      "epoch": 0.20452079409032908,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001760940763178021,
+      "loss": 0.123,
+      "step": 23561
+    },
+    {
+      "epoch": 0.20452947457053325,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0017609205618457364,
+      "loss": 0.0791,
+      "step": 23562
+    },
+    {
+      "epoch": 0.2045381550507374,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0017609003597906787,
+      "loss": 0.0889,
+      "step": 23563
+    },
+    {
+      "epoch": 0.20454683553094158,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0017608801570128702,
+      "loss": 0.1016,
+      "step": 23564
+    },
+    {
+      "epoch": 0.20455551601114574,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0017608599535123336,
+      "loss": 0.106,
+      "step": 23565
+    },
+    {
+      "epoch": 0.2045641964913499,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.00176083974928909,
+      "loss": 0.1016,
+      "step": 23566
+    },
+    {
+      "epoch": 0.20457287697155407,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017608195443431624,
+      "loss": 0.1196,
+      "step": 23567
+    },
+    {
+      "epoch": 0.20458155745175824,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0017607993386745723,
+      "loss": 0.082,
+      "step": 23568
+    },
+    {
+      "epoch": 0.2045902379319624,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0017607791322833417,
+      "loss": 0.1338,
+      "step": 23569
+    },
+    {
+      "epoch": 0.20459891841216657,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0017607589251694934,
+      "loss": 0.1221,
+      "step": 23570
+    },
+    {
+      "epoch": 0.20460759889237073,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0017607387173330486,
+      "loss": 0.0957,
+      "step": 23571
+    },
+    {
+      "epoch": 0.2046162793725749,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0017607185087740302,
+      "loss": 0.1069,
+      "step": 23572
+    },
+    {
+      "epoch": 0.20462495985277906,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0017606982994924597,
+      "loss": 0.1377,
+      "step": 23573
+    },
+    {
+      "epoch": 0.20463364033298323,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0017606780894883595,
+      "loss": 0.124,
+      "step": 23574
+    },
+    {
+      "epoch": 0.2046423208131874,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001760657878761752,
+      "loss": 0.0869,
+      "step": 23575
+    },
+    {
+      "epoch": 0.20465100129339156,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0017606376673126587,
+      "loss": 0.1104,
+      "step": 23576
+    },
+    {
+      "epoch": 0.20465968177359573,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.001760617455141102,
+      "loss": 0.126,
+      "step": 23577
+    },
+    {
+      "epoch": 0.2046683622537999,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0017605972422471041,
+      "loss": 0.123,
+      "step": 23578
+    },
+    {
+      "epoch": 0.20467704273400406,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0017605770286306867,
+      "loss": 0.124,
+      "step": 23579
+    },
+    {
+      "epoch": 0.20468572321420822,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0017605568142918726,
+      "loss": 0.1035,
+      "step": 23580
+    },
+    {
+      "epoch": 0.20469440369441239,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0017605365992306833,
+      "loss": 0.0972,
+      "step": 23581
+    },
+    {
+      "epoch": 0.20470308417461655,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0017605163834471412,
+      "loss": 0.1055,
+      "step": 23582
+    },
+    {
+      "epoch": 0.20471176465482072,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0017604961669412682,
+      "loss": 0.123,
+      "step": 23583
+    },
+    {
+      "epoch": 0.20472044513502488,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0017604759497130864,
+      "loss": 0.0962,
+      "step": 23584
+    },
+    {
+      "epoch": 0.20472912561522905,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0017604557317626181,
+      "loss": 0.1001,
+      "step": 23585
+    },
+    {
+      "epoch": 0.2047378060954332,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0017604355130898855,
+      "loss": 0.1172,
+      "step": 23586
+    },
+    {
+      "epoch": 0.20474648657563738,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0017604152936949104,
+      "loss": 0.1152,
+      "step": 23587
+    },
+    {
+      "epoch": 0.20475516705584154,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001760395073577715,
+      "loss": 0.1182,
+      "step": 23588
+    },
+    {
+      "epoch": 0.2047638475360457,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0017603748527383217,
+      "loss": 0.0977,
+      "step": 23589
+    },
+    {
+      "epoch": 0.20477252801624987,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001760354631176752,
+      "loss": 0.127,
+      "step": 23590
+    },
+    {
+      "epoch": 0.20478120849645404,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0017603344088930287,
+      "loss": 0.1064,
+      "step": 23591
+    },
+    {
+      "epoch": 0.2047898889766582,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0017603141858871733,
+      "loss": 0.0967,
+      "step": 23592
+    },
+    {
+      "epoch": 0.20479856945686237,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0017602939621592087,
+      "loss": 0.082,
+      "step": 23593
+    },
+    {
+      "epoch": 0.20480724993706653,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0017602737377091563,
+      "loss": 0.0967,
+      "step": 23594
+    },
+    {
+      "epoch": 0.20481593041727067,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0017602535125370385,
+      "loss": 0.1182,
+      "step": 23595
+    },
+    {
+      "epoch": 0.20482461089747483,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0017602332866428773,
+      "loss": 0.1064,
+      "step": 23596
+    },
+    {
+      "epoch": 0.204833291377679,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0017602130600266948,
+      "loss": 0.1074,
+      "step": 23597
+    },
+    {
+      "epoch": 0.20484197185788316,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0017601928326885135,
+      "loss": 0.1689,
+      "step": 23598
+    },
+    {
+      "epoch": 0.20485065233808733,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0017601726046283551,
+      "loss": 0.1123,
+      "step": 23599
+    },
+    {
+      "epoch": 0.2048593328182915,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001760152375846242,
+      "loss": 0.0928,
+      "step": 23600
+    },
+    {
+      "epoch": 0.20486801329849566,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001760132146342196,
+      "loss": 0.1001,
+      "step": 23601
+    },
+    {
+      "epoch": 0.20487669377869983,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0017601119161162393,
+      "loss": 0.1953,
+      "step": 23602
+    },
+    {
+      "epoch": 0.204885374258904,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0017600916851683943,
+      "loss": 0.0879,
+      "step": 23603
+    },
+    {
+      "epoch": 0.20489405473910816,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.001760071453498683,
+      "loss": 0.0928,
+      "step": 23604
+    },
+    {
+      "epoch": 0.20490273521931232,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0017600512211071273,
+      "loss": 0.0703,
+      "step": 23605
+    },
+    {
+      "epoch": 0.20491141569951649,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0017600309879937494,
+      "loss": 0.105,
+      "step": 23606
+    },
+    {
+      "epoch": 0.20492009617972065,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0017600107541585717,
+      "loss": 0.0659,
+      "step": 23607
+    },
+    {
+      "epoch": 0.20492877665992482,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0017599905196016164,
+      "loss": 0.1104,
+      "step": 23608
+    },
+    {
+      "epoch": 0.20493745714012898,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001759970284322905,
+      "loss": 0.0952,
+      "step": 23609
+    },
+    {
+      "epoch": 0.20494613762033315,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0017599500483224603,
+      "loss": 0.0947,
+      "step": 23610
+    },
+    {
+      "epoch": 0.2049548181005373,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0017599298116003042,
+      "loss": 0.1172,
+      "step": 23611
+    },
+    {
+      "epoch": 0.20496349858074148,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0017599095741564583,
+      "loss": 0.0908,
+      "step": 23612
+    },
+    {
+      "epoch": 0.20497217906094564,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0017598893359909455,
+      "loss": 0.1108,
+      "step": 23613
+    },
+    {
+      "epoch": 0.2049808595411498,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0017598690971037877,
+      "loss": 0.1045,
+      "step": 23614
+    },
+    {
+      "epoch": 0.20498954002135397,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001759848857495007,
+      "loss": 0.1035,
+      "step": 23615
+    },
+    {
+      "epoch": 0.20499822050155814,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0017598286171646256,
+      "loss": 0.1143,
+      "step": 23616
+    },
+    {
+      "epoch": 0.2050069009817623,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0017598083761126652,
+      "loss": 0.0806,
+      "step": 23617
+    },
+    {
+      "epoch": 0.20501558146196647,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0017597881343391483,
+      "loss": 0.1123,
+      "step": 23618
+    },
+    {
+      "epoch": 0.20502426194217063,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0017597678918440972,
+      "loss": 0.1279,
+      "step": 23619
+    },
+    {
+      "epoch": 0.2050329424223748,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0017597476486275337,
+      "loss": 0.1221,
+      "step": 23620
+    },
+    {
+      "epoch": 0.20504162290257896,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0017597274046894803,
+      "loss": 0.1328,
+      "step": 23621
+    },
+    {
+      "epoch": 0.20505030338278313,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0017597071600299586,
+      "loss": 0.1074,
+      "step": 23622
+    },
+    {
+      "epoch": 0.2050589838629873,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0017596869146489914,
+      "loss": 0.105,
+      "step": 23623
+    },
+    {
+      "epoch": 0.20506766434319146,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0017596666685466002,
+      "loss": 0.1289,
+      "step": 23624
+    },
+    {
+      "epoch": 0.20507634482339562,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0017596464217228076,
+      "loss": 0.1226,
+      "step": 23625
+    },
+    {
+      "epoch": 0.2050850253035998,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0017596261741776357,
+      "loss": 0.1406,
+      "step": 23626
+    },
+    {
+      "epoch": 0.20509370578380395,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0017596059259111065,
+      "loss": 0.1201,
+      "step": 23627
+    },
+    {
+      "epoch": 0.20510238626400812,
+      "grad_norm": 1.7578125,
+      "learning_rate": 0.0017595856769232422,
+      "loss": 0.1465,
+      "step": 23628
+    },
+    {
+      "epoch": 0.20511106674421228,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0017595654272140647,
+      "loss": 0.1045,
+      "step": 23629
+    },
+    {
+      "epoch": 0.20511974722441645,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0017595451767835962,
+      "loss": 0.0835,
+      "step": 23630
+    },
+    {
+      "epoch": 0.20512842770462061,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0017595249256318593,
+      "loss": 0.0952,
+      "step": 23631
+    },
+    {
+      "epoch": 0.20513710818482478,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001759504673758876,
+      "loss": 0.1025,
+      "step": 23632
+    },
+    {
+      "epoch": 0.20514578866502894,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001759484421164668,
+      "loss": 0.1426,
+      "step": 23633
+    },
+    {
+      "epoch": 0.2051544691452331,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0017594641678492578,
+      "loss": 0.123,
+      "step": 23634
+    },
+    {
+      "epoch": 0.20516314962543727,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0017594439138126678,
+      "loss": 0.1348,
+      "step": 23635
+    },
+    {
+      "epoch": 0.20517183010564144,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0017594236590549193,
+      "loss": 0.2178,
+      "step": 23636
+    },
+    {
+      "epoch": 0.2051805105858456,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001759403403576035,
+      "loss": 0.1064,
+      "step": 23637
+    },
+    {
+      "epoch": 0.20518919106604977,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0017593831473760374,
+      "loss": 0.0903,
+      "step": 23638
+    },
+    {
+      "epoch": 0.20519787154625393,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0017593628904549485,
+      "loss": 0.1143,
+      "step": 23639
+    },
+    {
+      "epoch": 0.2052065520264581,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.00175934263281279,
+      "loss": 0.1143,
+      "step": 23640
+    },
+    {
+      "epoch": 0.20521523250666227,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0017593223744495842,
+      "loss": 0.1387,
+      "step": 23641
+    },
+    {
+      "epoch": 0.20522391298686643,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017593021153653533,
+      "loss": 0.1045,
+      "step": 23642
+    },
+    {
+      "epoch": 0.2052325934670706,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0017592818555601198,
+      "loss": 0.0933,
+      "step": 23643
+    },
+    {
+      "epoch": 0.20524127394727476,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0017592615950339054,
+      "loss": 0.1328,
+      "step": 23644
+    },
+    {
+      "epoch": 0.20524995442747893,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0017592413337867321,
+      "loss": 0.0947,
+      "step": 23645
+    },
+    {
+      "epoch": 0.2052586349076831,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0017592210718186227,
+      "loss": 0.1133,
+      "step": 23646
+    },
+    {
+      "epoch": 0.20526731538788726,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0017592008091295995,
+      "loss": 0.1846,
+      "step": 23647
+    },
+    {
+      "epoch": 0.20527599586809142,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0017591805457196834,
+      "loss": 0.1094,
+      "step": 23648
+    },
+    {
+      "epoch": 0.20528467634829559,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001759160281588898,
+      "loss": 0.0967,
+      "step": 23649
+    },
+    {
+      "epoch": 0.20529335682849975,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0017591400167372643,
+      "loss": 0.1006,
+      "step": 23650
+    },
+    {
+      "epoch": 0.20530203730870392,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0017591197511648052,
+      "loss": 0.1426,
+      "step": 23651
+    },
+    {
+      "epoch": 0.20531071778890808,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0017590994848715427,
+      "loss": 0.1064,
+      "step": 23652
+    },
+    {
+      "epoch": 0.20531939826911225,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0017590792178574987,
+      "loss": 0.103,
+      "step": 23653
+    },
+    {
+      "epoch": 0.2053280787493164,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0017590589501226962,
+      "loss": 0.127,
+      "step": 23654
+    },
+    {
+      "epoch": 0.20533675922952058,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0017590386816671562,
+      "loss": 0.1123,
+      "step": 23655
+    },
+    {
+      "epoch": 0.20534543970972474,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0017590184124909012,
+      "loss": 0.1533,
+      "step": 23656
+    },
+    {
+      "epoch": 0.2053541201899289,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001758998142593954,
+      "loss": 0.1143,
+      "step": 23657
+    },
+    {
+      "epoch": 0.20536280067013307,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001758977871976336,
+      "loss": 0.0898,
+      "step": 23658
+    },
+    {
+      "epoch": 0.20537148115033724,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.00175895760063807,
+      "loss": 0.1055,
+      "step": 23659
+    },
+    {
+      "epoch": 0.2053801616305414,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0017589373285791779,
+      "loss": 0.1025,
+      "step": 23660
+    },
+    {
+      "epoch": 0.20538884211074557,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0017589170557996813,
+      "loss": 0.0967,
+      "step": 23661
+    },
+    {
+      "epoch": 0.20539752259094973,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0017588967822996036,
+      "loss": 0.1133,
+      "step": 23662
+    },
+    {
+      "epoch": 0.2054062030711539,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0017588765080789658,
+      "loss": 0.1348,
+      "step": 23663
+    },
+    {
+      "epoch": 0.20541488355135806,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0017588562331377906,
+      "loss": 0.2002,
+      "step": 23664
+    },
+    {
+      "epoch": 0.20542356403156223,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0017588359574761004,
+      "loss": 0.0938,
+      "step": 23665
+    },
+    {
+      "epoch": 0.2054322445117664,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0017588156810939169,
+      "loss": 0.0938,
+      "step": 23666
+    },
+    {
+      "epoch": 0.20544092499197056,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0017587954039912626,
+      "loss": 0.106,
+      "step": 23667
+    },
+    {
+      "epoch": 0.20544960547217472,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0017587751261681592,
+      "loss": 0.1123,
+      "step": 23668
+    },
+    {
+      "epoch": 0.2054582859523789,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0017587548476246295,
+      "loss": 0.1484,
+      "step": 23669
+    },
+    {
+      "epoch": 0.20546696643258305,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0017587345683606954,
+      "loss": 0.127,
+      "step": 23670
+    },
+    {
+      "epoch": 0.20547564691278722,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0017587142883763792,
+      "loss": 0.1064,
+      "step": 23671
+    },
+    {
+      "epoch": 0.20548432739299138,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0017586940076717028,
+      "loss": 0.1152,
+      "step": 23672
+    },
+    {
+      "epoch": 0.20549300787319555,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0017586737262466883,
+      "loss": 0.1309,
+      "step": 23673
+    },
+    {
+      "epoch": 0.20550168835339971,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0017586534441013583,
+      "loss": 0.1211,
+      "step": 23674
+    },
+    {
+      "epoch": 0.20551036883360388,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001758633161235735,
+      "loss": 0.1035,
+      "step": 23675
+    },
+    {
+      "epoch": 0.20551904931380804,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0017586128776498403,
+      "loss": 0.1108,
+      "step": 23676
+    },
+    {
+      "epoch": 0.2055277297940122,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0017585925933436962,
+      "loss": 0.1387,
+      "step": 23677
+    },
+    {
+      "epoch": 0.20553641027421637,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0017585723083173254,
+      "loss": 0.1147,
+      "step": 23678
+    },
+    {
+      "epoch": 0.20554509075442054,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0017585520225707493,
+      "loss": 0.0771,
+      "step": 23679
+    },
+    {
+      "epoch": 0.2055537712346247,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0017585317361039912,
+      "loss": 0.0854,
+      "step": 23680
+    },
+    {
+      "epoch": 0.20556245171482887,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0017585114489170723,
+      "loss": 0.105,
+      "step": 23681
+    },
+    {
+      "epoch": 0.20557113219503304,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0017584911610100157,
+      "loss": 0.1367,
+      "step": 23682
+    },
+    {
+      "epoch": 0.2055798126752372,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0017584708723828424,
+      "loss": 0.0781,
+      "step": 23683
+    },
+    {
+      "epoch": 0.20558849315544137,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0017584505830355757,
+      "loss": 0.0908,
+      "step": 23684
+    },
+    {
+      "epoch": 0.20559717363564553,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001758430292968237,
+      "loss": 0.0898,
+      "step": 23685
+    },
+    {
+      "epoch": 0.2056058541158497,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0017584100021808492,
+      "loss": 0.0869,
+      "step": 23686
+    },
+    {
+      "epoch": 0.20561453459605386,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0017583897106734341,
+      "loss": 0.1011,
+      "step": 23687
+    },
+    {
+      "epoch": 0.20562321507625803,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0017583694184460138,
+      "loss": 0.0811,
+      "step": 23688
+    },
+    {
+      "epoch": 0.2056318955564622,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0017583491254986104,
+      "loss": 0.084,
+      "step": 23689
+    },
+    {
+      "epoch": 0.20564057603666636,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0017583288318312464,
+      "loss": 0.1191,
+      "step": 23690
+    },
+    {
+      "epoch": 0.20564925651687052,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001758308537443944,
+      "loss": 0.1738,
+      "step": 23691
+    },
+    {
+      "epoch": 0.2056579369970747,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0017582882423367253,
+      "loss": 0.1094,
+      "step": 23692
+    },
+    {
+      "epoch": 0.20566661747727885,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0017582679465096122,
+      "loss": 0.0913,
+      "step": 23693
+    },
+    {
+      "epoch": 0.20567529795748302,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0017582476499626272,
+      "loss": 0.1104,
+      "step": 23694
+    },
+    {
+      "epoch": 0.20568397843768718,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0017582273526957925,
+      "loss": 0.1074,
+      "step": 23695
+    },
+    {
+      "epoch": 0.20569265891789135,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0017582070547091304,
+      "loss": 0.1016,
+      "step": 23696
+    },
+    {
+      "epoch": 0.2057013393980955,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001758186756002663,
+      "loss": 0.0918,
+      "step": 23697
+    },
+    {
+      "epoch": 0.20571001987829968,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0017581664565764123,
+      "loss": 0.1001,
+      "step": 23698
+    },
+    {
+      "epoch": 0.20571870035850384,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0017581461564304007,
+      "loss": 0.0864,
+      "step": 23699
+    },
+    {
+      "epoch": 0.205727380838708,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0017581258555646503,
+      "loss": 0.1426,
+      "step": 23700
+    },
+    {
+      "epoch": 0.20573606131891217,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0017581055539791835,
+      "loss": 0.1196,
+      "step": 23701
+    },
+    {
+      "epoch": 0.20574474179911634,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0017580852516740223,
+      "loss": 0.1006,
+      "step": 23702
+    },
+    {
+      "epoch": 0.2057534222793205,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0017580649486491887,
+      "loss": 0.0762,
+      "step": 23703
+    },
+    {
+      "epoch": 0.20576210275952467,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0017580446449047054,
+      "loss": 0.0928,
+      "step": 23704
+    },
+    {
+      "epoch": 0.20577078323972883,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0017580243404405945,
+      "loss": 0.1523,
+      "step": 23705
+    },
+    {
+      "epoch": 0.205779463719933,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0017580040352568778,
+      "loss": 0.103,
+      "step": 23706
+    },
+    {
+      "epoch": 0.20578814420013716,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0017579837293535778,
+      "loss": 0.1201,
+      "step": 23707
+    },
+    {
+      "epoch": 0.20579682468034133,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001757963422730717,
+      "loss": 0.0874,
+      "step": 23708
+    },
+    {
+      "epoch": 0.2058055051605455,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0017579431153883168,
+      "loss": 0.1416,
+      "step": 23709
+    },
+    {
+      "epoch": 0.20581418564074966,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0017579228073264003,
+      "loss": 0.0942,
+      "step": 23710
+    },
+    {
+      "epoch": 0.20582286612095382,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001757902498544989,
+      "loss": 0.0923,
+      "step": 23711
+    },
+    {
+      "epoch": 0.205831546601158,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017578821890441056,
+      "loss": 0.1191,
+      "step": 23712
+    },
+    {
+      "epoch": 0.20584022708136215,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0017578618788237719,
+      "loss": 0.0879,
+      "step": 23713
+    },
+    {
+      "epoch": 0.20584890756156632,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0017578415678840104,
+      "loss": 0.1245,
+      "step": 23714
+    },
+    {
+      "epoch": 0.20585758804177048,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0017578212562248435,
+      "loss": 0.1621,
+      "step": 23715
+    },
+    {
+      "epoch": 0.20586626852197465,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0017578009438462931,
+      "loss": 0.1426,
+      "step": 23716
+    },
+    {
+      "epoch": 0.20587494900217881,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0017577806307483815,
+      "loss": 0.1211,
+      "step": 23717
+    },
+    {
+      "epoch": 0.20588362948238295,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0017577603169311306,
+      "loss": 0.0781,
+      "step": 23718
+    },
+    {
+      "epoch": 0.20589230996258712,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001757740002394563,
+      "loss": 0.0957,
+      "step": 23719
+    },
+    {
+      "epoch": 0.20590099044279128,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0017577196871387012,
+      "loss": 0.0693,
+      "step": 23720
+    },
+    {
+      "epoch": 0.20590967092299545,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0017576993711635664,
+      "loss": 0.1079,
+      "step": 23721
+    },
+    {
+      "epoch": 0.2059183514031996,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0017576790544691816,
+      "loss": 0.0864,
+      "step": 23722
+    },
+    {
+      "epoch": 0.20592703188340378,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0017576587370555692,
+      "loss": 0.0762,
+      "step": 23723
+    },
+    {
+      "epoch": 0.20593571236360794,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0017576384189227508,
+      "loss": 0.1182,
+      "step": 23724
+    },
+    {
+      "epoch": 0.2059443928438121,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0017576181000707493,
+      "loss": 0.0938,
+      "step": 23725
+    },
+    {
+      "epoch": 0.20595307332401627,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0017575977804995863,
+      "loss": 0.1235,
+      "step": 23726
+    },
+    {
+      "epoch": 0.20596175380422044,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001757577460209284,
+      "loss": 0.1118,
+      "step": 23727
+    },
+    {
+      "epoch": 0.2059704342844246,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017575571391998648,
+      "loss": 0.1309,
+      "step": 23728
+    },
+    {
+      "epoch": 0.20597911476462877,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0017575368174713515,
+      "loss": 0.1074,
+      "step": 23729
+    },
+    {
+      "epoch": 0.20598779524483293,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0017575164950237659,
+      "loss": 0.123,
+      "step": 23730
+    },
+    {
+      "epoch": 0.2059964757250371,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0017574961718571295,
+      "loss": 0.127,
+      "step": 23731
+    },
+    {
+      "epoch": 0.20600515620524126,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0017574758479714655,
+      "loss": 0.0962,
+      "step": 23732
+    },
+    {
+      "epoch": 0.20601383668544543,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0017574555233667956,
+      "loss": 0.0928,
+      "step": 23733
+    },
+    {
+      "epoch": 0.2060225171656496,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0017574351980431425,
+      "loss": 0.1016,
+      "step": 23734
+    },
+    {
+      "epoch": 0.20603119764585376,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001757414872000528,
+      "loss": 0.1152,
+      "step": 23735
+    },
+    {
+      "epoch": 0.20603987812605792,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0017573945452389746,
+      "loss": 0.1025,
+      "step": 23736
+    },
+    {
+      "epoch": 0.2060485586062621,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0017573742177585043,
+      "loss": 0.0806,
+      "step": 23737
+    },
+    {
+      "epoch": 0.20605723908646625,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0017573538895591394,
+      "loss": 0.1279,
+      "step": 23738
+    },
+    {
+      "epoch": 0.20606591956667042,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0017573335606409022,
+      "loss": 0.0815,
+      "step": 23739
+    },
+    {
+      "epoch": 0.20607460004687458,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001757313231003815,
+      "loss": 0.0889,
+      "step": 23740
+    },
+    {
+      "epoch": 0.20608328052707875,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0017572929006478997,
+      "loss": 0.2949,
+      "step": 23741
+    },
+    {
+      "epoch": 0.20609196100728291,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0017572725695731788,
+      "loss": 0.0825,
+      "step": 23742
+    },
+    {
+      "epoch": 0.20610064148748708,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001757252237779675,
+      "loss": 0.0918,
+      "step": 23743
+    },
+    {
+      "epoch": 0.20610932196769124,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0017572319052674094,
+      "loss": 0.0972,
+      "step": 23744
+    },
+    {
+      "epoch": 0.2061180024478954,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001757211572036405,
+      "loss": 0.0947,
+      "step": 23745
+    },
+    {
+      "epoch": 0.20612668292809957,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0017571912380866842,
+      "loss": 0.1719,
+      "step": 23746
+    },
+    {
+      "epoch": 0.20613536340830374,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0017571709034182685,
+      "loss": 0.1055,
+      "step": 23747
+    },
+    {
+      "epoch": 0.2061440438885079,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0017571505680311808,
+      "loss": 0.0723,
+      "step": 23748
+    },
+    {
+      "epoch": 0.20615272436871207,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0017571302319254432,
+      "loss": 0.0957,
+      "step": 23749
+    },
+    {
+      "epoch": 0.20616140484891624,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0017571098951010776,
+      "loss": 0.1035,
+      "step": 23750
+    },
+    {
+      "epoch": 0.2061700853291204,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0017570895575581068,
+      "loss": 0.1377,
+      "step": 23751
+    },
+    {
+      "epoch": 0.20617876580932457,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0017570692192965523,
+      "loss": 0.1055,
+      "step": 23752
+    },
+    {
+      "epoch": 0.20618744628952873,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0017570488803164374,
+      "loss": 0.0996,
+      "step": 23753
+    },
+    {
+      "epoch": 0.2061961267697329,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.001757028540617783,
+      "loss": 0.2344,
+      "step": 23754
+    },
+    {
+      "epoch": 0.20620480724993706,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0017570082002006124,
+      "loss": 0.1016,
+      "step": 23755
+    },
+    {
+      "epoch": 0.20621348773014123,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0017569878590649477,
+      "loss": 0.0967,
+      "step": 23756
+    },
+    {
+      "epoch": 0.2062221682103454,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0017569675172108108,
+      "loss": 0.1147,
+      "step": 23757
+    },
+    {
+      "epoch": 0.20623084869054956,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001756947174638224,
+      "loss": 0.1118,
+      "step": 23758
+    },
+    {
+      "epoch": 0.20623952917075372,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0017569268313472098,
+      "loss": 0.1699,
+      "step": 23759
+    },
+    {
+      "epoch": 0.2062482096509579,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.00175690648733779,
+      "loss": 0.1084,
+      "step": 23760
+    },
+    {
+      "epoch": 0.20625689013116205,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0017568861426099877,
+      "loss": 0.0967,
+      "step": 23761
+    },
+    {
+      "epoch": 0.20626557061136622,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001756865797163824,
+      "loss": 0.104,
+      "step": 23762
+    },
+    {
+      "epoch": 0.20627425109157038,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0017568454509993218,
+      "loss": 0.1543,
+      "step": 23763
+    },
+    {
+      "epoch": 0.20628293157177455,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0017568251041165034,
+      "loss": 0.0913,
+      "step": 23764
+    },
+    {
+      "epoch": 0.2062916120519787,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.001756804756515391,
+      "loss": 0.1562,
+      "step": 23765
+    },
+    {
+      "epoch": 0.20630029253218288,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0017567844081960067,
+      "loss": 0.0986,
+      "step": 23766
+    },
+    {
+      "epoch": 0.20630897301238704,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0017567640591583731,
+      "loss": 0.1182,
+      "step": 23767
+    },
+    {
+      "epoch": 0.2063176534925912,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0017567437094025118,
+      "loss": 0.0967,
+      "step": 23768
+    },
+    {
+      "epoch": 0.20632633397279537,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0017567233589284455,
+      "loss": 0.0859,
+      "step": 23769
+    },
+    {
+      "epoch": 0.20633501445299954,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0017567030077361967,
+      "loss": 0.0771,
+      "step": 23770
+    },
+    {
+      "epoch": 0.2063436949332037,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0017566826558257872,
+      "loss": 0.1035,
+      "step": 23771
+    },
+    {
+      "epoch": 0.20635237541340787,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0017566623031972394,
+      "loss": 0.1064,
+      "step": 23772
+    },
+    {
+      "epoch": 0.20636105589361203,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0017566419498505758,
+      "loss": 0.1235,
+      "step": 23773
+    },
+    {
+      "epoch": 0.2063697363738162,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0017566215957858181,
+      "loss": 0.1172,
+      "step": 23774
+    },
+    {
+      "epoch": 0.20637841685402036,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001756601241002989,
+      "loss": 0.1011,
+      "step": 23775
+    },
+    {
+      "epoch": 0.20638709733422453,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0017565808855021106,
+      "loss": 0.0986,
+      "step": 23776
+    },
+    {
+      "epoch": 0.2063957778144287,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0017565605292832053,
+      "loss": 0.1318,
+      "step": 23777
+    },
+    {
+      "epoch": 0.20640445829463286,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001756540172346295,
+      "loss": 0.1094,
+      "step": 23778
+    },
+    {
+      "epoch": 0.20641313877483702,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0017565198146914025,
+      "loss": 0.0806,
+      "step": 23779
+    },
+    {
+      "epoch": 0.2064218192550412,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0017564994563185497,
+      "loss": 0.2578,
+      "step": 23780
+    },
+    {
+      "epoch": 0.20643049973524535,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0017564790972277593,
+      "loss": 0.1001,
+      "step": 23781
+    },
+    {
+      "epoch": 0.20643918021544952,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0017564587374190525,
+      "loss": 0.127,
+      "step": 23782
+    },
+    {
+      "epoch": 0.20644786069565368,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.001756438376892453,
+      "loss": 0.0718,
+      "step": 23783
+    },
+    {
+      "epoch": 0.20645654117585785,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0017564180156479819,
+      "loss": 0.0903,
+      "step": 23784
+    },
+    {
+      "epoch": 0.20646522165606201,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017563976536856623,
+      "loss": 0.124,
+      "step": 23785
+    },
+    {
+      "epoch": 0.20647390213626618,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0017563772910055158,
+      "loss": 0.0977,
+      "step": 23786
+    },
+    {
+      "epoch": 0.20648258261647034,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0017563569276075651,
+      "loss": 0.0752,
+      "step": 23787
+    },
+    {
+      "epoch": 0.2064912630966745,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0017563365634918322,
+      "loss": 0.0957,
+      "step": 23788
+    },
+    {
+      "epoch": 0.20649994357687868,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0017563161986583395,
+      "loss": 0.1162,
+      "step": 23789
+    },
+    {
+      "epoch": 0.20650862405708284,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0017562958331071095,
+      "loss": 0.0859,
+      "step": 23790
+    },
+    {
+      "epoch": 0.206517304537287,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0017562754668381637,
+      "loss": 0.0874,
+      "step": 23791
+    },
+    {
+      "epoch": 0.20652598501749117,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0017562550998515257,
+      "loss": 0.0977,
+      "step": 23792
+    },
+    {
+      "epoch": 0.20653466549769534,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0017562347321472163,
+      "loss": 0.1157,
+      "step": 23793
+    },
+    {
+      "epoch": 0.2065433459778995,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0017562143637252588,
+      "loss": 0.1221,
+      "step": 23794
+    },
+    {
+      "epoch": 0.20655202645810367,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001756193994585675,
+      "loss": 0.1099,
+      "step": 23795
+    },
+    {
+      "epoch": 0.20656070693830783,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0017561736247284875,
+      "loss": 0.1191,
+      "step": 23796
+    },
+    {
+      "epoch": 0.206569387418512,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0017561532541537181,
+      "loss": 0.1006,
+      "step": 23797
+    },
+    {
+      "epoch": 0.20657806789871616,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0017561328828613896,
+      "loss": 0.1157,
+      "step": 23798
+    },
+    {
+      "epoch": 0.20658674837892033,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0017561125108515238,
+      "loss": 0.085,
+      "step": 23799
+    },
+    {
+      "epoch": 0.2065954288591245,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017560921381241435,
+      "loss": 0.1162,
+      "step": 23800
+    },
+    {
+      "epoch": 0.20660410933932866,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0017560717646792703,
+      "loss": 0.1074,
+      "step": 23801
+    },
+    {
+      "epoch": 0.20661278981953282,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0017560513905169274,
+      "loss": 0.1064,
+      "step": 23802
+    },
+    {
+      "epoch": 0.206621470299737,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001756031015637136,
+      "loss": 0.1191,
+      "step": 23803
+    },
+    {
+      "epoch": 0.20663015077994115,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0017560106400399194,
+      "loss": 0.1406,
+      "step": 23804
+    },
+    {
+      "epoch": 0.20663883126014532,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0017559902637252992,
+      "loss": 0.1172,
+      "step": 23805
+    },
+    {
+      "epoch": 0.20664751174034948,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0017559698866932978,
+      "loss": 0.0859,
+      "step": 23806
+    },
+    {
+      "epoch": 0.20665619222055365,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0017559495089439377,
+      "loss": 0.106,
+      "step": 23807
+    },
+    {
+      "epoch": 0.2066648727007578,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001755929130477241,
+      "loss": 0.1357,
+      "step": 23808
+    },
+    {
+      "epoch": 0.20667355318096198,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0017559087512932306,
+      "loss": 0.1182,
+      "step": 23809
+    },
+    {
+      "epoch": 0.20668223366116614,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0017558883713919276,
+      "loss": 0.126,
+      "step": 23810
+    },
+    {
+      "epoch": 0.2066909141413703,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017558679907733552,
+      "loss": 0.1045,
+      "step": 23811
+    },
+    {
+      "epoch": 0.20669959462157447,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0017558476094375352,
+      "loss": 0.124,
+      "step": 23812
+    },
+    {
+      "epoch": 0.20670827510177864,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0017558272273844903,
+      "loss": 0.124,
+      "step": 23813
+    },
+    {
+      "epoch": 0.2067169555819828,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.0017558068446142426,
+      "loss": 0.2383,
+      "step": 23814
+    },
+    {
+      "epoch": 0.20672563606218697,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0017557864611268143,
+      "loss": 0.1309,
+      "step": 23815
+    },
+    {
+      "epoch": 0.20673431654239113,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0017557660769222277,
+      "loss": 0.1025,
+      "step": 23816
+    },
+    {
+      "epoch": 0.2067429970225953,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0017557456920005055,
+      "loss": 0.0825,
+      "step": 23817
+    },
+    {
+      "epoch": 0.20675167750279946,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0017557253063616694,
+      "loss": 0.1157,
+      "step": 23818
+    },
+    {
+      "epoch": 0.20676035798300363,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001755704920005742,
+      "loss": 0.1709,
+      "step": 23819
+    },
+    {
+      "epoch": 0.2067690384632078,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0017556845329327457,
+      "loss": 0.1094,
+      "step": 23820
+    },
+    {
+      "epoch": 0.20677771894341196,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0017556641451427022,
+      "loss": 0.1177,
+      "step": 23821
+    },
+    {
+      "epoch": 0.20678639942361612,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0017556437566356349,
+      "loss": 0.127,
+      "step": 23822
+    },
+    {
+      "epoch": 0.2067950799038203,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001755623367411565,
+      "loss": 0.1211,
+      "step": 23823
+    },
+    {
+      "epoch": 0.20680376038402445,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0017556029774705153,
+      "loss": 0.1406,
+      "step": 23824
+    },
+    {
+      "epoch": 0.20681244086422862,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0017555825868125082,
+      "loss": 0.1104,
+      "step": 23825
+    },
+    {
+      "epoch": 0.20682112134443278,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0017555621954375657,
+      "loss": 0.1094,
+      "step": 23826
+    },
+    {
+      "epoch": 0.20682980182463695,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.00175554180334571,
+      "loss": 0.1016,
+      "step": 23827
+    },
+    {
+      "epoch": 0.20683848230484111,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001755521410536964,
+      "loss": 0.0742,
+      "step": 23828
+    },
+    {
+      "epoch": 0.20684716278504528,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0017555010170113492,
+      "loss": 0.1152,
+      "step": 23829
+    },
+    {
+      "epoch": 0.20685584326524944,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0017554806227688888,
+      "loss": 0.1328,
+      "step": 23830
+    },
+    {
+      "epoch": 0.2068645237454536,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0017554602278096049,
+      "loss": 0.1465,
+      "step": 23831
+    },
+    {
+      "epoch": 0.20687320422565778,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0017554398321335187,
+      "loss": 0.1011,
+      "step": 23832
+    },
+    {
+      "epoch": 0.20688188470586194,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0017554194357406539,
+      "loss": 0.0972,
+      "step": 23833
+    },
+    {
+      "epoch": 0.2068905651860661,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001755399038631032,
+      "loss": 0.1309,
+      "step": 23834
+    },
+    {
+      "epoch": 0.20689924566627027,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0017553786408046757,
+      "loss": 0.1289,
+      "step": 23835
+    },
+    {
+      "epoch": 0.20690792614647444,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0017553582422616073,
+      "loss": 0.1143,
+      "step": 23836
+    },
+    {
+      "epoch": 0.2069166066266786,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0017553378430018487,
+      "loss": 0.1748,
+      "step": 23837
+    },
+    {
+      "epoch": 0.20692528710688277,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0017553174430254227,
+      "loss": 0.1099,
+      "step": 23838
+    },
+    {
+      "epoch": 0.20693396758708693,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001755297042332351,
+      "loss": 0.1318,
+      "step": 23839
+    },
+    {
+      "epoch": 0.2069426480672911,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0017552766409226568,
+      "loss": 0.0947,
+      "step": 23840
+    },
+    {
+      "epoch": 0.20695132854749526,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0017552562387963618,
+      "loss": 0.1445,
+      "step": 23841
+    },
+    {
+      "epoch": 0.2069600090276994,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001755235835953488,
+      "loss": 0.1309,
+      "step": 23842
+    },
+    {
+      "epoch": 0.20696868950790356,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0017552154323940586,
+      "loss": 0.1152,
+      "step": 23843
+    },
+    {
+      "epoch": 0.20697736998810773,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0017551950281180953,
+      "loss": 0.1582,
+      "step": 23844
+    },
+    {
+      "epoch": 0.2069860504683119,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0017551746231256205,
+      "loss": 0.126,
+      "step": 23845
+    },
+    {
+      "epoch": 0.20699473094851606,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0017551542174166566,
+      "loss": 0.1289,
+      "step": 23846
+    },
+    {
+      "epoch": 0.20700341142872022,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001755133810991226,
+      "loss": 0.0845,
+      "step": 23847
+    },
+    {
+      "epoch": 0.2070120919089244,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0017551134038493509,
+      "loss": 0.126,
+      "step": 23848
+    },
+    {
+      "epoch": 0.20702077238912855,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0017550929959910535,
+      "loss": 0.1035,
+      "step": 23849
+    },
+    {
+      "epoch": 0.20702945286933272,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0017550725874163563,
+      "loss": 0.125,
+      "step": 23850
+    },
+    {
+      "epoch": 0.20703813334953688,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0017550521781252815,
+      "loss": 0.1084,
+      "step": 23851
+    },
+    {
+      "epoch": 0.20704681382974105,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0017550317681178517,
+      "loss": 0.1201,
+      "step": 23852
+    },
+    {
+      "epoch": 0.20705549430994521,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0017550113573940888,
+      "loss": 0.1104,
+      "step": 23853
+    },
+    {
+      "epoch": 0.20706417479014938,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0017549909459540155,
+      "loss": 0.126,
+      "step": 23854
+    },
+    {
+      "epoch": 0.20707285527035355,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0017549705337976537,
+      "loss": 0.1074,
+      "step": 23855
+    },
+    {
+      "epoch": 0.2070815357505577,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0017549501209250262,
+      "loss": 0.1104,
+      "step": 23856
+    },
+    {
+      "epoch": 0.20709021623076188,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001754929707336155,
+      "loss": 0.0854,
+      "step": 23857
+    },
+    {
+      "epoch": 0.20709889671096604,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0017549092930310621,
+      "loss": 0.1201,
+      "step": 23858
+    },
+    {
+      "epoch": 0.2071075771911702,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001754888878009771,
+      "loss": 0.1094,
+      "step": 23859
+    },
+    {
+      "epoch": 0.20711625767137437,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0017548684622723026,
+      "loss": 0.1377,
+      "step": 23860
+    },
+    {
+      "epoch": 0.20712493815157854,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0017548480458186805,
+      "loss": 0.1045,
+      "step": 23861
+    },
+    {
+      "epoch": 0.2071336186317827,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001754827628648926,
+      "loss": 0.0742,
+      "step": 23862
+    },
+    {
+      "epoch": 0.20714229911198687,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0017548072107630619,
+      "loss": 0.0918,
+      "step": 23863
+    },
+    {
+      "epoch": 0.20715097959219103,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0017547867921611105,
+      "loss": 0.1348,
+      "step": 23864
+    },
+    {
+      "epoch": 0.2071596600723952,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0017547663728430941,
+      "loss": 0.1377,
+      "step": 23865
+    },
+    {
+      "epoch": 0.20716834055259936,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0017547459528090348,
+      "loss": 0.1021,
+      "step": 23866
+    },
+    {
+      "epoch": 0.20717702103280353,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0017547255320589557,
+      "loss": 0.0854,
+      "step": 23867
+    },
+    {
+      "epoch": 0.2071857015130077,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0017547051105928782,
+      "loss": 0.1089,
+      "step": 23868
+    },
+    {
+      "epoch": 0.20719438199321186,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001754684688410825,
+      "loss": 0.1167,
+      "step": 23869
+    },
+    {
+      "epoch": 0.20720306247341602,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0017546642655128185,
+      "loss": 0.1475,
+      "step": 23870
+    },
+    {
+      "epoch": 0.2072117429536202,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017546438418988813,
+      "loss": 0.1138,
+      "step": 23871
+    },
+    {
+      "epoch": 0.20722042343382435,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001754623417569035,
+      "loss": 0.1089,
+      "step": 23872
+    },
+    {
+      "epoch": 0.20722910391402852,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0017546029925233028,
+      "loss": 0.1719,
+      "step": 23873
+    },
+    {
+      "epoch": 0.20723778439423268,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0017545825667617061,
+      "loss": 0.0688,
+      "step": 23874
+    },
+    {
+      "epoch": 0.20724646487443685,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0017545621402842682,
+      "loss": 0.124,
+      "step": 23875
+    },
+    {
+      "epoch": 0.207255145354641,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0017545417130910106,
+      "loss": 0.1177,
+      "step": 23876
+    },
+    {
+      "epoch": 0.20726382583484518,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0017545212851819564,
+      "loss": 0.1113,
+      "step": 23877
+    },
+    {
+      "epoch": 0.20727250631504934,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0017545008565571273,
+      "loss": 0.1445,
+      "step": 23878
+    },
+    {
+      "epoch": 0.2072811867952535,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001754480427216546,
+      "loss": 0.1279,
+      "step": 23879
+    },
+    {
+      "epoch": 0.20728986727545767,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0017544599971602347,
+      "loss": 0.1211,
+      "step": 23880
+    },
+    {
+      "epoch": 0.20729854775566184,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0017544395663882158,
+      "loss": 0.1074,
+      "step": 23881
+    },
+    {
+      "epoch": 0.207307228235866,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0017544191349005115,
+      "loss": 0.1123,
+      "step": 23882
+    },
+    {
+      "epoch": 0.20731590871607017,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0017543987026971445,
+      "loss": 0.1094,
+      "step": 23883
+    },
+    {
+      "epoch": 0.20732458919627433,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0017543782697781367,
+      "loss": 0.0977,
+      "step": 23884
+    },
+    {
+      "epoch": 0.2073332696764785,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0017543578361435106,
+      "loss": 0.1006,
+      "step": 23885
+    },
+    {
+      "epoch": 0.20734195015668266,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0017543374017932887,
+      "loss": 0.1123,
+      "step": 23886
+    },
+    {
+      "epoch": 0.20735063063688683,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0017543169667274933,
+      "loss": 0.1084,
+      "step": 23887
+    },
+    {
+      "epoch": 0.207359311117091,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0017542965309461469,
+      "loss": 0.1133,
+      "step": 23888
+    },
+    {
+      "epoch": 0.20736799159729516,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0017542760944492712,
+      "loss": 0.1074,
+      "step": 23889
+    },
+    {
+      "epoch": 0.20737667207749932,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0017542556572368894,
+      "loss": 0.1367,
+      "step": 23890
+    },
+    {
+      "epoch": 0.2073853525577035,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0017542352193090232,
+      "loss": 0.1035,
+      "step": 23891
+    },
+    {
+      "epoch": 0.20739403303790765,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0017542147806656953,
+      "loss": 0.1089,
+      "step": 23892
+    },
+    {
+      "epoch": 0.20740271351811182,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001754194341306928,
+      "loss": 0.1367,
+      "step": 23893
+    },
+    {
+      "epoch": 0.20741139399831598,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0017541739012327435,
+      "loss": 0.1157,
+      "step": 23894
+    },
+    {
+      "epoch": 0.20742007447852015,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0017541534604431641,
+      "loss": 0.1582,
+      "step": 23895
+    },
+    {
+      "epoch": 0.20742875495872432,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0017541330189382125,
+      "loss": 0.1035,
+      "step": 23896
+    },
+    {
+      "epoch": 0.20743743543892848,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001754112576717911,
+      "loss": 0.1543,
+      "step": 23897
+    },
+    {
+      "epoch": 0.20744611591913265,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0017540921337822815,
+      "loss": 0.1118,
+      "step": 23898
+    },
+    {
+      "epoch": 0.2074547963993368,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0017540716901313469,
+      "loss": 0.1357,
+      "step": 23899
+    },
+    {
+      "epoch": 0.20746347687954098,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0017540512457651291,
+      "loss": 0.0918,
+      "step": 23900
+    },
+    {
+      "epoch": 0.20747215735974514,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0017540308006836506,
+      "loss": 0.0903,
+      "step": 23901
+    },
+    {
+      "epoch": 0.2074808378399493,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0017540103548869342,
+      "loss": 0.1006,
+      "step": 23902
+    },
+    {
+      "epoch": 0.20748951832015347,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001753989908375002,
+      "loss": 0.0947,
+      "step": 23903
+    },
+    {
+      "epoch": 0.20749819880035764,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001753969461147876,
+      "loss": 0.1113,
+      "step": 23904
+    },
+    {
+      "epoch": 0.2075068792805618,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017539490132055789,
+      "loss": 0.1133,
+      "step": 23905
+    },
+    {
+      "epoch": 0.20751555976076597,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001753928564548133,
+      "loss": 0.1348,
+      "step": 23906
+    },
+    {
+      "epoch": 0.20752424024097013,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0017539081151755605,
+      "loss": 0.1035,
+      "step": 23907
+    },
+    {
+      "epoch": 0.2075329207211743,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0017538876650878838,
+      "loss": 0.1035,
+      "step": 23908
+    },
+    {
+      "epoch": 0.20754160120137846,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017538672142851256,
+      "loss": 0.1143,
+      "step": 23909
+    },
+    {
+      "epoch": 0.20755028168158263,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001753846762767308,
+      "loss": 0.1123,
+      "step": 23910
+    },
+    {
+      "epoch": 0.2075589621617868,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0017538263105344534,
+      "loss": 0.127,
+      "step": 23911
+    },
+    {
+      "epoch": 0.20756764264199096,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0017538058575865842,
+      "loss": 0.1084,
+      "step": 23912
+    },
+    {
+      "epoch": 0.20757632312219512,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001753785403923723,
+      "loss": 0.1108,
+      "step": 23913
+    },
+    {
+      "epoch": 0.2075850036023993,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0017537649495458915,
+      "loss": 0.0996,
+      "step": 23914
+    },
+    {
+      "epoch": 0.20759368408260345,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0017537444944531121,
+      "loss": 0.1113,
+      "step": 23915
+    },
+    {
+      "epoch": 0.20760236456280762,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0017537240386454084,
+      "loss": 0.104,
+      "step": 23916
+    },
+    {
+      "epoch": 0.20761104504301178,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0017537035821228015,
+      "loss": 0.1436,
+      "step": 23917
+    },
+    {
+      "epoch": 0.20761972552321595,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0017536831248853144,
+      "loss": 0.082,
+      "step": 23918
+    },
+    {
+      "epoch": 0.2076284060034201,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001753662666932969,
+      "loss": 0.1309,
+      "step": 23919
+    },
+    {
+      "epoch": 0.20763708648362428,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001753642208265788,
+      "loss": 0.0913,
+      "step": 23920
+    },
+    {
+      "epoch": 0.20764576696382844,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0017536217488837936,
+      "loss": 0.1143,
+      "step": 23921
+    },
+    {
+      "epoch": 0.2076544474440326,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0017536012887870084,
+      "loss": 0.0938,
+      "step": 23922
+    },
+    {
+      "epoch": 0.20766312792423677,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0017535808279754547,
+      "loss": 0.1348,
+      "step": 23923
+    },
+    {
+      "epoch": 0.20767180840444094,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0017535603664491548,
+      "loss": 0.1152,
+      "step": 23924
+    },
+    {
+      "epoch": 0.2076804888846451,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001753539904208131,
+      "loss": 0.0957,
+      "step": 23925
+    },
+    {
+      "epoch": 0.20768916936484927,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001753519441252406,
+      "loss": 0.1006,
+      "step": 23926
+    },
+    {
+      "epoch": 0.20769784984505343,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001753498977582002,
+      "loss": 0.1367,
+      "step": 23927
+    },
+    {
+      "epoch": 0.2077065303252576,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0017534785131969408,
+      "loss": 0.0698,
+      "step": 23928
+    },
+    {
+      "epoch": 0.20771521080546176,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0017534580480972456,
+      "loss": 0.1211,
+      "step": 23929
+    },
+    {
+      "epoch": 0.20772389128566593,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0017534375822829387,
+      "loss": 0.0986,
+      "step": 23930
+    },
+    {
+      "epoch": 0.2077325717658701,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0017534171157540422,
+      "loss": 0.1133,
+      "step": 23931
+    },
+    {
+      "epoch": 0.20774125224607426,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0017533966485105783,
+      "loss": 0.1318,
+      "step": 23932
+    },
+    {
+      "epoch": 0.20774993272627842,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00175337618055257,
+      "loss": 0.0845,
+      "step": 23933
+    },
+    {
+      "epoch": 0.2077586132064826,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0017533557118800391,
+      "loss": 0.1641,
+      "step": 23934
+    },
+    {
+      "epoch": 0.20776729368668675,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0017533352424930085,
+      "loss": 0.127,
+      "step": 23935
+    },
+    {
+      "epoch": 0.20777597416689092,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0017533147723915,
+      "loss": 0.1211,
+      "step": 23936
+    },
+    {
+      "epoch": 0.20778465464709509,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0017532943015755363,
+      "loss": 0.0947,
+      "step": 23937
+    },
+    {
+      "epoch": 0.20779333512729925,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.00175327383004514,
+      "loss": 0.0908,
+      "step": 23938
+    },
+    {
+      "epoch": 0.20780201560750342,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001753253357800333,
+      "loss": 0.0933,
+      "step": 23939
+    },
+    {
+      "epoch": 0.20781069608770758,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001753232884841138,
+      "loss": 0.1836,
+      "step": 23940
+    },
+    {
+      "epoch": 0.20781937656791175,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0017532124111675773,
+      "loss": 0.0708,
+      "step": 23941
+    },
+    {
+      "epoch": 0.2078280570481159,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0017531919367796734,
+      "loss": 0.125,
+      "step": 23942
+    },
+    {
+      "epoch": 0.20783673752832008,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0017531714616774486,
+      "loss": 0.1021,
+      "step": 23943
+    },
+    {
+      "epoch": 0.20784541800852424,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0017531509858609254,
+      "loss": 0.1079,
+      "step": 23944
+    },
+    {
+      "epoch": 0.2078540984887284,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001753130509330126,
+      "loss": 0.0859,
+      "step": 23945
+    },
+    {
+      "epoch": 0.20786277896893257,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0017531100320850729,
+      "loss": 0.0732,
+      "step": 23946
+    },
+    {
+      "epoch": 0.20787145944913674,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0017530895541257884,
+      "loss": 0.0977,
+      "step": 23947
+    },
+    {
+      "epoch": 0.2078801399293409,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0017530690754522951,
+      "loss": 0.0894,
+      "step": 23948
+    },
+    {
+      "epoch": 0.20788882040954507,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0017530485960646149,
+      "loss": 0.0947,
+      "step": 23949
+    },
+    {
+      "epoch": 0.20789750088974923,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001753028115962771,
+      "loss": 0.0757,
+      "step": 23950
+    },
+    {
+      "epoch": 0.2079061813699534,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0017530076351467852,
+      "loss": 0.1133,
+      "step": 23951
+    },
+    {
+      "epoch": 0.20791486185015756,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.00175298715361668,
+      "loss": 0.083,
+      "step": 23952
+    },
+    {
+      "epoch": 0.20792354233036173,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001752966671372478,
+      "loss": 0.124,
+      "step": 23953
+    },
+    {
+      "epoch": 0.2079322228105659,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0017529461884142016,
+      "loss": 0.1289,
+      "step": 23954
+    },
+    {
+      "epoch": 0.20794090329077006,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0017529257047418727,
+      "loss": 0.125,
+      "step": 23955
+    },
+    {
+      "epoch": 0.20794958377097422,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0017529052203555143,
+      "loss": 0.1338,
+      "step": 23956
+    },
+    {
+      "epoch": 0.2079582642511784,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0017528847352551488,
+      "loss": 0.1133,
+      "step": 23957
+    },
+    {
+      "epoch": 0.20796694473138255,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001752864249440798,
+      "loss": 0.0898,
+      "step": 23958
+    },
+    {
+      "epoch": 0.20797562521158672,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0017528437629124845,
+      "loss": 0.1201,
+      "step": 23959
+    },
+    {
+      "epoch": 0.20798430569179088,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0017528232756702311,
+      "loss": 0.123,
+      "step": 23960
+    },
+    {
+      "epoch": 0.20799298617199505,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.00175280278771406,
+      "loss": 0.126,
+      "step": 23961
+    },
+    {
+      "epoch": 0.2080016666521992,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0017527822990439934,
+      "loss": 0.0996,
+      "step": 23962
+    },
+    {
+      "epoch": 0.20801034713240338,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0017527618096600539,
+      "loss": 0.1338,
+      "step": 23963
+    },
+    {
+      "epoch": 0.20801902761260754,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0017527413195622644,
+      "loss": 0.1396,
+      "step": 23964
+    },
+    {
+      "epoch": 0.20802770809281168,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0017527208287506464,
+      "loss": 0.0659,
+      "step": 23965
+    },
+    {
+      "epoch": 0.20803638857301585,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0017527003372252228,
+      "loss": 0.1221,
+      "step": 23966
+    },
+    {
+      "epoch": 0.20804506905322,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0017526798449860157,
+      "loss": 0.1191,
+      "step": 23967
+    },
+    {
+      "epoch": 0.20805374953342418,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0017526593520330478,
+      "loss": 0.1216,
+      "step": 23968
+    },
+    {
+      "epoch": 0.20806243001362834,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0017526388583663415,
+      "loss": 0.0981,
+      "step": 23969
+    },
+    {
+      "epoch": 0.2080711104938325,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001752618363985919,
+      "loss": 0.1582,
+      "step": 23970
+    },
+    {
+      "epoch": 0.20807979097403667,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0017525978688918032,
+      "loss": 0.1309,
+      "step": 23971
+    },
+    {
+      "epoch": 0.20808847145424084,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001752577373084016,
+      "loss": 0.0825,
+      "step": 23972
+    },
+    {
+      "epoch": 0.208097151934445,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00175255687656258,
+      "loss": 0.0884,
+      "step": 23973
+    },
+    {
+      "epoch": 0.20810583241464917,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0017525363793275178,
+      "loss": 0.1006,
+      "step": 23974
+    },
+    {
+      "epoch": 0.20811451289485333,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0017525158813788513,
+      "loss": 0.127,
+      "step": 23975
+    },
+    {
+      "epoch": 0.2081231933750575,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0017524953827166032,
+      "loss": 0.1865,
+      "step": 23976
+    },
+    {
+      "epoch": 0.20813187385526166,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001752474883340796,
+      "loss": 0.1235,
+      "step": 23977
+    },
+    {
+      "epoch": 0.20814055433546583,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0017524543832514524,
+      "loss": 0.127,
+      "step": 23978
+    },
+    {
+      "epoch": 0.20814923481567,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0017524338824485943,
+      "loss": 0.0977,
+      "step": 23979
+    },
+    {
+      "epoch": 0.20815791529587416,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0017524133809322445,
+      "loss": 0.1279,
+      "step": 23980
+    },
+    {
+      "epoch": 0.20816659577607832,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0017523928787024248,
+      "loss": 0.123,
+      "step": 23981
+    },
+    {
+      "epoch": 0.2081752762562825,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0017523723757591583,
+      "loss": 0.0957,
+      "step": 23982
+    },
+    {
+      "epoch": 0.20818395673648665,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001752351872102467,
+      "loss": 0.1328,
+      "step": 23983
+    },
+    {
+      "epoch": 0.20819263721669082,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0017523313677323738,
+      "loss": 0.0713,
+      "step": 23984
+    },
+    {
+      "epoch": 0.20820131769689498,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0017523108626489006,
+      "loss": 0.1191,
+      "step": 23985
+    },
+    {
+      "epoch": 0.20820999817709915,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0017522903568520701,
+      "loss": 0.1138,
+      "step": 23986
+    },
+    {
+      "epoch": 0.2082186786573033,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0017522698503419048,
+      "loss": 0.1328,
+      "step": 23987
+    },
+    {
+      "epoch": 0.20822735913750748,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0017522493431184265,
+      "loss": 0.1738,
+      "step": 23988
+    },
+    {
+      "epoch": 0.20823603961771164,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0017522288351816586,
+      "loss": 0.1377,
+      "step": 23989
+    },
+    {
+      "epoch": 0.2082447200979158,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0017522083265316228,
+      "loss": 0.1016,
+      "step": 23990
+    },
+    {
+      "epoch": 0.20825340057811997,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001752187817168342,
+      "loss": 0.1147,
+      "step": 23991
+    },
+    {
+      "epoch": 0.20826208105832414,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0017521673070918383,
+      "loss": 0.1455,
+      "step": 23992
+    },
+    {
+      "epoch": 0.2082707615385283,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0017521467963021343,
+      "loss": 0.1084,
+      "step": 23993
+    },
+    {
+      "epoch": 0.20827944201873247,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0017521262847992522,
+      "loss": 0.0903,
+      "step": 23994
+    },
+    {
+      "epoch": 0.20828812249893663,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0017521057725832145,
+      "loss": 0.1113,
+      "step": 23995
+    },
+    {
+      "epoch": 0.2082968029791408,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001752085259654044,
+      "loss": 0.1079,
+      "step": 23996
+    },
+    {
+      "epoch": 0.20830548345934496,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001752064746011763,
+      "loss": 0.1221,
+      "step": 23997
+    },
+    {
+      "epoch": 0.20831416393954913,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0017520442316563934,
+      "loss": 0.1157,
+      "step": 23998
+    },
+    {
+      "epoch": 0.2083228444197533,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0017520237165879581,
+      "loss": 0.1318,
+      "step": 23999
+    },
+    {
+      "epoch": 0.20833152489995746,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0017520032008064795,
+      "loss": 0.0977,
+      "step": 24000
+    },
+    {
+      "epoch": 0.20834020538016162,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0017519826843119801,
+      "loss": 0.1318,
+      "step": 24001
+    },
+    {
+      "epoch": 0.2083488858603658,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001751962167104482,
+      "loss": 0.1162,
+      "step": 24002
+    },
+    {
+      "epoch": 0.20835756634056996,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0017519416491840081,
+      "loss": 0.1172,
+      "step": 24003
+    },
+    {
+      "epoch": 0.20836624682077412,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0017519211305505806,
+      "loss": 0.0879,
+      "step": 24004
+    },
+    {
+      "epoch": 0.20837492730097829,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0017519006112042218,
+      "loss": 0.1211,
+      "step": 24005
+    },
+    {
+      "epoch": 0.20838360778118245,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0017518800911449544,
+      "loss": 0.083,
+      "step": 24006
+    },
+    {
+      "epoch": 0.20839228826138662,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0017518595703728006,
+      "loss": 0.0991,
+      "step": 24007
+    },
+    {
+      "epoch": 0.20840096874159078,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0017518390488877832,
+      "loss": 0.1108,
+      "step": 24008
+    },
+    {
+      "epoch": 0.20840964922179495,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0017518185266899244,
+      "loss": 0.1221,
+      "step": 24009
+    },
+    {
+      "epoch": 0.2084183297019991,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0017517980037792464,
+      "loss": 0.0967,
+      "step": 24010
+    },
+    {
+      "epoch": 0.20842701018220328,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0017517774801557723,
+      "loss": 0.1016,
+      "step": 24011
+    },
+    {
+      "epoch": 0.20843569066240744,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0017517569558195237,
+      "loss": 0.1045,
+      "step": 24012
+    },
+    {
+      "epoch": 0.2084443711426116,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0017517364307705235,
+      "loss": 0.1426,
+      "step": 24013
+    },
+    {
+      "epoch": 0.20845305162281577,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0017517159050087945,
+      "loss": 0.1309,
+      "step": 24014
+    },
+    {
+      "epoch": 0.20846173210301994,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0017516953785343583,
+      "loss": 0.0786,
+      "step": 24015
+    },
+    {
+      "epoch": 0.2084704125832241,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0017516748513472383,
+      "loss": 0.0723,
+      "step": 24016
+    },
+    {
+      "epoch": 0.20847909306342827,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0017516543234474562,
+      "loss": 0.1084,
+      "step": 24017
+    },
+    {
+      "epoch": 0.20848777354363243,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0017516337948350346,
+      "loss": 0.123,
+      "step": 24018
+    },
+    {
+      "epoch": 0.2084964540238366,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0017516132655099963,
+      "loss": 0.1367,
+      "step": 24019
+    },
+    {
+      "epoch": 0.20850513450404076,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0017515927354723635,
+      "loss": 0.1162,
+      "step": 24020
+    },
+    {
+      "epoch": 0.20851381498424493,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0017515722047221588,
+      "loss": 0.1309,
+      "step": 24021
+    },
+    {
+      "epoch": 0.2085224954644491,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001751551673259404,
+      "loss": 0.084,
+      "step": 24022
+    },
+    {
+      "epoch": 0.20853117594465326,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0017515311410841229,
+      "loss": 0.0977,
+      "step": 24023
+    },
+    {
+      "epoch": 0.20853985642485742,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0017515106081963366,
+      "loss": 0.1162,
+      "step": 24024
+    },
+    {
+      "epoch": 0.2085485369050616,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0017514900745960681,
+      "loss": 0.0986,
+      "step": 24025
+    },
+    {
+      "epoch": 0.20855721738526575,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0017514695402833398,
+      "loss": 0.1211,
+      "step": 24026
+    },
+    {
+      "epoch": 0.20856589786546992,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0017514490052581745,
+      "loss": 0.1357,
+      "step": 24027
+    },
+    {
+      "epoch": 0.20857457834567408,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0017514284695205942,
+      "loss": 0.1094,
+      "step": 24028
+    },
+    {
+      "epoch": 0.20858325882587825,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0017514079330706214,
+      "loss": 0.0845,
+      "step": 24029
+    },
+    {
+      "epoch": 0.2085919393060824,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0017513873959082787,
+      "loss": 0.0894,
+      "step": 24030
+    },
+    {
+      "epoch": 0.20860061978628658,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0017513668580335887,
+      "loss": 0.1172,
+      "step": 24031
+    },
+    {
+      "epoch": 0.20860930026649074,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0017513463194465736,
+      "loss": 0.1006,
+      "step": 24032
+    },
+    {
+      "epoch": 0.2086179807466949,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001751325780147256,
+      "loss": 0.1104,
+      "step": 24033
+    },
+    {
+      "epoch": 0.20862666122689907,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001751305240135658,
+      "loss": 0.1104,
+      "step": 24034
+    },
+    {
+      "epoch": 0.20863534170710324,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0017512846994118026,
+      "loss": 0.1006,
+      "step": 24035
+    },
+    {
+      "epoch": 0.2086440221873074,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001751264157975712,
+      "loss": 0.106,
+      "step": 24036
+    },
+    {
+      "epoch": 0.20865270266751157,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001751243615827409,
+      "loss": 0.1045,
+      "step": 24037
+    },
+    {
+      "epoch": 0.20866138314771573,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017512230729669154,
+      "loss": 0.083,
+      "step": 24038
+    },
+    {
+      "epoch": 0.2086700636279199,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0017512025293942542,
+      "loss": 0.1201,
+      "step": 24039
+    },
+    {
+      "epoch": 0.20867874410812406,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0017511819851094477,
+      "loss": 0.0908,
+      "step": 24040
+    },
+    {
+      "epoch": 0.20868742458832823,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017511614401125185,
+      "loss": 0.1484,
+      "step": 24041
+    },
+    {
+      "epoch": 0.2086961050685324,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0017511408944034885,
+      "loss": 0.1157,
+      "step": 24042
+    },
+    {
+      "epoch": 0.20870478554873656,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0017511203479823809,
+      "loss": 0.0859,
+      "step": 24043
+    },
+    {
+      "epoch": 0.20871346602894073,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0017510998008492178,
+      "loss": 0.0938,
+      "step": 24044
+    },
+    {
+      "epoch": 0.2087221465091449,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0017510792530040218,
+      "loss": 0.166,
+      "step": 24045
+    },
+    {
+      "epoch": 0.20873082698934906,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0017510587044468153,
+      "loss": 0.0859,
+      "step": 24046
+    },
+    {
+      "epoch": 0.20873950746955322,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0017510381551776209,
+      "loss": 0.1099,
+      "step": 24047
+    },
+    {
+      "epoch": 0.20874818794975739,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0017510176051964608,
+      "loss": 0.1201,
+      "step": 24048
+    },
+    {
+      "epoch": 0.20875686842996155,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0017509970545033578,
+      "loss": 0.1099,
+      "step": 24049
+    },
+    {
+      "epoch": 0.20876554891016572,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.001750976503098334,
+      "loss": 0.1035,
+      "step": 24050
+    },
+    {
+      "epoch": 0.20877422939036988,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001750955950981412,
+      "loss": 0.0972,
+      "step": 24051
+    },
+    {
+      "epoch": 0.20878290987057405,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0017509353981526145,
+      "loss": 0.1123,
+      "step": 24052
+    },
+    {
+      "epoch": 0.2087915903507782,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0017509148446119636,
+      "loss": 0.0996,
+      "step": 24053
+    },
+    {
+      "epoch": 0.20880027083098238,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0017508942903594824,
+      "loss": 0.0908,
+      "step": 24054
+    },
+    {
+      "epoch": 0.20880895131118654,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0017508737353951927,
+      "loss": 0.1348,
+      "step": 24055
+    },
+    {
+      "epoch": 0.2088176317913907,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0017508531797191173,
+      "loss": 0.1035,
+      "step": 24056
+    },
+    {
+      "epoch": 0.20882631227159487,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001750832623331279,
+      "loss": 0.1367,
+      "step": 24057
+    },
+    {
+      "epoch": 0.20883499275179904,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0017508120662316996,
+      "loss": 0.123,
+      "step": 24058
+    },
+    {
+      "epoch": 0.2088436732320032,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0017507915084204017,
+      "loss": 0.1172,
+      "step": 24059
+    },
+    {
+      "epoch": 0.20885235371220737,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0017507709498974084,
+      "loss": 0.1592,
+      "step": 24060
+    },
+    {
+      "epoch": 0.20886103419241153,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0017507503906627417,
+      "loss": 0.1387,
+      "step": 24061
+    },
+    {
+      "epoch": 0.2088697146726157,
+      "grad_norm": 1.625,
+      "learning_rate": 0.0017507298307164237,
+      "loss": 0.1035,
+      "step": 24062
+    },
+    {
+      "epoch": 0.20887839515281986,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0017507092700584776,
+      "loss": 0.1143,
+      "step": 24063
+    },
+    {
+      "epoch": 0.20888707563302403,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001750688708688926,
+      "loss": 0.0854,
+      "step": 24064
+    },
+    {
+      "epoch": 0.2088957561132282,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0017506681466077905,
+      "loss": 0.1562,
+      "step": 24065
+    },
+    {
+      "epoch": 0.20890443659343236,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001750647583815094,
+      "loss": 0.1162,
+      "step": 24066
+    },
+    {
+      "epoch": 0.20891311707363652,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0017506270203108596,
+      "loss": 0.0879,
+      "step": 24067
+    },
+    {
+      "epoch": 0.2089217975538407,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.001750606456095109,
+      "loss": 0.0918,
+      "step": 24068
+    },
+    {
+      "epoch": 0.20893047803404485,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0017505858911678652,
+      "loss": 0.1211,
+      "step": 24069
+    },
+    {
+      "epoch": 0.20893915851424902,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.00175056532552915,
+      "loss": 0.0991,
+      "step": 24070
+    },
+    {
+      "epoch": 0.20894783899445318,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0017505447591789865,
+      "loss": 0.0933,
+      "step": 24071
+    },
+    {
+      "epoch": 0.20895651947465735,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0017505241921173971,
+      "loss": 0.0874,
+      "step": 24072
+    },
+    {
+      "epoch": 0.2089651999548615,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0017505036243444043,
+      "loss": 0.082,
+      "step": 24073
+    },
+    {
+      "epoch": 0.20897388043506568,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0017504830558600303,
+      "loss": 0.1113,
+      "step": 24074
+    },
+    {
+      "epoch": 0.20898256091526984,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.001750462486664298,
+      "loss": 0.1338,
+      "step": 24075
+    },
+    {
+      "epoch": 0.208991241395474,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0017504419167572297,
+      "loss": 0.0879,
+      "step": 24076
+    },
+    {
+      "epoch": 0.20899992187567817,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0017504213461388477,
+      "loss": 0.0933,
+      "step": 24077
+    },
+    {
+      "epoch": 0.20900860235588234,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0017504007748091748,
+      "loss": 0.1011,
+      "step": 24078
+    },
+    {
+      "epoch": 0.2090172828360865,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0017503802027682334,
+      "loss": 0.1904,
+      "step": 24079
+    },
+    {
+      "epoch": 0.20902596331629067,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001750359630016046,
+      "loss": 0.1094,
+      "step": 24080
+    },
+    {
+      "epoch": 0.20903464379649483,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0017503390565526348,
+      "loss": 0.1211,
+      "step": 24081
+    },
+    {
+      "epoch": 0.209043324276699,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001750318482378023,
+      "loss": 0.083,
+      "step": 24082
+    },
+    {
+      "epoch": 0.20905200475690316,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0017502979074922327,
+      "loss": 0.1016,
+      "step": 24083
+    },
+    {
+      "epoch": 0.20906068523710733,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001750277331895286,
+      "loss": 0.0752,
+      "step": 24084
+    },
+    {
+      "epoch": 0.2090693657173115,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001750256755587206,
+      "loss": 0.1104,
+      "step": 24085
+    },
+    {
+      "epoch": 0.20907804619751566,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0017502361785680148,
+      "loss": 0.1221,
+      "step": 24086
+    },
+    {
+      "epoch": 0.20908672667771983,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0017502156008377353,
+      "loss": 0.1221,
+      "step": 24087
+    },
+    {
+      "epoch": 0.20909540715792396,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0017501950223963898,
+      "loss": 0.1074,
+      "step": 24088
+    },
+    {
+      "epoch": 0.20910408763812813,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0017501744432440005,
+      "loss": 0.1152,
+      "step": 24089
+    },
+    {
+      "epoch": 0.2091127681183323,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0017501538633805905,
+      "loss": 0.1045,
+      "step": 24090
+    },
+    {
+      "epoch": 0.20912144859853646,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0017501332828061818,
+      "loss": 0.0967,
+      "step": 24091
+    },
+    {
+      "epoch": 0.20913012907874062,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0017501127015207975,
+      "loss": 0.1533,
+      "step": 24092
+    },
+    {
+      "epoch": 0.2091388095589448,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0017500921195244595,
+      "loss": 0.105,
+      "step": 24093
+    },
+    {
+      "epoch": 0.20914749003914895,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0017500715368171903,
+      "loss": 0.0952,
+      "step": 24094
+    },
+    {
+      "epoch": 0.20915617051935312,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0017500509533990127,
+      "loss": 0.1533,
+      "step": 24095
+    },
+    {
+      "epoch": 0.20916485099955728,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0017500303692699493,
+      "loss": 0.2227,
+      "step": 24096
+    },
+    {
+      "epoch": 0.20917353147976145,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0017500097844300223,
+      "loss": 0.1074,
+      "step": 24097
+    },
+    {
+      "epoch": 0.2091822119599656,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0017499891988792544,
+      "loss": 0.0996,
+      "step": 24098
+    },
+    {
+      "epoch": 0.20919089244016978,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0017499686126176684,
+      "loss": 0.1113,
+      "step": 24099
+    },
+    {
+      "epoch": 0.20919957292037394,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001749948025645286,
+      "loss": 0.0972,
+      "step": 24100
+    },
+    {
+      "epoch": 0.2092082534005781,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0017499274379621306,
+      "loss": 0.1182,
+      "step": 24101
+    },
+    {
+      "epoch": 0.20921693388078227,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001749906849568224,
+      "loss": 0.1006,
+      "step": 24102
+    },
+    {
+      "epoch": 0.20922561436098644,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0017498862604635893,
+      "loss": 0.1182,
+      "step": 24103
+    },
+    {
+      "epoch": 0.2092342948411906,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0017498656706482485,
+      "loss": 0.0884,
+      "step": 24104
+    },
+    {
+      "epoch": 0.20924297532139477,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0017498450801222247,
+      "loss": 0.0835,
+      "step": 24105
+    },
+    {
+      "epoch": 0.20925165580159893,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.00174982448888554,
+      "loss": 0.123,
+      "step": 24106
+    },
+    {
+      "epoch": 0.2092603362818031,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0017498038969382167,
+      "loss": 0.0952,
+      "step": 24107
+    },
+    {
+      "epoch": 0.20926901676200726,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0017497833042802778,
+      "loss": 0.123,
+      "step": 24108
+    },
+    {
+      "epoch": 0.20927769724221143,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0017497627109117455,
+      "loss": 0.1133,
+      "step": 24109
+    },
+    {
+      "epoch": 0.2092863777224156,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0017497421168326428,
+      "loss": 0.1162,
+      "step": 24110
+    },
+    {
+      "epoch": 0.20929505820261976,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0017497215220429916,
+      "loss": 0.1064,
+      "step": 24111
+    },
+    {
+      "epoch": 0.20930373868282393,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0017497009265428148,
+      "loss": 0.1055,
+      "step": 24112
+    },
+    {
+      "epoch": 0.2093124191630281,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001749680330332135,
+      "loss": 0.1318,
+      "step": 24113
+    },
+    {
+      "epoch": 0.20932109964323226,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0017496597334109743,
+      "loss": 0.0898,
+      "step": 24114
+    },
+    {
+      "epoch": 0.20932978012343642,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0017496391357793555,
+      "loss": 0.104,
+      "step": 24115
+    },
+    {
+      "epoch": 0.20933846060364059,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0017496185374373011,
+      "loss": 0.0786,
+      "step": 24116
+    },
+    {
+      "epoch": 0.20934714108384475,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0017495979383848337,
+      "loss": 0.1465,
+      "step": 24117
+    },
+    {
+      "epoch": 0.20935582156404892,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0017495773386219757,
+      "loss": 0.0889,
+      "step": 24118
+    },
+    {
+      "epoch": 0.20936450204425308,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017495567381487498,
+      "loss": 0.1108,
+      "step": 24119
+    },
+    {
+      "epoch": 0.20937318252445725,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0017495361369651782,
+      "loss": 0.0859,
+      "step": 24120
+    },
+    {
+      "epoch": 0.2093818630046614,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0017495155350712838,
+      "loss": 0.1162,
+      "step": 24121
+    },
+    {
+      "epoch": 0.20939054348486558,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0017494949324670893,
+      "loss": 0.1016,
+      "step": 24122
+    },
+    {
+      "epoch": 0.20939922396506974,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0017494743291526162,
+      "loss": 0.0796,
+      "step": 24123
+    },
+    {
+      "epoch": 0.2094079044452739,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017494537251278883,
+      "loss": 0.126,
+      "step": 24124
+    },
+    {
+      "epoch": 0.20941658492547807,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0017494331203929273,
+      "loss": 0.0718,
+      "step": 24125
+    },
+    {
+      "epoch": 0.20942526540568224,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001749412514947756,
+      "loss": 0.1123,
+      "step": 24126
+    },
+    {
+      "epoch": 0.2094339458858864,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0017493919087923971,
+      "loss": 0.1094,
+      "step": 24127
+    },
+    {
+      "epoch": 0.20944262636609057,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0017493713019268726,
+      "loss": 0.1221,
+      "step": 24128
+    },
+    {
+      "epoch": 0.20945130684629473,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0017493506943512059,
+      "loss": 0.085,
+      "step": 24129
+    },
+    {
+      "epoch": 0.2094599873264989,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.0017493300860654184,
+      "loss": 0.1211,
+      "step": 24130
+    },
+    {
+      "epoch": 0.20946866780670306,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001749309477069534,
+      "loss": 0.1011,
+      "step": 24131
+    },
+    {
+      "epoch": 0.20947734828690723,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001749288867363574,
+      "loss": 0.0903,
+      "step": 24132
+    },
+    {
+      "epoch": 0.2094860287671114,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0017492682569475618,
+      "loss": 0.1064,
+      "step": 24133
+    },
+    {
+      "epoch": 0.20949470924731556,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001749247645821519,
+      "loss": 0.0996,
+      "step": 24134
+    },
+    {
+      "epoch": 0.20950338972751972,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0017492270339854695,
+      "loss": 0.1211,
+      "step": 24135
+    },
+    {
+      "epoch": 0.2095120702077239,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0017492064214394347,
+      "loss": 0.1143,
+      "step": 24136
+    },
+    {
+      "epoch": 0.20952075068792805,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0017491858081834374,
+      "loss": 0.0874,
+      "step": 24137
+    },
+    {
+      "epoch": 0.20952943116813222,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0017491651942175004,
+      "loss": 0.5234,
+      "step": 24138
+    },
+    {
+      "epoch": 0.20953811164833638,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.001749144579541646,
+      "loss": 0.0811,
+      "step": 24139
+    },
+    {
+      "epoch": 0.20954679212854055,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0017491239641558968,
+      "loss": 0.1816,
+      "step": 24140
+    },
+    {
+      "epoch": 0.20955547260874471,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0017491033480602756,
+      "loss": 0.0752,
+      "step": 24141
+    },
+    {
+      "epoch": 0.20956415308894888,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0017490827312548046,
+      "loss": 0.1172,
+      "step": 24142
+    },
+    {
+      "epoch": 0.20957283356915304,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0017490621137395065,
+      "loss": 0.1172,
+      "step": 24143
+    },
+    {
+      "epoch": 0.2095815140493572,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0017490414955144038,
+      "loss": 0.1245,
+      "step": 24144
+    },
+    {
+      "epoch": 0.20959019452956137,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001749020876579519,
+      "loss": 0.1069,
+      "step": 24145
+    },
+    {
+      "epoch": 0.20959887500976554,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0017490002569348748,
+      "loss": 0.0957,
+      "step": 24146
+    },
+    {
+      "epoch": 0.2096075554899697,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001748979636580494,
+      "loss": 0.1201,
+      "step": 24147
+    },
+    {
+      "epoch": 0.20961623597017387,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0017489590155163983,
+      "loss": 0.1201,
+      "step": 24148
+    },
+    {
+      "epoch": 0.20962491645037803,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.001748938393742611,
+      "loss": 0.1416,
+      "step": 24149
+    },
+    {
+      "epoch": 0.2096335969305822,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0017489177712591544,
+      "loss": 0.0859,
+      "step": 24150
+    },
+    {
+      "epoch": 0.20964227741078637,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001748897148066051,
+      "loss": 0.1318,
+      "step": 24151
+    },
+    {
+      "epoch": 0.20965095789099053,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0017488765241633235,
+      "loss": 0.1123,
+      "step": 24152
+    },
+    {
+      "epoch": 0.2096596383711947,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0017488558995509944,
+      "loss": 0.1357,
+      "step": 24153
+    },
+    {
+      "epoch": 0.20966831885139886,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0017488352742290863,
+      "loss": 0.1006,
+      "step": 24154
+    },
+    {
+      "epoch": 0.20967699933160303,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0017488146481976215,
+      "loss": 0.1309,
+      "step": 24155
+    },
+    {
+      "epoch": 0.2096856798118072,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001748794021456623,
+      "loss": 0.1025,
+      "step": 24156
+    },
+    {
+      "epoch": 0.20969436029201136,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001748773394006113,
+      "loss": 0.0928,
+      "step": 24157
+    },
+    {
+      "epoch": 0.20970304077221552,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001748752765846114,
+      "loss": 0.082,
+      "step": 24158
+    },
+    {
+      "epoch": 0.20971172125241969,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0017487321369766492,
+      "loss": 0.1406,
+      "step": 24159
+    },
+    {
+      "epoch": 0.20972040173262385,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0017487115073977402,
+      "loss": 0.1387,
+      "step": 24160
+    },
+    {
+      "epoch": 0.20972908221282802,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0017486908771094103,
+      "loss": 0.1074,
+      "step": 24161
+    },
+    {
+      "epoch": 0.20973776269303218,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0017486702461116817,
+      "loss": 0.1035,
+      "step": 24162
+    },
+    {
+      "epoch": 0.20974644317323635,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001748649614404577,
+      "loss": 0.1108,
+      "step": 24163
+    },
+    {
+      "epoch": 0.2097551236534405,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0017486289819881188,
+      "loss": 0.1118,
+      "step": 24164
+    },
+    {
+      "epoch": 0.20976380413364468,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017486083488623297,
+      "loss": 0.1816,
+      "step": 24165
+    },
+    {
+      "epoch": 0.20977248461384884,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0017485877150272323,
+      "loss": 0.0947,
+      "step": 24166
+    },
+    {
+      "epoch": 0.209781165094053,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001748567080482849,
+      "loss": 0.1074,
+      "step": 24167
+    },
+    {
+      "epoch": 0.20978984557425717,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0017485464452292028,
+      "loss": 0.1006,
+      "step": 24168
+    },
+    {
+      "epoch": 0.20979852605446134,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0017485258092663158,
+      "loss": 0.0752,
+      "step": 24169
+    },
+    {
+      "epoch": 0.2098072065346655,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0017485051725942106,
+      "loss": 0.1914,
+      "step": 24170
+    },
+    {
+      "epoch": 0.20981588701486967,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0017484845352129103,
+      "loss": 0.105,
+      "step": 24171
+    },
+    {
+      "epoch": 0.20982456749507383,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0017484638971224365,
+      "loss": 0.0996,
+      "step": 24172
+    },
+    {
+      "epoch": 0.209833247975278,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0017484432583228127,
+      "loss": 0.1211,
+      "step": 24173
+    },
+    {
+      "epoch": 0.20984192845548216,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001748422618814061,
+      "loss": 0.1162,
+      "step": 24174
+    },
+    {
+      "epoch": 0.20985060893568633,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001748401978596204,
+      "loss": 0.1123,
+      "step": 24175
+    },
+    {
+      "epoch": 0.2098592894158905,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0017483813376692643,
+      "loss": 0.1045,
+      "step": 24176
+    },
+    {
+      "epoch": 0.20986796989609466,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0017483606960332645,
+      "loss": 0.126,
+      "step": 24177
+    },
+    {
+      "epoch": 0.20987665037629882,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0017483400536882272,
+      "loss": 0.1064,
+      "step": 24178
+    },
+    {
+      "epoch": 0.209885330856503,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001748319410634175,
+      "loss": 0.083,
+      "step": 24179
+    },
+    {
+      "epoch": 0.20989401133670715,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0017482987668711305,
+      "loss": 0.1621,
+      "step": 24180
+    },
+    {
+      "epoch": 0.20990269181691132,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0017482781223991163,
+      "loss": 0.126,
+      "step": 24181
+    },
+    {
+      "epoch": 0.20991137229711548,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0017482574772181548,
+      "loss": 0.1309,
+      "step": 24182
+    },
+    {
+      "epoch": 0.20992005277731965,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0017482368313282684,
+      "loss": 0.1182,
+      "step": 24183
+    },
+    {
+      "epoch": 0.20992873325752381,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0017482161847294801,
+      "loss": 0.0986,
+      "step": 24184
+    },
+    {
+      "epoch": 0.20993741373772798,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0017481955374218127,
+      "loss": 0.106,
+      "step": 24185
+    },
+    {
+      "epoch": 0.20994609421793214,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001748174889405288,
+      "loss": 0.1025,
+      "step": 24186
+    },
+    {
+      "epoch": 0.2099547746981363,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001748154240679929,
+      "loss": 0.1318,
+      "step": 24187
+    },
+    {
+      "epoch": 0.20996345517834047,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0017481335912457583,
+      "loss": 0.1367,
+      "step": 24188
+    },
+    {
+      "epoch": 0.20997213565854464,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0017481129411027986,
+      "loss": 0.1035,
+      "step": 24189
+    },
+    {
+      "epoch": 0.2099808161387488,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001748092290251072,
+      "loss": 0.0889,
+      "step": 24190
+    },
+    {
+      "epoch": 0.20998949661895297,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0017480716386906016,
+      "loss": 0.1006,
+      "step": 24191
+    },
+    {
+      "epoch": 0.20999817709915714,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0017480509864214097,
+      "loss": 0.1318,
+      "step": 24192
+    },
+    {
+      "epoch": 0.2100068575793613,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0017480303334435194,
+      "loss": 0.0894,
+      "step": 24193
+    },
+    {
+      "epoch": 0.21001553805956547,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0017480096797569521,
+      "loss": 0.1035,
+      "step": 24194
+    },
+    {
+      "epoch": 0.21002421853976963,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0017479890253617317,
+      "loss": 0.0972,
+      "step": 24195
+    },
+    {
+      "epoch": 0.2100328990199738,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0017479683702578803,
+      "loss": 0.126,
+      "step": 24196
+    },
+    {
+      "epoch": 0.21004157950017796,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0017479477144454203,
+      "loss": 0.0996,
+      "step": 24197
+    },
+    {
+      "epoch": 0.21005025998038213,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0017479270579243743,
+      "loss": 0.1016,
+      "step": 24198
+    },
+    {
+      "epoch": 0.2100589404605863,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001747906400694765,
+      "loss": 0.1006,
+      "step": 24199
+    },
+    {
+      "epoch": 0.21006762094079046,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0017478857427566151,
+      "loss": 0.1152,
+      "step": 24200
+    },
+    {
+      "epoch": 0.21007630142099462,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0017478650841099468,
+      "loss": 0.1377,
+      "step": 24201
+    },
+    {
+      "epoch": 0.2100849819011988,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0017478444247547833,
+      "loss": 0.1035,
+      "step": 24202
+    },
+    {
+      "epoch": 0.21009366238140295,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0017478237646911467,
+      "loss": 0.1309,
+      "step": 24203
+    },
+    {
+      "epoch": 0.21010234286160712,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.00174780310391906,
+      "loss": 0.1094,
+      "step": 24204
+    },
+    {
+      "epoch": 0.21011102334181128,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0017477824424385452,
+      "loss": 0.0967,
+      "step": 24205
+    },
+    {
+      "epoch": 0.21011970382201545,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0017477617802496258,
+      "loss": 0.127,
+      "step": 24206
+    },
+    {
+      "epoch": 0.2101283843022196,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0017477411173523237,
+      "loss": 0.1138,
+      "step": 24207
+    },
+    {
+      "epoch": 0.21013706478242378,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.001747720453746661,
+      "loss": 0.1523,
+      "step": 24208
+    },
+    {
+      "epoch": 0.21014574526262794,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0017476997894326615,
+      "loss": 0.1152,
+      "step": 24209
+    },
+    {
+      "epoch": 0.2101544257428321,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017476791244103473,
+      "loss": 0.1104,
+      "step": 24210
+    },
+    {
+      "epoch": 0.21016310622303624,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001747658458679741,
+      "loss": 0.1299,
+      "step": 24211
+    },
+    {
+      "epoch": 0.2101717867032404,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017476377922408647,
+      "loss": 0.1064,
+      "step": 24212
+    },
+    {
+      "epoch": 0.21018046718344457,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0017476171250937415,
+      "loss": 0.0859,
+      "step": 24213
+    },
+    {
+      "epoch": 0.21018914766364874,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0017475964572383942,
+      "loss": 0.125,
+      "step": 24214
+    },
+    {
+      "epoch": 0.2101978281438529,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0017475757886748454,
+      "loss": 0.1084,
+      "step": 24215
+    },
+    {
+      "epoch": 0.21020650862405707,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001747555119403117,
+      "loss": 0.1221,
+      "step": 24216
+    },
+    {
+      "epoch": 0.21021518910426124,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001747534449423232,
+      "loss": 0.1035,
+      "step": 24217
+    },
+    {
+      "epoch": 0.2102238695844654,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0017475137787352133,
+      "loss": 0.1289,
+      "step": 24218
+    },
+    {
+      "epoch": 0.21023255006466957,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001747493107339083,
+      "loss": 0.1602,
+      "step": 24219
+    },
+    {
+      "epoch": 0.21024123054487373,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0017474724352348643,
+      "loss": 0.0928,
+      "step": 24220
+    },
+    {
+      "epoch": 0.2102499110250779,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0017474517624225792,
+      "loss": 0.1328,
+      "step": 24221
+    },
+    {
+      "epoch": 0.21025859150528206,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0017474310889022508,
+      "loss": 0.0933,
+      "step": 24222
+    },
+    {
+      "epoch": 0.21026727198548623,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0017474104146739011,
+      "loss": 0.1279,
+      "step": 24223
+    },
+    {
+      "epoch": 0.2102759524656904,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0017473897397375536,
+      "loss": 0.0806,
+      "step": 24224
+    },
+    {
+      "epoch": 0.21028463294589456,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0017473690640932302,
+      "loss": 0.103,
+      "step": 24225
+    },
+    {
+      "epoch": 0.21029331342609872,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0017473483877409536,
+      "loss": 0.1182,
+      "step": 24226
+    },
+    {
+      "epoch": 0.2103019939063029,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0017473277106807468,
+      "loss": 0.0928,
+      "step": 24227
+    },
+    {
+      "epoch": 0.21031067438650705,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0017473070329126318,
+      "loss": 0.1064,
+      "step": 24228
+    },
+    {
+      "epoch": 0.21031935486671122,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001747286354436632,
+      "loss": 0.105,
+      "step": 24229
+    },
+    {
+      "epoch": 0.21032803534691538,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001747265675252769,
+      "loss": 0.1084,
+      "step": 24230
+    },
+    {
+      "epoch": 0.21033671582711955,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0017472449953610666,
+      "loss": 0.1133,
+      "step": 24231
+    },
+    {
+      "epoch": 0.2103453963073237,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0017472243147615464,
+      "loss": 0.1514,
+      "step": 24232
+    },
+    {
+      "epoch": 0.21035407678752788,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0017472036334542315,
+      "loss": 0.1104,
+      "step": 24233
+    },
+    {
+      "epoch": 0.21036275726773204,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0017471829514391444,
+      "loss": 0.1211,
+      "step": 24234
+    },
+    {
+      "epoch": 0.2103714377479362,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0017471622687163077,
+      "loss": 0.1562,
+      "step": 24235
+    },
+    {
+      "epoch": 0.21038011822814037,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001747141585285744,
+      "loss": 0.1133,
+      "step": 24236
+    },
+    {
+      "epoch": 0.21038879870834454,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0017471209011474763,
+      "loss": 0.0957,
+      "step": 24237
+    },
+    {
+      "epoch": 0.2103974791885487,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0017471002163015265,
+      "loss": 0.0874,
+      "step": 24238
+    },
+    {
+      "epoch": 0.21040615966875287,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0017470795307479182,
+      "loss": 0.1172,
+      "step": 24239
+    },
+    {
+      "epoch": 0.21041484014895703,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001747058844486673,
+      "loss": 0.1172,
+      "step": 24240
+    },
+    {
+      "epoch": 0.2104235206291612,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.001747038157517814,
+      "loss": 0.1143,
+      "step": 24241
+    },
+    {
+      "epoch": 0.21043220110936536,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.001747017469841364,
+      "loss": 0.1426,
+      "step": 24242
+    },
+    {
+      "epoch": 0.21044088158956953,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0017469967814573452,
+      "loss": 0.1235,
+      "step": 24243
+    },
+    {
+      "epoch": 0.2104495620697737,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017469760923657805,
+      "loss": 0.1162,
+      "step": 24244
+    },
+    {
+      "epoch": 0.21045824254997786,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0017469554025666926,
+      "loss": 0.1396,
+      "step": 24245
+    },
+    {
+      "epoch": 0.21046692303018202,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0017469347120601037,
+      "loss": 0.1055,
+      "step": 24246
+    },
+    {
+      "epoch": 0.2104756035103862,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001746914020846037,
+      "loss": 0.1064,
+      "step": 24247
+    },
+    {
+      "epoch": 0.21048428399059035,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0017468933289245148,
+      "loss": 0.0986,
+      "step": 24248
+    },
+    {
+      "epoch": 0.21049296447079452,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0017468726362955597,
+      "loss": 0.1172,
+      "step": 24249
+    },
+    {
+      "epoch": 0.21050164495099868,
+      "grad_norm": 3.28125,
+      "learning_rate": 0.0017468519429591945,
+      "loss": 0.1455,
+      "step": 24250
+    },
+    {
+      "epoch": 0.21051032543120285,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0017468312489154415,
+      "loss": 0.1045,
+      "step": 24251
+    },
+    {
+      "epoch": 0.21051900591140701,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0017468105541643235,
+      "loss": 0.1416,
+      "step": 24252
+    },
+    {
+      "epoch": 0.21052768639161118,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0017467898587058637,
+      "loss": 0.1138,
+      "step": 24253
+    },
+    {
+      "epoch": 0.21053636687181534,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0017467691625400838,
+      "loss": 0.1006,
+      "step": 24254
+    },
+    {
+      "epoch": 0.2105450473520195,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0017467484656670071,
+      "loss": 0.082,
+      "step": 24255
+    },
+    {
+      "epoch": 0.21055372783222367,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001746727768086656,
+      "loss": 0.1445,
+      "step": 24256
+    },
+    {
+      "epoch": 0.21056240831242784,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001746707069799053,
+      "loss": 0.0796,
+      "step": 24257
+    },
+    {
+      "epoch": 0.210571088792632,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0017466863708042208,
+      "loss": 0.1143,
+      "step": 24258
+    },
+    {
+      "epoch": 0.21057976927283617,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0017466656711021822,
+      "loss": 0.1553,
+      "step": 24259
+    },
+    {
+      "epoch": 0.21058844975304034,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0017466449706929597,
+      "loss": 0.1328,
+      "step": 24260
+    },
+    {
+      "epoch": 0.2105971302332445,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0017466242695765761,
+      "loss": 0.0874,
+      "step": 24261
+    },
+    {
+      "epoch": 0.21060581071344867,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001746603567753054,
+      "loss": 0.1309,
+      "step": 24262
+    },
+    {
+      "epoch": 0.21061449119365283,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0017465828652224158,
+      "loss": 0.0854,
+      "step": 24263
+    },
+    {
+      "epoch": 0.210623171673857,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001746562161984684,
+      "loss": 0.1475,
+      "step": 24264
+    },
+    {
+      "epoch": 0.21063185215406116,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001746541458039882,
+      "loss": 0.1016,
+      "step": 24265
+    },
+    {
+      "epoch": 0.21064053263426533,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0017465207533880315,
+      "loss": 0.124,
+      "step": 24266
+    },
+    {
+      "epoch": 0.2106492131144695,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0017465000480291562,
+      "loss": 0.0938,
+      "step": 24267
+    },
+    {
+      "epoch": 0.21065789359467366,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0017464793419632778,
+      "loss": 0.1123,
+      "step": 24268
+    },
+    {
+      "epoch": 0.21066657407487782,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0017464586351904194,
+      "loss": 0.1006,
+      "step": 24269
+    },
+    {
+      "epoch": 0.210675254555082,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0017464379277106034,
+      "loss": 0.127,
+      "step": 24270
+    },
+    {
+      "epoch": 0.21068393503528615,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001746417219523853,
+      "loss": 0.104,
+      "step": 24271
+    },
+    {
+      "epoch": 0.21069261551549032,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00174639651063019,
+      "loss": 0.1406,
+      "step": 24272
+    },
+    {
+      "epoch": 0.21070129599569448,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0017463758010296377,
+      "loss": 0.1123,
+      "step": 24273
+    },
+    {
+      "epoch": 0.21070997647589865,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0017463550907222184,
+      "loss": 0.1387,
+      "step": 24274
+    },
+    {
+      "epoch": 0.2107186569561028,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001746334379707955,
+      "loss": 0.1758,
+      "step": 24275
+    },
+    {
+      "epoch": 0.21072733743630698,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0017463136679868698,
+      "loss": 0.0889,
+      "step": 24276
+    },
+    {
+      "epoch": 0.21073601791651114,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.001746292955558986,
+      "loss": 0.0859,
+      "step": 24277
+    },
+    {
+      "epoch": 0.2107446983967153,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0017462722424243257,
+      "loss": 0.1006,
+      "step": 24278
+    },
+    {
+      "epoch": 0.21075337887691947,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0017462515285829121,
+      "loss": 0.1074,
+      "step": 24279
+    },
+    {
+      "epoch": 0.21076205935712364,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0017462308140347674,
+      "loss": 0.1025,
+      "step": 24280
+    },
+    {
+      "epoch": 0.2107707398373278,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0017462100987799144,
+      "loss": 0.1514,
+      "step": 24281
+    },
+    {
+      "epoch": 0.21077942031753197,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0017461893828183755,
+      "loss": 0.0996,
+      "step": 24282
+    },
+    {
+      "epoch": 0.21078810079773613,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0017461686661501739,
+      "loss": 0.1445,
+      "step": 24283
+    },
+    {
+      "epoch": 0.2107967812779403,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0017461479487753317,
+      "loss": 0.1562,
+      "step": 24284
+    },
+    {
+      "epoch": 0.21080546175814446,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.001746127230693872,
+      "loss": 0.1211,
+      "step": 24285
+    },
+    {
+      "epoch": 0.21081414223834863,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.001746106511905817,
+      "loss": 0.1035,
+      "step": 24286
+    },
+    {
+      "epoch": 0.2108228227185528,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0017460857924111901,
+      "loss": 0.0918,
+      "step": 24287
+    },
+    {
+      "epoch": 0.21083150319875696,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0017460650722100131,
+      "loss": 0.1143,
+      "step": 24288
+    },
+    {
+      "epoch": 0.21084018367896112,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0017460443513023093,
+      "loss": 0.083,
+      "step": 24289
+    },
+    {
+      "epoch": 0.2108488641591653,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001746023629688101,
+      "loss": 0.1191,
+      "step": 24290
+    },
+    {
+      "epoch": 0.21085754463936945,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0017460029073674108,
+      "loss": 0.0835,
+      "step": 24291
+    },
+    {
+      "epoch": 0.21086622511957362,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0017459821843402617,
+      "loss": 0.1621,
+      "step": 24292
+    },
+    {
+      "epoch": 0.21087490559977778,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0017459614606066761,
+      "loss": 0.1172,
+      "step": 24293
+    },
+    {
+      "epoch": 0.21088358607998195,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0017459407361666767,
+      "loss": 0.1367,
+      "step": 24294
+    },
+    {
+      "epoch": 0.21089226656018611,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0017459200110202864,
+      "loss": 0.1465,
+      "step": 24295
+    },
+    {
+      "epoch": 0.21090094704039028,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0017458992851675278,
+      "loss": 0.083,
+      "step": 24296
+    },
+    {
+      "epoch": 0.21090962752059444,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0017458785586084232,
+      "loss": 0.1113,
+      "step": 24297
+    },
+    {
+      "epoch": 0.2109183080007986,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0017458578313429953,
+      "loss": 0.1104,
+      "step": 24298
+    },
+    {
+      "epoch": 0.21092698848100278,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0017458371033712676,
+      "loss": 0.1172,
+      "step": 24299
+    },
+    {
+      "epoch": 0.21093566896120694,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0017458163746932615,
+      "loss": 0.1074,
+      "step": 24300
+    },
+    {
+      "epoch": 0.2109443494414111,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0017457956453090008,
+      "loss": 0.0967,
+      "step": 24301
+    },
+    {
+      "epoch": 0.21095302992161527,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0017457749152185074,
+      "loss": 0.1328,
+      "step": 24302
+    },
+    {
+      "epoch": 0.21096171040181944,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0017457541844218044,
+      "loss": 0.1182,
+      "step": 24303
+    },
+    {
+      "epoch": 0.2109703908820236,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0017457334529189141,
+      "loss": 0.0898,
+      "step": 24304
+    },
+    {
+      "epoch": 0.21097907136222777,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0017457127207098598,
+      "loss": 0.0977,
+      "step": 24305
+    },
+    {
+      "epoch": 0.21098775184243193,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0017456919877946635,
+      "loss": 0.0938,
+      "step": 24306
+    },
+    {
+      "epoch": 0.2109964323226361,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0017456712541733482,
+      "loss": 0.082,
+      "step": 24307
+    },
+    {
+      "epoch": 0.21100511280284026,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0017456505198459363,
+      "loss": 0.1299,
+      "step": 24308
+    },
+    {
+      "epoch": 0.21101379328304443,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0017456297848124512,
+      "loss": 0.1099,
+      "step": 24309
+    },
+    {
+      "epoch": 0.2110224737632486,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0017456090490729145,
+      "loss": 0.1104,
+      "step": 24310
+    },
+    {
+      "epoch": 0.21103115424345276,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.00174558831262735,
+      "loss": 0.0762,
+      "step": 24311
+    },
+    {
+      "epoch": 0.21103983472365692,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0017455675754757794,
+      "loss": 0.1187,
+      "step": 24312
+    },
+    {
+      "epoch": 0.2110485152038611,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001745546837618226,
+      "loss": 0.1328,
+      "step": 24313
+    },
+    {
+      "epoch": 0.21105719568406525,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001745526099054712,
+      "loss": 0.1094,
+      "step": 24314
+    },
+    {
+      "epoch": 0.21106587616426942,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0017455053597852607,
+      "loss": 0.1035,
+      "step": 24315
+    },
+    {
+      "epoch": 0.21107455664447358,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0017454846198098944,
+      "loss": 0.1621,
+      "step": 24316
+    },
+    {
+      "epoch": 0.21108323712467775,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0017454638791286354,
+      "loss": 0.0908,
+      "step": 24317
+    },
+    {
+      "epoch": 0.2110919176048819,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0017454431377415072,
+      "loss": 0.1025,
+      "step": 24318
+    },
+    {
+      "epoch": 0.21110059808508608,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001745422395648532,
+      "loss": 0.0938,
+      "step": 24319
+    },
+    {
+      "epoch": 0.21110927856529024,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0017454016528497328,
+      "loss": 0.0928,
+      "step": 24320
+    },
+    {
+      "epoch": 0.2111179590454944,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0017453809093451317,
+      "loss": 0.1465,
+      "step": 24321
+    },
+    {
+      "epoch": 0.21112663952569857,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0017453601651347516,
+      "loss": 0.1396,
+      "step": 24322
+    },
+    {
+      "epoch": 0.21113532000590274,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0017453394202186158,
+      "loss": 0.0991,
+      "step": 24323
+    },
+    {
+      "epoch": 0.2111440004861069,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0017453186745967461,
+      "loss": 0.0752,
+      "step": 24324
+    },
+    {
+      "epoch": 0.21115268096631107,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0017452979282691662,
+      "loss": 0.1592,
+      "step": 24325
+    },
+    {
+      "epoch": 0.21116136144651523,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0017452771812358973,
+      "loss": 0.0776,
+      "step": 24326
+    },
+    {
+      "epoch": 0.2111700419267194,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0017452564334969635,
+      "loss": 0.1396,
+      "step": 24327
+    },
+    {
+      "epoch": 0.21117872240692356,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001745235685052387,
+      "loss": 0.1299,
+      "step": 24328
+    },
+    {
+      "epoch": 0.21118740288712773,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0017452149359021903,
+      "loss": 0.085,
+      "step": 24329
+    },
+    {
+      "epoch": 0.2111960833673319,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0017451941860463962,
+      "loss": 0.1221,
+      "step": 24330
+    },
+    {
+      "epoch": 0.21120476384753606,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0017451734354850274,
+      "loss": 0.0986,
+      "step": 24331
+    },
+    {
+      "epoch": 0.21121344432774022,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001745152684218107,
+      "loss": 0.1035,
+      "step": 24332
+    },
+    {
+      "epoch": 0.2112221248079444,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.001745131932245657,
+      "loss": 0.0781,
+      "step": 24333
+    },
+    {
+      "epoch": 0.21123080528814853,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0017451111795677005,
+      "loss": 0.0845,
+      "step": 24334
+    },
+    {
+      "epoch": 0.2112394857683527,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.00174509042618426,
+      "loss": 0.1523,
+      "step": 24335
+    },
+    {
+      "epoch": 0.21124816624855686,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0017450696720953582,
+      "loss": 0.0859,
+      "step": 24336
+    },
+    {
+      "epoch": 0.21125684672876102,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001745048917301018,
+      "loss": 0.1016,
+      "step": 24337
+    },
+    {
+      "epoch": 0.2112655272089652,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0017450281618012623,
+      "loss": 0.0967,
+      "step": 24338
+    },
+    {
+      "epoch": 0.21127420768916935,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0017450074055961134,
+      "loss": 0.1367,
+      "step": 24339
+    },
+    {
+      "epoch": 0.21128288816937352,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0017449866486855935,
+      "loss": 0.1152,
+      "step": 24340
+    },
+    {
+      "epoch": 0.21129156864957768,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0017449658910697265,
+      "loss": 0.1436,
+      "step": 24341
+    },
+    {
+      "epoch": 0.21130024912978185,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0017449451327485343,
+      "loss": 0.105,
+      "step": 24342
+    },
+    {
+      "epoch": 0.211308929609986,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0017449243737220398,
+      "loss": 0.1006,
+      "step": 24343
+    },
+    {
+      "epoch": 0.21131761009019018,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0017449036139902657,
+      "loss": 0.1001,
+      "step": 24344
+    },
+    {
+      "epoch": 0.21132629057039434,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001744882853553235,
+      "loss": 0.0908,
+      "step": 24345
+    },
+    {
+      "epoch": 0.2113349710505985,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0017448620924109695,
+      "loss": 0.1445,
+      "step": 24346
+    },
+    {
+      "epoch": 0.21134365153080267,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0017448413305634928,
+      "loss": 0.1074,
+      "step": 24347
+    },
+    {
+      "epoch": 0.21135233201100684,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0017448205680108276,
+      "loss": 0.1016,
+      "step": 24348
+    },
+    {
+      "epoch": 0.211361012491211,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0017447998047529958,
+      "loss": 0.1006,
+      "step": 24349
+    },
+    {
+      "epoch": 0.21136969297141517,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0017447790407900208,
+      "loss": 0.0938,
+      "step": 24350
+    },
+    {
+      "epoch": 0.21137837345161933,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0017447582761219251,
+      "loss": 0.0996,
+      "step": 24351
+    },
+    {
+      "epoch": 0.2113870539318235,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017447375107487318,
+      "loss": 0.1133,
+      "step": 24352
+    },
+    {
+      "epoch": 0.21139573441202766,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.001744716744670463,
+      "loss": 0.0879,
+      "step": 24353
+    },
+    {
+      "epoch": 0.21140441489223183,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0017446959778871416,
+      "loss": 0.0928,
+      "step": 24354
+    },
+    {
+      "epoch": 0.211413095372436,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0017446752103987904,
+      "loss": 0.1123,
+      "step": 24355
+    },
+    {
+      "epoch": 0.21142177585264016,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0017446544422054322,
+      "loss": 0.1206,
+      "step": 24356
+    },
+    {
+      "epoch": 0.21143045633284432,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0017446336733070893,
+      "loss": 0.1094,
+      "step": 24357
+    },
+    {
+      "epoch": 0.2114391368130485,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001744612903703785,
+      "loss": 0.1147,
+      "step": 24358
+    },
+    {
+      "epoch": 0.21144781729325265,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0017445921333955415,
+      "loss": 0.0952,
+      "step": 24359
+    },
+    {
+      "epoch": 0.21145649777345682,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0017445713623823818,
+      "loss": 0.1406,
+      "step": 24360
+    },
+    {
+      "epoch": 0.21146517825366098,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0017445505906643287,
+      "loss": 0.1367,
+      "step": 24361
+    },
+    {
+      "epoch": 0.21147385873386515,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0017445298182414044,
+      "loss": 0.1025,
+      "step": 24362
+    },
+    {
+      "epoch": 0.21148253921406931,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001744509045113632,
+      "loss": 0.0942,
+      "step": 24363
+    },
+    {
+      "epoch": 0.21149121969427348,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0017444882712810343,
+      "loss": 0.1191,
+      "step": 24364
+    },
+    {
+      "epoch": 0.21149990017447765,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001744467496743634,
+      "loss": 0.1221,
+      "step": 24365
+    },
+    {
+      "epoch": 0.2115085806546818,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0017444467215014538,
+      "loss": 0.126,
+      "step": 24366
+    },
+    {
+      "epoch": 0.21151726113488598,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0017444259455545162,
+      "loss": 0.0874,
+      "step": 24367
+    },
+    {
+      "epoch": 0.21152594161509014,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0017444051689028445,
+      "loss": 0.1211,
+      "step": 24368
+    },
+    {
+      "epoch": 0.2115346220952943,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0017443843915464605,
+      "loss": 0.1021,
+      "step": 24369
+    },
+    {
+      "epoch": 0.21154330257549847,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0017443636134853874,
+      "loss": 0.1084,
+      "step": 24370
+    },
+    {
+      "epoch": 0.21155198305570264,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0017443428347196479,
+      "loss": 0.1162,
+      "step": 24371
+    },
+    {
+      "epoch": 0.2115606635359068,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0017443220552492652,
+      "loss": 0.1089,
+      "step": 24372
+    },
+    {
+      "epoch": 0.21156934401611097,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0017443012750742613,
+      "loss": 0.0996,
+      "step": 24373
+    },
+    {
+      "epoch": 0.21157802449631513,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0017442804941946591,
+      "loss": 0.1074,
+      "step": 24374
+    },
+    {
+      "epoch": 0.2115867049765193,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0017442597126104816,
+      "loss": 0.1328,
+      "step": 24375
+    },
+    {
+      "epoch": 0.21159538545672346,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0017442389303217512,
+      "loss": 0.1016,
+      "step": 24376
+    },
+    {
+      "epoch": 0.21160406593692763,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0017442181473284908,
+      "loss": 0.1719,
+      "step": 24377
+    },
+    {
+      "epoch": 0.2116127464171318,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001744197363630723,
+      "loss": 0.1475,
+      "step": 24378
+    },
+    {
+      "epoch": 0.21162142689733596,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0017441765792284712,
+      "loss": 0.1113,
+      "step": 24379
+    },
+    {
+      "epoch": 0.21163010737754012,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0017441557941217571,
+      "loss": 0.1289,
+      "step": 24380
+    },
+    {
+      "epoch": 0.2116387878577443,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0017441350083106041,
+      "loss": 0.1221,
+      "step": 24381
+    },
+    {
+      "epoch": 0.21164746833794845,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0017441142217950348,
+      "loss": 0.1328,
+      "step": 24382
+    },
+    {
+      "epoch": 0.21165614881815262,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0017440934345750714,
+      "loss": 0.0991,
+      "step": 24383
+    },
+    {
+      "epoch": 0.21166482929835678,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0017440726466507375,
+      "loss": 0.1074,
+      "step": 24384
+    },
+    {
+      "epoch": 0.21167350977856095,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0017440518580220552,
+      "loss": 0.4121,
+      "step": 24385
+    },
+    {
+      "epoch": 0.2116821902587651,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001744031068689048,
+      "loss": 0.0825,
+      "step": 24386
+    },
+    {
+      "epoch": 0.21169087073896928,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0017440102786517374,
+      "loss": 0.0933,
+      "step": 24387
+    },
+    {
+      "epoch": 0.21169955121917344,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0017439894879101472,
+      "loss": 0.0669,
+      "step": 24388
+    },
+    {
+      "epoch": 0.2117082316993776,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0017439686964642995,
+      "loss": 0.1167,
+      "step": 24389
+    },
+    {
+      "epoch": 0.21171691217958177,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0017439479043142175,
+      "loss": 0.0967,
+      "step": 24390
+    },
+    {
+      "epoch": 0.21172559265978594,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.001743927111459924,
+      "loss": 0.1816,
+      "step": 24391
+    },
+    {
+      "epoch": 0.2117342731399901,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0017439063179014414,
+      "loss": 0.1094,
+      "step": 24392
+    },
+    {
+      "epoch": 0.21174295362019427,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0017438855236387922,
+      "loss": 0.1045,
+      "step": 24393
+    },
+    {
+      "epoch": 0.21175163410039843,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0017438647286719996,
+      "loss": 0.0977,
+      "step": 24394
+    },
+    {
+      "epoch": 0.2117603145806026,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0017438439330010862,
+      "loss": 0.0928,
+      "step": 24395
+    },
+    {
+      "epoch": 0.21176899506080676,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0017438231366260749,
+      "loss": 0.1216,
+      "step": 24396
+    },
+    {
+      "epoch": 0.21177767554101093,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001743802339546988,
+      "loss": 0.104,
+      "step": 24397
+    },
+    {
+      "epoch": 0.2117863560212151,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0017437815417638486,
+      "loss": 0.1025,
+      "step": 24398
+    },
+    {
+      "epoch": 0.21179503650141926,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0017437607432766798,
+      "loss": 0.1367,
+      "step": 24399
+    },
+    {
+      "epoch": 0.21180371698162342,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0017437399440855038,
+      "loss": 0.1055,
+      "step": 24400
+    },
+    {
+      "epoch": 0.2118123974618276,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001743719144190343,
+      "loss": 0.1104,
+      "step": 24401
+    },
+    {
+      "epoch": 0.21182107794203175,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0017436983435912211,
+      "loss": 0.1206,
+      "step": 24402
+    },
+    {
+      "epoch": 0.21182975842223592,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.00174367754228816,
+      "loss": 0.0918,
+      "step": 24403
+    },
+    {
+      "epoch": 0.21183843890244008,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.001743656740281183,
+      "loss": 0.1328,
+      "step": 24404
+    },
+    {
+      "epoch": 0.21184711938264425,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0017436359375703125,
+      "loss": 0.0815,
+      "step": 24405
+    },
+    {
+      "epoch": 0.21185579986284842,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0017436151341555716,
+      "loss": 0.105,
+      "step": 24406
+    },
+    {
+      "epoch": 0.21186448034305258,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001743594330036983,
+      "loss": 0.0947,
+      "step": 24407
+    },
+    {
+      "epoch": 0.21187316082325675,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001743573525214569,
+      "loss": 0.1641,
+      "step": 24408
+    },
+    {
+      "epoch": 0.2118818413034609,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0017435527196883527,
+      "loss": 0.1445,
+      "step": 24409
+    },
+    {
+      "epoch": 0.21189052178366508,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0017435319134583571,
+      "loss": 0.1338,
+      "step": 24410
+    },
+    {
+      "epoch": 0.21189920226386924,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0017435111065246043,
+      "loss": 0.103,
+      "step": 24411
+    },
+    {
+      "epoch": 0.2119078827440734,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0017434902988871178,
+      "loss": 0.1196,
+      "step": 24412
+    },
+    {
+      "epoch": 0.21191656322427757,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0017434694905459197,
+      "loss": 0.0908,
+      "step": 24413
+    },
+    {
+      "epoch": 0.21192524370448174,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001743448681501033,
+      "loss": 0.0869,
+      "step": 24414
+    },
+    {
+      "epoch": 0.2119339241846859,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0017434278717524806,
+      "loss": 0.1064,
+      "step": 24415
+    },
+    {
+      "epoch": 0.21194260466489007,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001743407061300285,
+      "loss": 0.1787,
+      "step": 24416
+    },
+    {
+      "epoch": 0.21195128514509423,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0017433862501444696,
+      "loss": 0.1152,
+      "step": 24417
+    },
+    {
+      "epoch": 0.2119599656252984,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001743365438285056,
+      "loss": 0.0977,
+      "step": 24418
+    },
+    {
+      "epoch": 0.21196864610550256,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0017433446257220679,
+      "loss": 0.1099,
+      "step": 24419
+    },
+    {
+      "epoch": 0.21197732658570673,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001743323812455528,
+      "loss": 0.0845,
+      "step": 24420
+    },
+    {
+      "epoch": 0.2119860070659109,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0017433029984854586,
+      "loss": 0.0859,
+      "step": 24421
+    },
+    {
+      "epoch": 0.21199468754611506,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001743282183811883,
+      "loss": 0.1367,
+      "step": 24422
+    },
+    {
+      "epoch": 0.21200336802631922,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001743261368434823,
+      "loss": 0.0967,
+      "step": 24423
+    },
+    {
+      "epoch": 0.2120120485065234,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0017432405523543026,
+      "loss": 0.085,
+      "step": 24424
+    },
+    {
+      "epoch": 0.21202072898672755,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0017432197355703438,
+      "loss": 0.0859,
+      "step": 24425
+    },
+    {
+      "epoch": 0.21202940946693172,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0017431989180829697,
+      "loss": 0.123,
+      "step": 24426
+    },
+    {
+      "epoch": 0.21203808994713588,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0017431780998922028,
+      "loss": 0.1377,
+      "step": 24427
+    },
+    {
+      "epoch": 0.21204677042734005,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0017431572809980661,
+      "loss": 0.1289,
+      "step": 24428
+    },
+    {
+      "epoch": 0.2120554509075442,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0017431364614005822,
+      "loss": 0.0947,
+      "step": 24429
+    },
+    {
+      "epoch": 0.21206413138774838,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001743115641099774,
+      "loss": 0.0928,
+      "step": 24430
+    },
+    {
+      "epoch": 0.21207281186795254,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0017430948200956643,
+      "loss": 0.1768,
+      "step": 24431
+    },
+    {
+      "epoch": 0.2120814923481567,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0017430739983882752,
+      "loss": 0.0664,
+      "step": 24432
+    },
+    {
+      "epoch": 0.21209017282836087,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0017430531759776307,
+      "loss": 0.1396,
+      "step": 24433
+    },
+    {
+      "epoch": 0.21209885330856504,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0017430323528637526,
+      "loss": 0.1416,
+      "step": 24434
+    },
+    {
+      "epoch": 0.2121075337887692,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001743011529046664,
+      "loss": 0.0796,
+      "step": 24435
+    },
+    {
+      "epoch": 0.21211621426897337,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0017429907045263878,
+      "loss": 0.0996,
+      "step": 24436
+    },
+    {
+      "epoch": 0.21212489474917753,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0017429698793029462,
+      "loss": 0.1162,
+      "step": 24437
+    },
+    {
+      "epoch": 0.2121335752293817,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0017429490533763627,
+      "loss": 0.1221,
+      "step": 24438
+    },
+    {
+      "epoch": 0.21214225570958586,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.00174292822674666,
+      "loss": 0.0986,
+      "step": 24439
+    },
+    {
+      "epoch": 0.21215093618979003,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0017429073994138604,
+      "loss": 0.1348,
+      "step": 24440
+    },
+    {
+      "epoch": 0.2121596166699942,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001742886571377987,
+      "loss": 0.124,
+      "step": 24441
+    },
+    {
+      "epoch": 0.21216829715019836,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0017428657426390624,
+      "loss": 0.1514,
+      "step": 24442
+    },
+    {
+      "epoch": 0.21217697763040252,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0017428449131971096,
+      "loss": 0.0996,
+      "step": 24443
+    },
+    {
+      "epoch": 0.2121856581106067,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0017428240830521511,
+      "loss": 0.1279,
+      "step": 24444
+    },
+    {
+      "epoch": 0.21219433859081085,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.00174280325220421,
+      "loss": 0.1079,
+      "step": 24445
+    },
+    {
+      "epoch": 0.21220301907101502,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0017427824206533089,
+      "loss": 0.0952,
+      "step": 24446
+    },
+    {
+      "epoch": 0.21221169955121919,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0017427615883994704,
+      "loss": 0.1001,
+      "step": 24447
+    },
+    {
+      "epoch": 0.21222038003142335,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0017427407554427177,
+      "loss": 0.0835,
+      "step": 24448
+    },
+    {
+      "epoch": 0.21222906051162752,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0017427199217830735,
+      "loss": 0.1523,
+      "step": 24449
+    },
+    {
+      "epoch": 0.21223774099183168,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.00174269908742056,
+      "loss": 0.0923,
+      "step": 24450
+    },
+    {
+      "epoch": 0.21224642147203585,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0017426782523552009,
+      "loss": 0.0981,
+      "step": 24451
+    },
+    {
+      "epoch": 0.21225510195224,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0017426574165870182,
+      "loss": 0.0908,
+      "step": 24452
+    },
+    {
+      "epoch": 0.21226378243244418,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0017426365801160354,
+      "loss": 0.1162,
+      "step": 24453
+    },
+    {
+      "epoch": 0.21227246291264834,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0017426157429422746,
+      "loss": 0.2793,
+      "step": 24454
+    },
+    {
+      "epoch": 0.2122811433928525,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0017425949050657588,
+      "loss": 0.0986,
+      "step": 24455
+    },
+    {
+      "epoch": 0.21228982387305667,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001742574066486511,
+      "loss": 0.1104,
+      "step": 24456
+    },
+    {
+      "epoch": 0.21229850435326084,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0017425532272045538,
+      "loss": 0.0859,
+      "step": 24457
+    },
+    {
+      "epoch": 0.21230718483346497,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017425323872199102,
+      "loss": 0.1138,
+      "step": 24458
+    },
+    {
+      "epoch": 0.21231586531366914,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0017425115465326028,
+      "loss": 0.0889,
+      "step": 24459
+    },
+    {
+      "epoch": 0.2123245457938733,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001742490705142654,
+      "loss": 0.1543,
+      "step": 24460
+    },
+    {
+      "epoch": 0.21233322627407747,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0017424698630500873,
+      "loss": 0.0889,
+      "step": 24461
+    },
+    {
+      "epoch": 0.21234190675428163,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0017424490202549254,
+      "loss": 0.1279,
+      "step": 24462
+    },
+    {
+      "epoch": 0.2123505872344858,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0017424281767571908,
+      "loss": 0.104,
+      "step": 24463
+    },
+    {
+      "epoch": 0.21235926771468996,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0017424073325569063,
+      "loss": 0.0972,
+      "step": 24464
+    },
+    {
+      "epoch": 0.21236794819489413,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.001742386487654095,
+      "loss": 0.1279,
+      "step": 24465
+    },
+    {
+      "epoch": 0.2123766286750983,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0017423656420487792,
+      "loss": 0.0957,
+      "step": 24466
+    },
+    {
+      "epoch": 0.21238530915530246,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.001742344795740982,
+      "loss": 0.252,
+      "step": 24467
+    },
+    {
+      "epoch": 0.21239398963550662,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0017423239487307266,
+      "loss": 0.1084,
+      "step": 24468
+    },
+    {
+      "epoch": 0.2124026701157108,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001742303101018035,
+      "loss": 0.0879,
+      "step": 24469
+    },
+    {
+      "epoch": 0.21241135059591496,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0017422822526029305,
+      "loss": 0.0928,
+      "step": 24470
+    },
+    {
+      "epoch": 0.21242003107611912,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0017422614034854355,
+      "loss": 0.125,
+      "step": 24471
+    },
+    {
+      "epoch": 0.21242871155632329,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0017422405536655734,
+      "loss": 0.1226,
+      "step": 24472
+    },
+    {
+      "epoch": 0.21243739203652745,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0017422197031433666,
+      "loss": 0.124,
+      "step": 24473
+    },
+    {
+      "epoch": 0.21244607251673162,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0017421988519188381,
+      "loss": 0.127,
+      "step": 24474
+    },
+    {
+      "epoch": 0.21245475299693578,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0017421779999920103,
+      "loss": 0.1108,
+      "step": 24475
+    },
+    {
+      "epoch": 0.21246343347713995,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0017421571473629066,
+      "loss": 0.1318,
+      "step": 24476
+    },
+    {
+      "epoch": 0.2124721139573441,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0017421362940315496,
+      "loss": 0.166,
+      "step": 24477
+    },
+    {
+      "epoch": 0.21248079443754828,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0017421154399979617,
+      "loss": 0.1729,
+      "step": 24478
+    },
+    {
+      "epoch": 0.21248947491775244,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001742094585262166,
+      "loss": 0.1182,
+      "step": 24479
+    },
+    {
+      "epoch": 0.2124981553979566,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0017420737298241853,
+      "loss": 0.1484,
+      "step": 24480
+    },
+    {
+      "epoch": 0.21250683587816077,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0017420528736840424,
+      "loss": 0.1094,
+      "step": 24481
+    },
+    {
+      "epoch": 0.21251551635836494,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0017420320168417603,
+      "loss": 0.0894,
+      "step": 24482
+    },
+    {
+      "epoch": 0.2125241968385691,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0017420111592973614,
+      "loss": 0.1172,
+      "step": 24483
+    },
+    {
+      "epoch": 0.21253287731877327,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001741990301050869,
+      "loss": 0.1025,
+      "step": 24484
+    },
+    {
+      "epoch": 0.21254155779897743,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0017419694421023054,
+      "loss": 0.1377,
+      "step": 24485
+    },
+    {
+      "epoch": 0.2125502382791816,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001741948582451694,
+      "loss": 0.0786,
+      "step": 24486
+    },
+    {
+      "epoch": 0.21255891875938576,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001741927722099057,
+      "loss": 0.1182,
+      "step": 24487
+    },
+    {
+      "epoch": 0.21256759923958993,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0017419068610444175,
+      "loss": 0.3262,
+      "step": 24488
+    },
+    {
+      "epoch": 0.2125762797197941,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0017418859992877984,
+      "loss": 0.1084,
+      "step": 24489
+    },
+    {
+      "epoch": 0.21258496019999826,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0017418651368292224,
+      "loss": 0.0967,
+      "step": 24490
+    },
+    {
+      "epoch": 0.21259364068020242,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0017418442736687122,
+      "loss": 0.1084,
+      "step": 24491
+    },
+    {
+      "epoch": 0.2126023211604066,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001741823409806291,
+      "loss": 0.1055,
+      "step": 24492
+    },
+    {
+      "epoch": 0.21261100164061075,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0017418025452419813,
+      "loss": 0.1309,
+      "step": 24493
+    },
+    {
+      "epoch": 0.21261968212081492,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0017417816799758059,
+      "loss": 0.0918,
+      "step": 24494
+    },
+    {
+      "epoch": 0.21262836260101908,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0017417608140077876,
+      "loss": 0.1045,
+      "step": 24495
+    },
+    {
+      "epoch": 0.21263704308122325,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0017417399473379494,
+      "loss": 0.0889,
+      "step": 24496
+    },
+    {
+      "epoch": 0.2126457235614274,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0017417190799663142,
+      "loss": 0.0977,
+      "step": 24497
+    },
+    {
+      "epoch": 0.21265440404163158,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0017416982118929046,
+      "loss": 0.166,
+      "step": 24498
+    },
+    {
+      "epoch": 0.21266308452183574,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001741677343117743,
+      "loss": 0.084,
+      "step": 24499
+    },
+    {
+      "epoch": 0.2126717650020399,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0017416564736408532,
+      "loss": 0.0776,
+      "step": 24500
+    },
+    {
+      "epoch": 0.21268044548224407,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0017416356034622574,
+      "loss": 0.1123,
+      "step": 24501
+    },
+    {
+      "epoch": 0.21268912596244824,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0017416147325819786,
+      "loss": 0.0747,
+      "step": 24502
+    },
+    {
+      "epoch": 0.2126978064426524,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0017415938610000395,
+      "loss": 0.0981,
+      "step": 24503
+    },
+    {
+      "epoch": 0.21270648692285657,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001741572988716463,
+      "loss": 0.0972,
+      "step": 24504
+    },
+    {
+      "epoch": 0.21271516740306073,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001741552115731272,
+      "loss": 0.0864,
+      "step": 24505
+    },
+    {
+      "epoch": 0.2127238478832649,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.001741531242044489,
+      "loss": 0.1387,
+      "step": 24506
+    },
+    {
+      "epoch": 0.21273252836346906,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0017415103676561372,
+      "loss": 0.1055,
+      "step": 24507
+    },
+    {
+      "epoch": 0.21274120884367323,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0017414894925662392,
+      "loss": 0.0908,
+      "step": 24508
+    },
+    {
+      "epoch": 0.2127498893238774,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0017414686167748182,
+      "loss": 0.1094,
+      "step": 24509
+    },
+    {
+      "epoch": 0.21275856980408156,
+      "grad_norm": 1.0,
+      "learning_rate": 0.001741447740281897,
+      "loss": 0.1162,
+      "step": 24510
+    },
+    {
+      "epoch": 0.21276725028428572,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0017414268630874976,
+      "loss": 0.1592,
+      "step": 24511
+    },
+    {
+      "epoch": 0.2127759307644899,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0017414059851916437,
+      "loss": 0.1426,
+      "step": 24512
+    },
+    {
+      "epoch": 0.21278461124469406,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0017413851065943577,
+      "loss": 0.1226,
+      "step": 24513
+    },
+    {
+      "epoch": 0.21279329172489822,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0017413642272956628,
+      "loss": 0.1035,
+      "step": 24514
+    },
+    {
+      "epoch": 0.21280197220510239,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0017413433472955812,
+      "loss": 0.0791,
+      "step": 24515
+    },
+    {
+      "epoch": 0.21281065268530655,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0017413224665941366,
+      "loss": 0.126,
+      "step": 24516
+    },
+    {
+      "epoch": 0.21281933316551072,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0017413015851913511,
+      "loss": 0.0879,
+      "step": 24517
+    },
+    {
+      "epoch": 0.21282801364571488,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0017412807030872481,
+      "loss": 0.0801,
+      "step": 24518
+    },
+    {
+      "epoch": 0.21283669412591905,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0017412598202818502,
+      "loss": 0.0767,
+      "step": 24519
+    },
+    {
+      "epoch": 0.2128453746061232,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0017412389367751798,
+      "loss": 0.1328,
+      "step": 24520
+    },
+    {
+      "epoch": 0.21285405508632738,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0017412180525672603,
+      "loss": 0.0811,
+      "step": 24521
+    },
+    {
+      "epoch": 0.21286273556653154,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0017411971676581147,
+      "loss": 0.125,
+      "step": 24522
+    },
+    {
+      "epoch": 0.2128714160467357,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.001741176282047765,
+      "loss": 0.1084,
+      "step": 24523
+    },
+    {
+      "epoch": 0.21288009652693987,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0017411553957362347,
+      "loss": 0.1143,
+      "step": 24524
+    },
+    {
+      "epoch": 0.21288877700714404,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0017411345087235467,
+      "loss": 0.0952,
+      "step": 24525
+    },
+    {
+      "epoch": 0.2128974574873482,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0017411136210097235,
+      "loss": 0.0933,
+      "step": 24526
+    },
+    {
+      "epoch": 0.21290613796755237,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001741092732594788,
+      "loss": 0.0688,
+      "step": 24527
+    },
+    {
+      "epoch": 0.21291481844775653,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001741071843478763,
+      "loss": 0.1045,
+      "step": 24528
+    },
+    {
+      "epoch": 0.2129234989279607,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0017410509536616718,
+      "loss": 0.1245,
+      "step": 24529
+    },
+    {
+      "epoch": 0.21293217940816486,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.001741030063143537,
+      "loss": 0.1533,
+      "step": 24530
+    },
+    {
+      "epoch": 0.21294085988836903,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.001741009171924381,
+      "loss": 0.1191,
+      "step": 24531
+    },
+    {
+      "epoch": 0.2129495403685732,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017409882800042272,
+      "loss": 0.0991,
+      "step": 24532
+    },
+    {
+      "epoch": 0.21295822084877736,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001740967387383098,
+      "loss": 0.0991,
+      "step": 24533
+    },
+    {
+      "epoch": 0.21296690132898152,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017409464940610169,
+      "loss": 0.1147,
+      "step": 24534
+    },
+    {
+      "epoch": 0.2129755818091857,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0017409256000380058,
+      "loss": 0.1201,
+      "step": 24535
+    },
+    {
+      "epoch": 0.21298426228938985,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0017409047053140885,
+      "loss": 0.1494,
+      "step": 24536
+    },
+    {
+      "epoch": 0.21299294276959402,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0017408838098892877,
+      "loss": 0.1045,
+      "step": 24537
+    },
+    {
+      "epoch": 0.21300162324979818,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0017408629137636255,
+      "loss": 0.1172,
+      "step": 24538
+    },
+    {
+      "epoch": 0.21301030373000235,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0017408420169371254,
+      "loss": 0.1094,
+      "step": 24539
+    },
+    {
+      "epoch": 0.2130189842102065,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.00174082111940981,
+      "loss": 0.0977,
+      "step": 24540
+    },
+    {
+      "epoch": 0.21302766469041068,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0017408002211817025,
+      "loss": 0.0811,
+      "step": 24541
+    },
+    {
+      "epoch": 0.21303634517061484,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001740779322252825,
+      "loss": 0.0962,
+      "step": 24542
+    },
+    {
+      "epoch": 0.213045025650819,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0017407584226232014,
+      "loss": 0.125,
+      "step": 24543
+    },
+    {
+      "epoch": 0.21305370613102317,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0017407375222928537,
+      "loss": 0.123,
+      "step": 24544
+    },
+    {
+      "epoch": 0.21306238661122734,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0017407166212618052,
+      "loss": 0.1309,
+      "step": 24545
+    },
+    {
+      "epoch": 0.2130710670914315,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0017406957195300785,
+      "loss": 0.0928,
+      "step": 24546
+    },
+    {
+      "epoch": 0.21307974757163567,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.001740674817097697,
+      "loss": 0.2422,
+      "step": 24547
+    },
+    {
+      "epoch": 0.21308842805183983,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0017406539139646828,
+      "loss": 0.1006,
+      "step": 24548
+    },
+    {
+      "epoch": 0.213097108532044,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001740633010131059,
+      "loss": 0.1699,
+      "step": 24549
+    },
+    {
+      "epoch": 0.21310578901224816,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001740612105596849,
+      "loss": 0.1211,
+      "step": 24550
+    },
+    {
+      "epoch": 0.21311446949245233,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.001740591200362075,
+      "loss": 0.0874,
+      "step": 24551
+    },
+    {
+      "epoch": 0.2131231499726565,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0017405702944267598,
+      "loss": 0.0972,
+      "step": 24552
+    },
+    {
+      "epoch": 0.21313183045286066,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0017405493877909268,
+      "loss": 0.1338,
+      "step": 24553
+    },
+    {
+      "epoch": 0.21314051093306483,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0017405284804545987,
+      "loss": 0.0879,
+      "step": 24554
+    },
+    {
+      "epoch": 0.213149191413269,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0017405075724177984,
+      "loss": 0.1064,
+      "step": 24555
+    },
+    {
+      "epoch": 0.21315787189347316,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0017404866636805483,
+      "loss": 0.0781,
+      "step": 24556
+    },
+    {
+      "epoch": 0.21316655237367732,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0017404657542428718,
+      "loss": 0.1348,
+      "step": 24557
+    },
+    {
+      "epoch": 0.21317523285388149,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0017404448441047913,
+      "loss": 0.1348,
+      "step": 24558
+    },
+    {
+      "epoch": 0.21318391333408565,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0017404239332663304,
+      "loss": 0.0913,
+      "step": 24559
+    },
+    {
+      "epoch": 0.21319259381428982,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.001740403021727511,
+      "loss": 0.124,
+      "step": 24560
+    },
+    {
+      "epoch": 0.21320127429449398,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.001740382109488357,
+      "loss": 0.1035,
+      "step": 24561
+    },
+    {
+      "epoch": 0.21320995477469815,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0017403611965488904,
+      "loss": 0.1289,
+      "step": 24562
+    },
+    {
+      "epoch": 0.2132186352549023,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0017403402829091346,
+      "loss": 0.1045,
+      "step": 24563
+    },
+    {
+      "epoch": 0.21322731573510648,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001740319368569112,
+      "loss": 0.1387,
+      "step": 24564
+    },
+    {
+      "epoch": 0.21323599621531064,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001740298453528846,
+      "loss": 0.1641,
+      "step": 24565
+    },
+    {
+      "epoch": 0.2132446766955148,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0017402775377883593,
+      "loss": 0.1128,
+      "step": 24566
+    },
+    {
+      "epoch": 0.21325335717571897,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0017402566213476742,
+      "loss": 0.1172,
+      "step": 24567
+    },
+    {
+      "epoch": 0.21326203765592314,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0017402357042068147,
+      "loss": 0.1328,
+      "step": 24568
+    },
+    {
+      "epoch": 0.2132707181361273,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001740214786365803,
+      "loss": 0.125,
+      "step": 24569
+    },
+    {
+      "epoch": 0.21327939861633147,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0017401938678246619,
+      "loss": 0.1738,
+      "step": 24570
+    },
+    {
+      "epoch": 0.21328807909653563,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0017401729485834144,
+      "loss": 0.1025,
+      "step": 24571
+    },
+    {
+      "epoch": 0.2132967595767398,
+      "grad_norm": 1.859375,
+      "learning_rate": 0.001740152028642083,
+      "loss": 0.1445,
+      "step": 24572
+    },
+    {
+      "epoch": 0.21330544005694396,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0017401311080006913,
+      "loss": 0.1387,
+      "step": 24573
+    },
+    {
+      "epoch": 0.21331412053714813,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0017401101866592618,
+      "loss": 0.0864,
+      "step": 24574
+    },
+    {
+      "epoch": 0.2133228010173523,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0017400892646178177,
+      "loss": 0.1191,
+      "step": 24575
+    },
+    {
+      "epoch": 0.21333148149755646,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0017400683418763812,
+      "loss": 0.1348,
+      "step": 24576
+    },
+    {
+      "epoch": 0.21334016197776062,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0017400474184349757,
+      "loss": 0.1396,
+      "step": 24577
+    },
+    {
+      "epoch": 0.2133488424579648,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0017400264942936238,
+      "loss": 0.1074,
+      "step": 24578
+    },
+    {
+      "epoch": 0.21335752293816895,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0017400055694523492,
+      "loss": 0.1279,
+      "step": 24579
+    },
+    {
+      "epoch": 0.21336620341837312,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0017399846439111735,
+      "loss": 0.1206,
+      "step": 24580
+    },
+    {
+      "epoch": 0.21337488389857726,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0017399637176701201,
+      "loss": 0.1016,
+      "step": 24581
+    },
+    {
+      "epoch": 0.21338356437878142,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0017399427907292123,
+      "loss": 0.0859,
+      "step": 24582
+    },
+    {
+      "epoch": 0.21339224485898559,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0017399218630884728,
+      "loss": 0.0786,
+      "step": 24583
+    },
+    {
+      "epoch": 0.21340092533918975,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0017399009347479242,
+      "loss": 0.1299,
+      "step": 24584
+    },
+    {
+      "epoch": 0.21340960581939392,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0017398800057075896,
+      "loss": 0.0991,
+      "step": 24585
+    },
+    {
+      "epoch": 0.21341828629959808,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0017398590759674917,
+      "loss": 0.126,
+      "step": 24586
+    },
+    {
+      "epoch": 0.21342696677980225,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0017398381455276536,
+      "loss": 0.1113,
+      "step": 24587
+    },
+    {
+      "epoch": 0.2134356472600064,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001739817214388098,
+      "loss": 0.1523,
+      "step": 24588
+    },
+    {
+      "epoch": 0.21344432774021058,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0017397962825488483,
+      "loss": 0.0898,
+      "step": 24589
+    },
+    {
+      "epoch": 0.21345300822041474,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0017397753500099267,
+      "loss": 0.1143,
+      "step": 24590
+    },
+    {
+      "epoch": 0.2134616887006189,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0017397544167713564,
+      "loss": 0.0771,
+      "step": 24591
+    },
+    {
+      "epoch": 0.21347036918082307,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0017397334828331604,
+      "loss": 0.0991,
+      "step": 24592
+    },
+    {
+      "epoch": 0.21347904966102724,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0017397125481953613,
+      "loss": 0.1406,
+      "step": 24593
+    },
+    {
+      "epoch": 0.2134877301412314,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0017396916128579825,
+      "loss": 0.0957,
+      "step": 24594
+    },
+    {
+      "epoch": 0.21349641062143557,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0017396706768210463,
+      "loss": 0.127,
+      "step": 24595
+    },
+    {
+      "epoch": 0.21350509110163973,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0017396497400845757,
+      "loss": 0.1289,
+      "step": 24596
+    },
+    {
+      "epoch": 0.2135137715818439,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0017396288026485939,
+      "loss": 0.0732,
+      "step": 24597
+    },
+    {
+      "epoch": 0.21352245206204806,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0017396078645131239,
+      "loss": 0.1553,
+      "step": 24598
+    },
+    {
+      "epoch": 0.21353113254225223,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001739586925678188,
+      "loss": 0.1221,
+      "step": 24599
+    },
+    {
+      "epoch": 0.2135398130224564,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0017395659861438096,
+      "loss": 0.1084,
+      "step": 24600
+    },
+    {
+      "epoch": 0.21354849350266056,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0017395450459100114,
+      "loss": 0.1357,
+      "step": 24601
+    },
+    {
+      "epoch": 0.21355717398286472,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0017395241049768162,
+      "loss": 0.083,
+      "step": 24602
+    },
+    {
+      "epoch": 0.2135658544630689,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0017395031633442475,
+      "loss": 0.1143,
+      "step": 24603
+    },
+    {
+      "epoch": 0.21357453494327305,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0017394822210123272,
+      "loss": 0.1406,
+      "step": 24604
+    },
+    {
+      "epoch": 0.21358321542347722,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0017394612779810792,
+      "loss": 0.1128,
+      "step": 24605
+    },
+    {
+      "epoch": 0.21359189590368138,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0017394403342505258,
+      "loss": 0.1104,
+      "step": 24606
+    },
+    {
+      "epoch": 0.21360057638388555,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0017394193898206901,
+      "loss": 0.0889,
+      "step": 24607
+    },
+    {
+      "epoch": 0.2136092568640897,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0017393984446915948,
+      "loss": 0.1299,
+      "step": 24608
+    },
+    {
+      "epoch": 0.21361793734429388,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001739377498863263,
+      "loss": 0.1152,
+      "step": 24609
+    },
+    {
+      "epoch": 0.21362661782449804,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001739356552335718,
+      "loss": 0.0811,
+      "step": 24610
+    },
+    {
+      "epoch": 0.2136352983047022,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0017393356051089816,
+      "loss": 0.1245,
+      "step": 24611
+    },
+    {
+      "epoch": 0.21364397878490637,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0017393146571830778,
+      "loss": 0.0781,
+      "step": 24612
+    },
+    {
+      "epoch": 0.21365265926511054,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001739293708558029,
+      "loss": 0.0991,
+      "step": 24613
+    },
+    {
+      "epoch": 0.2136613397453147,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0017392727592338583,
+      "loss": 0.1201,
+      "step": 24614
+    },
+    {
+      "epoch": 0.21367002022551887,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0017392518092105885,
+      "loss": 0.1396,
+      "step": 24615
+    },
+    {
+      "epoch": 0.21367870070572303,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0017392308584882423,
+      "loss": 0.1074,
+      "step": 24616
+    },
+    {
+      "epoch": 0.2136873811859272,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001739209907066843,
+      "loss": 0.1133,
+      "step": 24617
+    },
+    {
+      "epoch": 0.21369606166613136,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0017391889549464134,
+      "loss": 0.0903,
+      "step": 24618
+    },
+    {
+      "epoch": 0.21370474214633553,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0017391680021269763,
+      "loss": 0.0806,
+      "step": 24619
+    },
+    {
+      "epoch": 0.2137134226265397,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0017391470486085548,
+      "loss": 0.1094,
+      "step": 24620
+    },
+    {
+      "epoch": 0.21372210310674386,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0017391260943911714,
+      "loss": 0.1113,
+      "step": 24621
+    },
+    {
+      "epoch": 0.21373078358694803,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0017391051394748495,
+      "loss": 0.1108,
+      "step": 24622
+    },
+    {
+      "epoch": 0.2137394640671522,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.001739084183859612,
+      "loss": 0.2539,
+      "step": 24623
+    },
+    {
+      "epoch": 0.21374814454735636,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0017390632275454813,
+      "loss": 0.0977,
+      "step": 24624
+    },
+    {
+      "epoch": 0.21375682502756052,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0017390422705324806,
+      "loss": 0.1045,
+      "step": 24625
+    },
+    {
+      "epoch": 0.21376550550776469,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0017390213128206334,
+      "loss": 0.0854,
+      "step": 24626
+    },
+    {
+      "epoch": 0.21377418598796885,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0017390003544099618,
+      "loss": 0.1523,
+      "step": 24627
+    },
+    {
+      "epoch": 0.21378286646817302,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001738979395300489,
+      "loss": 0.0806,
+      "step": 24628
+    },
+    {
+      "epoch": 0.21379154694837718,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0017389584354922378,
+      "loss": 0.0898,
+      "step": 24629
+    },
+    {
+      "epoch": 0.21380022742858135,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0017389374749852313,
+      "loss": 0.0977,
+      "step": 24630
+    },
+    {
+      "epoch": 0.2138089079087855,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0017389165137794925,
+      "loss": 0.1318,
+      "step": 24631
+    },
+    {
+      "epoch": 0.21381758838898968,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017388955518750442,
+      "loss": 0.1206,
+      "step": 24632
+    },
+    {
+      "epoch": 0.21382626886919384,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0017388745892719093,
+      "loss": 0.0933,
+      "step": 24633
+    },
+    {
+      "epoch": 0.213834949349398,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0017388536259701107,
+      "loss": 0.1348,
+      "step": 24634
+    },
+    {
+      "epoch": 0.21384362982960217,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0017388326619696712,
+      "loss": 0.0879,
+      "step": 24635
+    },
+    {
+      "epoch": 0.21385231030980634,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0017388116972706143,
+      "loss": 0.1079,
+      "step": 24636
+    },
+    {
+      "epoch": 0.2138609907900105,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0017387907318729622,
+      "loss": 0.0708,
+      "step": 24637
+    },
+    {
+      "epoch": 0.21386967127021467,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0017387697657767383,
+      "loss": 0.1045,
+      "step": 24638
+    },
+    {
+      "epoch": 0.21387835175041883,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017387487989819653,
+      "loss": 0.0859,
+      "step": 24639
+    },
+    {
+      "epoch": 0.213887032230623,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0017387278314886661,
+      "loss": 0.0957,
+      "step": 24640
+    },
+    {
+      "epoch": 0.21389571271082716,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0017387068632968641,
+      "loss": 0.1035,
+      "step": 24641
+    },
+    {
+      "epoch": 0.21390439319103133,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0017386858944065816,
+      "loss": 0.1016,
+      "step": 24642
+    },
+    {
+      "epoch": 0.2139130736712355,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0017386649248178419,
+      "loss": 0.1318,
+      "step": 24643
+    },
+    {
+      "epoch": 0.21392175415143966,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0017386439545306678,
+      "loss": 0.1099,
+      "step": 24644
+    },
+    {
+      "epoch": 0.21393043463164382,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0017386229835450822,
+      "loss": 0.124,
+      "step": 24645
+    },
+    {
+      "epoch": 0.213939115111848,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0017386020118611083,
+      "loss": 0.124,
+      "step": 24646
+    },
+    {
+      "epoch": 0.21394779559205215,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0017385810394787687,
+      "loss": 0.1133,
+      "step": 24647
+    },
+    {
+      "epoch": 0.21395647607225632,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0017385600663980866,
+      "loss": 0.0898,
+      "step": 24648
+    },
+    {
+      "epoch": 0.21396515655246048,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0017385390926190846,
+      "loss": 0.1187,
+      "step": 24649
+    },
+    {
+      "epoch": 0.21397383703266465,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001738518118141786,
+      "loss": 0.1094,
+      "step": 24650
+    },
+    {
+      "epoch": 0.21398251751286881,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0017384971429662136,
+      "loss": 0.0859,
+      "step": 24651
+    },
+    {
+      "epoch": 0.21399119799307298,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0017384761670923901,
+      "loss": 0.0918,
+      "step": 24652
+    },
+    {
+      "epoch": 0.21399987847327714,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0017384551905203387,
+      "loss": 0.0913,
+      "step": 24653
+    },
+    {
+      "epoch": 0.2140085589534813,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0017384342132500824,
+      "loss": 0.1191,
+      "step": 24654
+    },
+    {
+      "epoch": 0.21401723943368547,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001738413235281644,
+      "loss": 0.1182,
+      "step": 24655
+    },
+    {
+      "epoch": 0.21402591991388964,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0017383922566150465,
+      "loss": 0.0977,
+      "step": 24656
+    },
+    {
+      "epoch": 0.2140346003940938,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001738371277250313,
+      "loss": 0.0889,
+      "step": 24657
+    },
+    {
+      "epoch": 0.21404328087429797,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001738350297187466,
+      "loss": 0.0933,
+      "step": 24658
+    },
+    {
+      "epoch": 0.21405196135450213,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001738329316426529,
+      "loss": 0.1836,
+      "step": 24659
+    },
+    {
+      "epoch": 0.2140606418347063,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0017383083349675242,
+      "loss": 0.0781,
+      "step": 24660
+    },
+    {
+      "epoch": 0.21406932231491047,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0017382873528104755,
+      "loss": 0.1211,
+      "step": 24661
+    },
+    {
+      "epoch": 0.21407800279511463,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0017382663699554047,
+      "loss": 0.085,
+      "step": 24662
+    },
+    {
+      "epoch": 0.2140866832753188,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0017382453864023357,
+      "loss": 0.1279,
+      "step": 24663
+    },
+    {
+      "epoch": 0.21409536375552296,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0017382244021512912,
+      "loss": 0.1348,
+      "step": 24664
+    },
+    {
+      "epoch": 0.21410404423572713,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0017382034172022942,
+      "loss": 0.0957,
+      "step": 24665
+    },
+    {
+      "epoch": 0.2141127247159313,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0017381824315553674,
+      "loss": 0.1021,
+      "step": 24666
+    },
+    {
+      "epoch": 0.21412140519613546,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0017381614452105336,
+      "loss": 0.1025,
+      "step": 24667
+    },
+    {
+      "epoch": 0.21413008567633962,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0017381404581678164,
+      "loss": 0.0942,
+      "step": 24668
+    },
+    {
+      "epoch": 0.21413876615654379,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0017381194704272386,
+      "loss": 0.0874,
+      "step": 24669
+    },
+    {
+      "epoch": 0.21414744663674795,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0017380984819888225,
+      "loss": 0.1299,
+      "step": 24670
+    },
+    {
+      "epoch": 0.21415612711695212,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0017380774928525918,
+      "loss": 0.0908,
+      "step": 24671
+    },
+    {
+      "epoch": 0.21416480759715628,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001738056503018569,
+      "loss": 0.1406,
+      "step": 24672
+    },
+    {
+      "epoch": 0.21417348807736045,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001738035512486777,
+      "loss": 0.0986,
+      "step": 24673
+    },
+    {
+      "epoch": 0.2141821685575646,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0017380145212572394,
+      "loss": 0.1299,
+      "step": 24674
+    },
+    {
+      "epoch": 0.21419084903776878,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001737993529329978,
+      "loss": 0.1089,
+      "step": 24675
+    },
+    {
+      "epoch": 0.21419952951797294,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001737972536705017,
+      "loss": 0.0786,
+      "step": 24676
+    },
+    {
+      "epoch": 0.2142082099981771,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0017379515433823788,
+      "loss": 0.0879,
+      "step": 24677
+    },
+    {
+      "epoch": 0.21421689047838127,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0017379305493620864,
+      "loss": 0.0996,
+      "step": 24678
+    },
+    {
+      "epoch": 0.21422557095858544,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0017379095546441624,
+      "loss": 0.1104,
+      "step": 24679
+    },
+    {
+      "epoch": 0.2142342514387896,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0017378885592286304,
+      "loss": 0.0986,
+      "step": 24680
+    },
+    {
+      "epoch": 0.21424293191899377,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0017378675631155127,
+      "loss": 0.1416,
+      "step": 24681
+    },
+    {
+      "epoch": 0.21425161239919793,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001737846566304833,
+      "loss": 0.0889,
+      "step": 24682
+    },
+    {
+      "epoch": 0.2142602928794021,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001737825568796614,
+      "loss": 0.1631,
+      "step": 24683
+    },
+    {
+      "epoch": 0.21426897335960626,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0017378045705908783,
+      "loss": 0.1436,
+      "step": 24684
+    },
+    {
+      "epoch": 0.21427765383981043,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0017377835716876492,
+      "loss": 0.1201,
+      "step": 24685
+    },
+    {
+      "epoch": 0.2142863343200146,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0017377625720869491,
+      "loss": 0.0991,
+      "step": 24686
+    },
+    {
+      "epoch": 0.21429501480021876,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0017377415717888022,
+      "loss": 0.1934,
+      "step": 24687
+    },
+    {
+      "epoch": 0.21430369528042292,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00173772057079323,
+      "loss": 0.1416,
+      "step": 24688
+    },
+    {
+      "epoch": 0.2143123757606271,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0017376995691002566,
+      "loss": 0.0811,
+      "step": 24689
+    },
+    {
+      "epoch": 0.21432105624083125,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0017376785667099044,
+      "loss": 0.0713,
+      "step": 24690
+    },
+    {
+      "epoch": 0.21432973672103542,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0017376575636221965,
+      "loss": 0.1123,
+      "step": 24691
+    },
+    {
+      "epoch": 0.21433841720123958,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0017376365598371557,
+      "loss": 0.1187,
+      "step": 24692
+    },
+    {
+      "epoch": 0.21434709768144375,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0017376155553548054,
+      "loss": 0.0664,
+      "step": 24693
+    },
+    {
+      "epoch": 0.21435577816164791,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0017375945501751683,
+      "loss": 0.1104,
+      "step": 24694
+    },
+    {
+      "epoch": 0.21436445864185208,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0017375735442982671,
+      "loss": 0.1289,
+      "step": 24695
+    },
+    {
+      "epoch": 0.21437313912205624,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0017375525377241253,
+      "loss": 0.1182,
+      "step": 24696
+    },
+    {
+      "epoch": 0.2143818196022604,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0017375315304527658,
+      "loss": 0.0889,
+      "step": 24697
+    },
+    {
+      "epoch": 0.21439050008246457,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0017375105224842111,
+      "loss": 0.1216,
+      "step": 24698
+    },
+    {
+      "epoch": 0.21439918056266874,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0017374895138184848,
+      "loss": 0.1143,
+      "step": 24699
+    },
+    {
+      "epoch": 0.2144078610428729,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001737468504455609,
+      "loss": 0.0796,
+      "step": 24700
+    },
+    {
+      "epoch": 0.21441654152307707,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0017374474943956076,
+      "loss": 0.1191,
+      "step": 24701
+    },
+    {
+      "epoch": 0.21442522200328124,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001737426483638503,
+      "loss": 0.062,
+      "step": 24702
+    },
+    {
+      "epoch": 0.2144339024834854,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0017374054721843184,
+      "loss": 0.1055,
+      "step": 24703
+    },
+    {
+      "epoch": 0.21444258296368954,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001737384460033077,
+      "loss": 0.0952,
+      "step": 24704
+    },
+    {
+      "epoch": 0.2144512634438937,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0017373634471848011,
+      "loss": 0.126,
+      "step": 24705
+    },
+    {
+      "epoch": 0.21445994392409787,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0017373424336395147,
+      "loss": 0.0786,
+      "step": 24706
+    },
+    {
+      "epoch": 0.21446862440430203,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0017373214193972398,
+      "loss": 0.0986,
+      "step": 24707
+    },
+    {
+      "epoch": 0.2144773048845062,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0017373004044579999,
+      "loss": 0.0884,
+      "step": 24708
+    },
+    {
+      "epoch": 0.21448598536471036,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0017372793888218178,
+      "loss": 0.0723,
+      "step": 24709
+    },
+    {
+      "epoch": 0.21449466584491453,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0017372583724887165,
+      "loss": 0.0933,
+      "step": 24710
+    },
+    {
+      "epoch": 0.2145033463251187,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0017372373554587189,
+      "loss": 0.1074,
+      "step": 24711
+    },
+    {
+      "epoch": 0.21451202680532286,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0017372163377318483,
+      "loss": 0.1406,
+      "step": 24712
+    },
+    {
+      "epoch": 0.21452070728552702,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0017371953193081277,
+      "loss": 0.0952,
+      "step": 24713
+    },
+    {
+      "epoch": 0.2145293877657312,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0017371743001875794,
+      "loss": 0.0923,
+      "step": 24714
+    },
+    {
+      "epoch": 0.21453806824593535,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.001737153280370227,
+      "loss": 0.0908,
+      "step": 24715
+    },
+    {
+      "epoch": 0.21454674872613952,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0017371322598560936,
+      "loss": 0.0908,
+      "step": 24716
+    },
+    {
+      "epoch": 0.21455542920634368,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0017371112386452016,
+      "loss": 0.1123,
+      "step": 24717
+    },
+    {
+      "epoch": 0.21456410968654785,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0017370902167375747,
+      "loss": 0.0801,
+      "step": 24718
+    },
+    {
+      "epoch": 0.21457279016675201,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0017370691941332353,
+      "loss": 0.0967,
+      "step": 24719
+    },
+    {
+      "epoch": 0.21458147064695618,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0017370481708322064,
+      "loss": 0.1035,
+      "step": 24720
+    },
+    {
+      "epoch": 0.21459015112716034,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0017370271468345113,
+      "loss": 0.127,
+      "step": 24721
+    },
+    {
+      "epoch": 0.2145988316073645,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001737006122140173,
+      "loss": 0.1445,
+      "step": 24722
+    },
+    {
+      "epoch": 0.21460751208756867,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0017369850967492145,
+      "loss": 0.0923,
+      "step": 24723
+    },
+    {
+      "epoch": 0.21461619256777284,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0017369640706616583,
+      "loss": 0.1406,
+      "step": 24724
+    },
+    {
+      "epoch": 0.214624873047977,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0017369430438775283,
+      "loss": 0.0947,
+      "step": 24725
+    },
+    {
+      "epoch": 0.21463355352818117,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0017369220163968462,
+      "loss": 0.0957,
+      "step": 24726
+    },
+    {
+      "epoch": 0.21464223400838534,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0017369009882196363,
+      "loss": 0.0898,
+      "step": 24727
+    },
+    {
+      "epoch": 0.2146509144885895,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001736879959345921,
+      "loss": 0.1055,
+      "step": 24728
+    },
+    {
+      "epoch": 0.21465959496879367,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0017368589297757232,
+      "loss": 0.1104,
+      "step": 24729
+    },
+    {
+      "epoch": 0.21466827544899783,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0017368378995090664,
+      "loss": 0.0688,
+      "step": 24730
+    },
+    {
+      "epoch": 0.214676955929202,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001736816868545973,
+      "loss": 0.1084,
+      "step": 24731
+    },
+    {
+      "epoch": 0.21468563640940616,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0017367958368864661,
+      "loss": 0.1162,
+      "step": 24732
+    },
+    {
+      "epoch": 0.21469431688961033,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0017367748045305686,
+      "loss": 0.1196,
+      "step": 24733
+    },
+    {
+      "epoch": 0.2147029973698145,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0017367537714783044,
+      "loss": 0.082,
+      "step": 24734
+    },
+    {
+      "epoch": 0.21471167785001866,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0017367327377296952,
+      "loss": 0.1113,
+      "step": 24735
+    },
+    {
+      "epoch": 0.21472035833022282,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.001736711703284765,
+      "loss": 0.1465,
+      "step": 24736
+    },
+    {
+      "epoch": 0.214729038810427,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0017366906681435365,
+      "loss": 0.1445,
+      "step": 24737
+    },
+    {
+      "epoch": 0.21473771929063115,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0017366696323060328,
+      "loss": 0.085,
+      "step": 24738
+    },
+    {
+      "epoch": 0.21474639977083532,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0017366485957722763,
+      "loss": 0.1543,
+      "step": 24739
+    },
+    {
+      "epoch": 0.21475508025103948,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0017366275585422907,
+      "loss": 0.1206,
+      "step": 24740
+    },
+    {
+      "epoch": 0.21476376073124365,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0017366065206160988,
+      "loss": 0.0928,
+      "step": 24741
+    },
+    {
+      "epoch": 0.2147724412114478,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0017365854819937235,
+      "loss": 0.1001,
+      "step": 24742
+    },
+    {
+      "epoch": 0.21478112169165198,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0017365644426751879,
+      "loss": 0.0801,
+      "step": 24743
+    },
+    {
+      "epoch": 0.21478980217185614,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0017365434026605149,
+      "loss": 0.127,
+      "step": 24744
+    },
+    {
+      "epoch": 0.2147984826520603,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0017365223619497277,
+      "loss": 0.1299,
+      "step": 24745
+    },
+    {
+      "epoch": 0.21480716313226447,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0017365013205428493,
+      "loss": 0.1025,
+      "step": 24746
+    },
+    {
+      "epoch": 0.21481584361246864,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0017364802784399025,
+      "loss": 0.1191,
+      "step": 24747
+    },
+    {
+      "epoch": 0.2148245240926728,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0017364592356409105,
+      "loss": 0.1289,
+      "step": 24748
+    },
+    {
+      "epoch": 0.21483320457287697,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001736438192145896,
+      "loss": 0.0938,
+      "step": 24749
+    },
+    {
+      "epoch": 0.21484188505308113,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0017364171479548828,
+      "loss": 0.1074,
+      "step": 24750
+    },
+    {
+      "epoch": 0.2148505655332853,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0017363961030678928,
+      "loss": 0.1455,
+      "step": 24751
+    },
+    {
+      "epoch": 0.21485924601348946,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.00173637505748495,
+      "loss": 0.082,
+      "step": 24752
+    },
+    {
+      "epoch": 0.21486792649369363,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0017363540112060767,
+      "loss": 0.1191,
+      "step": 24753
+    },
+    {
+      "epoch": 0.2148766069738978,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001736332964231296,
+      "loss": 0.1279,
+      "step": 24754
+    },
+    {
+      "epoch": 0.21488528745410196,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0017363119165606318,
+      "loss": 0.1064,
+      "step": 24755
+    },
+    {
+      "epoch": 0.21489396793430612,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0017362908681941062,
+      "loss": 0.0967,
+      "step": 24756
+    },
+    {
+      "epoch": 0.2149026484145103,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017362698191317422,
+      "loss": 0.0703,
+      "step": 24757
+    },
+    {
+      "epoch": 0.21491132889471445,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001736248769373563,
+      "loss": 0.1436,
+      "step": 24758
+    },
+    {
+      "epoch": 0.21492000937491862,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0017362277189195922,
+      "loss": 0.0933,
+      "step": 24759
+    },
+    {
+      "epoch": 0.21492868985512278,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001736206667769852,
+      "loss": 0.1104,
+      "step": 24760
+    },
+    {
+      "epoch": 0.21493737033532695,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0017361856159243659,
+      "loss": 0.0889,
+      "step": 24761
+    },
+    {
+      "epoch": 0.21494605081553111,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0017361645633831566,
+      "loss": 0.0918,
+      "step": 24762
+    },
+    {
+      "epoch": 0.21495473129573528,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0017361435101462472,
+      "loss": 0.1484,
+      "step": 24763
+    },
+    {
+      "epoch": 0.21496341177593944,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0017361224562136613,
+      "loss": 0.105,
+      "step": 24764
+    },
+    {
+      "epoch": 0.2149720922561436,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001736101401585421,
+      "loss": 0.1182,
+      "step": 24765
+    },
+    {
+      "epoch": 0.21498077273634777,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00173608034626155,
+      "loss": 0.1064,
+      "step": 24766
+    },
+    {
+      "epoch": 0.21498945321655194,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0017360592902420711,
+      "loss": 0.0811,
+      "step": 24767
+    },
+    {
+      "epoch": 0.2149981336967561,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001736038233527007,
+      "loss": 0.0957,
+      "step": 24768
+    },
+    {
+      "epoch": 0.21500681417696027,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0017360171761163815,
+      "loss": 0.1221,
+      "step": 24769
+    },
+    {
+      "epoch": 0.21501549465716444,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001735996118010217,
+      "loss": 0.0938,
+      "step": 24770
+    },
+    {
+      "epoch": 0.2150241751373686,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0017359750592085364,
+      "loss": 0.1348,
+      "step": 24771
+    },
+    {
+      "epoch": 0.21503285561757277,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0017359539997113633,
+      "loss": 0.1025,
+      "step": 24772
+    },
+    {
+      "epoch": 0.21504153609777693,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0017359329395187201,
+      "loss": 0.125,
+      "step": 24773
+    },
+    {
+      "epoch": 0.2150502165779811,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0017359118786306306,
+      "loss": 0.1328,
+      "step": 24774
+    },
+    {
+      "epoch": 0.21505889705818526,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0017358908170471176,
+      "loss": 0.1025,
+      "step": 24775
+    },
+    {
+      "epoch": 0.21506757753838943,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0017358697547682038,
+      "loss": 0.127,
+      "step": 24776
+    },
+    {
+      "epoch": 0.2150762580185936,
+      "grad_norm": 1.734375,
+      "learning_rate": 0.0017358486917939122,
+      "loss": 0.1641,
+      "step": 24777
+    },
+    {
+      "epoch": 0.21508493849879776,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0017358276281242663,
+      "loss": 0.1191,
+      "step": 24778
+    },
+    {
+      "epoch": 0.21509361897900192,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0017358065637592885,
+      "loss": 0.0903,
+      "step": 24779
+    },
+    {
+      "epoch": 0.2151022994592061,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0017357854986990023,
+      "loss": 0.0908,
+      "step": 24780
+    },
+    {
+      "epoch": 0.21511097993941025,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0017357644329434308,
+      "loss": 0.0864,
+      "step": 24781
+    },
+    {
+      "epoch": 0.21511966041961442,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0017357433664925966,
+      "loss": 0.103,
+      "step": 24782
+    },
+    {
+      "epoch": 0.21512834089981858,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0017357222993465234,
+      "loss": 0.1064,
+      "step": 24783
+    },
+    {
+      "epoch": 0.21513702138002275,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0017357012315052335,
+      "loss": 0.1133,
+      "step": 24784
+    },
+    {
+      "epoch": 0.2151457018602269,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0017356801629687503,
+      "loss": 0.1016,
+      "step": 24785
+    },
+    {
+      "epoch": 0.21515438234043108,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0017356590937370972,
+      "loss": 0.124,
+      "step": 24786
+    },
+    {
+      "epoch": 0.21516306282063524,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0017356380238102965,
+      "loss": 0.0967,
+      "step": 24787
+    },
+    {
+      "epoch": 0.2151717433008394,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0017356169531883717,
+      "loss": 0.1279,
+      "step": 24788
+    },
+    {
+      "epoch": 0.21518042378104357,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0017355958818713456,
+      "loss": 0.0957,
+      "step": 24789
+    },
+    {
+      "epoch": 0.21518910426124774,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0017355748098592417,
+      "loss": 0.0752,
+      "step": 24790
+    },
+    {
+      "epoch": 0.2151977847414519,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0017355537371520824,
+      "loss": 0.1123,
+      "step": 24791
+    },
+    {
+      "epoch": 0.21520646522165607,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0017355326637498913,
+      "loss": 0.082,
+      "step": 24792
+    },
+    {
+      "epoch": 0.21521514570186023,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0017355115896526912,
+      "loss": 0.0972,
+      "step": 24793
+    },
+    {
+      "epoch": 0.2152238261820644,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001735490514860505,
+      "loss": 0.1426,
+      "step": 24794
+    },
+    {
+      "epoch": 0.21523250666226856,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0017354694393733564,
+      "loss": 0.1172,
+      "step": 24795
+    },
+    {
+      "epoch": 0.21524118714247273,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0017354483631912675,
+      "loss": 0.1191,
+      "step": 24796
+    },
+    {
+      "epoch": 0.2152498676226769,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0017354272863142621,
+      "loss": 0.1074,
+      "step": 24797
+    },
+    {
+      "epoch": 0.21525854810288106,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017354062087423625,
+      "loss": 0.1348,
+      "step": 24798
+    },
+    {
+      "epoch": 0.21526722858308522,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0017353851304755927,
+      "loss": 0.1289,
+      "step": 24799
+    },
+    {
+      "epoch": 0.2152759090632894,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0017353640515139753,
+      "loss": 0.0923,
+      "step": 24800
+    },
+    {
+      "epoch": 0.21528458954349355,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001735342971857533,
+      "loss": 0.0952,
+      "step": 24801
+    },
+    {
+      "epoch": 0.21529327002369772,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001735321891506289,
+      "loss": 0.1167,
+      "step": 24802
+    },
+    {
+      "epoch": 0.21530195050390188,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0017353008104602672,
+      "loss": 0.1445,
+      "step": 24803
+    },
+    {
+      "epoch": 0.21531063098410605,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0017352797287194894,
+      "loss": 0.1309,
+      "step": 24804
+    },
+    {
+      "epoch": 0.21531931146431021,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0017352586462839798,
+      "loss": 0.124,
+      "step": 24805
+    },
+    {
+      "epoch": 0.21532799194451438,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0017352375631537605,
+      "loss": 0.2891,
+      "step": 24806
+    },
+    {
+      "epoch": 0.21533667242471854,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001735216479328855,
+      "loss": 0.0898,
+      "step": 24807
+    },
+    {
+      "epoch": 0.2153453529049227,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017351953948092863,
+      "loss": 0.0952,
+      "step": 24808
+    },
+    {
+      "epoch": 0.21535403338512688,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0017351743095950774,
+      "loss": 0.1172,
+      "step": 24809
+    },
+    {
+      "epoch": 0.21536271386533104,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0017351532236862515,
+      "loss": 0.1055,
+      "step": 24810
+    },
+    {
+      "epoch": 0.2153713943455352,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0017351321370828315,
+      "loss": 0.1045,
+      "step": 24811
+    },
+    {
+      "epoch": 0.21538007482573937,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0017351110497848406,
+      "loss": 0.1162,
+      "step": 24812
+    },
+    {
+      "epoch": 0.21538875530594354,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0017350899617923018,
+      "loss": 0.0928,
+      "step": 24813
+    },
+    {
+      "epoch": 0.2153974357861477,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001735068873105238,
+      "loss": 0.1187,
+      "step": 24814
+    },
+    {
+      "epoch": 0.21540611626635187,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0017350477837236727,
+      "loss": 0.1191,
+      "step": 24815
+    },
+    {
+      "epoch": 0.21541479674655603,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0017350266936476285,
+      "loss": 0.0947,
+      "step": 24816
+    },
+    {
+      "epoch": 0.2154234772267602,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0017350056028771288,
+      "loss": 0.0869,
+      "step": 24817
+    },
+    {
+      "epoch": 0.21543215770696436,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0017349845114121962,
+      "loss": 0.1797,
+      "step": 24818
+    },
+    {
+      "epoch": 0.21544083818716853,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0017349634192528542,
+      "loss": 0.1152,
+      "step": 24819
+    },
+    {
+      "epoch": 0.2154495186673727,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001734942326399126,
+      "loss": 0.0889,
+      "step": 24820
+    },
+    {
+      "epoch": 0.21545819914757686,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0017349212328510342,
+      "loss": 0.165,
+      "step": 24821
+    },
+    {
+      "epoch": 0.21546687962778102,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001734900138608602,
+      "loss": 0.1289,
+      "step": 24822
+    },
+    {
+      "epoch": 0.2154755601079852,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0017348790436718527,
+      "loss": 0.0918,
+      "step": 24823
+    },
+    {
+      "epoch": 0.21548424058818935,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0017348579480408095,
+      "loss": 0.1367,
+      "step": 24824
+    },
+    {
+      "epoch": 0.21549292106839352,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0017348368517154948,
+      "loss": 0.1074,
+      "step": 24825
+    },
+    {
+      "epoch": 0.21550160154859768,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.001734815754695932,
+      "loss": 0.0981,
+      "step": 24826
+    },
+    {
+      "epoch": 0.21551028202880182,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0017347946569821443,
+      "loss": 0.1104,
+      "step": 24827
+    },
+    {
+      "epoch": 0.21551896250900598,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0017347735585741548,
+      "loss": 0.0786,
+      "step": 24828
+    },
+    {
+      "epoch": 0.21552764298921015,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0017347524594719865,
+      "loss": 0.0771,
+      "step": 24829
+    },
+    {
+      "epoch": 0.21553632346941431,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0017347313596756625,
+      "loss": 0.1699,
+      "step": 24830
+    },
+    {
+      "epoch": 0.21554500394961848,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0017347102591852056,
+      "loss": 0.1099,
+      "step": 24831
+    },
+    {
+      "epoch": 0.21555368442982265,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0017346891580006392,
+      "loss": 0.1094,
+      "step": 24832
+    },
+    {
+      "epoch": 0.2155623649100268,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0017346680561219864,
+      "loss": 0.1406,
+      "step": 24833
+    },
+    {
+      "epoch": 0.21557104539023098,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00173464695354927,
+      "loss": 0.1011,
+      "step": 24834
+    },
+    {
+      "epoch": 0.21557972587043514,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017346258502825135,
+      "loss": 0.0908,
+      "step": 24835
+    },
+    {
+      "epoch": 0.2155884063506393,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001734604746321739,
+      "loss": 0.1201,
+      "step": 24836
+    },
+    {
+      "epoch": 0.21559708683084347,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.001734583641666971,
+      "loss": 0.1011,
+      "step": 24837
+    },
+    {
+      "epoch": 0.21560576731104764,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0017345625363182316,
+      "loss": 0.0713,
+      "step": 24838
+    },
+    {
+      "epoch": 0.2156144477912518,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0017345414302755442,
+      "loss": 0.168,
+      "step": 24839
+    },
+    {
+      "epoch": 0.21562312827145597,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0017345203235389318,
+      "loss": 0.1201,
+      "step": 24840
+    },
+    {
+      "epoch": 0.21563180875166013,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0017344992161084172,
+      "loss": 0.1484,
+      "step": 24841
+    },
+    {
+      "epoch": 0.2156404892318643,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001734478107984024,
+      "loss": 0.1309,
+      "step": 24842
+    },
+    {
+      "epoch": 0.21564916971206846,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0017344569991657754,
+      "loss": 0.127,
+      "step": 24843
+    },
+    {
+      "epoch": 0.21565785019227263,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0017344358896536941,
+      "loss": 0.2402,
+      "step": 24844
+    },
+    {
+      "epoch": 0.2156665306724768,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0017344147794478028,
+      "loss": 0.0928,
+      "step": 24845
+    },
+    {
+      "epoch": 0.21567521115268096,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0017343936685481252,
+      "loss": 0.1138,
+      "step": 24846
+    },
+    {
+      "epoch": 0.21568389163288512,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0017343725569546845,
+      "loss": 0.1006,
+      "step": 24847
+    },
+    {
+      "epoch": 0.2156925721130893,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0017343514446675032,
+      "loss": 0.0811,
+      "step": 24848
+    },
+    {
+      "epoch": 0.21570125259329345,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001734330331686605,
+      "loss": 0.1426,
+      "step": 24849
+    },
+    {
+      "epoch": 0.21570993307349762,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0017343092180120124,
+      "loss": 0.0947,
+      "step": 24850
+    },
+    {
+      "epoch": 0.21571861355370178,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0017342881036437488,
+      "loss": 0.1035,
+      "step": 24851
+    },
+    {
+      "epoch": 0.21572729403390595,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0017342669885818374,
+      "loss": 0.1377,
+      "step": 24852
+    },
+    {
+      "epoch": 0.2157359745141101,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001734245872826301,
+      "loss": 0.0957,
+      "step": 24853
+    },
+    {
+      "epoch": 0.21574465499431428,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0017342247563771626,
+      "loss": 0.0996,
+      "step": 24854
+    },
+    {
+      "epoch": 0.21575333547451844,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001734203639234446,
+      "loss": 0.1748,
+      "step": 24855
+    },
+    {
+      "epoch": 0.2157620159547226,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.001734182521398174,
+      "loss": 0.1069,
+      "step": 24856
+    },
+    {
+      "epoch": 0.21577069643492677,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001734161402868369,
+      "loss": 0.1338,
+      "step": 24857
+    },
+    {
+      "epoch": 0.21577937691513094,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0017341402836450548,
+      "loss": 0.0952,
+      "step": 24858
+    },
+    {
+      "epoch": 0.2157880573953351,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0017341191637282542,
+      "loss": 0.1074,
+      "step": 24859
+    },
+    {
+      "epoch": 0.21579673787553927,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0017340980431179906,
+      "loss": 0.0938,
+      "step": 24860
+    },
+    {
+      "epoch": 0.21580541835574343,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001734076921814287,
+      "loss": 0.0986,
+      "step": 24861
+    },
+    {
+      "epoch": 0.2158140988359476,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001734055799817166,
+      "loss": 0.0938,
+      "step": 24862
+    },
+    {
+      "epoch": 0.21582277931615176,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0017340346771266514,
+      "loss": 0.0791,
+      "step": 24863
+    },
+    {
+      "epoch": 0.21583145979635593,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0017340135537427658,
+      "loss": 0.1367,
+      "step": 24864
+    },
+    {
+      "epoch": 0.2158401402765601,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0017339924296655327,
+      "loss": 0.0767,
+      "step": 24865
+    },
+    {
+      "epoch": 0.21584882075676426,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0017339713048949749,
+      "loss": 0.0767,
+      "step": 24866
+    },
+    {
+      "epoch": 0.21585750123696842,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0017339501794311157,
+      "loss": 0.0791,
+      "step": 24867
+    },
+    {
+      "epoch": 0.2158661817171726,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0017339290532739783,
+      "loss": 0.1367,
+      "step": 24868
+    },
+    {
+      "epoch": 0.21587486219737675,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0017339079264235852,
+      "loss": 0.1206,
+      "step": 24869
+    },
+    {
+      "epoch": 0.21588354267758092,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0017338867988799598,
+      "loss": 0.083,
+      "step": 24870
+    },
+    {
+      "epoch": 0.21589222315778508,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0017338656706431257,
+      "loss": 0.0918,
+      "step": 24871
+    },
+    {
+      "epoch": 0.21590090363798925,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0017338445417131056,
+      "loss": 0.1055,
+      "step": 24872
+    },
+    {
+      "epoch": 0.21590958411819342,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0017338234120899225,
+      "loss": 0.1016,
+      "step": 24873
+    },
+    {
+      "epoch": 0.21591826459839758,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0017338022817736,
+      "loss": 0.1104,
+      "step": 24874
+    },
+    {
+      "epoch": 0.21592694507860175,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0017337811507641605,
+      "loss": 0.0952,
+      "step": 24875
+    },
+    {
+      "epoch": 0.2159356255588059,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017337600190616277,
+      "loss": 0.0928,
+      "step": 24876
+    },
+    {
+      "epoch": 0.21594430603901008,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001733738886666024,
+      "loss": 0.1021,
+      "step": 24877
+    },
+    {
+      "epoch": 0.21595298651921424,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0017337177535773733,
+      "loss": 0.0889,
+      "step": 24878
+    },
+    {
+      "epoch": 0.2159616669994184,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0017336966197956986,
+      "loss": 0.0742,
+      "step": 24879
+    },
+    {
+      "epoch": 0.21597034747962257,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0017336754853210224,
+      "loss": 0.1387,
+      "step": 24880
+    },
+    {
+      "epoch": 0.21597902795982674,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0017336543501533684,
+      "loss": 0.0996,
+      "step": 24881
+    },
+    {
+      "epoch": 0.2159877084400309,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0017336332142927596,
+      "loss": 0.1113,
+      "step": 24882
+    },
+    {
+      "epoch": 0.21599638892023507,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0017336120777392185,
+      "loss": 0.0825,
+      "step": 24883
+    },
+    {
+      "epoch": 0.21600506940043923,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0017335909404927695,
+      "loss": 0.1001,
+      "step": 24884
+    },
+    {
+      "epoch": 0.2160137498806434,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0017335698025534346,
+      "loss": 0.1201,
+      "step": 24885
+    },
+    {
+      "epoch": 0.21602243036084756,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0017335486639212371,
+      "loss": 0.105,
+      "step": 24886
+    },
+    {
+      "epoch": 0.21603111084105173,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0017335275245962006,
+      "loss": 0.0986,
+      "step": 24887
+    },
+    {
+      "epoch": 0.2160397913212559,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0017335063845783478,
+      "loss": 0.1328,
+      "step": 24888
+    },
+    {
+      "epoch": 0.21604847180146006,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001733485243867702,
+      "loss": 0.1309,
+      "step": 24889
+    },
+    {
+      "epoch": 0.21605715228166422,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0017334641024642863,
+      "loss": 0.1191,
+      "step": 24890
+    },
+    {
+      "epoch": 0.2160658327618684,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001733442960368124,
+      "loss": 0.1133,
+      "step": 24891
+    },
+    {
+      "epoch": 0.21607451324207255,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0017334218175792373,
+      "loss": 0.0664,
+      "step": 24892
+    },
+    {
+      "epoch": 0.21608319372227672,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0017334006740976507,
+      "loss": 0.103,
+      "step": 24893
+    },
+    {
+      "epoch": 0.21609187420248088,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0017333795299233862,
+      "loss": 0.1172,
+      "step": 24894
+    },
+    {
+      "epoch": 0.21610055468268505,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0017333583850564676,
+      "loss": 0.1553,
+      "step": 24895
+    },
+    {
+      "epoch": 0.2161092351628892,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0017333372394969175,
+      "loss": 0.1436,
+      "step": 24896
+    },
+    {
+      "epoch": 0.21611791564309338,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0017333160932447593,
+      "loss": 0.1133,
+      "step": 24897
+    },
+    {
+      "epoch": 0.21612659612329754,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0017332949463000165,
+      "loss": 0.1055,
+      "step": 24898
+    },
+    {
+      "epoch": 0.2161352766035017,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0017332737986627122,
+      "loss": 0.1167,
+      "step": 24899
+    },
+    {
+      "epoch": 0.21614395708370587,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0017332526503328685,
+      "loss": 0.0908,
+      "step": 24900
+    },
+    {
+      "epoch": 0.21615263756391004,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0017332315013105094,
+      "loss": 0.1069,
+      "step": 24901
+    },
+    {
+      "epoch": 0.2161613180441142,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001733210351595658,
+      "loss": 0.1084,
+      "step": 24902
+    },
+    {
+      "epoch": 0.21616999852431837,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0017331892011883368,
+      "loss": 0.0713,
+      "step": 24903
+    },
+    {
+      "epoch": 0.21617867900452253,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0017331680500885698,
+      "loss": 0.0967,
+      "step": 24904
+    },
+    {
+      "epoch": 0.2161873594847267,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0017331468982963796,
+      "loss": 0.0752,
+      "step": 24905
+    },
+    {
+      "epoch": 0.21619603996493086,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0017331257458117896,
+      "loss": 0.0693,
+      "step": 24906
+    },
+    {
+      "epoch": 0.21620472044513503,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0017331045926348228,
+      "loss": 0.1416,
+      "step": 24907
+    },
+    {
+      "epoch": 0.2162134009253392,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0017330834387655023,
+      "loss": 0.1318,
+      "step": 24908
+    },
+    {
+      "epoch": 0.21622208140554336,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.001733062284203851,
+      "loss": 0.1279,
+      "step": 24909
+    },
+    {
+      "epoch": 0.21623076188574752,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0017330411289498927,
+      "loss": 0.0947,
+      "step": 24910
+    },
+    {
+      "epoch": 0.2162394423659517,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.00173301997300365,
+      "loss": 0.0874,
+      "step": 24911
+    },
+    {
+      "epoch": 0.21624812284615585,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0017329988163651462,
+      "loss": 0.1475,
+      "step": 24912
+    },
+    {
+      "epoch": 0.21625680332636002,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0017329776590344044,
+      "loss": 0.1064,
+      "step": 24913
+    },
+    {
+      "epoch": 0.21626548380656418,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0017329565010114475,
+      "loss": 0.125,
+      "step": 24914
+    },
+    {
+      "epoch": 0.21627416428676835,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0017329353422962993,
+      "loss": 0.0859,
+      "step": 24915
+    },
+    {
+      "epoch": 0.21628284476697252,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.001732914182888982,
+      "loss": 0.0928,
+      "step": 24916
+    },
+    {
+      "epoch": 0.21629152524717668,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0017328930227895195,
+      "loss": 0.0977,
+      "step": 24917
+    },
+    {
+      "epoch": 0.21630020572738085,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0017328718619979348,
+      "loss": 0.1162,
+      "step": 24918
+    },
+    {
+      "epoch": 0.216308886207585,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0017328507005142507,
+      "loss": 0.1099,
+      "step": 24919
+    },
+    {
+      "epoch": 0.21631756668778918,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001732829538338491,
+      "loss": 0.1074,
+      "step": 24920
+    },
+    {
+      "epoch": 0.21632624716799334,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001732808375470678,
+      "loss": 0.168,
+      "step": 24921
+    },
+    {
+      "epoch": 0.2163349276481975,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0017327872119108354,
+      "loss": 0.1045,
+      "step": 24922
+    },
+    {
+      "epoch": 0.21634360812840167,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0017327660476589863,
+      "loss": 0.1025,
+      "step": 24923
+    },
+    {
+      "epoch": 0.21635228860860584,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001732744882715154,
+      "loss": 0.0938,
+      "step": 24924
+    },
+    {
+      "epoch": 0.21636096908881,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001732723717079361,
+      "loss": 0.1167,
+      "step": 24925
+    },
+    {
+      "epoch": 0.21636964956901417,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001732702550751631,
+      "loss": 0.0723,
+      "step": 24926
+    },
+    {
+      "epoch": 0.21637833004921833,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0017326813837319867,
+      "loss": 0.1074,
+      "step": 24927
+    },
+    {
+      "epoch": 0.2163870105294225,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001732660216020452,
+      "loss": 0.0854,
+      "step": 24928
+    },
+    {
+      "epoch": 0.21639569100962666,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0017326390476170494,
+      "loss": 0.0977,
+      "step": 24929
+    },
+    {
+      "epoch": 0.21640437148983083,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0017326178785218023,
+      "loss": 0.0996,
+      "step": 24930
+    },
+    {
+      "epoch": 0.216413051970035,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0017325967087347335,
+      "loss": 0.0913,
+      "step": 24931
+    },
+    {
+      "epoch": 0.21642173245023916,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001732575538255867,
+      "loss": 0.0967,
+      "step": 24932
+    },
+    {
+      "epoch": 0.21643041293044332,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001732554367085225,
+      "loss": 0.1094,
+      "step": 24933
+    },
+    {
+      "epoch": 0.2164390934106475,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0017325331952228312,
+      "loss": 0.1357,
+      "step": 24934
+    },
+    {
+      "epoch": 0.21644777389085165,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0017325120226687083,
+      "loss": 0.124,
+      "step": 24935
+    },
+    {
+      "epoch": 0.21645645437105582,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.00173249084942288,
+      "loss": 0.1172,
+      "step": 24936
+    },
+    {
+      "epoch": 0.21646513485125998,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0017324696754853692,
+      "loss": 0.0928,
+      "step": 24937
+    },
+    {
+      "epoch": 0.21647381533146415,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0017324485008561991,
+      "loss": 0.0732,
+      "step": 24938
+    },
+    {
+      "epoch": 0.2164824958116683,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0017324273255353928,
+      "loss": 0.104,
+      "step": 24939
+    },
+    {
+      "epoch": 0.21649117629187248,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0017324061495229736,
+      "loss": 0.1309,
+      "step": 24940
+    },
+    {
+      "epoch": 0.21649985677207664,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0017323849728189645,
+      "loss": 0.1016,
+      "step": 24941
+    },
+    {
+      "epoch": 0.2165085372522808,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0017323637954233884,
+      "loss": 0.1787,
+      "step": 24942
+    },
+    {
+      "epoch": 0.21651721773248497,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0017323426173362691,
+      "loss": 0.1138,
+      "step": 24943
+    },
+    {
+      "epoch": 0.21652589821268914,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0017323214385576292,
+      "loss": 0.1279,
+      "step": 24944
+    },
+    {
+      "epoch": 0.2165345786928933,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0017323002590874921,
+      "loss": 0.1055,
+      "step": 24945
+    },
+    {
+      "epoch": 0.21654325917309747,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0017322790789258808,
+      "loss": 0.1182,
+      "step": 24946
+    },
+    {
+      "epoch": 0.21655193965330163,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001732257898072819,
+      "loss": 0.124,
+      "step": 24947
+    },
+    {
+      "epoch": 0.2165606201335058,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0017322367165283291,
+      "loss": 0.1074,
+      "step": 24948
+    },
+    {
+      "epoch": 0.21656930061370996,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0017322155342924348,
+      "loss": 0.0845,
+      "step": 24949
+    },
+    {
+      "epoch": 0.2165779810939141,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0017321943513651592,
+      "loss": 0.166,
+      "step": 24950
+    },
+    {
+      "epoch": 0.21658666157411827,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001732173167746525,
+      "loss": 0.123,
+      "step": 24951
+    },
+    {
+      "epoch": 0.21659534205432243,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001732151983436556,
+      "loss": 0.1167,
+      "step": 24952
+    },
+    {
+      "epoch": 0.2166040225345266,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.001732130798435275,
+      "loss": 0.0879,
+      "step": 24953
+    },
+    {
+      "epoch": 0.21661270301473076,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0017321096127427054,
+      "loss": 0.082,
+      "step": 24954
+    },
+    {
+      "epoch": 0.21662138349493493,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0017320884263588702,
+      "loss": 0.1396,
+      "step": 24955
+    },
+    {
+      "epoch": 0.2166300639751391,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0017320672392837925,
+      "loss": 0.105,
+      "step": 24956
+    },
+    {
+      "epoch": 0.21663874445534326,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0017320460515174955,
+      "loss": 0.084,
+      "step": 24957
+    },
+    {
+      "epoch": 0.21664742493554742,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0017320248630600026,
+      "loss": 0.0913,
+      "step": 24958
+    },
+    {
+      "epoch": 0.2166561054157516,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0017320036739113364,
+      "loss": 0.1055,
+      "step": 24959
+    },
+    {
+      "epoch": 0.21666478589595575,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001731982484071521,
+      "loss": 0.106,
+      "step": 24960
+    },
+    {
+      "epoch": 0.21667346637615992,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0017319612935405785,
+      "loss": 0.0947,
+      "step": 24961
+    },
+    {
+      "epoch": 0.21668214685636408,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0017319401023185332,
+      "loss": 0.0894,
+      "step": 24962
+    },
+    {
+      "epoch": 0.21669082733656825,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0017319189104054074,
+      "loss": 0.1104,
+      "step": 24963
+    },
+    {
+      "epoch": 0.2166995078167724,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0017318977178012247,
+      "loss": 0.1099,
+      "step": 24964
+    },
+    {
+      "epoch": 0.21670818829697658,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0017318765245060079,
+      "loss": 0.1094,
+      "step": 24965
+    },
+    {
+      "epoch": 0.21671686877718074,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0017318553305197806,
+      "loss": 0.1045,
+      "step": 24966
+    },
+    {
+      "epoch": 0.2167255492573849,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001731834135842566,
+      "loss": 0.1133,
+      "step": 24967
+    },
+    {
+      "epoch": 0.21673422973758907,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0017318129404743868,
+      "loss": 0.0752,
+      "step": 24968
+    },
+    {
+      "epoch": 0.21674291021779324,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0017317917444152667,
+      "loss": 0.0898,
+      "step": 24969
+    },
+    {
+      "epoch": 0.2167515906979974,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0017317705476652286,
+      "loss": 0.1289,
+      "step": 24970
+    },
+    {
+      "epoch": 0.21676027117820157,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0017317493502242954,
+      "loss": 0.124,
+      "step": 24971
+    },
+    {
+      "epoch": 0.21676895165840573,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001731728152092491,
+      "loss": 0.1797,
+      "step": 24972
+    },
+    {
+      "epoch": 0.2167776321386099,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.001731706953269838,
+      "loss": 0.1035,
+      "step": 24973
+    },
+    {
+      "epoch": 0.21678631261881406,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0017316857537563598,
+      "loss": 0.0874,
+      "step": 24974
+    },
+    {
+      "epoch": 0.21679499309901823,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0017316645535520791,
+      "loss": 0.0947,
+      "step": 24975
+    },
+    {
+      "epoch": 0.2168036735792224,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00173164335265702,
+      "loss": 0.0869,
+      "step": 24976
+    },
+    {
+      "epoch": 0.21681235405942656,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0017316221510712056,
+      "loss": 0.0986,
+      "step": 24977
+    },
+    {
+      "epoch": 0.21682103453963072,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001731600948794658,
+      "loss": 0.1025,
+      "step": 24978
+    },
+    {
+      "epoch": 0.2168297150198349,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0017315797458274016,
+      "loss": 0.1289,
+      "step": 24979
+    },
+    {
+      "epoch": 0.21683839550003906,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0017315585421694586,
+      "loss": 0.126,
+      "step": 24980
+    },
+    {
+      "epoch": 0.21684707598024322,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0017315373378208531,
+      "loss": 0.1138,
+      "step": 24981
+    },
+    {
+      "epoch": 0.21685575646044739,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0017315161327816075,
+      "loss": 0.1064,
+      "step": 24982
+    },
+    {
+      "epoch": 0.21686443694065155,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0017314949270517458,
+      "loss": 0.1211,
+      "step": 24983
+    },
+    {
+      "epoch": 0.21687311742085572,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0017314737206312902,
+      "loss": 0.0928,
+      "step": 24984
+    },
+    {
+      "epoch": 0.21688179790105988,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0017314525135202648,
+      "loss": 0.0947,
+      "step": 24985
+    },
+    {
+      "epoch": 0.21689047838126405,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0017314313057186924,
+      "loss": 0.0762,
+      "step": 24986
+    },
+    {
+      "epoch": 0.2168991588614682,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0017314100972265958,
+      "loss": 0.1123,
+      "step": 24987
+    },
+    {
+      "epoch": 0.21690783934167238,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0017313888880439992,
+      "loss": 0.1572,
+      "step": 24988
+    },
+    {
+      "epoch": 0.21691651982187654,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001731367678170925,
+      "loss": 0.1025,
+      "step": 24989
+    },
+    {
+      "epoch": 0.2169252003020807,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0017313464676073965,
+      "loss": 0.1191,
+      "step": 24990
+    },
+    {
+      "epoch": 0.21693388078228487,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0017313252563534367,
+      "loss": 0.084,
+      "step": 24991
+    },
+    {
+      "epoch": 0.21694256126248904,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0017313040444090694,
+      "loss": 0.0889,
+      "step": 24992
+    },
+    {
+      "epoch": 0.2169512417426932,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0017312828317743176,
+      "loss": 0.1309,
+      "step": 24993
+    },
+    {
+      "epoch": 0.21695992222289737,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0017312616184492042,
+      "loss": 0.1006,
+      "step": 24994
+    },
+    {
+      "epoch": 0.21696860270310153,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0017312404044337528,
+      "loss": 0.1279,
+      "step": 24995
+    },
+    {
+      "epoch": 0.2169772831833057,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001731219189727986,
+      "loss": 0.082,
+      "step": 24996
+    },
+    {
+      "epoch": 0.21698596366350986,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0017311979743319275,
+      "loss": 0.3008,
+      "step": 24997
+    },
+    {
+      "epoch": 0.21699464414371403,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0017311767582456007,
+      "loss": 0.0894,
+      "step": 24998
+    },
+    {
+      "epoch": 0.2170033246239182,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001731155541469028,
+      "loss": 0.1021,
+      "step": 24999
+    },
+    {
+      "epoch": 0.21701200510412236,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0017311343240022335,
+      "loss": 0.1533,
+      "step": 25000
+    },
+    {
+      "epoch": 0.21702068558432652,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0017311131058452395,
+      "loss": 0.1143,
+      "step": 25001
+    },
+    {
+      "epoch": 0.2170293660645307,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0017310918869980704,
+      "loss": 0.0923,
+      "step": 25002
+    },
+    {
+      "epoch": 0.21703804654473485,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0017310706674607483,
+      "loss": 0.1299,
+      "step": 25003
+    },
+    {
+      "epoch": 0.21704672702493902,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0017310494472332967,
+      "loss": 0.1235,
+      "step": 25004
+    },
+    {
+      "epoch": 0.21705540750514318,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001731028226315739,
+      "loss": 0.1309,
+      "step": 25005
+    },
+    {
+      "epoch": 0.21706408798534735,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0017310070047080984,
+      "loss": 0.1074,
+      "step": 25006
+    },
+    {
+      "epoch": 0.2170727684655515,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0017309857824103977,
+      "loss": 0.1182,
+      "step": 25007
+    },
+    {
+      "epoch": 0.21708144894575568,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0017309645594226607,
+      "loss": 0.1445,
+      "step": 25008
+    },
+    {
+      "epoch": 0.21709012942595984,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0017309433357449102,
+      "loss": 0.1162,
+      "step": 25009
+    },
+    {
+      "epoch": 0.217098809906164,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0017309221113771698,
+      "loss": 0.1562,
+      "step": 25010
+    },
+    {
+      "epoch": 0.21710749038636817,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0017309008863194623,
+      "loss": 0.0913,
+      "step": 25011
+    },
+    {
+      "epoch": 0.21711617086657234,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001730879660571811,
+      "loss": 0.0864,
+      "step": 25012
+    },
+    {
+      "epoch": 0.2171248513467765,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0017308584341342395,
+      "loss": 0.1406,
+      "step": 25013
+    },
+    {
+      "epoch": 0.21713353182698067,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0017308372070067705,
+      "loss": 0.1406,
+      "step": 25014
+    },
+    {
+      "epoch": 0.21714221230718483,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0017308159791894274,
+      "loss": 0.1055,
+      "step": 25015
+    },
+    {
+      "epoch": 0.217150892787389,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0017307947506822335,
+      "loss": 0.1895,
+      "step": 25016
+    },
+    {
+      "epoch": 0.21715957326759316,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0017307735214852115,
+      "loss": 0.0967,
+      "step": 25017
+    },
+    {
+      "epoch": 0.21716825374779733,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0017307522915983853,
+      "loss": 0.125,
+      "step": 25018
+    },
+    {
+      "epoch": 0.2171769342280015,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0017307310610217782,
+      "loss": 0.1084,
+      "step": 25019
+    },
+    {
+      "epoch": 0.21718561470820566,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001730709829755413,
+      "loss": 0.0962,
+      "step": 25020
+    },
+    {
+      "epoch": 0.21719429518840982,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0017306885977993128,
+      "loss": 0.1104,
+      "step": 25021
+    },
+    {
+      "epoch": 0.217202975668614,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0017306673651535007,
+      "loss": 0.126,
+      "step": 25022
+    },
+    {
+      "epoch": 0.21721165614881816,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0017306461318180007,
+      "loss": 0.1201,
+      "step": 25023
+    },
+    {
+      "epoch": 0.21722033662902232,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0017306248977928356,
+      "loss": 0.0845,
+      "step": 25024
+    },
+    {
+      "epoch": 0.21722901710922649,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0017306036630780287,
+      "loss": 0.1025,
+      "step": 25025
+    },
+    {
+      "epoch": 0.21723769758943065,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0017305824276736027,
+      "loss": 0.1543,
+      "step": 25026
+    },
+    {
+      "epoch": 0.21724637806963482,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0017305611915795814,
+      "loss": 0.1475,
+      "step": 25027
+    },
+    {
+      "epoch": 0.21725505854983898,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0017305399547959876,
+      "loss": 0.0957,
+      "step": 25028
+    },
+    {
+      "epoch": 0.21726373903004315,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017305187173228453,
+      "loss": 0.0957,
+      "step": 25029
+    },
+    {
+      "epoch": 0.2172724195102473,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0017304974791601771,
+      "loss": 0.127,
+      "step": 25030
+    },
+    {
+      "epoch": 0.21728109999045148,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.001730476240308006,
+      "loss": 0.085,
+      "step": 25031
+    },
+    {
+      "epoch": 0.21728978047065564,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0017304550007663558,
+      "loss": 0.0981,
+      "step": 25032
+    },
+    {
+      "epoch": 0.2172984609508598,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0017304337605352493,
+      "loss": 0.0776,
+      "step": 25033
+    },
+    {
+      "epoch": 0.21730714143106397,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0017304125196147102,
+      "loss": 0.1494,
+      "step": 25034
+    },
+    {
+      "epoch": 0.21731582191126814,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0017303912780047613,
+      "loss": 0.0913,
+      "step": 25035
+    },
+    {
+      "epoch": 0.2173245023914723,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0017303700357054258,
+      "loss": 0.0986,
+      "step": 25036
+    },
+    {
+      "epoch": 0.21733318287167647,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0017303487927167272,
+      "loss": 0.0981,
+      "step": 25037
+    },
+    {
+      "epoch": 0.21734186335188063,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001730327549038689,
+      "loss": 0.0864,
+      "step": 25038
+    },
+    {
+      "epoch": 0.2173505438320848,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0017303063046713333,
+      "loss": 0.0957,
+      "step": 25039
+    },
+    {
+      "epoch": 0.21735922431228896,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0017302850596146845,
+      "loss": 0.0796,
+      "step": 25040
+    },
+    {
+      "epoch": 0.21736790479249313,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0017302638138687657,
+      "loss": 0.1123,
+      "step": 25041
+    },
+    {
+      "epoch": 0.2173765852726973,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0017302425674335996,
+      "loss": 0.1006,
+      "step": 25042
+    },
+    {
+      "epoch": 0.21738526575290146,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0017302213203092096,
+      "loss": 0.1885,
+      "step": 25043
+    },
+    {
+      "epoch": 0.21739394623310562,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0017302000724956192,
+      "loss": 0.0908,
+      "step": 25044
+    },
+    {
+      "epoch": 0.2174026267133098,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0017301788239928513,
+      "loss": 0.0771,
+      "step": 25045
+    },
+    {
+      "epoch": 0.21741130719351395,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0017301575748009297,
+      "loss": 0.1113,
+      "step": 25046
+    },
+    {
+      "epoch": 0.21741998767371812,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001730136324919877,
+      "loss": 0.207,
+      "step": 25047
+    },
+    {
+      "epoch": 0.21742866815392228,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0017301150743497164,
+      "loss": 0.1045,
+      "step": 25048
+    },
+    {
+      "epoch": 0.21743734863412645,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0017300938230904717,
+      "loss": 0.1289,
+      "step": 25049
+    },
+    {
+      "epoch": 0.2174460291143306,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.001730072571142166,
+      "loss": 0.1387,
+      "step": 25050
+    },
+    {
+      "epoch": 0.21745470959453478,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017300513185048224,
+      "loss": 0.0986,
+      "step": 25051
+    },
+    {
+      "epoch": 0.21746339007473894,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0017300300651784636,
+      "loss": 0.1187,
+      "step": 25052
+    },
+    {
+      "epoch": 0.2174720705549431,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0017300088111631138,
+      "loss": 0.0767,
+      "step": 25053
+    },
+    {
+      "epoch": 0.21748075103514727,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0017299875564587958,
+      "loss": 0.123,
+      "step": 25054
+    },
+    {
+      "epoch": 0.21748943151535144,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001729966301065533,
+      "loss": 0.1162,
+      "step": 25055
+    },
+    {
+      "epoch": 0.2174981119955556,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0017299450449833485,
+      "loss": 0.1006,
+      "step": 25056
+    },
+    {
+      "epoch": 0.21750679247575977,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0017299237882122654,
+      "loss": 0.0957,
+      "step": 25057
+    },
+    {
+      "epoch": 0.21751547295596393,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0017299025307523073,
+      "loss": 0.1064,
+      "step": 25058
+    },
+    {
+      "epoch": 0.2175241534361681,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001729881272603497,
+      "loss": 0.0991,
+      "step": 25059
+    },
+    {
+      "epoch": 0.21753283391637226,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0017298600137658582,
+      "loss": 0.1055,
+      "step": 25060
+    },
+    {
+      "epoch": 0.21754151439657643,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0017298387542394136,
+      "loss": 0.1143,
+      "step": 25061
+    },
+    {
+      "epoch": 0.2175501948767806,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0017298174940241873,
+      "loss": 0.123,
+      "step": 25062
+    },
+    {
+      "epoch": 0.21755887535698476,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0017297962331202018,
+      "loss": 0.1221,
+      "step": 25063
+    },
+    {
+      "epoch": 0.21756755583718893,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.001729774971527481,
+      "loss": 0.0879,
+      "step": 25064
+    },
+    {
+      "epoch": 0.2175762363173931,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.001729753709246047,
+      "loss": 0.1143,
+      "step": 25065
+    },
+    {
+      "epoch": 0.21758491679759726,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0017297324462759243,
+      "loss": 0.0967,
+      "step": 25066
+    },
+    {
+      "epoch": 0.21759359727780142,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0017297111826171355,
+      "loss": 0.0898,
+      "step": 25067
+    },
+    {
+      "epoch": 0.21760227775800559,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0017296899182697042,
+      "loss": 0.1211,
+      "step": 25068
+    },
+    {
+      "epoch": 0.21761095823820975,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0017296686532336531,
+      "loss": 0.1445,
+      "step": 25069
+    },
+    {
+      "epoch": 0.21761963871841392,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0017296473875090062,
+      "loss": 0.0752,
+      "step": 25070
+    },
+    {
+      "epoch": 0.21762831919861808,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0017296261210957861,
+      "loss": 0.104,
+      "step": 25071
+    },
+    {
+      "epoch": 0.21763699967882225,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0017296048539940168,
+      "loss": 0.127,
+      "step": 25072
+    },
+    {
+      "epoch": 0.2176456801590264,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017295835862037204,
+      "loss": 0.0752,
+      "step": 25073
+    },
+    {
+      "epoch": 0.21765436063923055,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0017295623177249213,
+      "loss": 0.1787,
+      "step": 25074
+    },
+    {
+      "epoch": 0.2176630411194347,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0017295410485576423,
+      "loss": 0.125,
+      "step": 25075
+    },
+    {
+      "epoch": 0.21767172159963888,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0017295197787019065,
+      "loss": 0.1094,
+      "step": 25076
+    },
+    {
+      "epoch": 0.21768040207984304,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0017294985081577378,
+      "loss": 0.0923,
+      "step": 25077
+    },
+    {
+      "epoch": 0.2176890825600472,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0017294772369251584,
+      "loss": 0.0981,
+      "step": 25078
+    },
+    {
+      "epoch": 0.21769776304025137,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0017294559650041922,
+      "loss": 0.0977,
+      "step": 25079
+    },
+    {
+      "epoch": 0.21770644352045554,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0017294346923948625,
+      "loss": 0.1172,
+      "step": 25080
+    },
+    {
+      "epoch": 0.2177151240006597,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0017294134190971925,
+      "loss": 0.1367,
+      "step": 25081
+    },
+    {
+      "epoch": 0.21772380448086387,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0017293921451112053,
+      "loss": 0.082,
+      "step": 25082
+    },
+    {
+      "epoch": 0.21773248496106803,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0017293708704369246,
+      "loss": 0.126,
+      "step": 25083
+    },
+    {
+      "epoch": 0.2177411654412722,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0017293495950743735,
+      "loss": 0.1079,
+      "step": 25084
+    },
+    {
+      "epoch": 0.21774984592147636,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001729328319023575,
+      "loss": 0.0991,
+      "step": 25085
+    },
+    {
+      "epoch": 0.21775852640168053,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0017293070422845522,
+      "loss": 0.1201,
+      "step": 25086
+    },
+    {
+      "epoch": 0.2177672068818847,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001729285764857329,
+      "loss": 0.1084,
+      "step": 25087
+    },
+    {
+      "epoch": 0.21777588736208886,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0017292644867419281,
+      "loss": 0.1348,
+      "step": 25088
+    },
+    {
+      "epoch": 0.21778456784229303,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001729243207938373,
+      "loss": 0.1348,
+      "step": 25089
+    },
+    {
+      "epoch": 0.2177932483224972,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0017292219284466876,
+      "loss": 0.1172,
+      "step": 25090
+    },
+    {
+      "epoch": 0.21780192880270136,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0017292006482668937,
+      "loss": 0.0811,
+      "step": 25091
+    },
+    {
+      "epoch": 0.21781060928290552,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001729179367399016,
+      "loss": 0.1201,
+      "step": 25092
+    },
+    {
+      "epoch": 0.21781928976310969,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001729158085843077,
+      "loss": 0.0962,
+      "step": 25093
+    },
+    {
+      "epoch": 0.21782797024331385,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0017291368035991003,
+      "loss": 0.1338,
+      "step": 25094
+    },
+    {
+      "epoch": 0.21783665072351802,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0017291155206671086,
+      "loss": 0.0879,
+      "step": 25095
+    },
+    {
+      "epoch": 0.21784533120372218,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0017290942370471259,
+      "loss": 0.0972,
+      "step": 25096
+    },
+    {
+      "epoch": 0.21785401168392635,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0017290729527391752,
+      "loss": 0.1387,
+      "step": 25097
+    },
+    {
+      "epoch": 0.2178626921641305,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0017290516677432798,
+      "loss": 0.125,
+      "step": 25098
+    },
+    {
+      "epoch": 0.21787137264433468,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0017290303820594632,
+      "loss": 0.124,
+      "step": 25099
+    },
+    {
+      "epoch": 0.21788005312453884,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001729009095687748,
+      "loss": 0.0967,
+      "step": 25100
+    },
+    {
+      "epoch": 0.217888733604743,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0017289878086281581,
+      "loss": 0.125,
+      "step": 25101
+    },
+    {
+      "epoch": 0.21789741408494717,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0017289665208807166,
+      "loss": 0.1133,
+      "step": 25102
+    },
+    {
+      "epoch": 0.21790609456515134,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0017289452324454467,
+      "loss": 0.1553,
+      "step": 25103
+    },
+    {
+      "epoch": 0.2179147750453555,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0017289239433223719,
+      "loss": 0.0947,
+      "step": 25104
+    },
+    {
+      "epoch": 0.21792345552555967,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0017289026535115151,
+      "loss": 0.0928,
+      "step": 25105
+    },
+    {
+      "epoch": 0.21793213600576383,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0017288813630129,
+      "loss": 0.0972,
+      "step": 25106
+    },
+    {
+      "epoch": 0.217940816485968,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0017288600718265495,
+      "loss": 0.1172,
+      "step": 25107
+    },
+    {
+      "epoch": 0.21794949696617216,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0017288387799524873,
+      "loss": 0.1045,
+      "step": 25108
+    },
+    {
+      "epoch": 0.21795817744637633,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0017288174873907362,
+      "loss": 0.1094,
+      "step": 25109
+    },
+    {
+      "epoch": 0.2179668579265805,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0017287961941413197,
+      "loss": 0.0928,
+      "step": 25110
+    },
+    {
+      "epoch": 0.21797553840678466,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0017287749002042612,
+      "loss": 0.0898,
+      "step": 25111
+    },
+    {
+      "epoch": 0.21798421888698882,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001728753605579584,
+      "loss": 0.1445,
+      "step": 25112
+    },
+    {
+      "epoch": 0.217992899367193,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0017287323102673114,
+      "loss": 0.0913,
+      "step": 25113
+    },
+    {
+      "epoch": 0.21800157984739715,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0017287110142674665,
+      "loss": 0.1035,
+      "step": 25114
+    },
+    {
+      "epoch": 0.21801026032760132,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0017286897175800727,
+      "loss": 0.1235,
+      "step": 25115
+    },
+    {
+      "epoch": 0.21801894080780548,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0017286684202051533,
+      "loss": 0.0874,
+      "step": 25116
+    },
+    {
+      "epoch": 0.21802762128800965,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0017286471221427313,
+      "loss": 0.1055,
+      "step": 25117
+    },
+    {
+      "epoch": 0.2180363017682138,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0017286258233928306,
+      "loss": 0.127,
+      "step": 25118
+    },
+    {
+      "epoch": 0.21804498224841798,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.001728604523955474,
+      "loss": 0.0884,
+      "step": 25119
+    },
+    {
+      "epoch": 0.21805366272862214,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0017285832238306852,
+      "loss": 0.0815,
+      "step": 25120
+    },
+    {
+      "epoch": 0.2180623432088263,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0017285619230184867,
+      "loss": 0.1182,
+      "step": 25121
+    },
+    {
+      "epoch": 0.21807102368903047,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0017285406215189027,
+      "loss": 0.1099,
+      "step": 25122
+    },
+    {
+      "epoch": 0.21807970416923464,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0017285193193319563,
+      "loss": 0.0723,
+      "step": 25123
+    },
+    {
+      "epoch": 0.2180883846494388,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0017284980164576703,
+      "loss": 0.1348,
+      "step": 25124
+    },
+    {
+      "epoch": 0.21809706512964297,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001728476712896068,
+      "loss": 0.0986,
+      "step": 25125
+    },
+    {
+      "epoch": 0.21810574560984713,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0017284554086471734,
+      "loss": 0.0967,
+      "step": 25126
+    },
+    {
+      "epoch": 0.2181144260900513,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0017284341037110094,
+      "loss": 0.0928,
+      "step": 25127
+    },
+    {
+      "epoch": 0.21812310657025547,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0017284127980875994,
+      "loss": 0.1084,
+      "step": 25128
+    },
+    {
+      "epoch": 0.21813178705045963,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0017283914917769663,
+      "loss": 0.084,
+      "step": 25129
+    },
+    {
+      "epoch": 0.2181404675306638,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.001728370184779134,
+      "loss": 0.1216,
+      "step": 25130
+    },
+    {
+      "epoch": 0.21814914801086796,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0017283488770941256,
+      "loss": 0.1001,
+      "step": 25131
+    },
+    {
+      "epoch": 0.21815782849107213,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001728327568721964,
+      "loss": 0.0967,
+      "step": 25132
+    },
+    {
+      "epoch": 0.2181665089712763,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0017283062596626729,
+      "loss": 0.1387,
+      "step": 25133
+    },
+    {
+      "epoch": 0.21817518945148046,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0017282849499162755,
+      "loss": 0.126,
+      "step": 25134
+    },
+    {
+      "epoch": 0.21818386993168462,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0017282636394827951,
+      "loss": 0.1143,
+      "step": 25135
+    },
+    {
+      "epoch": 0.21819255041188879,
+      "grad_norm": 1.9765625,
+      "learning_rate": 0.0017282423283622552,
+      "loss": 0.1621,
+      "step": 25136
+    },
+    {
+      "epoch": 0.21820123089209295,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0017282210165546787,
+      "loss": 0.104,
+      "step": 25137
+    },
+    {
+      "epoch": 0.21820991137229712,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0017281997040600895,
+      "loss": 0.1484,
+      "step": 25138
+    },
+    {
+      "epoch": 0.21821859185250128,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.00172817839087851,
+      "loss": 0.0996,
+      "step": 25139
+    },
+    {
+      "epoch": 0.21822727233270545,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0017281570770099645,
+      "loss": 0.0947,
+      "step": 25140
+    },
+    {
+      "epoch": 0.2182359528129096,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0017281357624544756,
+      "loss": 0.0918,
+      "step": 25141
+    },
+    {
+      "epoch": 0.21824463329311378,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0017281144472120669,
+      "loss": 0.0996,
+      "step": 25142
+    },
+    {
+      "epoch": 0.21825331377331794,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0017280931312827618,
+      "loss": 0.1104,
+      "step": 25143
+    },
+    {
+      "epoch": 0.2182619942535221,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0017280718146665832,
+      "loss": 0.1396,
+      "step": 25144
+    },
+    {
+      "epoch": 0.21827067473372627,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0017280504973635551,
+      "loss": 0.1138,
+      "step": 25145
+    },
+    {
+      "epoch": 0.21827935521393044,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0017280291793737004,
+      "loss": 0.1055,
+      "step": 25146
+    },
+    {
+      "epoch": 0.2182880356941346,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0017280078606970422,
+      "loss": 0.2227,
+      "step": 25147
+    },
+    {
+      "epoch": 0.21829671617433877,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0017279865413336042,
+      "loss": 0.1235,
+      "step": 25148
+    },
+    {
+      "epoch": 0.21830539665454293,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001727965221283409,
+      "loss": 0.1348,
+      "step": 25149
+    },
+    {
+      "epoch": 0.2183140771347471,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0017279439005464811,
+      "loss": 0.1387,
+      "step": 25150
+    },
+    {
+      "epoch": 0.21832275761495126,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001727922579122843,
+      "loss": 0.1016,
+      "step": 25151
+    },
+    {
+      "epoch": 0.21833143809515543,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0017279012570125185,
+      "loss": 0.1104,
+      "step": 25152
+    },
+    {
+      "epoch": 0.2183401185753596,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0017278799342155301,
+      "loss": 0.0957,
+      "step": 25153
+    },
+    {
+      "epoch": 0.21834879905556376,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001727858610731902,
+      "loss": 0.1348,
+      "step": 25154
+    },
+    {
+      "epoch": 0.21835747953576792,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0017278372865616569,
+      "loss": 0.0918,
+      "step": 25155
+    },
+    {
+      "epoch": 0.2183661600159721,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0017278159617048184,
+      "loss": 0.0918,
+      "step": 25156
+    },
+    {
+      "epoch": 0.21837484049617625,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0017277946361614102,
+      "loss": 0.1084,
+      "step": 25157
+    },
+    {
+      "epoch": 0.21838352097638042,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001727773309931455,
+      "loss": 0.1387,
+      "step": 25158
+    },
+    {
+      "epoch": 0.21839220145658458,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001727751983014976,
+      "loss": 0.0947,
+      "step": 25159
+    },
+    {
+      "epoch": 0.21840088193678875,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001727730655411997,
+      "loss": 0.0942,
+      "step": 25160
+    },
+    {
+      "epoch": 0.21840956241699291,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0017277093271225415,
+      "loss": 0.0903,
+      "step": 25161
+    },
+    {
+      "epoch": 0.21841824289719708,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0017276879981466325,
+      "loss": 0.0986,
+      "step": 25162
+    },
+    {
+      "epoch": 0.21842692337740124,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.001727666668484293,
+      "loss": 0.1113,
+      "step": 25163
+    },
+    {
+      "epoch": 0.2184356038576054,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0017276453381355468,
+      "loss": 0.1084,
+      "step": 25164
+    },
+    {
+      "epoch": 0.21844428433780957,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001727624007100417,
+      "loss": 0.0933,
+      "step": 25165
+    },
+    {
+      "epoch": 0.21845296481801374,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0017276026753789274,
+      "loss": 0.0977,
+      "step": 25166
+    },
+    {
+      "epoch": 0.2184616452982179,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0017275813429711005,
+      "loss": 0.0986,
+      "step": 25167
+    },
+    {
+      "epoch": 0.21847032577842207,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.00172756000987696,
+      "loss": 0.1094,
+      "step": 25168
+    },
+    {
+      "epoch": 0.21847900625862623,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0017275386760965297,
+      "loss": 0.1123,
+      "step": 25169
+    },
+    {
+      "epoch": 0.2184876867388304,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0017275173416298325,
+      "loss": 0.1025,
+      "step": 25170
+    },
+    {
+      "epoch": 0.21849636721903457,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0017274960064768917,
+      "loss": 0.0933,
+      "step": 25171
+    },
+    {
+      "epoch": 0.21850504769923873,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0017274746706377306,
+      "loss": 0.0962,
+      "step": 25172
+    },
+    {
+      "epoch": 0.2185137281794429,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0017274533341123724,
+      "loss": 0.1123,
+      "step": 25173
+    },
+    {
+      "epoch": 0.21852240865964706,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0017274319969008411,
+      "loss": 0.1182,
+      "step": 25174
+    },
+    {
+      "epoch": 0.21853108913985123,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0017274106590031595,
+      "loss": 0.0747,
+      "step": 25175
+    },
+    {
+      "epoch": 0.2185397696200554,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001727389320419351,
+      "loss": 0.1064,
+      "step": 25176
+    },
+    {
+      "epoch": 0.21854845010025956,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0017273679811494387,
+      "loss": 0.1133,
+      "step": 25177
+    },
+    {
+      "epoch": 0.21855713058046372,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0017273466411934465,
+      "loss": 0.1074,
+      "step": 25178
+    },
+    {
+      "epoch": 0.21856581106066789,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0017273253005513972,
+      "loss": 0.0752,
+      "step": 25179
+    },
+    {
+      "epoch": 0.21857449154087205,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0017273039592233146,
+      "loss": 0.1279,
+      "step": 25180
+    },
+    {
+      "epoch": 0.21858317202107622,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0017272826172092218,
+      "loss": 0.166,
+      "step": 25181
+    },
+    {
+      "epoch": 0.21859185250128038,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001727261274509142,
+      "loss": 0.1582,
+      "step": 25182
+    },
+    {
+      "epoch": 0.21860053298148455,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0017272399311230988,
+      "loss": 0.124,
+      "step": 25183
+    },
+    {
+      "epoch": 0.2186092134616887,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0017272185870511156,
+      "loss": 0.085,
+      "step": 25184
+    },
+    {
+      "epoch": 0.21861789394189288,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0017271972422932154,
+      "loss": 0.0908,
+      "step": 25185
+    },
+    {
+      "epoch": 0.21862657442209704,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0017271758968494216,
+      "loss": 0.1934,
+      "step": 25186
+    },
+    {
+      "epoch": 0.2186352549023012,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0017271545507197576,
+      "loss": 0.126,
+      "step": 25187
+    },
+    {
+      "epoch": 0.21864393538250537,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0017271332039042473,
+      "loss": 0.0664,
+      "step": 25188
+    },
+    {
+      "epoch": 0.21865261586270954,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0017271118564029131,
+      "loss": 0.0918,
+      "step": 25189
+    },
+    {
+      "epoch": 0.2186612963429137,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001727090508215779,
+      "loss": 0.0947,
+      "step": 25190
+    },
+    {
+      "epoch": 0.21866997682311787,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001727069159342868,
+      "loss": 0.1143,
+      "step": 25191
+    },
+    {
+      "epoch": 0.21867865730332203,
+      "grad_norm": 3.34375,
+      "learning_rate": 0.0017270478097842037,
+      "loss": 0.208,
+      "step": 25192
+    },
+    {
+      "epoch": 0.2186873377835262,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0017270264595398096,
+      "loss": 0.0854,
+      "step": 25193
+    },
+    {
+      "epoch": 0.21869601826373036,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0017270051086097082,
+      "loss": 0.1016,
+      "step": 25194
+    },
+    {
+      "epoch": 0.21870469874393453,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001726983756993924,
+      "loss": 0.1289,
+      "step": 25195
+    },
+    {
+      "epoch": 0.2187133792241387,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0017269624046924795,
+      "loss": 0.0874,
+      "step": 25196
+    },
+    {
+      "epoch": 0.21872205970434283,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001726941051705398,
+      "loss": 0.0879,
+      "step": 25197
+    },
+    {
+      "epoch": 0.218730740184547,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0017269196980327038,
+      "loss": 0.1162,
+      "step": 25198
+    },
+    {
+      "epoch": 0.21873942066475116,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0017268983436744191,
+      "loss": 0.1104,
+      "step": 25199
+    },
+    {
+      "epoch": 0.21874810114495533,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0017268769886305685,
+      "loss": 0.1279,
+      "step": 25200
+    },
+    {
+      "epoch": 0.2187567816251595,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.001726855632901174,
+      "loss": 0.1182,
+      "step": 25201
+    },
+    {
+      "epoch": 0.21876546210536366,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0017268342764862597,
+      "loss": 0.1025,
+      "step": 25202
+    },
+    {
+      "epoch": 0.21877414258556782,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001726812919385849,
+      "loss": 0.0771,
+      "step": 25203
+    },
+    {
+      "epoch": 0.218782823065772,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001726791561599965,
+      "loss": 0.1318,
+      "step": 25204
+    },
+    {
+      "epoch": 0.21879150354597615,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0017267702031286314,
+      "loss": 0.1484,
+      "step": 25205
+    },
+    {
+      "epoch": 0.21880018402618032,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0017267488439718713,
+      "loss": 0.0986,
+      "step": 25206
+    },
+    {
+      "epoch": 0.21880886450638448,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0017267274841297076,
+      "loss": 0.1445,
+      "step": 25207
+    },
+    {
+      "epoch": 0.21881754498658865,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0017267061236021647,
+      "loss": 0.1406,
+      "step": 25208
+    },
+    {
+      "epoch": 0.2188262254667928,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0017266847623892652,
+      "loss": 0.1338,
+      "step": 25209
+    },
+    {
+      "epoch": 0.21883490594699698,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0017266634004910327,
+      "loss": 0.1367,
+      "step": 25210
+    },
+    {
+      "epoch": 0.21884358642720114,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0017266420379074903,
+      "loss": 0.1553,
+      "step": 25211
+    },
+    {
+      "epoch": 0.2188522669074053,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0017266206746386617,
+      "loss": 0.127,
+      "step": 25212
+    },
+    {
+      "epoch": 0.21886094738760947,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0017265993106845704,
+      "loss": 0.1582,
+      "step": 25213
+    },
+    {
+      "epoch": 0.21886962786781364,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.001726577946045239,
+      "loss": 0.1123,
+      "step": 25214
+    },
+    {
+      "epoch": 0.2188783083480178,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001726556580720692,
+      "loss": 0.0776,
+      "step": 25215
+    },
+    {
+      "epoch": 0.21888698882822197,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0017265352147109515,
+      "loss": 0.083,
+      "step": 25216
+    },
+    {
+      "epoch": 0.21889566930842613,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0017265138480160418,
+      "loss": 0.0898,
+      "step": 25217
+    },
+    {
+      "epoch": 0.2189043497886303,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0017264924806359863,
+      "loss": 0.0898,
+      "step": 25218
+    },
+    {
+      "epoch": 0.21891303026883446,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0017264711125708073,
+      "loss": 0.1011,
+      "step": 25219
+    },
+    {
+      "epoch": 0.21892171074903863,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0017264497438205294,
+      "loss": 0.1123,
+      "step": 25220
+    },
+    {
+      "epoch": 0.2189303912292428,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0017264283743851753,
+      "loss": 0.0986,
+      "step": 25221
+    },
+    {
+      "epoch": 0.21893907170944696,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0017264070042647688,
+      "loss": 0.1016,
+      "step": 25222
+    },
+    {
+      "epoch": 0.21894775218965112,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.001726385633459333,
+      "loss": 0.1104,
+      "step": 25223
+    },
+    {
+      "epoch": 0.2189564326698553,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0017263642619688908,
+      "loss": 0.1226,
+      "step": 25224
+    },
+    {
+      "epoch": 0.21896511315005945,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0017263428897934666,
+      "loss": 0.1445,
+      "step": 25225
+    },
+    {
+      "epoch": 0.21897379363026362,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0017263215169330828,
+      "loss": 0.123,
+      "step": 25226
+    },
+    {
+      "epoch": 0.21898247411046778,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0017263001433877637,
+      "loss": 0.1045,
+      "step": 25227
+    },
+    {
+      "epoch": 0.21899115459067195,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0017262787691575319,
+      "loss": 0.125,
+      "step": 25228
+    },
+    {
+      "epoch": 0.21899983507087611,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0017262573942424108,
+      "loss": 0.0918,
+      "step": 25229
+    },
+    {
+      "epoch": 0.21900851555108028,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0017262360186424243,
+      "loss": 0.0996,
+      "step": 25230
+    },
+    {
+      "epoch": 0.21901719603128444,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0017262146423575955,
+      "loss": 0.1621,
+      "step": 25231
+    },
+    {
+      "epoch": 0.2190258765114886,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0017261932653879477,
+      "loss": 0.1328,
+      "step": 25232
+    },
+    {
+      "epoch": 0.21903455699169277,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0017261718877335042,
+      "loss": 0.1348,
+      "step": 25233
+    },
+    {
+      "epoch": 0.21904323747189694,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0017261505093942888,
+      "loss": 0.0874,
+      "step": 25234
+    },
+    {
+      "epoch": 0.2190519179521011,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0017261291303703246,
+      "loss": 0.1143,
+      "step": 25235
+    },
+    {
+      "epoch": 0.21906059843230527,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001726107750661635,
+      "loss": 0.1514,
+      "step": 25236
+    },
+    {
+      "epoch": 0.21906927891250944,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0017260863702682433,
+      "loss": 0.1279,
+      "step": 25237
+    },
+    {
+      "epoch": 0.2190779593927136,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0017260649891901727,
+      "loss": 0.1045,
+      "step": 25238
+    },
+    {
+      "epoch": 0.21908663987291777,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0017260436074274471,
+      "loss": 0.0845,
+      "step": 25239
+    },
+    {
+      "epoch": 0.21909532035312193,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0017260222249800897,
+      "loss": 0.1191,
+      "step": 25240
+    },
+    {
+      "epoch": 0.2191040008333261,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0017260008418481237,
+      "loss": 0.0874,
+      "step": 25241
+    },
+    {
+      "epoch": 0.21911268131353026,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0017259794580315724,
+      "loss": 0.1011,
+      "step": 25242
+    },
+    {
+      "epoch": 0.21912136179373443,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0017259580735304596,
+      "loss": 0.1133,
+      "step": 25243
+    },
+    {
+      "epoch": 0.2191300422739386,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0017259366883448082,
+      "loss": 0.1006,
+      "step": 25244
+    },
+    {
+      "epoch": 0.21913872275414276,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0017259153024746423,
+      "loss": 0.1235,
+      "step": 25245
+    },
+    {
+      "epoch": 0.21914740323434692,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0017258939159199844,
+      "loss": 0.1201,
+      "step": 25246
+    },
+    {
+      "epoch": 0.2191560837145511,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0017258725286808585,
+      "loss": 0.1465,
+      "step": 25247
+    },
+    {
+      "epoch": 0.21916476419475525,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001725851140757288,
+      "loss": 0.0996,
+      "step": 25248
+    },
+    {
+      "epoch": 0.21917344467495942,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0017258297521492957,
+      "loss": 0.1133,
+      "step": 25249
+    },
+    {
+      "epoch": 0.21918212515516358,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0017258083628569057,
+      "loss": 0.1069,
+      "step": 25250
+    },
+    {
+      "epoch": 0.21919080563536775,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.001725786972880141,
+      "loss": 0.1167,
+      "step": 25251
+    },
+    {
+      "epoch": 0.2191994861155719,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001725765582219025,
+      "loss": 0.0996,
+      "step": 25252
+    },
+    {
+      "epoch": 0.21920816659577608,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0017257441908735812,
+      "loss": 0.1416,
+      "step": 25253
+    },
+    {
+      "epoch": 0.21921684707598024,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001725722798843833,
+      "loss": 0.1143,
+      "step": 25254
+    },
+    {
+      "epoch": 0.2192255275561844,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0017257014061298032,
+      "loss": 0.0908,
+      "step": 25255
+    },
+    {
+      "epoch": 0.21923420803638857,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0017256800127315164,
+      "loss": 0.1309,
+      "step": 25256
+    },
+    {
+      "epoch": 0.21924288851659274,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0017256586186489952,
+      "loss": 0.0791,
+      "step": 25257
+    },
+    {
+      "epoch": 0.2192515689967969,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0017256372238822631,
+      "loss": 0.0942,
+      "step": 25258
+    },
+    {
+      "epoch": 0.21926024947700107,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0017256158284313435,
+      "loss": 0.1182,
+      "step": 25259
+    },
+    {
+      "epoch": 0.21926892995720523,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.00172559443229626,
+      "loss": 0.1206,
+      "step": 25260
+    },
+    {
+      "epoch": 0.2192776104374094,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0017255730354770356,
+      "loss": 0.1113,
+      "step": 25261
+    },
+    {
+      "epoch": 0.21928629091761356,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0017255516379736941,
+      "loss": 0.1094,
+      "step": 25262
+    },
+    {
+      "epoch": 0.21929497139781773,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0017255302397862586,
+      "loss": 0.1123,
+      "step": 25263
+    },
+    {
+      "epoch": 0.2193036518780219,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0017255088409147527,
+      "loss": 0.1133,
+      "step": 25264
+    },
+    {
+      "epoch": 0.21931233235822606,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0017254874413591998,
+      "loss": 0.0967,
+      "step": 25265
+    },
+    {
+      "epoch": 0.21932101283843022,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0017254660411196233,
+      "loss": 0.083,
+      "step": 25266
+    },
+    {
+      "epoch": 0.2193296933186344,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017254446401960462,
+      "loss": 0.1001,
+      "step": 25267
+    },
+    {
+      "epoch": 0.21933837379883855,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0017254232385884923,
+      "loss": 0.2246,
+      "step": 25268
+    },
+    {
+      "epoch": 0.21934705427904272,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001725401836296985,
+      "loss": 0.1104,
+      "step": 25269
+    },
+    {
+      "epoch": 0.21935573475924688,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0017253804333215478,
+      "loss": 0.105,
+      "step": 25270
+    },
+    {
+      "epoch": 0.21936441523945105,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0017253590296622042,
+      "loss": 0.1084,
+      "step": 25271
+    },
+    {
+      "epoch": 0.21937309571965521,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0017253376253189765,
+      "loss": 0.1299,
+      "step": 25272
+    },
+    {
+      "epoch": 0.21938177619985938,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0017253162202918897,
+      "loss": 0.0977,
+      "step": 25273
+    },
+    {
+      "epoch": 0.21939045668006354,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0017252948145809663,
+      "loss": 0.1187,
+      "step": 25274
+    },
+    {
+      "epoch": 0.2193991371602677,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.00172527340818623,
+      "loss": 0.1055,
+      "step": 25275
+    },
+    {
+      "epoch": 0.21940781764047187,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0017252520011077038,
+      "loss": 0.1133,
+      "step": 25276
+    },
+    {
+      "epoch": 0.21941649812067604,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0017252305933454116,
+      "loss": 0.1001,
+      "step": 25277
+    },
+    {
+      "epoch": 0.2194251786008802,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0017252091848993765,
+      "loss": 0.1318,
+      "step": 25278
+    },
+    {
+      "epoch": 0.21943385908108437,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001725187775769622,
+      "loss": 0.1211,
+      "step": 25279
+    },
+    {
+      "epoch": 0.21944253956128854,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0017251663659561718,
+      "loss": 0.1367,
+      "step": 25280
+    },
+    {
+      "epoch": 0.2194512200414927,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0017251449554590488,
+      "loss": 0.0991,
+      "step": 25281
+    },
+    {
+      "epoch": 0.21945990052169687,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0017251235442782766,
+      "loss": 0.1182,
+      "step": 25282
+    },
+    {
+      "epoch": 0.21946858100190103,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0017251021324138792,
+      "loss": 0.0908,
+      "step": 25283
+    },
+    {
+      "epoch": 0.2194772614821052,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001725080719865879,
+      "loss": 0.168,
+      "step": 25284
+    },
+    {
+      "epoch": 0.21948594196230936,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0017250593066343,
+      "loss": 0.1035,
+      "step": 25285
+    },
+    {
+      "epoch": 0.21949462244251353,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0017250378927191657,
+      "loss": 0.1045,
+      "step": 25286
+    },
+    {
+      "epoch": 0.2195033029227177,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0017250164781204991,
+      "loss": 0.0972,
+      "step": 25287
+    },
+    {
+      "epoch": 0.21951198340292186,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0017249950628383242,
+      "loss": 0.0986,
+      "step": 25288
+    },
+    {
+      "epoch": 0.21952066388312602,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0017249736468726638,
+      "loss": 0.0889,
+      "step": 25289
+    },
+    {
+      "epoch": 0.2195293443633302,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0017249522302235417,
+      "loss": 0.123,
+      "step": 25290
+    },
+    {
+      "epoch": 0.21953802484353435,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0017249308128909813,
+      "loss": 0.126,
+      "step": 25291
+    },
+    {
+      "epoch": 0.21954670532373852,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001724909394875006,
+      "loss": 0.1699,
+      "step": 25292
+    },
+    {
+      "epoch": 0.21955538580394268,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001724887976175639,
+      "loss": 0.0918,
+      "step": 25293
+    },
+    {
+      "epoch": 0.21956406628414685,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.001724866556792904,
+      "loss": 0.0718,
+      "step": 25294
+    },
+    {
+      "epoch": 0.219572746764351,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0017248451367268243,
+      "loss": 0.0991,
+      "step": 25295
+    },
+    {
+      "epoch": 0.21958142724455518,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001724823715977423,
+      "loss": 0.1157,
+      "step": 25296
+    },
+    {
+      "epoch": 0.21959010772475934,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0017248022945447247,
+      "loss": 0.0981,
+      "step": 25297
+    },
+    {
+      "epoch": 0.2195987882049635,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0017247808724287512,
+      "loss": 0.0903,
+      "step": 25298
+    },
+    {
+      "epoch": 0.21960746868516767,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0017247594496295272,
+      "loss": 0.0947,
+      "step": 25299
+    },
+    {
+      "epoch": 0.21961614916537184,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0017247380261470754,
+      "loss": 0.1025,
+      "step": 25300
+    },
+    {
+      "epoch": 0.219624829645576,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0017247166019814196,
+      "loss": 0.1152,
+      "step": 25301
+    },
+    {
+      "epoch": 0.21963351012578017,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.001724695177132583,
+      "loss": 0.0957,
+      "step": 25302
+    },
+    {
+      "epoch": 0.21964219060598433,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0017246737516005894,
+      "loss": 0.1426,
+      "step": 25303
+    },
+    {
+      "epoch": 0.2196508710861885,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0017246523253854618,
+      "loss": 0.1084,
+      "step": 25304
+    },
+    {
+      "epoch": 0.21965955156639266,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0017246308984872236,
+      "loss": 0.1084,
+      "step": 25305
+    },
+    {
+      "epoch": 0.21966823204659683,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001724609470905899,
+      "loss": 0.0938,
+      "step": 25306
+    },
+    {
+      "epoch": 0.219676912526801,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.00172458804264151,
+      "loss": 0.1475,
+      "step": 25307
+    },
+    {
+      "epoch": 0.21968559300700516,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0017245666136940817,
+      "loss": 0.1543,
+      "step": 25308
+    },
+    {
+      "epoch": 0.21969427348720932,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0017245451840636364,
+      "loss": 0.1016,
+      "step": 25309
+    },
+    {
+      "epoch": 0.2197029539674135,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0017245237537501978,
+      "loss": 0.126,
+      "step": 25310
+    },
+    {
+      "epoch": 0.21971163444761765,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0017245023227537898,
+      "loss": 0.1035,
+      "step": 25311
+    },
+    {
+      "epoch": 0.21972031492782182,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.001724480891074435,
+      "loss": 0.1143,
+      "step": 25312
+    },
+    {
+      "epoch": 0.21972899540802598,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0017244594587121575,
+      "loss": 0.1016,
+      "step": 25313
+    },
+    {
+      "epoch": 0.21973767588823015,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0017244380256669805,
+      "loss": 0.332,
+      "step": 25314
+    },
+    {
+      "epoch": 0.21974635636843431,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0017244165919389272,
+      "loss": 0.1064,
+      "step": 25315
+    },
+    {
+      "epoch": 0.21975503684863848,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0017243951575280218,
+      "loss": 0.1191,
+      "step": 25316
+    },
+    {
+      "epoch": 0.21976371732884264,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0017243737224342866,
+      "loss": 0.1055,
+      "step": 25317
+    },
+    {
+      "epoch": 0.2197723978090468,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0017243522866577463,
+      "loss": 0.084,
+      "step": 25318
+    },
+    {
+      "epoch": 0.21978107828925098,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0017243308501984232,
+      "loss": 0.0923,
+      "step": 25319
+    },
+    {
+      "epoch": 0.2197897587694551,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0017243094130563418,
+      "loss": 0.1001,
+      "step": 25320
+    },
+    {
+      "epoch": 0.21979843924965928,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0017242879752315244,
+      "loss": 0.1309,
+      "step": 25321
+    },
+    {
+      "epoch": 0.21980711972986344,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0017242665367239954,
+      "loss": 0.1045,
+      "step": 25322
+    },
+    {
+      "epoch": 0.2198158002100676,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001724245097533778,
+      "loss": 0.0903,
+      "step": 25323
+    },
+    {
+      "epoch": 0.21982448069027177,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0017242236576608951,
+      "loss": 0.1172,
+      "step": 25324
+    },
+    {
+      "epoch": 0.21983316117047594,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001724202217105371,
+      "loss": 0.1328,
+      "step": 25325
+    },
+    {
+      "epoch": 0.2198418416506801,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017241807758672288,
+      "loss": 0.1123,
+      "step": 25326
+    },
+    {
+      "epoch": 0.21985052213088427,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0017241593339464916,
+      "loss": 0.124,
+      "step": 25327
+    },
+    {
+      "epoch": 0.21985920261108843,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001724137891343183,
+      "loss": 0.1064,
+      "step": 25328
+    },
+    {
+      "epoch": 0.2198678830912926,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001724116448057327,
+      "loss": 0.0977,
+      "step": 25329
+    },
+    {
+      "epoch": 0.21987656357149676,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0017240950040889468,
+      "loss": 0.168,
+      "step": 25330
+    },
+    {
+      "epoch": 0.21988524405170093,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0017240735594380651,
+      "loss": 0.1025,
+      "step": 25331
+    },
+    {
+      "epoch": 0.2198939245319051,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0017240521141047062,
+      "loss": 0.0938,
+      "step": 25332
+    },
+    {
+      "epoch": 0.21990260501210926,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0017240306680888933,
+      "loss": 0.1113,
+      "step": 25333
+    },
+    {
+      "epoch": 0.21991128549231342,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0017240092213906496,
+      "loss": 0.1104,
+      "step": 25334
+    },
+    {
+      "epoch": 0.2199199659725176,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0017239877740099988,
+      "loss": 0.126,
+      "step": 25335
+    },
+    {
+      "epoch": 0.21992864645272175,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001723966325946965,
+      "loss": 0.1162,
+      "step": 25336
+    },
+    {
+      "epoch": 0.21993732693292592,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0017239448772015703,
+      "loss": 0.1328,
+      "step": 25337
+    },
+    {
+      "epoch": 0.21994600741313008,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0017239234277738392,
+      "loss": 0.1152,
+      "step": 25338
+    },
+    {
+      "epoch": 0.21995468789333425,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0017239019776637944,
+      "loss": 0.1201,
+      "step": 25339
+    },
+    {
+      "epoch": 0.21996336837353841,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0017238805268714602,
+      "loss": 0.1523,
+      "step": 25340
+    },
+    {
+      "epoch": 0.21997204885374258,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0017238590753968594,
+      "loss": 0.0962,
+      "step": 25341
+    },
+    {
+      "epoch": 0.21998072933394675,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0017238376232400158,
+      "loss": 0.1162,
+      "step": 25342
+    },
+    {
+      "epoch": 0.2199894098141509,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0017238161704009524,
+      "loss": 0.126,
+      "step": 25343
+    },
+    {
+      "epoch": 0.21999809029435508,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0017237947168796935,
+      "loss": 0.0957,
+      "step": 25344
+    },
+    {
+      "epoch": 0.22000677077455924,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.001723773262676262,
+      "loss": 0.1699,
+      "step": 25345
+    },
+    {
+      "epoch": 0.2200154512547634,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0017237518077906813,
+      "loss": 0.1289,
+      "step": 25346
+    },
+    {
+      "epoch": 0.22002413173496757,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001723730352222975,
+      "loss": 0.0938,
+      "step": 25347
+    },
+    {
+      "epoch": 0.22003281221517174,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0017237088959731665,
+      "loss": 0.0977,
+      "step": 25348
+    },
+    {
+      "epoch": 0.2200414926953759,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0017236874390412797,
+      "loss": 0.0898,
+      "step": 25349
+    },
+    {
+      "epoch": 0.22005017317558007,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0017236659814273373,
+      "loss": 0.1187,
+      "step": 25350
+    },
+    {
+      "epoch": 0.22005885365578423,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0017236445231313632,
+      "loss": 0.126,
+      "step": 25351
+    },
+    {
+      "epoch": 0.2200675341359884,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001723623064153381,
+      "loss": 0.1084,
+      "step": 25352
+    },
+    {
+      "epoch": 0.22007621461619256,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0017236016044934138,
+      "loss": 0.1113,
+      "step": 25353
+    },
+    {
+      "epoch": 0.22008489509639673,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0017235801441514854,
+      "loss": 0.0791,
+      "step": 25354
+    },
+    {
+      "epoch": 0.2200935755766009,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001723558683127619,
+      "loss": 0.1235,
+      "step": 25355
+    },
+    {
+      "epoch": 0.22010225605680506,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0017235372214218384,
+      "loss": 0.1055,
+      "step": 25356
+    },
+    {
+      "epoch": 0.22011093653700922,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0017235157590341664,
+      "loss": 0.1357,
+      "step": 25357
+    },
+    {
+      "epoch": 0.2201196170172134,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001723494295964627,
+      "loss": 0.1348,
+      "step": 25358
+    },
+    {
+      "epoch": 0.22012829749741755,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001723472832213244,
+      "loss": 0.1035,
+      "step": 25359
+    },
+    {
+      "epoch": 0.22013697797762172,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0017234513677800406,
+      "loss": 0.1416,
+      "step": 25360
+    },
+    {
+      "epoch": 0.22014565845782588,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0017234299026650398,
+      "loss": 0.1172,
+      "step": 25361
+    },
+    {
+      "epoch": 0.22015433893803005,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0017234084368682654,
+      "loss": 0.1094,
+      "step": 25362
+    },
+    {
+      "epoch": 0.2201630194182342,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.001723386970389741,
+      "loss": 0.1348,
+      "step": 25363
+    },
+    {
+      "epoch": 0.22017169989843838,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.00172336550322949,
+      "loss": 0.0952,
+      "step": 25364
+    },
+    {
+      "epoch": 0.22018038037864254,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.001723344035387536,
+      "loss": 0.1055,
+      "step": 25365
+    },
+    {
+      "epoch": 0.2201890608588467,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001723322566863902,
+      "loss": 0.1074,
+      "step": 25366
+    },
+    {
+      "epoch": 0.22019774133905087,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0017233010976586119,
+      "loss": 0.1338,
+      "step": 25367
+    },
+    {
+      "epoch": 0.22020642181925504,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001723279627771689,
+      "loss": 0.1226,
+      "step": 25368
+    },
+    {
+      "epoch": 0.2202151022994592,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0017232581572031568,
+      "loss": 0.1357,
+      "step": 25369
+    },
+    {
+      "epoch": 0.22022378277966337,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0017232366859530395,
+      "loss": 0.103,
+      "step": 25370
+    },
+    {
+      "epoch": 0.22023246325986753,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0017232152140213592,
+      "loss": 0.1084,
+      "step": 25371
+    },
+    {
+      "epoch": 0.2202411437400717,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0017231937414081404,
+      "loss": 0.1582,
+      "step": 25372
+    },
+    {
+      "epoch": 0.22024982422027586,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001723172268113406,
+      "loss": 0.1211,
+      "step": 25373
+    },
+    {
+      "epoch": 0.22025850470048003,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0017231507941371802,
+      "loss": 0.1074,
+      "step": 25374
+    },
+    {
+      "epoch": 0.2202671851806842,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001723129319479486,
+      "loss": 0.0923,
+      "step": 25375
+    },
+    {
+      "epoch": 0.22027586566088836,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0017231078441403468,
+      "loss": 0.1641,
+      "step": 25376
+    },
+    {
+      "epoch": 0.22028454614109252,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0017230863681197864,
+      "loss": 0.1318,
+      "step": 25377
+    },
+    {
+      "epoch": 0.2202932266212967,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001723064891417828,
+      "loss": 0.085,
+      "step": 25378
+    },
+    {
+      "epoch": 0.22030190710150085,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001723043414034495,
+      "loss": 0.104,
+      "step": 25379
+    },
+    {
+      "epoch": 0.22031058758170502,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0017230219359698111,
+      "loss": 0.1523,
+      "step": 25380
+    },
+    {
+      "epoch": 0.22031926806190918,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0017230004572237998,
+      "loss": 0.0967,
+      "step": 25381
+    },
+    {
+      "epoch": 0.22032794854211335,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0017229789777964848,
+      "loss": 0.0986,
+      "step": 25382
+    },
+    {
+      "epoch": 0.22033662902231752,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0017229574976878892,
+      "loss": 0.1357,
+      "step": 25383
+    },
+    {
+      "epoch": 0.22034530950252168,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017229360168980368,
+      "loss": 0.0698,
+      "step": 25384
+    },
+    {
+      "epoch": 0.22035398998272585,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0017229145354269507,
+      "loss": 0.1289,
+      "step": 25385
+    },
+    {
+      "epoch": 0.22036267046293,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0017228930532746547,
+      "loss": 0.1118,
+      "step": 25386
+    },
+    {
+      "epoch": 0.22037135094313418,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0017228715704411721,
+      "loss": 0.1079,
+      "step": 25387
+    },
+    {
+      "epoch": 0.22038003142333834,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001722850086926527,
+      "loss": 0.125,
+      "step": 25388
+    },
+    {
+      "epoch": 0.2203887119035425,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0017228286027307417,
+      "loss": 0.0786,
+      "step": 25389
+    },
+    {
+      "epoch": 0.22039739238374667,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0017228071178538409,
+      "loss": 0.0869,
+      "step": 25390
+    },
+    {
+      "epoch": 0.22040607286395084,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0017227856322958473,
+      "loss": 0.0977,
+      "step": 25391
+    },
+    {
+      "epoch": 0.220414753344155,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001722764146056785,
+      "loss": 0.1562,
+      "step": 25392
+    },
+    {
+      "epoch": 0.22042343382435917,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0017227426591366769,
+      "loss": 0.1001,
+      "step": 25393
+    },
+    {
+      "epoch": 0.22043211430456333,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.001722721171535547,
+      "loss": 0.127,
+      "step": 25394
+    },
+    {
+      "epoch": 0.2204407947847675,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0017226996832534185,
+      "loss": 0.1035,
+      "step": 25395
+    },
+    {
+      "epoch": 0.22044947526497166,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001722678194290315,
+      "loss": 0.1094,
+      "step": 25396
+    },
+    {
+      "epoch": 0.22045815574517583,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0017226567046462599,
+      "loss": 0.1279,
+      "step": 25397
+    },
+    {
+      "epoch": 0.22046683622538,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0017226352143212766,
+      "loss": 0.1133,
+      "step": 25398
+    },
+    {
+      "epoch": 0.22047551670558416,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0017226137233153891,
+      "loss": 0.0894,
+      "step": 25399
+    },
+    {
+      "epoch": 0.22048419718578832,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0017225922316286206,
+      "loss": 0.124,
+      "step": 25400
+    },
+    {
+      "epoch": 0.2204928776659925,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0017225707392609943,
+      "loss": 0.0908,
+      "step": 25401
+    },
+    {
+      "epoch": 0.22050155814619665,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0017225492462125344,
+      "loss": 0.1328,
+      "step": 25402
+    },
+    {
+      "epoch": 0.22051023862640082,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0017225277524832636,
+      "loss": 0.1426,
+      "step": 25403
+    },
+    {
+      "epoch": 0.22051891910660498,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001722506258073206,
+      "loss": 0.1006,
+      "step": 25404
+    },
+    {
+      "epoch": 0.22052759958680915,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001722484762982385,
+      "loss": 0.1279,
+      "step": 25405
+    },
+    {
+      "epoch": 0.2205362800670133,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0017224632672108238,
+      "loss": 0.0928,
+      "step": 25406
+    },
+    {
+      "epoch": 0.22054496054721748,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0017224417707585463,
+      "loss": 0.1484,
+      "step": 25407
+    },
+    {
+      "epoch": 0.22055364102742164,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0017224202736255759,
+      "loss": 0.0977,
+      "step": 25408
+    },
+    {
+      "epoch": 0.2205623215076258,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001722398775811936,
+      "loss": 0.123,
+      "step": 25409
+    },
+    {
+      "epoch": 0.22057100198782997,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0017223772773176497,
+      "loss": 0.127,
+      "step": 25410
+    },
+    {
+      "epoch": 0.22057968246803414,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0017223557781427415,
+      "loss": 0.1367,
+      "step": 25411
+    },
+    {
+      "epoch": 0.2205883629482383,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001722334278287234,
+      "loss": 0.0923,
+      "step": 25412
+    },
+    {
+      "epoch": 0.22059704342844247,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0017223127777511514,
+      "loss": 0.1113,
+      "step": 25413
+    },
+    {
+      "epoch": 0.22060572390864663,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001722291276534517,
+      "loss": 0.1055,
+      "step": 25414
+    },
+    {
+      "epoch": 0.2206144043888508,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0017222697746373536,
+      "loss": 0.0879,
+      "step": 25415
+    },
+    {
+      "epoch": 0.22062308486905496,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0017222482720596858,
+      "loss": 0.1079,
+      "step": 25416
+    },
+    {
+      "epoch": 0.22063176534925913,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0017222267688015364,
+      "loss": 0.1348,
+      "step": 25417
+    },
+    {
+      "epoch": 0.2206404458294633,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0017222052648629294,
+      "loss": 0.1025,
+      "step": 25418
+    },
+    {
+      "epoch": 0.22064912630966746,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001722183760243888,
+      "loss": 0.0918,
+      "step": 25419
+    },
+    {
+      "epoch": 0.22065780678987162,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0017221622549444357,
+      "loss": 0.0898,
+      "step": 25420
+    },
+    {
+      "epoch": 0.2206664872700758,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0017221407489645961,
+      "loss": 0.0908,
+      "step": 25421
+    },
+    {
+      "epoch": 0.22067516775027995,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001722119242304393,
+      "loss": 0.0898,
+      "step": 25422
+    },
+    {
+      "epoch": 0.22068384823048412,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0017220977349638495,
+      "loss": 0.0601,
+      "step": 25423
+    },
+    {
+      "epoch": 0.22069252871068828,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001722076226942989,
+      "loss": 0.1035,
+      "step": 25424
+    },
+    {
+      "epoch": 0.22070120919089245,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0017220547182418356,
+      "loss": 0.1348,
+      "step": 25425
+    },
+    {
+      "epoch": 0.22070988967109662,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0017220332088604122,
+      "loss": 0.165,
+      "step": 25426
+    },
+    {
+      "epoch": 0.22071857015130078,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0017220116987987431,
+      "loss": 0.1348,
+      "step": 25427
+    },
+    {
+      "epoch": 0.22072725063150495,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0017219901880568508,
+      "loss": 0.0918,
+      "step": 25428
+    },
+    {
+      "epoch": 0.2207359311117091,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0017219686766347597,
+      "loss": 0.1396,
+      "step": 25429
+    },
+    {
+      "epoch": 0.22074461159191328,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001721947164532493,
+      "loss": 0.1162,
+      "step": 25430
+    },
+    {
+      "epoch": 0.22075329207211744,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.001721925651750074,
+      "loss": 0.1201,
+      "step": 25431
+    },
+    {
+      "epoch": 0.2207619725523216,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0017219041382875264,
+      "loss": 0.1465,
+      "step": 25432
+    },
+    {
+      "epoch": 0.22077065303252577,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0017218826241448743,
+      "loss": 0.1465,
+      "step": 25433
+    },
+    {
+      "epoch": 0.22077933351272994,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0017218611093221406,
+      "loss": 0.1016,
+      "step": 25434
+    },
+    {
+      "epoch": 0.2207880139929341,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0017218395938193485,
+      "loss": 0.0879,
+      "step": 25435
+    },
+    {
+      "epoch": 0.22079669447313827,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0017218180776365222,
+      "loss": 0.1245,
+      "step": 25436
+    },
+    {
+      "epoch": 0.22080537495334243,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0017217965607736851,
+      "loss": 0.1055,
+      "step": 25437
+    },
+    {
+      "epoch": 0.2208140554335466,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0017217750432308603,
+      "loss": 0.1309,
+      "step": 25438
+    },
+    {
+      "epoch": 0.22082273591375076,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001721753525008072,
+      "loss": 0.0972,
+      "step": 25439
+    },
+    {
+      "epoch": 0.22083141639395493,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0017217320061053434,
+      "loss": 0.0898,
+      "step": 25440
+    },
+    {
+      "epoch": 0.2208400968741591,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0017217104865226976,
+      "loss": 0.1328,
+      "step": 25441
+    },
+    {
+      "epoch": 0.22084877735436326,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001721688966260159,
+      "loss": 0.1025,
+      "step": 25442
+    },
+    {
+      "epoch": 0.2208574578345674,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0017216674453177506,
+      "loss": 0.0903,
+      "step": 25443
+    },
+    {
+      "epoch": 0.22086613831477156,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001721645923695496,
+      "loss": 0.0854,
+      "step": 25444
+    },
+    {
+      "epoch": 0.22087481879497572,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0017216244013934187,
+      "loss": 0.125,
+      "step": 25445
+    },
+    {
+      "epoch": 0.2208834992751799,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001721602878411542,
+      "loss": 0.0796,
+      "step": 25446
+    },
+    {
+      "epoch": 0.22089217975538405,
+      "grad_norm": 0.5,
+      "learning_rate": 0.00172158135474989,
+      "loss": 0.0967,
+      "step": 25447
+    },
+    {
+      "epoch": 0.22090086023558822,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001721559830408486,
+      "loss": 0.1455,
+      "step": 25448
+    },
+    {
+      "epoch": 0.22090954071579239,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0017215383053873535,
+      "loss": 0.1221,
+      "step": 25449
+    },
+    {
+      "epoch": 0.22091822119599655,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001721516779686516,
+      "loss": 0.1582,
+      "step": 25450
+    },
+    {
+      "epoch": 0.22092690167620072,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0017214952533059972,
+      "loss": 0.1133,
+      "step": 25451
+    },
+    {
+      "epoch": 0.22093558215640488,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0017214737262458207,
+      "loss": 0.0977,
+      "step": 25452
+    },
+    {
+      "epoch": 0.22094426263660905,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0017214521985060096,
+      "loss": 0.0723,
+      "step": 25453
+    },
+    {
+      "epoch": 0.2209529431168132,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0017214306700865877,
+      "loss": 0.085,
+      "step": 25454
+    },
+    {
+      "epoch": 0.22096162359701738,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0017214091409875785,
+      "loss": 0.123,
+      "step": 25455
+    },
+    {
+      "epoch": 0.22097030407722154,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0017213876112090057,
+      "loss": 0.0918,
+      "step": 25456
+    },
+    {
+      "epoch": 0.2209789845574257,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001721366080750893,
+      "loss": 0.1045,
+      "step": 25457
+    },
+    {
+      "epoch": 0.22098766503762987,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0017213445496132632,
+      "loss": 0.0889,
+      "step": 25458
+    },
+    {
+      "epoch": 0.22099634551783404,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0017213230177961407,
+      "loss": 0.1006,
+      "step": 25459
+    },
+    {
+      "epoch": 0.2210050259980382,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0017213014852995485,
+      "loss": 0.0986,
+      "step": 25460
+    },
+    {
+      "epoch": 0.22101370647824237,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0017212799521235103,
+      "loss": 0.1543,
+      "step": 25461
+    },
+    {
+      "epoch": 0.22102238695844653,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00172125841826805,
+      "loss": 0.1846,
+      "step": 25462
+    },
+    {
+      "epoch": 0.2210310674386507,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0017212368837331905,
+      "loss": 0.0903,
+      "step": 25463
+    },
+    {
+      "epoch": 0.22103974791885486,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0017212153485189556,
+      "loss": 0.1055,
+      "step": 25464
+    },
+    {
+      "epoch": 0.22104842839905903,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0017211938126253693,
+      "loss": 0.0986,
+      "step": 25465
+    },
+    {
+      "epoch": 0.2210571088792632,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0017211722760524543,
+      "loss": 0.1172,
+      "step": 25466
+    },
+    {
+      "epoch": 0.22106578935946736,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0017211507388002352,
+      "loss": 0.0947,
+      "step": 25467
+    },
+    {
+      "epoch": 0.22107446983967152,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0017211292008687345,
+      "loss": 0.0767,
+      "step": 25468
+    },
+    {
+      "epoch": 0.2210831503198757,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0017211076622579765,
+      "loss": 0.0928,
+      "step": 25469
+    },
+    {
+      "epoch": 0.22109183080007985,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0017210861229679845,
+      "loss": 0.1006,
+      "step": 25470
+    },
+    {
+      "epoch": 0.22110051128028402,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0017210645829987819,
+      "loss": 0.1128,
+      "step": 25471
+    },
+    {
+      "epoch": 0.22110919176048818,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0017210430423503925,
+      "loss": 0.126,
+      "step": 25472
+    },
+    {
+      "epoch": 0.22111787224069235,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0017210215010228398,
+      "loss": 0.0938,
+      "step": 25473
+    },
+    {
+      "epoch": 0.2211265527208965,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0017209999590161473,
+      "loss": 0.127,
+      "step": 25474
+    },
+    {
+      "epoch": 0.22113523320110068,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0017209784163303384,
+      "loss": 0.1299,
+      "step": 25475
+    },
+    {
+      "epoch": 0.22114391368130484,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0017209568729654367,
+      "loss": 0.0938,
+      "step": 25476
+    },
+    {
+      "epoch": 0.221152594161509,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0017209353289214665,
+      "loss": 0.1035,
+      "step": 25477
+    },
+    {
+      "epoch": 0.22116127464171317,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0017209137841984504,
+      "loss": 0.1895,
+      "step": 25478
+    },
+    {
+      "epoch": 0.22116995512191734,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0017208922387964122,
+      "loss": 0.0986,
+      "step": 25479
+    },
+    {
+      "epoch": 0.2211786356021215,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001720870692715376,
+      "loss": 0.1426,
+      "step": 25480
+    },
+    {
+      "epoch": 0.22118731608232567,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0017208491459553643,
+      "loss": 0.1221,
+      "step": 25481
+    },
+    {
+      "epoch": 0.22119599656252983,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0017208275985164019,
+      "loss": 0.1182,
+      "step": 25482
+    },
+    {
+      "epoch": 0.221204677042734,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0017208060503985114,
+      "loss": 0.0928,
+      "step": 25483
+    },
+    {
+      "epoch": 0.22121335752293816,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0017207845016017167,
+      "loss": 0.1074,
+      "step": 25484
+    },
+    {
+      "epoch": 0.22122203800314233,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0017207629521260417,
+      "loss": 0.1025,
+      "step": 25485
+    },
+    {
+      "epoch": 0.2212307184833465,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0017207414019715095,
+      "loss": 0.1172,
+      "step": 25486
+    },
+    {
+      "epoch": 0.22123939896355066,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0017207198511381438,
+      "loss": 0.1006,
+      "step": 25487
+    },
+    {
+      "epoch": 0.22124807944375482,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0017206982996259682,
+      "loss": 0.1387,
+      "step": 25488
+    },
+    {
+      "epoch": 0.221256759923959,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0017206767474350062,
+      "loss": 0.0801,
+      "step": 25489
+    },
+    {
+      "epoch": 0.22126544040416316,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0017206551945652816,
+      "loss": 0.0908,
+      "step": 25490
+    },
+    {
+      "epoch": 0.22127412088436732,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001720633641016818,
+      "loss": 0.0981,
+      "step": 25491
+    },
+    {
+      "epoch": 0.22128280136457149,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0017206120867896385,
+      "loss": 0.1113,
+      "step": 25492
+    },
+    {
+      "epoch": 0.22129148184477565,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0017205905318837668,
+      "loss": 0.0801,
+      "step": 25493
+    },
+    {
+      "epoch": 0.22130016232497982,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0017205689762992268,
+      "loss": 0.166,
+      "step": 25494
+    },
+    {
+      "epoch": 0.22130884280518398,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0017205474200360418,
+      "loss": 0.1094,
+      "step": 25495
+    },
+    {
+      "epoch": 0.22131752328538815,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0017205258630942357,
+      "loss": 0.1299,
+      "step": 25496
+    },
+    {
+      "epoch": 0.2213262037655923,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0017205043054738316,
+      "loss": 0.0957,
+      "step": 25497
+    },
+    {
+      "epoch": 0.22133488424579648,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0017204827471748534,
+      "loss": 0.1133,
+      "step": 25498
+    },
+    {
+      "epoch": 0.22134356472600064,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0017204611881973246,
+      "loss": 0.1206,
+      "step": 25499
+    },
+    {
+      "epoch": 0.2213522452062048,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0017204396285412688,
+      "loss": 0.1367,
+      "step": 25500
+    },
+    {
+      "epoch": 0.22136092568640897,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0017204180682067092,
+      "loss": 0.1133,
+      "step": 25501
+    },
+    {
+      "epoch": 0.22136960616661314,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0017203965071936702,
+      "loss": 0.1328,
+      "step": 25502
+    },
+    {
+      "epoch": 0.2213782866468173,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0017203749455021746,
+      "loss": 0.0898,
+      "step": 25503
+    },
+    {
+      "epoch": 0.22138696712702147,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0017203533831322464,
+      "loss": 0.1035,
+      "step": 25504
+    },
+    {
+      "epoch": 0.22139564760722563,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001720331820083909,
+      "loss": 0.1396,
+      "step": 25505
+    },
+    {
+      "epoch": 0.2214043280874298,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0017203102563571862,
+      "loss": 0.0869,
+      "step": 25506
+    },
+    {
+      "epoch": 0.22141300856763396,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0017202886919521012,
+      "loss": 0.0894,
+      "step": 25507
+    },
+    {
+      "epoch": 0.22142168904783813,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.001720267126868678,
+      "loss": 0.0972,
+      "step": 25508
+    },
+    {
+      "epoch": 0.2214303695280423,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0017202455611069398,
+      "loss": 0.1157,
+      "step": 25509
+    },
+    {
+      "epoch": 0.22143905000824646,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0017202239946669106,
+      "loss": 0.0732,
+      "step": 25510
+    },
+    {
+      "epoch": 0.22144773048845062,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0017202024275486136,
+      "loss": 0.0947,
+      "step": 25511
+    },
+    {
+      "epoch": 0.2214564109686548,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0017201808597520725,
+      "loss": 0.1504,
+      "step": 25512
+    },
+    {
+      "epoch": 0.22146509144885895,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0017201592912773113,
+      "loss": 0.0864,
+      "step": 25513
+    },
+    {
+      "epoch": 0.22147377192906312,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0017201377221243526,
+      "loss": 0.0771,
+      "step": 25514
+    },
+    {
+      "epoch": 0.22148245240926728,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0017201161522932208,
+      "loss": 0.1074,
+      "step": 25515
+    },
+    {
+      "epoch": 0.22149113288947145,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00172009458178394,
+      "loss": 0.1162,
+      "step": 25516
+    },
+    {
+      "epoch": 0.2214998133696756,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0017200730105965322,
+      "loss": 0.1396,
+      "step": 25517
+    },
+    {
+      "epoch": 0.22150849384987978,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0017200514387310224,
+      "loss": 0.1104,
+      "step": 25518
+    },
+    {
+      "epoch": 0.22151717433008394,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0017200298661874335,
+      "loss": 0.1074,
+      "step": 25519
+    },
+    {
+      "epoch": 0.2215258548102881,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0017200082929657893,
+      "loss": 0.1152,
+      "step": 25520
+    },
+    {
+      "epoch": 0.22153453529049227,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0017199867190661132,
+      "loss": 0.106,
+      "step": 25521
+    },
+    {
+      "epoch": 0.22154321577069644,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0017199651444884288,
+      "loss": 0.0811,
+      "step": 25522
+    },
+    {
+      "epoch": 0.2215518962509006,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0017199435692327601,
+      "loss": 0.1152,
+      "step": 25523
+    },
+    {
+      "epoch": 0.22156057673110477,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0017199219932991304,
+      "loss": 0.1191,
+      "step": 25524
+    },
+    {
+      "epoch": 0.22156925721130893,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001719900416687563,
+      "loss": 0.1621,
+      "step": 25525
+    },
+    {
+      "epoch": 0.2215779376915131,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0017198788393980823,
+      "loss": 0.1982,
+      "step": 25526
+    },
+    {
+      "epoch": 0.22158661817171726,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001719857261430711,
+      "loss": 0.1133,
+      "step": 25527
+    },
+    {
+      "epoch": 0.22159529865192143,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0017198356827854734,
+      "loss": 0.0752,
+      "step": 25528
+    },
+    {
+      "epoch": 0.2216039791321256,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0017198141034623928,
+      "loss": 0.1064,
+      "step": 25529
+    },
+    {
+      "epoch": 0.22161265961232976,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0017197925234614928,
+      "loss": 0.123,
+      "step": 25530
+    },
+    {
+      "epoch": 0.22162134009253392,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.001719770942782797,
+      "loss": 0.1816,
+      "step": 25531
+    },
+    {
+      "epoch": 0.2216300205727381,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0017197493614263286,
+      "loss": 0.0977,
+      "step": 25532
+    },
+    {
+      "epoch": 0.22163870105294226,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001719727779392112,
+      "loss": 0.1709,
+      "step": 25533
+    },
+    {
+      "epoch": 0.22164738153314642,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0017197061966801705,
+      "loss": 0.1201,
+      "step": 25534
+    },
+    {
+      "epoch": 0.22165606201335059,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0017196846132905277,
+      "loss": 0.1182,
+      "step": 25535
+    },
+    {
+      "epoch": 0.22166474249355475,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0017196630292232067,
+      "loss": 0.1035,
+      "step": 25536
+    },
+    {
+      "epoch": 0.22167342297375892,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0017196414444782318,
+      "loss": 0.1133,
+      "step": 25537
+    },
+    {
+      "epoch": 0.22168210345396308,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0017196198590556265,
+      "loss": 0.0791,
+      "step": 25538
+    },
+    {
+      "epoch": 0.22169078393416725,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0017195982729554138,
+      "loss": 0.0928,
+      "step": 25539
+    },
+    {
+      "epoch": 0.2216994644143714,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0017195766861776178,
+      "loss": 0.1279,
+      "step": 25540
+    },
+    {
+      "epoch": 0.22170814489457558,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0017195550987222621,
+      "loss": 0.1465,
+      "step": 25541
+    },
+    {
+      "epoch": 0.22171682537477974,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017195335105893703,
+      "loss": 0.1016,
+      "step": 25542
+    },
+    {
+      "epoch": 0.2217255058549839,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001719511921778966,
+      "loss": 0.0957,
+      "step": 25543
+    },
+    {
+      "epoch": 0.22173418633518807,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0017194903322910726,
+      "loss": 0.1426,
+      "step": 25544
+    },
+    {
+      "epoch": 0.22174286681539224,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0017194687421257144,
+      "loss": 0.1045,
+      "step": 25545
+    },
+    {
+      "epoch": 0.2217515472955964,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001719447151282914,
+      "loss": 0.1416,
+      "step": 25546
+    },
+    {
+      "epoch": 0.22176022777580057,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0017194255597626957,
+      "loss": 0.1104,
+      "step": 25547
+    },
+    {
+      "epoch": 0.22176890825600473,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001719403967565083,
+      "loss": 0.123,
+      "step": 25548
+    },
+    {
+      "epoch": 0.2217775887362089,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0017193823746900992,
+      "loss": 0.1089,
+      "step": 25549
+    },
+    {
+      "epoch": 0.22178626921641306,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001719360781137768,
+      "loss": 0.1143,
+      "step": 25550
+    },
+    {
+      "epoch": 0.22179494969661723,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0017193391869081134,
+      "loss": 0.1191,
+      "step": 25551
+    },
+    {
+      "epoch": 0.2218036301768214,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0017193175920011588,
+      "loss": 0.1211,
+      "step": 25552
+    },
+    {
+      "epoch": 0.22181231065702556,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0017192959964169277,
+      "loss": 0.1064,
+      "step": 25553
+    },
+    {
+      "epoch": 0.22182099113722972,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001719274400155444,
+      "loss": 0.1147,
+      "step": 25554
+    },
+    {
+      "epoch": 0.2218296716174339,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0017192528032167309,
+      "loss": 0.0854,
+      "step": 25555
+    },
+    {
+      "epoch": 0.22183835209763805,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0017192312056008123,
+      "loss": 0.0981,
+      "step": 25556
+    },
+    {
+      "epoch": 0.22184703257784222,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0017192096073077115,
+      "loss": 0.127,
+      "step": 25557
+    },
+    {
+      "epoch": 0.22185571305804638,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0017191880083374529,
+      "loss": 0.1299,
+      "step": 25558
+    },
+    {
+      "epoch": 0.22186439353825055,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0017191664086900593,
+      "loss": 0.1279,
+      "step": 25559
+    },
+    {
+      "epoch": 0.2218730740184547,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0017191448083655548,
+      "loss": 0.1289,
+      "step": 25560
+    },
+    {
+      "epoch": 0.22188175449865888,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0017191232073639625,
+      "loss": 0.0806,
+      "step": 25561
+    },
+    {
+      "epoch": 0.22189043497886304,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0017191016056853064,
+      "loss": 0.1167,
+      "step": 25562
+    },
+    {
+      "epoch": 0.2218991154590672,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0017190800033296102,
+      "loss": 0.0879,
+      "step": 25563
+    },
+    {
+      "epoch": 0.22190779593927137,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0017190584002968977,
+      "loss": 0.2031,
+      "step": 25564
+    },
+    {
+      "epoch": 0.22191647641947554,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0017190367965871918,
+      "loss": 0.1367,
+      "step": 25565
+    },
+    {
+      "epoch": 0.22192515689967968,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0017190151922005165,
+      "loss": 0.0869,
+      "step": 25566
+    },
+    {
+      "epoch": 0.22193383737988384,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001718993587136896,
+      "loss": 0.1143,
+      "step": 25567
+    },
+    {
+      "epoch": 0.221942517860088,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0017189719813963534,
+      "loss": 0.0986,
+      "step": 25568
+    },
+    {
+      "epoch": 0.22195119834029217,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0017189503749789118,
+      "loss": 0.0938,
+      "step": 25569
+    },
+    {
+      "epoch": 0.22195987882049634,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0017189287678845955,
+      "loss": 0.1309,
+      "step": 25570
+    },
+    {
+      "epoch": 0.2219685593007005,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0017189071601134284,
+      "loss": 0.1338,
+      "step": 25571
+    },
+    {
+      "epoch": 0.22197723978090467,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0017188855516654335,
+      "loss": 0.1006,
+      "step": 25572
+    },
+    {
+      "epoch": 0.22198592026110883,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0017188639425406345,
+      "loss": 0.0889,
+      "step": 25573
+    },
+    {
+      "epoch": 0.221994600741313,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0017188423327390554,
+      "loss": 0.0811,
+      "step": 25574
+    },
+    {
+      "epoch": 0.22200328122151716,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0017188207222607194,
+      "loss": 0.1396,
+      "step": 25575
+    },
+    {
+      "epoch": 0.22201196170172133,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017187991111056507,
+      "loss": 0.1455,
+      "step": 25576
+    },
+    {
+      "epoch": 0.2220206421819255,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0017187774992738724,
+      "loss": 0.0991,
+      "step": 25577
+    },
+    {
+      "epoch": 0.22202932266212966,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0017187558867654083,
+      "loss": 0.126,
+      "step": 25578
+    },
+    {
+      "epoch": 0.22203800314233382,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001718734273580282,
+      "loss": 0.1436,
+      "step": 25579
+    },
+    {
+      "epoch": 0.222046683622538,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0017187126597185173,
+      "loss": 0.0952,
+      "step": 25580
+    },
+    {
+      "epoch": 0.22205536410274215,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0017186910451801377,
+      "loss": 0.0928,
+      "step": 25581
+    },
+    {
+      "epoch": 0.22206404458294632,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.001718669429965167,
+      "loss": 0.1074,
+      "step": 25582
+    },
+    {
+      "epoch": 0.22207272506315048,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0017186478140736284,
+      "loss": 0.0903,
+      "step": 25583
+    },
+    {
+      "epoch": 0.22208140554335465,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.001718626197505546,
+      "loss": 0.106,
+      "step": 25584
+    },
+    {
+      "epoch": 0.2220900860235588,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0017186045802609431,
+      "loss": 0.1328,
+      "step": 25585
+    },
+    {
+      "epoch": 0.22209876650376298,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0017185829623398438,
+      "loss": 0.1416,
+      "step": 25586
+    },
+    {
+      "epoch": 0.22210744698396714,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0017185613437422714,
+      "loss": 0.0996,
+      "step": 25587
+    },
+    {
+      "epoch": 0.2221161274641713,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0017185397244682498,
+      "loss": 0.125,
+      "step": 25588
+    },
+    {
+      "epoch": 0.22212480794437547,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0017185181045178022,
+      "loss": 0.0938,
+      "step": 25589
+    },
+    {
+      "epoch": 0.22213348842457964,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0017184964838909524,
+      "loss": 0.105,
+      "step": 25590
+    },
+    {
+      "epoch": 0.2221421689047838,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0017184748625877246,
+      "loss": 0.125,
+      "step": 25591
+    },
+    {
+      "epoch": 0.22215084938498797,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0017184532406081415,
+      "loss": 0.1216,
+      "step": 25592
+    },
+    {
+      "epoch": 0.22215952986519213,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0017184316179522272,
+      "loss": 0.1191,
+      "step": 25593
+    },
+    {
+      "epoch": 0.2221682103453963,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0017184099946200059,
+      "loss": 0.0918,
+      "step": 25594
+    },
+    {
+      "epoch": 0.22217689082560046,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0017183883706115004,
+      "loss": 0.1206,
+      "step": 25595
+    },
+    {
+      "epoch": 0.22218557130580463,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0017183667459267343,
+      "loss": 0.0898,
+      "step": 25596
+    },
+    {
+      "epoch": 0.2221942517860088,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0017183451205657321,
+      "loss": 0.0972,
+      "step": 25597
+    },
+    {
+      "epoch": 0.22220293226621296,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0017183234945285168,
+      "loss": 0.0801,
+      "step": 25598
+    },
+    {
+      "epoch": 0.22221161274641713,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0017183018678151125,
+      "loss": 0.1191,
+      "step": 25599
+    },
+    {
+      "epoch": 0.2222202932266213,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0017182802404255425,
+      "loss": 0.1182,
+      "step": 25600
+    },
+    {
+      "epoch": 0.22222897370682546,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.00171825861235983,
+      "loss": 0.1128,
+      "step": 25601
+    },
+    {
+      "epoch": 0.22223765418702962,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0017182369836179997,
+      "loss": 0.1523,
+      "step": 25602
+    },
+    {
+      "epoch": 0.22224633466723379,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0017182153542000744,
+      "loss": 0.1084,
+      "step": 25603
+    },
+    {
+      "epoch": 0.22225501514743795,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0017181937241060783,
+      "loss": 0.2246,
+      "step": 25604
+    },
+    {
+      "epoch": 0.22226369562764212,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0017181720933360347,
+      "loss": 0.0815,
+      "step": 25605
+    },
+    {
+      "epoch": 0.22227237610784628,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0017181504618899676,
+      "loss": 0.0957,
+      "step": 25606
+    },
+    {
+      "epoch": 0.22228105658805045,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0017181288297679002,
+      "loss": 0.1123,
+      "step": 25607
+    },
+    {
+      "epoch": 0.2222897370682546,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0017181071969698564,
+      "loss": 0.1035,
+      "step": 25608
+    },
+    {
+      "epoch": 0.22229841754845878,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.00171808556349586,
+      "loss": 0.1279,
+      "step": 25609
+    },
+    {
+      "epoch": 0.22230709802866294,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0017180639293459346,
+      "loss": 0.0977,
+      "step": 25610
+    },
+    {
+      "epoch": 0.2223157785088671,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0017180422945201035,
+      "loss": 0.1094,
+      "step": 25611
+    },
+    {
+      "epoch": 0.22232445898907127,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0017180206590183906,
+      "loss": 0.1011,
+      "step": 25612
+    },
+    {
+      "epoch": 0.22233313946927544,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.00171799902284082,
+      "loss": 0.125,
+      "step": 25613
+    },
+    {
+      "epoch": 0.2223418199494796,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0017179773859874146,
+      "loss": 0.1123,
+      "step": 25614
+    },
+    {
+      "epoch": 0.22235050042968377,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0017179557484581986,
+      "loss": 0.1279,
+      "step": 25615
+    },
+    {
+      "epoch": 0.22235918090988793,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0017179341102531954,
+      "loss": 0.0723,
+      "step": 25616
+    },
+    {
+      "epoch": 0.2223678613900921,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017179124713724285,
+      "loss": 0.124,
+      "step": 25617
+    },
+    {
+      "epoch": 0.22237654187029626,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.001717890831815922,
+      "loss": 0.1245,
+      "step": 25618
+    },
+    {
+      "epoch": 0.22238522235050043,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0017178691915836995,
+      "loss": 0.0879,
+      "step": 25619
+    },
+    {
+      "epoch": 0.2223939028307046,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0017178475506757847,
+      "loss": 0.1084,
+      "step": 25620
+    },
+    {
+      "epoch": 0.22240258331090876,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0017178259090922007,
+      "loss": 0.0674,
+      "step": 25621
+    },
+    {
+      "epoch": 0.22241126379111292,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0017178042668329716,
+      "loss": 0.0776,
+      "step": 25622
+    },
+    {
+      "epoch": 0.2224199442713171,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0017177826238981213,
+      "loss": 0.1143,
+      "step": 25623
+    },
+    {
+      "epoch": 0.22242862475152125,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017177609802876731,
+      "loss": 0.1543,
+      "step": 25624
+    },
+    {
+      "epoch": 0.22243730523172542,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0017177393360016506,
+      "loss": 0.1396,
+      "step": 25625
+    },
+    {
+      "epoch": 0.22244598571192958,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001717717691040078,
+      "loss": 0.1011,
+      "step": 25626
+    },
+    {
+      "epoch": 0.22245466619213375,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0017176960454029781,
+      "loss": 0.0928,
+      "step": 25627
+    },
+    {
+      "epoch": 0.2224633466723379,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0017176743990903755,
+      "loss": 0.105,
+      "step": 25628
+    },
+    {
+      "epoch": 0.22247202715254208,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0017176527521022935,
+      "loss": 0.1157,
+      "step": 25629
+    },
+    {
+      "epoch": 0.22248070763274624,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0017176311044387557,
+      "loss": 0.0898,
+      "step": 25630
+    },
+    {
+      "epoch": 0.2224893881129504,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0017176094560997853,
+      "loss": 0.1152,
+      "step": 25631
+    },
+    {
+      "epoch": 0.22249806859315457,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001717587807085407,
+      "loss": 0.1025,
+      "step": 25632
+    },
+    {
+      "epoch": 0.22250674907335874,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0017175661573956438,
+      "loss": 0.0854,
+      "step": 25633
+    },
+    {
+      "epoch": 0.2225154295535629,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0017175445070305199,
+      "loss": 0.1001,
+      "step": 25634
+    },
+    {
+      "epoch": 0.22252411003376707,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0017175228559900577,
+      "loss": 0.1689,
+      "step": 25635
+    },
+    {
+      "epoch": 0.22253279051397123,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0017175012042742826,
+      "loss": 0.0659,
+      "step": 25636
+    },
+    {
+      "epoch": 0.2225414709941754,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0017174795518832171,
+      "loss": 0.0977,
+      "step": 25637
+    },
+    {
+      "epoch": 0.22255015147437957,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0017174578988168852,
+      "loss": 0.0918,
+      "step": 25638
+    },
+    {
+      "epoch": 0.22255883195458373,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001717436245075311,
+      "loss": 0.0806,
+      "step": 25639
+    },
+    {
+      "epoch": 0.2225675124347879,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0017174145906585175,
+      "loss": 0.1094,
+      "step": 25640
+    },
+    {
+      "epoch": 0.22257619291499206,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0017173929355665286,
+      "loss": 0.1113,
+      "step": 25641
+    },
+    {
+      "epoch": 0.22258487339519623,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0017173712797993684,
+      "loss": 0.1514,
+      "step": 25642
+    },
+    {
+      "epoch": 0.2225935538754004,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.00171734962335706,
+      "loss": 0.166,
+      "step": 25643
+    },
+    {
+      "epoch": 0.22260223435560456,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001717327966239627,
+      "loss": 0.0845,
+      "step": 25644
+    },
+    {
+      "epoch": 0.22261091483580872,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001717306308447094,
+      "loss": 0.1758,
+      "step": 25645
+    },
+    {
+      "epoch": 0.22261959531601289,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0017172846499794838,
+      "loss": 0.0884,
+      "step": 25646
+    },
+    {
+      "epoch": 0.22262827579621705,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0017172629908368206,
+      "loss": 0.0845,
+      "step": 25647
+    },
+    {
+      "epoch": 0.22263695627642122,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0017172413310191275,
+      "loss": 0.123,
+      "step": 25648
+    },
+    {
+      "epoch": 0.22264563675662538,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0017172196705264287,
+      "loss": 0.1074,
+      "step": 25649
+    },
+    {
+      "epoch": 0.22265431723682955,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001717198009358748,
+      "loss": 0.0859,
+      "step": 25650
+    },
+    {
+      "epoch": 0.2226629977170337,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0017171763475161087,
+      "loss": 0.123,
+      "step": 25651
+    },
+    {
+      "epoch": 0.22267167819723788,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0017171546849985345,
+      "loss": 0.1367,
+      "step": 25652
+    },
+    {
+      "epoch": 0.22268035867744204,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0017171330218060492,
+      "loss": 0.0776,
+      "step": 25653
+    },
+    {
+      "epoch": 0.2226890391576462,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0017171113579386766,
+      "loss": 0.0806,
+      "step": 25654
+    },
+    {
+      "epoch": 0.22269771963785037,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.00171708969339644,
+      "loss": 0.1182,
+      "step": 25655
+    },
+    {
+      "epoch": 0.22270640011805454,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0017170680281793638,
+      "loss": 0.1084,
+      "step": 25656
+    },
+    {
+      "epoch": 0.2227150805982587,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0017170463622874711,
+      "loss": 0.1084,
+      "step": 25657
+    },
+    {
+      "epoch": 0.22272376107846287,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0017170246957207858,
+      "loss": 0.0835,
+      "step": 25658
+    },
+    {
+      "epoch": 0.22273244155866703,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0017170030284793317,
+      "loss": 0.106,
+      "step": 25659
+    },
+    {
+      "epoch": 0.2227411220388712,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001716981360563132,
+      "loss": 0.0918,
+      "step": 25660
+    },
+    {
+      "epoch": 0.22274980251907536,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001716959691972211,
+      "loss": 0.1235,
+      "step": 25661
+    },
+    {
+      "epoch": 0.22275848299927953,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0017169380227065924,
+      "loss": 0.105,
+      "step": 25662
+    },
+    {
+      "epoch": 0.2227671634794837,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0017169163527662992,
+      "loss": 0.1211,
+      "step": 25663
+    },
+    {
+      "epoch": 0.22277584395968786,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0017168946821513556,
+      "loss": 0.1045,
+      "step": 25664
+    },
+    {
+      "epoch": 0.22278452443989202,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0017168730108617855,
+      "loss": 0.085,
+      "step": 25665
+    },
+    {
+      "epoch": 0.2227932049200962,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0017168513388976122,
+      "loss": 0.1719,
+      "step": 25666
+    },
+    {
+      "epoch": 0.22280188540030035,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0017168296662588595,
+      "loss": 0.1045,
+      "step": 25667
+    },
+    {
+      "epoch": 0.22281056588050452,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0017168079929455514,
+      "loss": 0.0864,
+      "step": 25668
+    },
+    {
+      "epoch": 0.22281924636070868,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001716786318957711,
+      "loss": 0.1445,
+      "step": 25669
+    },
+    {
+      "epoch": 0.22282792684091285,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0017167646442953625,
+      "loss": 0.1699,
+      "step": 25670
+    },
+    {
+      "epoch": 0.22283660732111701,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0017167429689585291,
+      "loss": 0.0996,
+      "step": 25671
+    },
+    {
+      "epoch": 0.22284528780132118,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0017167212929472353,
+      "loss": 0.1216,
+      "step": 25672
+    },
+    {
+      "epoch": 0.22285396828152534,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0017166996162615044,
+      "loss": 0.1133,
+      "step": 25673
+    },
+    {
+      "epoch": 0.2228626487617295,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0017166779389013599,
+      "loss": 0.1348,
+      "step": 25674
+    },
+    {
+      "epoch": 0.22287132924193367,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0017166562608668258,
+      "loss": 0.125,
+      "step": 25675
+    },
+    {
+      "epoch": 0.22288000972213784,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0017166345821579255,
+      "loss": 0.1436,
+      "step": 25676
+    },
+    {
+      "epoch": 0.222888690202342,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001716612902774683,
+      "loss": 0.0938,
+      "step": 25677
+    },
+    {
+      "epoch": 0.22289737068254617,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0017165912227171216,
+      "loss": 0.1006,
+      "step": 25678
+    },
+    {
+      "epoch": 0.22290605116275033,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0017165695419852655,
+      "loss": 0.0977,
+      "step": 25679
+    },
+    {
+      "epoch": 0.2229147316429545,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0017165478605791382,
+      "loss": 0.0796,
+      "step": 25680
+    },
+    {
+      "epoch": 0.22292341212315867,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0017165261784987636,
+      "loss": 0.0981,
+      "step": 25681
+    },
+    {
+      "epoch": 0.22293209260336283,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017165044957441653,
+      "loss": 0.1133,
+      "step": 25682
+    },
+    {
+      "epoch": 0.222940773083567,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0017164828123153665,
+      "loss": 0.1279,
+      "step": 25683
+    },
+    {
+      "epoch": 0.22294945356377116,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0017164611282123917,
+      "loss": 0.0986,
+      "step": 25684
+    },
+    {
+      "epoch": 0.22295813404397533,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0017164394434352641,
+      "loss": 0.1025,
+      "step": 25685
+    },
+    {
+      "epoch": 0.2229668145241795,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0017164177579840078,
+      "loss": 0.0972,
+      "step": 25686
+    },
+    {
+      "epoch": 0.22297549500438366,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0017163960718586462,
+      "loss": 0.1182,
+      "step": 25687
+    },
+    {
+      "epoch": 0.22298417548458782,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0017163743850592032,
+      "loss": 0.1064,
+      "step": 25688
+    },
+    {
+      "epoch": 0.222992855964792,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0017163526975857023,
+      "loss": 0.082,
+      "step": 25689
+    },
+    {
+      "epoch": 0.22300153644499612,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0017163310094381675,
+      "loss": 0.0933,
+      "step": 25690
+    },
+    {
+      "epoch": 0.2230102169252003,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001716309320616622,
+      "loss": 0.1011,
+      "step": 25691
+    },
+    {
+      "epoch": 0.22301889740540445,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0017162876311210902,
+      "loss": 0.1025,
+      "step": 25692
+    },
+    {
+      "epoch": 0.22302757788560862,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0017162659409515955,
+      "loss": 0.125,
+      "step": 25693
+    },
+    {
+      "epoch": 0.22303625836581278,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0017162442501081616,
+      "loss": 0.1367,
+      "step": 25694
+    },
+    {
+      "epoch": 0.22304493884601695,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0017162225585908122,
+      "loss": 0.105,
+      "step": 25695
+    },
+    {
+      "epoch": 0.22305361932622111,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001716200866399571,
+      "loss": 0.105,
+      "step": 25696
+    },
+    {
+      "epoch": 0.22306229980642528,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0017161791735344622,
+      "loss": 0.1006,
+      "step": 25697
+    },
+    {
+      "epoch": 0.22307098028662944,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0017161574799955087,
+      "loss": 0.1777,
+      "step": 25698
+    },
+    {
+      "epoch": 0.2230796607668336,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017161357857827346,
+      "loss": 0.1055,
+      "step": 25699
+    },
+    {
+      "epoch": 0.22308834124703777,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0017161140908961636,
+      "loss": 0.0874,
+      "step": 25700
+    },
+    {
+      "epoch": 0.22309702172724194,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0017160923953358199,
+      "loss": 0.1016,
+      "step": 25701
+    },
+    {
+      "epoch": 0.2231057022074461,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0017160706991017265,
+      "loss": 0.1387,
+      "step": 25702
+    },
+    {
+      "epoch": 0.22311438268765027,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0017160490021939077,
+      "loss": 0.1235,
+      "step": 25703
+    },
+    {
+      "epoch": 0.22312306316785444,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017160273046123867,
+      "loss": 0.0991,
+      "step": 25704
+    },
+    {
+      "epoch": 0.2231317436480586,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0017160056063571874,
+      "loss": 0.1406,
+      "step": 25705
+    },
+    {
+      "epoch": 0.22314042412826277,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0017159839074283338,
+      "loss": 0.123,
+      "step": 25706
+    },
+    {
+      "epoch": 0.22314910460846693,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0017159622078258498,
+      "loss": 0.0845,
+      "step": 25707
+    },
+    {
+      "epoch": 0.2231577850886711,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0017159405075497583,
+      "loss": 0.0918,
+      "step": 25708
+    },
+    {
+      "epoch": 0.22316646556887526,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0017159188066000837,
+      "loss": 0.123,
+      "step": 25709
+    },
+    {
+      "epoch": 0.22317514604907943,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0017158971049768493,
+      "loss": 0.0972,
+      "step": 25710
+    },
+    {
+      "epoch": 0.2231838265292836,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0017158754026800796,
+      "loss": 0.1465,
+      "step": 25711
+    },
+    {
+      "epoch": 0.22319250700948776,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0017158536997097975,
+      "loss": 0.0732,
+      "step": 25712
+    },
+    {
+      "epoch": 0.22320118748969192,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0017158319960660269,
+      "loss": 0.0918,
+      "step": 25713
+    },
+    {
+      "epoch": 0.2232098679698961,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001715810291748792,
+      "loss": 0.1602,
+      "step": 25714
+    },
+    {
+      "epoch": 0.22321854845010025,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001715788586758116,
+      "loss": 0.1035,
+      "step": 25715
+    },
+    {
+      "epoch": 0.22322722893030442,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001715766881094023,
+      "loss": 0.085,
+      "step": 25716
+    },
+    {
+      "epoch": 0.22323590941050858,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0017157451747565366,
+      "loss": 0.1025,
+      "step": 25717
+    },
+    {
+      "epoch": 0.22324458989071275,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0017157234677456802,
+      "loss": 0.123,
+      "step": 25718
+    },
+    {
+      "epoch": 0.2232532703709169,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0017157017600614782,
+      "loss": 0.0928,
+      "step": 25719
+    },
+    {
+      "epoch": 0.22326195085112108,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0017156800517039539,
+      "loss": 0.082,
+      "step": 25720
+    },
+    {
+      "epoch": 0.22327063133132524,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001715658342673131,
+      "loss": 0.0972,
+      "step": 25721
+    },
+    {
+      "epoch": 0.2232793118115294,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0017156366329690335,
+      "loss": 0.1045,
+      "step": 25722
+    },
+    {
+      "epoch": 0.22328799229173357,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0017156149225916852,
+      "loss": 0.1191,
+      "step": 25723
+    },
+    {
+      "epoch": 0.22329667277193774,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0017155932115411096,
+      "loss": 0.1016,
+      "step": 25724
+    },
+    {
+      "epoch": 0.2233053532521419,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0017155714998173305,
+      "loss": 0.1143,
+      "step": 25725
+    },
+    {
+      "epoch": 0.22331403373234607,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0017155497874203716,
+      "loss": 0.0898,
+      "step": 25726
+    },
+    {
+      "epoch": 0.22332271421255023,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0017155280743502564,
+      "loss": 0.1201,
+      "step": 25727
+    },
+    {
+      "epoch": 0.2233313946927544,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0017155063606070094,
+      "loss": 0.1289,
+      "step": 25728
+    },
+    {
+      "epoch": 0.22334007517295856,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0017154846461906537,
+      "loss": 0.1562,
+      "step": 25729
+    },
+    {
+      "epoch": 0.22334875565316273,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0017154629311012133,
+      "loss": 0.1201,
+      "step": 25730
+    },
+    {
+      "epoch": 0.2233574361333669,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001715441215338712,
+      "loss": 0.1123,
+      "step": 25731
+    },
+    {
+      "epoch": 0.22336611661357106,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017154194989031733,
+      "loss": 0.0864,
+      "step": 25732
+    },
+    {
+      "epoch": 0.22337479709377522,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0017153977817946212,
+      "loss": 0.1406,
+      "step": 25733
+    },
+    {
+      "epoch": 0.2233834775739794,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0017153760640130791,
+      "loss": 0.0996,
+      "step": 25734
+    },
+    {
+      "epoch": 0.22339215805418355,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0017153543455585712,
+      "loss": 0.1006,
+      "step": 25735
+    },
+    {
+      "epoch": 0.22340083853438772,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001715332626431121,
+      "loss": 0.0679,
+      "step": 25736
+    },
+    {
+      "epoch": 0.22340951901459188,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0017153109066307519,
+      "loss": 0.1523,
+      "step": 25737
+    },
+    {
+      "epoch": 0.22341819949479605,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0017152891861574883,
+      "loss": 0.0933,
+      "step": 25738
+    },
+    {
+      "epoch": 0.22342687997500021,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0017152674650113538,
+      "loss": 0.1133,
+      "step": 25739
+    },
+    {
+      "epoch": 0.22343556045520438,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001715245743192372,
+      "loss": 0.1309,
+      "step": 25740
+    },
+    {
+      "epoch": 0.22344424093540854,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0017152240207005667,
+      "loss": 0.0947,
+      "step": 25741
+    },
+    {
+      "epoch": 0.2234529214156127,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0017152022975359619,
+      "loss": 0.0757,
+      "step": 25742
+    },
+    {
+      "epoch": 0.22346160189581687,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0017151805736985806,
+      "loss": 0.124,
+      "step": 25743
+    },
+    {
+      "epoch": 0.22347028237602104,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0017151588491884473,
+      "loss": 0.1475,
+      "step": 25744
+    },
+    {
+      "epoch": 0.2234789628562252,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0017151371240055858,
+      "loss": 0.1128,
+      "step": 25745
+    },
+    {
+      "epoch": 0.22348764333642937,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0017151153981500192,
+      "loss": 0.1201,
+      "step": 25746
+    },
+    {
+      "epoch": 0.22349632381663354,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0017150936716217719,
+      "loss": 0.1416,
+      "step": 25747
+    },
+    {
+      "epoch": 0.2235050042968377,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0017150719444208671,
+      "loss": 0.1187,
+      "step": 25748
+    },
+    {
+      "epoch": 0.22351368477704187,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001715050216547329,
+      "loss": 0.1436,
+      "step": 25749
+    },
+    {
+      "epoch": 0.22352236525724603,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0017150284880011815,
+      "loss": 0.1602,
+      "step": 25750
+    },
+    {
+      "epoch": 0.2235310457374502,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0017150067587824475,
+      "loss": 0.1201,
+      "step": 25751
+    },
+    {
+      "epoch": 0.22353972621765436,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0017149850288911519,
+      "loss": 0.1367,
+      "step": 25752
+    },
+    {
+      "epoch": 0.22354840669785853,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0017149632983273176,
+      "loss": 0.0781,
+      "step": 25753
+    },
+    {
+      "epoch": 0.2235570871780627,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.001714941567090969,
+      "loss": 0.0991,
+      "step": 25754
+    },
+    {
+      "epoch": 0.22356576765826686,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001714919835182129,
+      "loss": 0.1074,
+      "step": 25755
+    },
+    {
+      "epoch": 0.22357444813847102,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0017148981026008224,
+      "loss": 0.1201,
+      "step": 25756
+    },
+    {
+      "epoch": 0.2235831286186752,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0017148763693470724,
+      "loss": 0.0918,
+      "step": 25757
+    },
+    {
+      "epoch": 0.22359180909887935,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0017148546354209028,
+      "loss": 0.0918,
+      "step": 25758
+    },
+    {
+      "epoch": 0.22360048957908352,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0017148329008223374,
+      "loss": 0.0996,
+      "step": 25759
+    },
+    {
+      "epoch": 0.22360917005928768,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0017148111655514,
+      "loss": 0.1309,
+      "step": 25760
+    },
+    {
+      "epoch": 0.22361785053949185,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0017147894296081138,
+      "loss": 0.1436,
+      "step": 25761
+    },
+    {
+      "epoch": 0.223626531019696,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0017147676929925037,
+      "loss": 0.0859,
+      "step": 25762
+    },
+    {
+      "epoch": 0.22363521149990018,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.001714745955704593,
+      "loss": 0.1006,
+      "step": 25763
+    },
+    {
+      "epoch": 0.22364389198010434,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0017147242177444053,
+      "loss": 0.1074,
+      "step": 25764
+    },
+    {
+      "epoch": 0.2236525724603085,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001714702479111964,
+      "loss": 0.0806,
+      "step": 25765
+    },
+    {
+      "epoch": 0.22366125294051267,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0017146807398072937,
+      "loss": 0.0762,
+      "step": 25766
+    },
+    {
+      "epoch": 0.22366993342071684,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0017146589998304177,
+      "loss": 0.1289,
+      "step": 25767
+    },
+    {
+      "epoch": 0.223678613900921,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.00171463725918136,
+      "loss": 0.1045,
+      "step": 25768
+    },
+    {
+      "epoch": 0.22368729438112517,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0017146155178601437,
+      "loss": 0.1484,
+      "step": 25769
+    },
+    {
+      "epoch": 0.22369597486132933,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0017145937758667935,
+      "loss": 0.1309,
+      "step": 25770
+    },
+    {
+      "epoch": 0.2237046553415335,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0017145720332013329,
+      "loss": 0.1006,
+      "step": 25771
+    },
+    {
+      "epoch": 0.22371333582173766,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0017145502898637855,
+      "loss": 0.1011,
+      "step": 25772
+    },
+    {
+      "epoch": 0.22372201630194183,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0017145285458541747,
+      "loss": 0.1162,
+      "step": 25773
+    },
+    {
+      "epoch": 0.223730696782146,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0017145068011725252,
+      "loss": 0.1006,
+      "step": 25774
+    },
+    {
+      "epoch": 0.22373937726235016,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0017144850558188601,
+      "loss": 0.123,
+      "step": 25775
+    },
+    {
+      "epoch": 0.22374805774255432,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0017144633097932035,
+      "loss": 0.1865,
+      "step": 25776
+    },
+    {
+      "epoch": 0.2237567382227585,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001714441563095579,
+      "loss": 0.1211,
+      "step": 25777
+    },
+    {
+      "epoch": 0.22376541870296265,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0017144198157260104,
+      "loss": 0.1094,
+      "step": 25778
+    },
+    {
+      "epoch": 0.22377409918316682,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0017143980676845214,
+      "loss": 0.1133,
+      "step": 25779
+    },
+    {
+      "epoch": 0.22378277966337098,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.001714376318971136,
+      "loss": 0.1123,
+      "step": 25780
+    },
+    {
+      "epoch": 0.22379146014357515,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001714354569585878,
+      "loss": 0.1221,
+      "step": 25781
+    },
+    {
+      "epoch": 0.22380014062377931,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0017143328195287709,
+      "loss": 0.1523,
+      "step": 25782
+    },
+    {
+      "epoch": 0.22380882110398348,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0017143110687998387,
+      "loss": 0.1357,
+      "step": 25783
+    },
+    {
+      "epoch": 0.22381750158418764,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0017142893173991054,
+      "loss": 0.1089,
+      "step": 25784
+    },
+    {
+      "epoch": 0.2238261820643918,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0017142675653265941,
+      "loss": 0.0864,
+      "step": 25785
+    },
+    {
+      "epoch": 0.22383486254459597,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0017142458125823294,
+      "loss": 0.1582,
+      "step": 25786
+    },
+    {
+      "epoch": 0.22384354302480014,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0017142240591663345,
+      "loss": 0.0884,
+      "step": 25787
+    },
+    {
+      "epoch": 0.2238522235050043,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0017142023050786335,
+      "loss": 0.1157,
+      "step": 25788
+    },
+    {
+      "epoch": 0.22386090398520847,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0017141805503192503,
+      "loss": 0.1309,
+      "step": 25789
+    },
+    {
+      "epoch": 0.22386958446541264,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0017141587948882081,
+      "loss": 0.0879,
+      "step": 25790
+    },
+    {
+      "epoch": 0.2238782649456168,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0017141370387855311,
+      "loss": 0.1309,
+      "step": 25791
+    },
+    {
+      "epoch": 0.22388694542582097,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0017141152820112431,
+      "loss": 0.125,
+      "step": 25792
+    },
+    {
+      "epoch": 0.22389562590602513,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001714093524565368,
+      "loss": 0.124,
+      "step": 25793
+    },
+    {
+      "epoch": 0.2239043063862293,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0017140717664479294,
+      "loss": 0.1045,
+      "step": 25794
+    },
+    {
+      "epoch": 0.22391298686643346,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0017140500076589511,
+      "loss": 0.1143,
+      "step": 25795
+    },
+    {
+      "epoch": 0.22392166734663763,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0017140282481984568,
+      "loss": 0.106,
+      "step": 25796
+    },
+    {
+      "epoch": 0.2239303478268418,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0017140064880664706,
+      "loss": 0.0918,
+      "step": 25797
+    },
+    {
+      "epoch": 0.22393902830704596,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0017139847272630163,
+      "loss": 0.1221,
+      "step": 25798
+    },
+    {
+      "epoch": 0.22394770878725012,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001713962965788117,
+      "loss": 0.0918,
+      "step": 25799
+    },
+    {
+      "epoch": 0.2239563892674543,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0017139412036417977,
+      "loss": 0.1074,
+      "step": 25800
+    },
+    {
+      "epoch": 0.22396506974765845,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001713919440824081,
+      "loss": 0.1094,
+      "step": 25801
+    },
+    {
+      "epoch": 0.22397375022786262,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0017138976773349915,
+      "loss": 0.1133,
+      "step": 25802
+    },
+    {
+      "epoch": 0.22398243070806678,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0017138759131745526,
+      "loss": 0.123,
+      "step": 25803
+    },
+    {
+      "epoch": 0.22399111118827095,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001713854148342788,
+      "loss": 0.123,
+      "step": 25804
+    },
+    {
+      "epoch": 0.2239997916684751,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0017138323828397221,
+      "loss": 0.1465,
+      "step": 25805
+    },
+    {
+      "epoch": 0.22400847214867928,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0017138106166653781,
+      "loss": 0.0732,
+      "step": 25806
+    },
+    {
+      "epoch": 0.22401715262888344,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0017137888498197805,
+      "loss": 0.0981,
+      "step": 25807
+    },
+    {
+      "epoch": 0.2240258331090876,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0017137670823029522,
+      "loss": 0.0698,
+      "step": 25808
+    },
+    {
+      "epoch": 0.22403451358929177,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0017137453141149173,
+      "loss": 0.0815,
+      "step": 25809
+    },
+    {
+      "epoch": 0.22404319406949594,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0017137235452557,
+      "loss": 0.0928,
+      "step": 25810
+    },
+    {
+      "epoch": 0.2240518745497001,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0017137017757253238,
+      "loss": 0.1143,
+      "step": 25811
+    },
+    {
+      "epoch": 0.22406055502990427,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0017136800055238127,
+      "loss": 0.1309,
+      "step": 25812
+    },
+    {
+      "epoch": 0.2240692355101084,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0017136582346511903,
+      "loss": 0.1299,
+      "step": 25813
+    },
+    {
+      "epoch": 0.22407791599031257,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0017136364631074803,
+      "loss": 0.1021,
+      "step": 25814
+    },
+    {
+      "epoch": 0.22408659647051674,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0017136146908927066,
+      "loss": 0.1069,
+      "step": 25815
+    },
+    {
+      "epoch": 0.2240952769507209,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0017135929180068935,
+      "loss": 0.1445,
+      "step": 25816
+    },
+    {
+      "epoch": 0.22410395743092507,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001713571144450064,
+      "loss": 0.1113,
+      "step": 25817
+    },
+    {
+      "epoch": 0.22411263791112923,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0017135493702222426,
+      "loss": 0.1348,
+      "step": 25818
+    },
+    {
+      "epoch": 0.2241213183913334,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001713527595323453,
+      "loss": 0.1099,
+      "step": 25819
+    },
+    {
+      "epoch": 0.22412999887153756,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0017135058197537184,
+      "loss": 0.1016,
+      "step": 25820
+    },
+    {
+      "epoch": 0.22413867935174173,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0017134840435130629,
+      "loss": 0.1128,
+      "step": 25821
+    },
+    {
+      "epoch": 0.2241473598319459,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0017134622666015108,
+      "loss": 0.0879,
+      "step": 25822
+    },
+    {
+      "epoch": 0.22415604031215006,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0017134404890190857,
+      "loss": 0.0757,
+      "step": 25823
+    },
+    {
+      "epoch": 0.22416472079235422,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0017134187107658112,
+      "loss": 0.124,
+      "step": 25824
+    },
+    {
+      "epoch": 0.2241734012725584,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001713396931841711,
+      "loss": 0.1816,
+      "step": 25825
+    },
+    {
+      "epoch": 0.22418208175276255,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0017133751522468096,
+      "loss": 0.1089,
+      "step": 25826
+    },
+    {
+      "epoch": 0.22419076223296672,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.00171335337198113,
+      "loss": 0.0996,
+      "step": 25827
+    },
+    {
+      "epoch": 0.22419944271317088,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0017133315910446965,
+      "loss": 0.083,
+      "step": 25828
+    },
+    {
+      "epoch": 0.22420812319337505,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0017133098094375327,
+      "loss": 0.0718,
+      "step": 25829
+    },
+    {
+      "epoch": 0.2242168036735792,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0017132880271596624,
+      "loss": 0.1084,
+      "step": 25830
+    },
+    {
+      "epoch": 0.22422548415378338,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0017132662442111098,
+      "loss": 0.1074,
+      "step": 25831
+    },
+    {
+      "epoch": 0.22423416463398754,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0017132444605918987,
+      "loss": 0.1104,
+      "step": 25832
+    },
+    {
+      "epoch": 0.2242428451141917,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001713222676302052,
+      "loss": 0.1016,
+      "step": 25833
+    },
+    {
+      "epoch": 0.22425152559439587,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0017132008913415945,
+      "loss": 0.103,
+      "step": 25834
+    },
+    {
+      "epoch": 0.22426020607460004,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0017131791057105497,
+      "loss": 0.1123,
+      "step": 25835
+    },
+    {
+      "epoch": 0.2242688865548042,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0017131573194089414,
+      "loss": 0.1133,
+      "step": 25836
+    },
+    {
+      "epoch": 0.22427756703500837,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0017131355324367934,
+      "loss": 0.0884,
+      "step": 25837
+    },
+    {
+      "epoch": 0.22428624751521253,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0017131137447941298,
+      "loss": 0.084,
+      "step": 25838
+    },
+    {
+      "epoch": 0.2242949279954167,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0017130919564809742,
+      "loss": 0.0913,
+      "step": 25839
+    },
+    {
+      "epoch": 0.22430360847562086,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0017130701674973503,
+      "loss": 0.1152,
+      "step": 25840
+    },
+    {
+      "epoch": 0.22431228895582503,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.001713048377843282,
+      "loss": 0.1172,
+      "step": 25841
+    },
+    {
+      "epoch": 0.2243209694360292,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001713026587518794,
+      "loss": 0.1064,
+      "step": 25842
+    },
+    {
+      "epoch": 0.22432964991623336,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0017130047965239082,
+      "loss": 0.0889,
+      "step": 25843
+    },
+    {
+      "epoch": 0.22433833039643752,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0017129830048586501,
+      "loss": 0.0918,
+      "step": 25844
+    },
+    {
+      "epoch": 0.2243470108766417,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001712961212523043,
+      "loss": 0.1309,
+      "step": 25845
+    },
+    {
+      "epoch": 0.22435569135684585,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0017129394195171106,
+      "loss": 0.1279,
+      "step": 25846
+    },
+    {
+      "epoch": 0.22436437183705002,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.001712917625840877,
+      "loss": 0.0986,
+      "step": 25847
+    },
+    {
+      "epoch": 0.22437305231725418,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0017128958314943659,
+      "loss": 0.1807,
+      "step": 25848
+    },
+    {
+      "epoch": 0.22438173279745835,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0017128740364776008,
+      "loss": 0.1045,
+      "step": 25849
+    },
+    {
+      "epoch": 0.22439041327766251,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0017128522407906062,
+      "loss": 0.1016,
+      "step": 25850
+    },
+    {
+      "epoch": 0.22439909375786668,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0017128304444334053,
+      "loss": 0.126,
+      "step": 25851
+    },
+    {
+      "epoch": 0.22440777423807085,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0017128086474060228,
+      "loss": 0.1055,
+      "step": 25852
+    },
+    {
+      "epoch": 0.224416454718275,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0017127868497084813,
+      "loss": 0.1143,
+      "step": 25853
+    },
+    {
+      "epoch": 0.22442513519847918,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0017127650513408057,
+      "loss": 0.1133,
+      "step": 25854
+    },
+    {
+      "epoch": 0.22443381567868334,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0017127432523030195,
+      "loss": 0.1089,
+      "step": 25855
+    },
+    {
+      "epoch": 0.2244424961588875,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0017127214525951462,
+      "loss": 0.1172,
+      "step": 25856
+    },
+    {
+      "epoch": 0.22445117663909167,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0017126996522172102,
+      "loss": 0.1177,
+      "step": 25857
+    },
+    {
+      "epoch": 0.22445985711929584,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0017126778511692348,
+      "loss": 0.105,
+      "step": 25858
+    },
+    {
+      "epoch": 0.2244685375995,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0017126560494512444,
+      "loss": 0.0981,
+      "step": 25859
+    },
+    {
+      "epoch": 0.22447721807970417,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0017126342470632624,
+      "loss": 0.0786,
+      "step": 25860
+    },
+    {
+      "epoch": 0.22448589855990833,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0017126124440053128,
+      "loss": 0.1006,
+      "step": 25861
+    },
+    {
+      "epoch": 0.2244945790401125,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0017125906402774196,
+      "loss": 0.1025,
+      "step": 25862
+    },
+    {
+      "epoch": 0.22450325952031666,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0017125688358796062,
+      "loss": 0.0835,
+      "step": 25863
+    },
+    {
+      "epoch": 0.22451194000052083,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001712547030811897,
+      "loss": 0.0889,
+      "step": 25864
+    },
+    {
+      "epoch": 0.224520620480725,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0017125252250743152,
+      "loss": 0.1309,
+      "step": 25865
+    },
+    {
+      "epoch": 0.22452930096092916,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0017125034186668854,
+      "loss": 0.168,
+      "step": 25866
+    },
+    {
+      "epoch": 0.22453798144113332,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0017124816115896306,
+      "loss": 0.1387,
+      "step": 25867
+    },
+    {
+      "epoch": 0.2245466619213375,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0017124598038425756,
+      "loss": 0.0688,
+      "step": 25868
+    },
+    {
+      "epoch": 0.22455534240154165,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0017124379954257434,
+      "loss": 0.0952,
+      "step": 25869
+    },
+    {
+      "epoch": 0.22456402288174582,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0017124161863391585,
+      "loss": 0.1187,
+      "step": 25870
+    },
+    {
+      "epoch": 0.22457270336194998,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0017123943765828444,
+      "loss": 0.0884,
+      "step": 25871
+    },
+    {
+      "epoch": 0.22458138384215415,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0017123725661568247,
+      "loss": 0.1016,
+      "step": 25872
+    },
+    {
+      "epoch": 0.2245900643223583,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001712350755061124,
+      "loss": 0.1001,
+      "step": 25873
+    },
+    {
+      "epoch": 0.22459874480256248,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0017123289432957654,
+      "loss": 0.0947,
+      "step": 25874
+    },
+    {
+      "epoch": 0.22460742528276664,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0017123071308607733,
+      "loss": 0.126,
+      "step": 25875
+    },
+    {
+      "epoch": 0.2246161057629708,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0017122853177561712,
+      "loss": 0.2168,
+      "step": 25876
+    },
+    {
+      "epoch": 0.22462478624317497,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0017122635039819832,
+      "loss": 0.1025,
+      "step": 25877
+    },
+    {
+      "epoch": 0.22463346672337914,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0017122416895382327,
+      "loss": 0.0908,
+      "step": 25878
+    },
+    {
+      "epoch": 0.2246421472035833,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0017122198744249442,
+      "loss": 0.0903,
+      "step": 25879
+    },
+    {
+      "epoch": 0.22465082768378747,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0017121980586421416,
+      "loss": 0.1182,
+      "step": 25880
+    },
+    {
+      "epoch": 0.22465950816399163,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017121762421898475,
+      "loss": 0.0786,
+      "step": 25881
+    },
+    {
+      "epoch": 0.2246681886441958,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001712154425068087,
+      "loss": 0.126,
+      "step": 25882
+    },
+    {
+      "epoch": 0.22467686912439996,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0017121326072768838,
+      "loss": 0.1226,
+      "step": 25883
+    },
+    {
+      "epoch": 0.22468554960460413,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0017121107888162616,
+      "loss": 0.0879,
+      "step": 25884
+    },
+    {
+      "epoch": 0.2246942300848083,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0017120889696862443,
+      "loss": 0.1348,
+      "step": 25885
+    },
+    {
+      "epoch": 0.22470291056501246,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0017120671498868556,
+      "loss": 0.0854,
+      "step": 25886
+    },
+    {
+      "epoch": 0.22471159104521662,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0017120453294181192,
+      "loss": 0.1621,
+      "step": 25887
+    },
+    {
+      "epoch": 0.2247202715254208,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0017120235082800596,
+      "loss": 0.0967,
+      "step": 25888
+    },
+    {
+      "epoch": 0.22472895200562495,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0017120016864727002,
+      "loss": 0.1621,
+      "step": 25889
+    },
+    {
+      "epoch": 0.22473763248582912,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0017119798639960646,
+      "loss": 0.0869,
+      "step": 25890
+    },
+    {
+      "epoch": 0.22474631296603328,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0017119580408501774,
+      "loss": 0.1143,
+      "step": 25891
+    },
+    {
+      "epoch": 0.22475499344623745,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0017119362170350619,
+      "loss": 0.0703,
+      "step": 25892
+    },
+    {
+      "epoch": 0.22476367392644162,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0017119143925507423,
+      "loss": 0.2383,
+      "step": 25893
+    },
+    {
+      "epoch": 0.22477235440664578,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001711892567397242,
+      "loss": 0.1211,
+      "step": 25894
+    },
+    {
+      "epoch": 0.22478103488684995,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0017118707415745855,
+      "loss": 0.1045,
+      "step": 25895
+    },
+    {
+      "epoch": 0.2247897153670541,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0017118489150827962,
+      "loss": 0.0918,
+      "step": 25896
+    },
+    {
+      "epoch": 0.22479839584725828,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0017118270879218985,
+      "loss": 0.1021,
+      "step": 25897
+    },
+    {
+      "epoch": 0.22480707632746244,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0017118052600919153,
+      "loss": 0.1021,
+      "step": 25898
+    },
+    {
+      "epoch": 0.2248157568076666,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0017117834315928714,
+      "loss": 0.1079,
+      "step": 25899
+    },
+    {
+      "epoch": 0.22482443728787077,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0017117616024247902,
+      "loss": 0.1299,
+      "step": 25900
+    },
+    {
+      "epoch": 0.22483311776807494,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001711739772587696,
+      "loss": 0.1475,
+      "step": 25901
+    },
+    {
+      "epoch": 0.2248417982482791,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0017117179420816121,
+      "loss": 0.125,
+      "step": 25902
+    },
+    {
+      "epoch": 0.22485047872848327,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0017116961109065625,
+      "loss": 0.1426,
+      "step": 25903
+    },
+    {
+      "epoch": 0.22485915920868743,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0017116742790625715,
+      "loss": 0.1089,
+      "step": 25904
+    },
+    {
+      "epoch": 0.2248678396888916,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0017116524465496628,
+      "loss": 0.124,
+      "step": 25905
+    },
+    {
+      "epoch": 0.22487652016909576,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0017116306133678598,
+      "loss": 0.0986,
+      "step": 25906
+    },
+    {
+      "epoch": 0.22488520064929993,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0017116087795171868,
+      "loss": 0.1182,
+      "step": 25907
+    },
+    {
+      "epoch": 0.2248938811295041,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0017115869449976678,
+      "loss": 0.1299,
+      "step": 25908
+    },
+    {
+      "epoch": 0.22490256160970826,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0017115651098093265,
+      "loss": 0.0776,
+      "step": 25909
+    },
+    {
+      "epoch": 0.22491124208991242,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001711543273952187,
+      "loss": 0.1016,
+      "step": 25910
+    },
+    {
+      "epoch": 0.2249199225701166,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0017115214374262726,
+      "loss": 0.2129,
+      "step": 25911
+    },
+    {
+      "epoch": 0.22492860305032075,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0017114996002316075,
+      "loss": 0.1172,
+      "step": 25912
+    },
+    {
+      "epoch": 0.22493728353052492,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0017114777623682155,
+      "loss": 0.0962,
+      "step": 25913
+    },
+    {
+      "epoch": 0.22494596401072908,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0017114559238361213,
+      "loss": 0.0889,
+      "step": 25914
+    },
+    {
+      "epoch": 0.22495464449093325,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0017114340846353475,
+      "loss": 0.0806,
+      "step": 25915
+    },
+    {
+      "epoch": 0.2249633249711374,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0017114122447659187,
+      "loss": 0.1064,
+      "step": 25916
+    },
+    {
+      "epoch": 0.22497200545134158,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001711390404227859,
+      "loss": 0.1128,
+      "step": 25917
+    },
+    {
+      "epoch": 0.22498068593154574,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0017113685630211917,
+      "loss": 0.1035,
+      "step": 25918
+    },
+    {
+      "epoch": 0.2249893664117499,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0017113467211459408,
+      "loss": 0.1416,
+      "step": 25919
+    },
+    {
+      "epoch": 0.22499804689195407,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0017113248786021305,
+      "loss": 0.1006,
+      "step": 25920
+    },
+    {
+      "epoch": 0.22500672737215824,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001711303035389784,
+      "loss": 0.1206,
+      "step": 25921
+    },
+    {
+      "epoch": 0.2250154078523624,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0017112811915089262,
+      "loss": 0.0859,
+      "step": 25922
+    },
+    {
+      "epoch": 0.22502408833256657,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0017112593469595803,
+      "loss": 0.1128,
+      "step": 25923
+    },
+    {
+      "epoch": 0.22503276881277073,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0017112375017417705,
+      "loss": 0.1157,
+      "step": 25924
+    },
+    {
+      "epoch": 0.2250414492929749,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0017112156558555206,
+      "loss": 0.0898,
+      "step": 25925
+    },
+    {
+      "epoch": 0.22505012977317906,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017111938093008542,
+      "loss": 0.106,
+      "step": 25926
+    },
+    {
+      "epoch": 0.22505881025338323,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0017111719620777955,
+      "loss": 0.0767,
+      "step": 25927
+    },
+    {
+      "epoch": 0.2250674907335874,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0017111501141863685,
+      "loss": 0.1025,
+      "step": 25928
+    },
+    {
+      "epoch": 0.22507617121379156,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0017111282656265966,
+      "loss": 0.0918,
+      "step": 25929
+    },
+    {
+      "epoch": 0.22508485169399572,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0017111064163985042,
+      "loss": 0.1396,
+      "step": 25930
+    },
+    {
+      "epoch": 0.2250935321741999,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001711084566502115,
+      "loss": 0.1211,
+      "step": 25931
+    },
+    {
+      "epoch": 0.22510221265440405,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001711062715937453,
+      "loss": 0.1553,
+      "step": 25932
+    },
+    {
+      "epoch": 0.22511089313460822,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001711040864704542,
+      "loss": 0.1069,
+      "step": 25933
+    },
+    {
+      "epoch": 0.22511957361481238,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0017110190128034057,
+      "loss": 0.1113,
+      "step": 25934
+    },
+    {
+      "epoch": 0.22512825409501655,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001710997160234068,
+      "loss": 0.1309,
+      "step": 25935
+    },
+    {
+      "epoch": 0.2251369345752207,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0017109753069965534,
+      "loss": 0.0811,
+      "step": 25936
+    },
+    {
+      "epoch": 0.22514561505542485,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0017109534530908852,
+      "loss": 0.0869,
+      "step": 25937
+    },
+    {
+      "epoch": 0.22515429553562902,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0017109315985170874,
+      "loss": 0.106,
+      "step": 25938
+    },
+    {
+      "epoch": 0.22516297601583318,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0017109097432751843,
+      "loss": 0.1152,
+      "step": 25939
+    },
+    {
+      "epoch": 0.22517165649603735,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0017108878873651995,
+      "loss": 0.0986,
+      "step": 25940
+    },
+    {
+      "epoch": 0.2251803369762415,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0017108660307871564,
+      "loss": 0.1152,
+      "step": 25941
+    },
+    {
+      "epoch": 0.22518901745644568,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0017108441735410797,
+      "loss": 0.1475,
+      "step": 25942
+    },
+    {
+      "epoch": 0.22519769793664984,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0017108223156269932,
+      "loss": 0.103,
+      "step": 25943
+    },
+    {
+      "epoch": 0.225206378416854,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00171080045704492,
+      "loss": 0.1035,
+      "step": 25944
+    },
+    {
+      "epoch": 0.22521505889705817,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0017107785977948848,
+      "loss": 0.127,
+      "step": 25945
+    },
+    {
+      "epoch": 0.22522373937726234,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0017107567378769116,
+      "loss": 0.085,
+      "step": 25946
+    },
+    {
+      "epoch": 0.2252324198574665,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.001710734877291024,
+      "loss": 0.0654,
+      "step": 25947
+    },
+    {
+      "epoch": 0.22524110033767067,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0017107130160372457,
+      "loss": 0.0845,
+      "step": 25948
+    },
+    {
+      "epoch": 0.22524978081787483,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001710691154115601,
+      "loss": 0.1045,
+      "step": 25949
+    },
+    {
+      "epoch": 0.225258461298079,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0017106692915261135,
+      "loss": 0.1016,
+      "step": 25950
+    },
+    {
+      "epoch": 0.22526714177828316,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001710647428268807,
+      "loss": 0.0986,
+      "step": 25951
+    },
+    {
+      "epoch": 0.22527582225848733,
+      "grad_norm": 1.0,
+      "learning_rate": 0.001710625564343706,
+      "loss": 0.1309,
+      "step": 25952
+    },
+    {
+      "epoch": 0.2252845027386915,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0017106036997508339,
+      "loss": 0.0811,
+      "step": 25953
+    },
+    {
+      "epoch": 0.22529318321889566,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001710581834490215,
+      "loss": 0.1279,
+      "step": 25954
+    },
+    {
+      "epoch": 0.22530186369909982,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0017105599685618726,
+      "loss": 0.0879,
+      "step": 25955
+    },
+    {
+      "epoch": 0.225310544179304,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0017105381019658313,
+      "loss": 0.083,
+      "step": 25956
+    },
+    {
+      "epoch": 0.22531922465950815,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0017105162347021145,
+      "loss": 0.1484,
+      "step": 25957
+    },
+    {
+      "epoch": 0.22532790513971232,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0017104943667707465,
+      "loss": 0.1309,
+      "step": 25958
+    },
+    {
+      "epoch": 0.22533658561991649,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0017104724981717508,
+      "loss": 0.1196,
+      "step": 25959
+    },
+    {
+      "epoch": 0.22534526610012065,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001710450628905152,
+      "loss": 0.1035,
+      "step": 25960
+    },
+    {
+      "epoch": 0.22535394658032482,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001710428758970973,
+      "loss": 0.1494,
+      "step": 25961
+    },
+    {
+      "epoch": 0.22536262706052898,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0017104068883692387,
+      "loss": 0.1064,
+      "step": 25962
+    },
+    {
+      "epoch": 0.22537130754073315,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0017103850170999725,
+      "loss": 0.1235,
+      "step": 25963
+    },
+    {
+      "epoch": 0.2253799880209373,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001710363145163198,
+      "loss": 0.1025,
+      "step": 25964
+    },
+    {
+      "epoch": 0.22538866850114148,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.00171034127255894,
+      "loss": 0.1406,
+      "step": 25965
+    },
+    {
+      "epoch": 0.22539734898134564,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.001710319399287222,
+      "loss": 0.085,
+      "step": 25966
+    },
+    {
+      "epoch": 0.2254060294615498,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0017102975253480677,
+      "loss": 0.1299,
+      "step": 25967
+    },
+    {
+      "epoch": 0.22541470994175397,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001710275650741501,
+      "loss": 0.126,
+      "step": 25968
+    },
+    {
+      "epoch": 0.22542339042195814,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0017102537754675464,
+      "loss": 0.1055,
+      "step": 25969
+    },
+    {
+      "epoch": 0.2254320709021623,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0017102318995262274,
+      "loss": 0.1338,
+      "step": 25970
+    },
+    {
+      "epoch": 0.22544075138236647,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0017102100229175678,
+      "loss": 0.1226,
+      "step": 25971
+    },
+    {
+      "epoch": 0.22544943186257063,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0017101881456415917,
+      "loss": 0.1104,
+      "step": 25972
+    },
+    {
+      "epoch": 0.2254581123427748,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001710166267698323,
+      "loss": 0.1211,
+      "step": 25973
+    },
+    {
+      "epoch": 0.22546679282297896,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0017101443890877857,
+      "loss": 0.1318,
+      "step": 25974
+    },
+    {
+      "epoch": 0.22547547330318313,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0017101225098100038,
+      "loss": 0.0801,
+      "step": 25975
+    },
+    {
+      "epoch": 0.2254841537833873,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001710100629865001,
+      "loss": 0.1201,
+      "step": 25976
+    },
+    {
+      "epoch": 0.22549283426359146,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001710078749252801,
+      "loss": 0.125,
+      "step": 25977
+    },
+    {
+      "epoch": 0.22550151474379562,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0017100568679734283,
+      "loss": 0.0933,
+      "step": 25978
+    },
+    {
+      "epoch": 0.2255101952239998,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0017100349860269066,
+      "loss": 0.1006,
+      "step": 25979
+    },
+    {
+      "epoch": 0.22551887570420395,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.00171001310341326,
+      "loss": 0.1221,
+      "step": 25980
+    },
+    {
+      "epoch": 0.22552755618440812,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001709991220132512,
+      "loss": 0.0972,
+      "step": 25981
+    },
+    {
+      "epoch": 0.22553623666461228,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0017099693361846866,
+      "loss": 0.1396,
+      "step": 25982
+    },
+    {
+      "epoch": 0.22554491714481645,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0017099474515698083,
+      "loss": 0.1143,
+      "step": 25983
+    },
+    {
+      "epoch": 0.2255535976250206,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0017099255662879002,
+      "loss": 0.1074,
+      "step": 25984
+    },
+    {
+      "epoch": 0.22556227810522478,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0017099036803389868,
+      "loss": 0.1001,
+      "step": 25985
+    },
+    {
+      "epoch": 0.22557095858542894,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0017098817937230921,
+      "loss": 0.1016,
+      "step": 25986
+    },
+    {
+      "epoch": 0.2255796390656331,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00170985990644024,
+      "loss": 0.168,
+      "step": 25987
+    },
+    {
+      "epoch": 0.22558831954583727,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0017098380184904537,
+      "loss": 0.1445,
+      "step": 25988
+    },
+    {
+      "epoch": 0.22559700002604144,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001709816129873758,
+      "loss": 0.1001,
+      "step": 25989
+    },
+    {
+      "epoch": 0.2256056805062456,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0017097942405901766,
+      "loss": 0.0996,
+      "step": 25990
+    },
+    {
+      "epoch": 0.22561436098644977,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0017097723506397332,
+      "loss": 0.1367,
+      "step": 25991
+    },
+    {
+      "epoch": 0.22562304146665393,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001709750460022452,
+      "loss": 0.0767,
+      "step": 25992
+    },
+    {
+      "epoch": 0.2256317219468581,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0017097285687383567,
+      "loss": 0.1201,
+      "step": 25993
+    },
+    {
+      "epoch": 0.22564040242706226,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0017097066767874716,
+      "loss": 0.0981,
+      "step": 25994
+    },
+    {
+      "epoch": 0.22564908290726643,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0017096847841698204,
+      "loss": 0.0947,
+      "step": 25995
+    },
+    {
+      "epoch": 0.2256577633874706,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0017096628908854275,
+      "loss": 0.1152,
+      "step": 25996
+    },
+    {
+      "epoch": 0.22566644386767476,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0017096409969343158,
+      "loss": 0.0811,
+      "step": 25997
+    },
+    {
+      "epoch": 0.22567512434787892,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.00170961910231651,
+      "loss": 0.1143,
+      "step": 25998
+    },
+    {
+      "epoch": 0.2256838048280831,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001709597207032034,
+      "loss": 0.1475,
+      "step": 25999
+    },
+    {
+      "epoch": 0.22569248530828726,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0017095753110809118,
+      "loss": 0.0879,
+      "step": 26000
+    },
+    {
+      "epoch": 0.22570116578849142,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001709553414463167,
+      "loss": 0.1064,
+      "step": 26001
+    },
+    {
+      "epoch": 0.22570984626869559,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017095315171788237,
+      "loss": 0.1484,
+      "step": 26002
+    },
+    {
+      "epoch": 0.22571852674889975,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.001709509619227906,
+      "loss": 0.1211,
+      "step": 26003
+    },
+    {
+      "epoch": 0.22572720722910392,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0017094877206104378,
+      "loss": 0.1221,
+      "step": 26004
+    },
+    {
+      "epoch": 0.22573588770930808,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001709465821326443,
+      "loss": 0.0923,
+      "step": 26005
+    },
+    {
+      "epoch": 0.22574456818951225,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0017094439213759453,
+      "loss": 0.085,
+      "step": 26006
+    },
+    {
+      "epoch": 0.2257532486697164,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001709422020758969,
+      "loss": 0.1201,
+      "step": 26007
+    },
+    {
+      "epoch": 0.22576192914992058,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0017094001194755382,
+      "loss": 0.1216,
+      "step": 26008
+    },
+    {
+      "epoch": 0.22577060963012474,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0017093782175256762,
+      "loss": 0.085,
+      "step": 26009
+    },
+    {
+      "epoch": 0.2257792901103289,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0017093563149094076,
+      "loss": 0.0781,
+      "step": 26010
+    },
+    {
+      "epoch": 0.22578797059053307,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0017093344116267558,
+      "loss": 0.0977,
+      "step": 26011
+    },
+    {
+      "epoch": 0.22579665107073724,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001709312507677745,
+      "loss": 0.1006,
+      "step": 26012
+    },
+    {
+      "epoch": 0.2258053315509414,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0017092906030623995,
+      "loss": 0.0728,
+      "step": 26013
+    },
+    {
+      "epoch": 0.22581401203114557,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001709268697780743,
+      "loss": 0.1064,
+      "step": 26014
+    },
+    {
+      "epoch": 0.22582269251134973,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0017092467918327994,
+      "loss": 0.1348,
+      "step": 26015
+    },
+    {
+      "epoch": 0.2258313729915539,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0017092248852185923,
+      "loss": 0.0918,
+      "step": 26016
+    },
+    {
+      "epoch": 0.22584005347175806,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0017092029779381464,
+      "loss": 0.106,
+      "step": 26017
+    },
+    {
+      "epoch": 0.22584873395196223,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0017091810699914852,
+      "loss": 0.0996,
+      "step": 26018
+    },
+    {
+      "epoch": 0.2258574144321664,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0017091591613786327,
+      "loss": 0.1069,
+      "step": 26019
+    },
+    {
+      "epoch": 0.22586609491237056,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0017091372520996128,
+      "loss": 0.0923,
+      "step": 26020
+    },
+    {
+      "epoch": 0.22587477539257472,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0017091153421544497,
+      "loss": 0.1064,
+      "step": 26021
+    },
+    {
+      "epoch": 0.2258834558727789,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017090934315431672,
+      "loss": 0.1187,
+      "step": 26022
+    },
+    {
+      "epoch": 0.22589213635298305,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001709071520265789,
+      "loss": 0.084,
+      "step": 26023
+    },
+    {
+      "epoch": 0.22590081683318722,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0017090496083223394,
+      "loss": 0.1377,
+      "step": 26024
+    },
+    {
+      "epoch": 0.22590949731339138,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0017090276957128425,
+      "loss": 0.168,
+      "step": 26025
+    },
+    {
+      "epoch": 0.22591817779359555,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001709005782437322,
+      "loss": 0.0869,
+      "step": 26026
+    },
+    {
+      "epoch": 0.2259268582737997,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001708983868495802,
+      "loss": 0.1289,
+      "step": 26027
+    },
+    {
+      "epoch": 0.22593553875400388,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0017089619538883062,
+      "loss": 0.1348,
+      "step": 26028
+    },
+    {
+      "epoch": 0.22594421923420804,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001708940038614859,
+      "loss": 0.106,
+      "step": 26029
+    },
+    {
+      "epoch": 0.2259528997144122,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0017089181226754842,
+      "loss": 0.0786,
+      "step": 26030
+    },
+    {
+      "epoch": 0.22596158019461637,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0017088962060702053,
+      "loss": 0.0737,
+      "step": 26031
+    },
+    {
+      "epoch": 0.22597026067482054,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001708874288799047,
+      "loss": 0.1191,
+      "step": 26032
+    },
+    {
+      "epoch": 0.2259789411550247,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0017088523708620328,
+      "loss": 0.0938,
+      "step": 26033
+    },
+    {
+      "epoch": 0.22598762163522887,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0017088304522591866,
+      "loss": 0.0815,
+      "step": 26034
+    },
+    {
+      "epoch": 0.22599630211543303,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0017088085329905328,
+      "loss": 0.0703,
+      "step": 26035
+    },
+    {
+      "epoch": 0.2260049825956372,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001708786613056095,
+      "loss": 0.0942,
+      "step": 26036
+    },
+    {
+      "epoch": 0.22601366307584136,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0017087646924558973,
+      "loss": 0.0928,
+      "step": 26037
+    },
+    {
+      "epoch": 0.22602234355604553,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001708742771189964,
+      "loss": 0.1221,
+      "step": 26038
+    },
+    {
+      "epoch": 0.2260310240362497,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0017087208492583183,
+      "loss": 0.1621,
+      "step": 26039
+    },
+    {
+      "epoch": 0.22603970451645386,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001708698926660985,
+      "loss": 0.1016,
+      "step": 26040
+    },
+    {
+      "epoch": 0.22604838499665802,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0017086770033979872,
+      "loss": 0.105,
+      "step": 26041
+    },
+    {
+      "epoch": 0.2260570654768622,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0017086550794693498,
+      "loss": 0.1011,
+      "step": 26042
+    },
+    {
+      "epoch": 0.22606574595706636,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0017086331548750965,
+      "loss": 0.0786,
+      "step": 26043
+    },
+    {
+      "epoch": 0.22607442643727052,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0017086112296152507,
+      "loss": 0.0972,
+      "step": 26044
+    },
+    {
+      "epoch": 0.22608310691747469,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001708589303689837,
+      "loss": 0.0947,
+      "step": 26045
+    },
+    {
+      "epoch": 0.22609178739767885,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0017085673770988793,
+      "loss": 0.0908,
+      "step": 26046
+    },
+    {
+      "epoch": 0.22610046787788302,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0017085454498424013,
+      "loss": 0.1025,
+      "step": 26047
+    },
+    {
+      "epoch": 0.22610914835808718,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0017085235219204272,
+      "loss": 0.1152,
+      "step": 26048
+    },
+    {
+      "epoch": 0.22611782883829135,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001708501593332981,
+      "loss": 0.1128,
+      "step": 26049
+    },
+    {
+      "epoch": 0.2261265093184955,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0017084796640800862,
+      "loss": 0.1182,
+      "step": 26050
+    },
+    {
+      "epoch": 0.22613518979869968,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0017084577341617674,
+      "loss": 0.1406,
+      "step": 26051
+    },
+    {
+      "epoch": 0.22614387027890384,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0017084358035780484,
+      "loss": 0.1133,
+      "step": 26052
+    },
+    {
+      "epoch": 0.226152550759108,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017084138723289532,
+      "loss": 0.0742,
+      "step": 26053
+    },
+    {
+      "epoch": 0.22616123123931217,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.001708391940414506,
+      "loss": 0.1279,
+      "step": 26054
+    },
+    {
+      "epoch": 0.22616991171951634,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.00170837000783473,
+      "loss": 0.1387,
+      "step": 26055
+    },
+    {
+      "epoch": 0.2261785921997205,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0017083480745896497,
+      "loss": 0.1006,
+      "step": 26056
+    },
+    {
+      "epoch": 0.22618727267992467,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017083261406792896,
+      "loss": 0.1074,
+      "step": 26057
+    },
+    {
+      "epoch": 0.22619595316012883,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0017083042061036729,
+      "loss": 0.123,
+      "step": 26058
+    },
+    {
+      "epoch": 0.22620463364033297,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.001708282270862824,
+      "loss": 0.1445,
+      "step": 26059
+    },
+    {
+      "epoch": 0.22621331412053713,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0017082603349567664,
+      "loss": 0.0825,
+      "step": 26060
+    },
+    {
+      "epoch": 0.2262219946007413,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0017082383983855248,
+      "loss": 0.1289,
+      "step": 26061
+    },
+    {
+      "epoch": 0.22623067508094546,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0017082164611491226,
+      "loss": 0.1167,
+      "step": 26062
+    },
+    {
+      "epoch": 0.22623935556114963,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0017081945232475843,
+      "loss": 0.0986,
+      "step": 26063
+    },
+    {
+      "epoch": 0.2262480360413538,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.001708172584680933,
+      "loss": 0.0967,
+      "step": 26064
+    },
+    {
+      "epoch": 0.22625671652155796,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0017081506454491937,
+      "loss": 0.1045,
+      "step": 26065
+    },
+    {
+      "epoch": 0.22626539700176213,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0017081287055523901,
+      "loss": 0.0986,
+      "step": 26066
+    },
+    {
+      "epoch": 0.2262740774819663,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0017081067649905458,
+      "loss": 0.1177,
+      "step": 26067
+    },
+    {
+      "epoch": 0.22628275796217046,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0017080848237636854,
+      "loss": 0.0991,
+      "step": 26068
+    },
+    {
+      "epoch": 0.22629143844237462,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0017080628818718323,
+      "loss": 0.0967,
+      "step": 26069
+    },
+    {
+      "epoch": 0.22630011892257879,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0017080409393150106,
+      "loss": 0.1621,
+      "step": 26070
+    },
+    {
+      "epoch": 0.22630879940278295,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0017080189960932452,
+      "loss": 0.1055,
+      "step": 26071
+    },
+    {
+      "epoch": 0.22631747988298712,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0017079970522065587,
+      "loss": 0.106,
+      "step": 26072
+    },
+    {
+      "epoch": 0.22632616036319128,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.001707975107654976,
+      "loss": 0.1094,
+      "step": 26073
+    },
+    {
+      "epoch": 0.22633484084339545,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0017079531624385207,
+      "loss": 0.0933,
+      "step": 26074
+    },
+    {
+      "epoch": 0.2263435213235996,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0017079312165572172,
+      "loss": 0.1221,
+      "step": 26075
+    },
+    {
+      "epoch": 0.22635220180380378,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001707909270011089,
+      "loss": 0.0938,
+      "step": 26076
+    },
+    {
+      "epoch": 0.22636088228400794,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0017078873228001605,
+      "loss": 0.1045,
+      "step": 26077
+    },
+    {
+      "epoch": 0.2263695627642121,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0017078653749244557,
+      "loss": 0.1045,
+      "step": 26078
+    },
+    {
+      "epoch": 0.22637824324441627,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0017078434263839981,
+      "loss": 0.1021,
+      "step": 26079
+    },
+    {
+      "epoch": 0.22638692372462044,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0017078214771788123,
+      "loss": 0.1172,
+      "step": 26080
+    },
+    {
+      "epoch": 0.2263956042048246,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0017077995273089219,
+      "loss": 0.1064,
+      "step": 26081
+    },
+    {
+      "epoch": 0.22640428468502877,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0017077775767743513,
+      "loss": 0.1279,
+      "step": 26082
+    },
+    {
+      "epoch": 0.22641296516523293,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0017077556255751243,
+      "loss": 0.1113,
+      "step": 26083
+    },
+    {
+      "epoch": 0.2264216456454371,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0017077336737112646,
+      "loss": 0.126,
+      "step": 26084
+    },
+    {
+      "epoch": 0.22643032612564126,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0017077117211827969,
+      "loss": 0.0928,
+      "step": 26085
+    },
+    {
+      "epoch": 0.22643900660584543,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0017076897679897444,
+      "loss": 0.0903,
+      "step": 26086
+    },
+    {
+      "epoch": 0.2264476870860496,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017076678141321317,
+      "loss": 0.0898,
+      "step": 26087
+    },
+    {
+      "epoch": 0.22645636756625376,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0017076458596099824,
+      "loss": 0.0835,
+      "step": 26088
+    },
+    {
+      "epoch": 0.22646504804645792,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0017076239044233207,
+      "loss": 0.0967,
+      "step": 26089
+    },
+    {
+      "epoch": 0.2264737285266621,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001707601948572171,
+      "loss": 0.1816,
+      "step": 26090
+    },
+    {
+      "epoch": 0.22648240900686625,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0017075799920565568,
+      "loss": 0.1211,
+      "step": 26091
+    },
+    {
+      "epoch": 0.22649108948707042,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.001707558034876502,
+      "loss": 0.0977,
+      "step": 26092
+    },
+    {
+      "epoch": 0.22649976996727458,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0017075360770320309,
+      "loss": 0.1006,
+      "step": 26093
+    },
+    {
+      "epoch": 0.22650845044747875,
+      "grad_norm": 1.859375,
+      "learning_rate": 0.0017075141185231677,
+      "loss": 0.4004,
+      "step": 26094
+    },
+    {
+      "epoch": 0.2265171309276829,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001707492159349936,
+      "loss": 0.1084,
+      "step": 26095
+    },
+    {
+      "epoch": 0.22652581140788708,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0017074701995123602,
+      "loss": 0.1104,
+      "step": 26096
+    },
+    {
+      "epoch": 0.22653449188809124,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0017074482390104643,
+      "loss": 0.124,
+      "step": 26097
+    },
+    {
+      "epoch": 0.2265431723682954,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0017074262778442712,
+      "loss": 0.0967,
+      "step": 26098
+    },
+    {
+      "epoch": 0.22655185284849957,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001707404316013807,
+      "loss": 0.1074,
+      "step": 26099
+    },
+    {
+      "epoch": 0.22656053332870374,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001707382353519094,
+      "loss": 0.1011,
+      "step": 26100
+    },
+    {
+      "epoch": 0.2265692138089079,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0017073603903601567,
+      "loss": 0.1118,
+      "step": 26101
+    },
+    {
+      "epoch": 0.22657789428911207,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0017073384265370193,
+      "loss": 0.1787,
+      "step": 26102
+    },
+    {
+      "epoch": 0.22658657476931623,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001707316462049706,
+      "loss": 0.0903,
+      "step": 26103
+    },
+    {
+      "epoch": 0.2265952552495204,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0017072944968982403,
+      "loss": 0.0859,
+      "step": 26104
+    },
+    {
+      "epoch": 0.22660393572972456,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0017072725310826465,
+      "loss": 0.0967,
+      "step": 26105
+    },
+    {
+      "epoch": 0.22661261620992873,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0017072505646029482,
+      "loss": 0.0825,
+      "step": 26106
+    },
+    {
+      "epoch": 0.2266212966901329,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0017072285974591701,
+      "loss": 0.0693,
+      "step": 26107
+    },
+    {
+      "epoch": 0.22662997717033706,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0017072066296513362,
+      "loss": 0.0708,
+      "step": 26108
+    },
+    {
+      "epoch": 0.22663865765054123,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0017071846611794702,
+      "loss": 0.0703,
+      "step": 26109
+    },
+    {
+      "epoch": 0.2266473381307454,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0017071626920435962,
+      "loss": 0.1035,
+      "step": 26110
+    },
+    {
+      "epoch": 0.22665601861094956,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0017071407222437377,
+      "loss": 0.1533,
+      "step": 26111
+    },
+    {
+      "epoch": 0.22666469909115372,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0017071187517799197,
+      "loss": 0.1074,
+      "step": 26112
+    },
+    {
+      "epoch": 0.22667337957135789,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0017070967806521655,
+      "loss": 0.0957,
+      "step": 26113
+    },
+    {
+      "epoch": 0.22668206005156205,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0017070748088604997,
+      "loss": 0.0791,
+      "step": 26114
+    },
+    {
+      "epoch": 0.22669074053176622,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0017070528364049454,
+      "loss": 0.1074,
+      "step": 26115
+    },
+    {
+      "epoch": 0.22669942101197038,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0017070308632855277,
+      "loss": 0.0947,
+      "step": 26116
+    },
+    {
+      "epoch": 0.22670810149217455,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0017070088895022701,
+      "loss": 0.1045,
+      "step": 26117
+    },
+    {
+      "epoch": 0.2267167819723787,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001706986915055197,
+      "loss": 0.0801,
+      "step": 26118
+    },
+    {
+      "epoch": 0.22672546245258288,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0017069649399443314,
+      "loss": 0.1143,
+      "step": 26119
+    },
+    {
+      "epoch": 0.22673414293278704,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0017069429641696986,
+      "loss": 0.0981,
+      "step": 26120
+    },
+    {
+      "epoch": 0.2267428234129912,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001706920987731322,
+      "loss": 0.1074,
+      "step": 26121
+    },
+    {
+      "epoch": 0.22675150389319537,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001706899010629226,
+      "loss": 0.0693,
+      "step": 26122
+    },
+    {
+      "epoch": 0.22676018437339954,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0017068770328634343,
+      "loss": 0.125,
+      "step": 26123
+    },
+    {
+      "epoch": 0.2267688648536037,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0017068550544339703,
+      "loss": 0.124,
+      "step": 26124
+    },
+    {
+      "epoch": 0.22677754533380787,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0017068330753408595,
+      "loss": 0.125,
+      "step": 26125
+    },
+    {
+      "epoch": 0.22678622581401203,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0017068110955841247,
+      "loss": 0.0967,
+      "step": 26126
+    },
+    {
+      "epoch": 0.2267949062942162,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0017067891151637904,
+      "loss": 0.1904,
+      "step": 26127
+    },
+    {
+      "epoch": 0.22680358677442036,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0017067671340798812,
+      "loss": 0.1602,
+      "step": 26128
+    },
+    {
+      "epoch": 0.22681226725462453,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.00170674515233242,
+      "loss": 0.1064,
+      "step": 26129
+    },
+    {
+      "epoch": 0.2268209477348287,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0017067231699214316,
+      "loss": 0.125,
+      "step": 26130
+    },
+    {
+      "epoch": 0.22682962821503286,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0017067011868469398,
+      "loss": 0.0986,
+      "step": 26131
+    },
+    {
+      "epoch": 0.22683830869523702,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0017066792031089687,
+      "loss": 0.1094,
+      "step": 26132
+    },
+    {
+      "epoch": 0.2268469891754412,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0017066572187075424,
+      "loss": 0.125,
+      "step": 26133
+    },
+    {
+      "epoch": 0.22685566965564535,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0017066352336426848,
+      "loss": 0.1025,
+      "step": 26134
+    },
+    {
+      "epoch": 0.22686435013584952,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0017066132479144204,
+      "loss": 0.1162,
+      "step": 26135
+    },
+    {
+      "epoch": 0.22687303061605368,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0017065912615227725,
+      "loss": 0.0938,
+      "step": 26136
+    },
+    {
+      "epoch": 0.22688171109625785,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0017065692744677654,
+      "loss": 0.0986,
+      "step": 26137
+    },
+    {
+      "epoch": 0.226890391576462,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0017065472867494236,
+      "loss": 0.0806,
+      "step": 26138
+    },
+    {
+      "epoch": 0.22689907205666618,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0017065252983677708,
+      "loss": 0.1055,
+      "step": 26139
+    },
+    {
+      "epoch": 0.22690775253687034,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0017065033093228306,
+      "loss": 0.1221,
+      "step": 26140
+    },
+    {
+      "epoch": 0.2269164330170745,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001706481319614628,
+      "loss": 0.0996,
+      "step": 26141
+    },
+    {
+      "epoch": 0.22692511349727867,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0017064593292431863,
+      "loss": 0.0903,
+      "step": 26142
+    },
+    {
+      "epoch": 0.22693379397748284,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0017064373382085296,
+      "loss": 0.0986,
+      "step": 26143
+    },
+    {
+      "epoch": 0.226942474457687,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0017064153465106823,
+      "loss": 0.085,
+      "step": 26144
+    },
+    {
+      "epoch": 0.22695115493789117,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0017063933541496686,
+      "loss": 0.1387,
+      "step": 26145
+    },
+    {
+      "epoch": 0.22695983541809533,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001706371361125512,
+      "loss": 0.1699,
+      "step": 26146
+    },
+    {
+      "epoch": 0.2269685158982995,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0017063493674382368,
+      "loss": 0.0918,
+      "step": 26147
+    },
+    {
+      "epoch": 0.22697719637850367,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001706327373087867,
+      "loss": 0.1396,
+      "step": 26148
+    },
+    {
+      "epoch": 0.22698587685870783,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0017063053780744266,
+      "loss": 0.1377,
+      "step": 26149
+    },
+    {
+      "epoch": 0.226994557338912,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00170628338239794,
+      "loss": 0.1631,
+      "step": 26150
+    },
+    {
+      "epoch": 0.22700323781911616,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0017062613860584306,
+      "loss": 0.1206,
+      "step": 26151
+    },
+    {
+      "epoch": 0.22701191829932033,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0017062393890559233,
+      "loss": 0.1641,
+      "step": 26152
+    },
+    {
+      "epoch": 0.2270205987795245,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0017062173913904414,
+      "loss": 0.1021,
+      "step": 26153
+    },
+    {
+      "epoch": 0.22702927925972866,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0017061953930620096,
+      "loss": 0.1621,
+      "step": 26154
+    },
+    {
+      "epoch": 0.22703795973993282,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0017061733940706512,
+      "loss": 0.1318,
+      "step": 26155
+    },
+    {
+      "epoch": 0.22704664022013699,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001706151394416391,
+      "loss": 0.1064,
+      "step": 26156
+    },
+    {
+      "epoch": 0.22705532070034115,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0017061293940992526,
+      "loss": 0.1426,
+      "step": 26157
+    },
+    {
+      "epoch": 0.22706400118054532,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0017061073931192603,
+      "loss": 0.1016,
+      "step": 26158
+    },
+    {
+      "epoch": 0.22707268166074948,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0017060853914764378,
+      "loss": 0.1221,
+      "step": 26159
+    },
+    {
+      "epoch": 0.22708136214095365,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0017060633891708098,
+      "loss": 0.0913,
+      "step": 26160
+    },
+    {
+      "epoch": 0.2270900426211578,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0017060413862024,
+      "loss": 0.0913,
+      "step": 26161
+    },
+    {
+      "epoch": 0.22709872310136198,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001706019382571232,
+      "loss": 0.1021,
+      "step": 26162
+    },
+    {
+      "epoch": 0.22710740358156614,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0017059973782773306,
+      "loss": 0.1367,
+      "step": 26163
+    },
+    {
+      "epoch": 0.2271160840617703,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0017059753733207196,
+      "loss": 0.1289,
+      "step": 26164
+    },
+    {
+      "epoch": 0.22712476454197447,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001705953367701423,
+      "loss": 0.1338,
+      "step": 26165
+    },
+    {
+      "epoch": 0.22713344502217864,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0017059313614194648,
+      "loss": 0.1328,
+      "step": 26166
+    },
+    {
+      "epoch": 0.2271421255023828,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0017059093544748694,
+      "loss": 0.1162,
+      "step": 26167
+    },
+    {
+      "epoch": 0.22715080598258697,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017058873468676604,
+      "loss": 0.1006,
+      "step": 26168
+    },
+    {
+      "epoch": 0.22715948646279113,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0017058653385978622,
+      "loss": 0.1064,
+      "step": 26169
+    },
+    {
+      "epoch": 0.2271681669429953,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0017058433296654987,
+      "loss": 0.1279,
+      "step": 26170
+    },
+    {
+      "epoch": 0.22717684742319946,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001705821320070594,
+      "loss": 0.1162,
+      "step": 26171
+    },
+    {
+      "epoch": 0.22718552790340363,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0017057993098131723,
+      "loss": 0.0879,
+      "step": 26172
+    },
+    {
+      "epoch": 0.2271942083836078,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0017057772988932577,
+      "loss": 0.0928,
+      "step": 26173
+    },
+    {
+      "epoch": 0.22720288886381196,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0017057552873108739,
+      "loss": 0.1045,
+      "step": 26174
+    },
+    {
+      "epoch": 0.22721156934401612,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0017057332750660455,
+      "loss": 0.123,
+      "step": 26175
+    },
+    {
+      "epoch": 0.2272202498242203,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001705711262158796,
+      "loss": 0.1562,
+      "step": 26176
+    },
+    {
+      "epoch": 0.22722893030442445,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.00170568924858915,
+      "loss": 0.1084,
+      "step": 26177
+    },
+    {
+      "epoch": 0.22723761078462862,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001705667234357131,
+      "loss": 0.082,
+      "step": 26178
+    },
+    {
+      "epoch": 0.22724629126483278,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001705645219462764,
+      "loss": 0.1348,
+      "step": 26179
+    },
+    {
+      "epoch": 0.22725497174503695,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001705623203906072,
+      "loss": 0.1592,
+      "step": 26180
+    },
+    {
+      "epoch": 0.22726365222524111,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0017056011876870796,
+      "loss": 0.1104,
+      "step": 26181
+    },
+    {
+      "epoch": 0.22727233270544528,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0017055791708058114,
+      "loss": 0.0747,
+      "step": 26182
+    },
+    {
+      "epoch": 0.22728101318564942,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00170555715326229,
+      "loss": 0.103,
+      "step": 26183
+    },
+    {
+      "epoch": 0.22728969366585358,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0017055351350565408,
+      "loss": 0.0908,
+      "step": 26184
+    },
+    {
+      "epoch": 0.22729837414605775,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0017055131161885875,
+      "loss": 0.1484,
+      "step": 26185
+    },
+    {
+      "epoch": 0.2273070546262619,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0017054910966584542,
+      "loss": 0.0864,
+      "step": 26186
+    },
+    {
+      "epoch": 0.22731573510646608,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001705469076466165,
+      "loss": 0.0996,
+      "step": 26187
+    },
+    {
+      "epoch": 0.22732441558667024,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0017054470556117439,
+      "loss": 0.166,
+      "step": 26188
+    },
+    {
+      "epoch": 0.2273330960668744,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.001705425034095215,
+      "loss": 0.1045,
+      "step": 26189
+    },
+    {
+      "epoch": 0.22734177654707857,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0017054030119166021,
+      "loss": 0.1167,
+      "step": 26190
+    },
+    {
+      "epoch": 0.22735045702728274,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.00170538098907593,
+      "loss": 0.0938,
+      "step": 26191
+    },
+    {
+      "epoch": 0.2273591375074869,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001705358965573222,
+      "loss": 0.1084,
+      "step": 26192
+    },
+    {
+      "epoch": 0.22736781798769107,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0017053369414085026,
+      "loss": 0.0835,
+      "step": 26193
+    },
+    {
+      "epoch": 0.22737649846789523,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0017053149165817958,
+      "loss": 0.125,
+      "step": 26194
+    },
+    {
+      "epoch": 0.2273851789480994,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001705292891093126,
+      "loss": 0.1089,
+      "step": 26195
+    },
+    {
+      "epoch": 0.22739385942830356,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0017052708649425165,
+      "loss": 0.0649,
+      "step": 26196
+    },
+    {
+      "epoch": 0.22740253990850773,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0017052488381299922,
+      "loss": 0.084,
+      "step": 26197
+    },
+    {
+      "epoch": 0.2274112203887119,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0017052268106555767,
+      "loss": 0.1309,
+      "step": 26198
+    },
+    {
+      "epoch": 0.22741990086891606,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0017052047825192945,
+      "loss": 0.125,
+      "step": 26199
+    },
+    {
+      "epoch": 0.22742858134912022,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0017051827537211692,
+      "loss": 0.1104,
+      "step": 26200
+    },
+    {
+      "epoch": 0.2274372618293244,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001705160724261225,
+      "loss": 0.0957,
+      "step": 26201
+    },
+    {
+      "epoch": 0.22744594230952855,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0017051386941394867,
+      "loss": 0.1045,
+      "step": 26202
+    },
+    {
+      "epoch": 0.22745462278973272,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0017051166633559771,
+      "loss": 0.1035,
+      "step": 26203
+    },
+    {
+      "epoch": 0.22746330326993688,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0017050946319107218,
+      "loss": 0.0957,
+      "step": 26204
+    },
+    {
+      "epoch": 0.22747198375014105,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0017050725998037435,
+      "loss": 0.1055,
+      "step": 26205
+    },
+    {
+      "epoch": 0.22748066423034521,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0017050505670350672,
+      "loss": 0.1455,
+      "step": 26206
+    },
+    {
+      "epoch": 0.22748934471054938,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0017050285336047163,
+      "loss": 0.0698,
+      "step": 26207
+    },
+    {
+      "epoch": 0.22749802519075354,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0017050064995127155,
+      "loss": 0.1904,
+      "step": 26208
+    },
+    {
+      "epoch": 0.2275067056709577,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0017049844647590892,
+      "loss": 0.0981,
+      "step": 26209
+    },
+    {
+      "epoch": 0.22751538615116187,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0017049624293438601,
+      "loss": 0.1162,
+      "step": 26210
+    },
+    {
+      "epoch": 0.22752406663136604,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0017049403932670535,
+      "loss": 0.1011,
+      "step": 26211
+    },
+    {
+      "epoch": 0.2275327471115702,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0017049183565286934,
+      "loss": 0.1006,
+      "step": 26212
+    },
+    {
+      "epoch": 0.22754142759177437,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0017048963191288037,
+      "loss": 0.1816,
+      "step": 26213
+    },
+    {
+      "epoch": 0.22755010807197854,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001704874281067408,
+      "loss": 0.2539,
+      "step": 26214
+    },
+    {
+      "epoch": 0.2275587885521827,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0017048522423445314,
+      "loss": 0.1221,
+      "step": 26215
+    },
+    {
+      "epoch": 0.22756746903238687,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.001704830202960197,
+      "loss": 0.0615,
+      "step": 26216
+    },
+    {
+      "epoch": 0.22757614951259103,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.00170480816291443,
+      "loss": 0.1113,
+      "step": 26217
+    },
+    {
+      "epoch": 0.2275848299927952,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0017047861222072534,
+      "loss": 0.1196,
+      "step": 26218
+    },
+    {
+      "epoch": 0.22759351047299936,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0017047640808386917,
+      "loss": 0.1445,
+      "step": 26219
+    },
+    {
+      "epoch": 0.22760219095320353,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0017047420388087693,
+      "loss": 0.1133,
+      "step": 26220
+    },
+    {
+      "epoch": 0.2276108714334077,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0017047199961175102,
+      "loss": 0.0879,
+      "step": 26221
+    },
+    {
+      "epoch": 0.22761955191361186,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0017046979527649384,
+      "loss": 0.1299,
+      "step": 26222
+    },
+    {
+      "epoch": 0.22762823239381602,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0017046759087510778,
+      "loss": 0.1221,
+      "step": 26223
+    },
+    {
+      "epoch": 0.2276369128740202,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017046538640759527,
+      "loss": 0.1436,
+      "step": 26224
+    },
+    {
+      "epoch": 0.22764559335422435,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0017046318187395873,
+      "loss": 0.0928,
+      "step": 26225
+    },
+    {
+      "epoch": 0.22765427383442852,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0017046097727420057,
+      "loss": 0.1084,
+      "step": 26226
+    },
+    {
+      "epoch": 0.22766295431463268,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0017045877260832318,
+      "loss": 0.1348,
+      "step": 26227
+    },
+    {
+      "epoch": 0.22767163479483685,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0017045656787632896,
+      "loss": 0.1377,
+      "step": 26228
+    },
+    {
+      "epoch": 0.227680315275041,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0017045436307822042,
+      "loss": 0.0854,
+      "step": 26229
+    },
+    {
+      "epoch": 0.22768899575524518,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0017045215821399986,
+      "loss": 0.2773,
+      "step": 26230
+    },
+    {
+      "epoch": 0.22769767623544934,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0017044995328366972,
+      "loss": 0.1533,
+      "step": 26231
+    },
+    {
+      "epoch": 0.2277063567156535,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0017044774828723242,
+      "loss": 0.0796,
+      "step": 26232
+    },
+    {
+      "epoch": 0.22771503719585767,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0017044554322469038,
+      "loss": 0.166,
+      "step": 26233
+    },
+    {
+      "epoch": 0.22772371767606184,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0017044333809604598,
+      "loss": 0.0752,
+      "step": 26234
+    },
+    {
+      "epoch": 0.227732398156266,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0017044113290130168,
+      "loss": 0.1406,
+      "step": 26235
+    },
+    {
+      "epoch": 0.22774107863647017,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0017043892764045986,
+      "loss": 0.104,
+      "step": 26236
+    },
+    {
+      "epoch": 0.22774975911667433,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001704367223135229,
+      "loss": 0.1152,
+      "step": 26237
+    },
+    {
+      "epoch": 0.2277584395968785,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0017043451692049327,
+      "loss": 0.0928,
+      "step": 26238
+    },
+    {
+      "epoch": 0.22776712007708266,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0017043231146137339,
+      "loss": 0.1338,
+      "step": 26239
+    },
+    {
+      "epoch": 0.22777580055728683,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001704301059361656,
+      "loss": 0.0986,
+      "step": 26240
+    },
+    {
+      "epoch": 0.227784481037491,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0017042790034487241,
+      "loss": 0.0864,
+      "step": 26241
+    },
+    {
+      "epoch": 0.22779316151769516,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0017042569468749614,
+      "loss": 0.1201,
+      "step": 26242
+    },
+    {
+      "epoch": 0.22780184199789932,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0017042348896403924,
+      "loss": 0.127,
+      "step": 26243
+    },
+    {
+      "epoch": 0.2278105224781035,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0017042128317450412,
+      "loss": 0.0869,
+      "step": 26244
+    },
+    {
+      "epoch": 0.22781920295830765,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001704190773188932,
+      "loss": 0.1387,
+      "step": 26245
+    },
+    {
+      "epoch": 0.22782788343851182,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0017041687139720887,
+      "loss": 0.1162,
+      "step": 26246
+    },
+    {
+      "epoch": 0.22783656391871598,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0017041466540945356,
+      "loss": 0.1074,
+      "step": 26247
+    },
+    {
+      "epoch": 0.22784524439892015,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001704124593556297,
+      "loss": 0.0693,
+      "step": 26248
+    },
+    {
+      "epoch": 0.22785392487912431,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0017041025323573968,
+      "loss": 0.1084,
+      "step": 26249
+    },
+    {
+      "epoch": 0.22786260535932848,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0017040804704978587,
+      "loss": 0.0923,
+      "step": 26250
+    },
+    {
+      "epoch": 0.22787128583953264,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0017040584079777077,
+      "loss": 0.0869,
+      "step": 26251
+    },
+    {
+      "epoch": 0.2278799663197368,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0017040363447969674,
+      "loss": 0.0957,
+      "step": 26252
+    },
+    {
+      "epoch": 0.22788864679994097,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0017040142809556617,
+      "loss": 0.0918,
+      "step": 26253
+    },
+    {
+      "epoch": 0.22789732728014514,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0017039922164538155,
+      "loss": 0.0742,
+      "step": 26254
+    },
+    {
+      "epoch": 0.2279060077603493,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0017039701512914524,
+      "loss": 0.1094,
+      "step": 26255
+    },
+    {
+      "epoch": 0.22791468824055347,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0017039480854685967,
+      "loss": 0.1396,
+      "step": 26256
+    },
+    {
+      "epoch": 0.22792336872075764,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001703926018985272,
+      "loss": 0.0903,
+      "step": 26257
+    },
+    {
+      "epoch": 0.2279320492009618,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0017039039518415035,
+      "loss": 0.1191,
+      "step": 26258
+    },
+    {
+      "epoch": 0.22794072968116597,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0017038818840373142,
+      "loss": 0.1055,
+      "step": 26259
+    },
+    {
+      "epoch": 0.22794941016137013,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017038598155727294,
+      "loss": 0.0854,
+      "step": 26260
+    },
+    {
+      "epoch": 0.2279580906415743,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0017038377464477718,
+      "loss": 0.082,
+      "step": 26261
+    },
+    {
+      "epoch": 0.22796677112177846,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0017038156766624665,
+      "loss": 0.0977,
+      "step": 26262
+    },
+    {
+      "epoch": 0.22797545160198263,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.001703793606216838,
+      "loss": 0.1543,
+      "step": 26263
+    },
+    {
+      "epoch": 0.2279841320821868,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0017037715351109093,
+      "loss": 0.1182,
+      "step": 26264
+    },
+    {
+      "epoch": 0.22799281256239096,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0017037494633447055,
+      "loss": 0.0957,
+      "step": 26265
+    },
+    {
+      "epoch": 0.22800149304259512,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.00170372739091825,
+      "loss": 0.124,
+      "step": 26266
+    },
+    {
+      "epoch": 0.2280101735227993,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0017037053178315673,
+      "loss": 0.1172,
+      "step": 26267
+    },
+    {
+      "epoch": 0.22801885400300345,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0017036832440846818,
+      "loss": 0.1143,
+      "step": 26268
+    },
+    {
+      "epoch": 0.22802753448320762,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0017036611696776174,
+      "loss": 0.1177,
+      "step": 26269
+    },
+    {
+      "epoch": 0.22803621496341178,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0017036390946103981,
+      "loss": 0.0947,
+      "step": 26270
+    },
+    {
+      "epoch": 0.22804489544361595,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.001703617018883048,
+      "loss": 0.1094,
+      "step": 26271
+    },
+    {
+      "epoch": 0.2280535759238201,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0017035949424955917,
+      "loss": 0.1426,
+      "step": 26272
+    },
+    {
+      "epoch": 0.22806225640402428,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0017035728654480528,
+      "loss": 0.0728,
+      "step": 26273
+    },
+    {
+      "epoch": 0.22807093688422844,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0017035507877404558,
+      "loss": 0.1001,
+      "step": 26274
+    },
+    {
+      "epoch": 0.2280796173644326,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0017035287093728247,
+      "loss": 0.1377,
+      "step": 26275
+    },
+    {
+      "epoch": 0.22808829784463677,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0017035066303451838,
+      "loss": 0.0889,
+      "step": 26276
+    },
+    {
+      "epoch": 0.22809697832484094,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0017034845506575568,
+      "loss": 0.1631,
+      "step": 26277
+    },
+    {
+      "epoch": 0.2281056588050451,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0017034624703099685,
+      "loss": 0.0864,
+      "step": 26278
+    },
+    {
+      "epoch": 0.22811433928524927,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0017034403893024424,
+      "loss": 0.0996,
+      "step": 26279
+    },
+    {
+      "epoch": 0.22812301976545343,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001703418307635003,
+      "loss": 0.1035,
+      "step": 26280
+    },
+    {
+      "epoch": 0.2281317002456576,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0017033962253076747,
+      "loss": 0.1162,
+      "step": 26281
+    },
+    {
+      "epoch": 0.22814038072586176,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001703374142320481,
+      "loss": 0.1309,
+      "step": 26282
+    },
+    {
+      "epoch": 0.22814906120606593,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0017033520586734465,
+      "loss": 0.1543,
+      "step": 26283
+    },
+    {
+      "epoch": 0.2281577416862701,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0017033299743665956,
+      "loss": 0.1094,
+      "step": 26284
+    },
+    {
+      "epoch": 0.22816642216647426,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0017033078893999517,
+      "loss": 0.1104,
+      "step": 26285
+    },
+    {
+      "epoch": 0.22817510264667842,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0017032858037735394,
+      "loss": 0.1162,
+      "step": 26286
+    },
+    {
+      "epoch": 0.2281837831268826,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0017032637174873833,
+      "loss": 0.1211,
+      "step": 26287
+    },
+    {
+      "epoch": 0.22819246360708675,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0017032416305415065,
+      "loss": 0.1328,
+      "step": 26288
+    },
+    {
+      "epoch": 0.22820114408729092,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0017032195429359336,
+      "loss": 0.1348,
+      "step": 26289
+    },
+    {
+      "epoch": 0.22820982456749508,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0017031974546706892,
+      "loss": 0.0928,
+      "step": 26290
+    },
+    {
+      "epoch": 0.22821850504769925,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001703175365745797,
+      "loss": 0.1128,
+      "step": 26291
+    },
+    {
+      "epoch": 0.22822718552790341,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0017031532761612813,
+      "loss": 0.1475,
+      "step": 26292
+    },
+    {
+      "epoch": 0.22823586600810758,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0017031311859171665,
+      "loss": 0.125,
+      "step": 26293
+    },
+    {
+      "epoch": 0.22824454648831174,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0017031090950134762,
+      "loss": 0.0947,
+      "step": 26294
+    },
+    {
+      "epoch": 0.2282532269685159,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001703087003450235,
+      "loss": 0.1099,
+      "step": 26295
+    },
+    {
+      "epoch": 0.22826190744872007,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0017030649112274668,
+      "loss": 0.1108,
+      "step": 26296
+    },
+    {
+      "epoch": 0.22827058792892424,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0017030428183451957,
+      "loss": 0.1055,
+      "step": 26297
+    },
+    {
+      "epoch": 0.2282792684091284,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0017030207248034466,
+      "loss": 0.125,
+      "step": 26298
+    },
+    {
+      "epoch": 0.22828794888933257,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0017029986306022428,
+      "loss": 0.1123,
+      "step": 26299
+    },
+    {
+      "epoch": 0.22829662936953674,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0017029765357416088,
+      "loss": 0.1992,
+      "step": 26300
+    },
+    {
+      "epoch": 0.2283053098497409,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0017029544402215684,
+      "loss": 0.0742,
+      "step": 26301
+    },
+    {
+      "epoch": 0.22831399032994507,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0017029323440421462,
+      "loss": 0.106,
+      "step": 26302
+    },
+    {
+      "epoch": 0.22832267081014923,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001702910247203367,
+      "loss": 0.1064,
+      "step": 26303
+    },
+    {
+      "epoch": 0.2283313512903534,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0017028881497052532,
+      "loss": 0.1758,
+      "step": 26304
+    },
+    {
+      "epoch": 0.22834003177055756,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0017028660515478302,
+      "loss": 0.1104,
+      "step": 26305
+    },
+    {
+      "epoch": 0.2283487122507617,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0017028439527311222,
+      "loss": 0.1211,
+      "step": 26306
+    },
+    {
+      "epoch": 0.22835739273096586,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0017028218532551531,
+      "loss": 0.1133,
+      "step": 26307
+    },
+    {
+      "epoch": 0.22836607321117003,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0017027997531199471,
+      "loss": 0.0859,
+      "step": 26308
+    },
+    {
+      "epoch": 0.2283747536913742,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0017027776523255282,
+      "loss": 0.127,
+      "step": 26309
+    },
+    {
+      "epoch": 0.22838343417157836,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0017027555508719206,
+      "loss": 0.1099,
+      "step": 26310
+    },
+    {
+      "epoch": 0.22839211465178252,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0017027334487591489,
+      "loss": 0.1045,
+      "step": 26311
+    },
+    {
+      "epoch": 0.2284007951319867,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0017027113459872369,
+      "loss": 0.0796,
+      "step": 26312
+    },
+    {
+      "epoch": 0.22840947561219085,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0017026892425562087,
+      "loss": 0.1289,
+      "step": 26313
+    },
+    {
+      "epoch": 0.22841815609239502,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.0017026671384660884,
+      "loss": 0.1177,
+      "step": 26314
+    },
+    {
+      "epoch": 0.22842683657259918,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0017026450337169006,
+      "loss": 0.1211,
+      "step": 26315
+    },
+    {
+      "epoch": 0.22843551705280335,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0017026229283086693,
+      "loss": 0.1182,
+      "step": 26316
+    },
+    {
+      "epoch": 0.22844419753300751,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.0017026008222414187,
+      "loss": 0.1299,
+      "step": 26317
+    },
+    {
+      "epoch": 0.22845287801321168,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0017025787155151727,
+      "loss": 0.0732,
+      "step": 26318
+    },
+    {
+      "epoch": 0.22846155849341584,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0017025566081299557,
+      "loss": 0.0908,
+      "step": 26319
+    },
+    {
+      "epoch": 0.22847023897362,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001702534500085792,
+      "loss": 0.1299,
+      "step": 26320
+    },
+    {
+      "epoch": 0.22847891945382418,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0017025123913827053,
+      "loss": 0.1289,
+      "step": 26321
+    },
+    {
+      "epoch": 0.22848759993402834,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0017024902820207204,
+      "loss": 0.0762,
+      "step": 26322
+    },
+    {
+      "epoch": 0.2284962804142325,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0017024681719998611,
+      "loss": 0.0815,
+      "step": 26323
+    },
+    {
+      "epoch": 0.22850496089443667,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0017024460613201517,
+      "loss": 0.0645,
+      "step": 26324
+    },
+    {
+      "epoch": 0.22851364137464084,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0017024239499816164,
+      "loss": 0.1016,
+      "step": 26325
+    },
+    {
+      "epoch": 0.228522321854845,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001702401837984279,
+      "loss": 0.1289,
+      "step": 26326
+    },
+    {
+      "epoch": 0.22853100233504917,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0017023797253281642,
+      "loss": 0.123,
+      "step": 26327
+    },
+    {
+      "epoch": 0.22853968281525333,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.001702357612013296,
+      "loss": 0.1182,
+      "step": 26328
+    },
+    {
+      "epoch": 0.2285483632954575,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0017023354980396988,
+      "loss": 0.1338,
+      "step": 26329
+    },
+    {
+      "epoch": 0.22855704377566166,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0017023133834073961,
+      "loss": 0.1621,
+      "step": 26330
+    },
+    {
+      "epoch": 0.22856572425586583,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0017022912681164128,
+      "loss": 0.1016,
+      "step": 26331
+    },
+    {
+      "epoch": 0.22857440473607,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0017022691521667728,
+      "loss": 0.1553,
+      "step": 26332
+    },
+    {
+      "epoch": 0.22858308521627416,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0017022470355585,
+      "loss": 0.166,
+      "step": 26333
+    },
+    {
+      "epoch": 0.22859176569647832,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0017022249182916192,
+      "loss": 0.0781,
+      "step": 26334
+    },
+    {
+      "epoch": 0.2286004461766825,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0017022028003661543,
+      "loss": 0.1553,
+      "step": 26335
+    },
+    {
+      "epoch": 0.22860912665688665,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0017021806817821292,
+      "loss": 0.1445,
+      "step": 26336
+    },
+    {
+      "epoch": 0.22861780713709082,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0017021585625395688,
+      "loss": 0.1006,
+      "step": 26337
+    },
+    {
+      "epoch": 0.22862648761729498,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0017021364426384967,
+      "loss": 0.0815,
+      "step": 26338
+    },
+    {
+      "epoch": 0.22863516809749915,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0017021143220789369,
+      "loss": 0.1338,
+      "step": 26339
+    },
+    {
+      "epoch": 0.2286438485777033,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0017020922008609144,
+      "loss": 0.085,
+      "step": 26340
+    },
+    {
+      "epoch": 0.22865252905790748,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0017020700789844526,
+      "loss": 0.1099,
+      "step": 26341
+    },
+    {
+      "epoch": 0.22866120953811164,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0017020479564495762,
+      "loss": 0.1226,
+      "step": 26342
+    },
+    {
+      "epoch": 0.2286698900183158,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0017020258332563088,
+      "loss": 0.1445,
+      "step": 26343
+    },
+    {
+      "epoch": 0.22867857049851997,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001702003709404675,
+      "loss": 0.085,
+      "step": 26344
+    },
+    {
+      "epoch": 0.22868725097872414,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0017019815848946995,
+      "loss": 0.0991,
+      "step": 26345
+    },
+    {
+      "epoch": 0.2286959314589283,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0017019594597264057,
+      "loss": 0.1631,
+      "step": 26346
+    },
+    {
+      "epoch": 0.22870461193913247,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001701937333899818,
+      "loss": 0.0942,
+      "step": 26347
+    },
+    {
+      "epoch": 0.22871329241933663,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0017019152074149609,
+      "loss": 0.0967,
+      "step": 26348
+    },
+    {
+      "epoch": 0.2287219728995408,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0017018930802718583,
+      "loss": 0.104,
+      "step": 26349
+    },
+    {
+      "epoch": 0.22873065337974496,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0017018709524705346,
+      "loss": 0.0728,
+      "step": 26350
+    },
+    {
+      "epoch": 0.22873933385994913,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0017018488240110137,
+      "loss": 0.123,
+      "step": 26351
+    },
+    {
+      "epoch": 0.2287480143401533,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0017018266948933198,
+      "loss": 0.1226,
+      "step": 26352
+    },
+    {
+      "epoch": 0.22875669482035746,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0017018045651174775,
+      "loss": 0.0913,
+      "step": 26353
+    },
+    {
+      "epoch": 0.22876537530056162,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0017017824346835107,
+      "loss": 0.1006,
+      "step": 26354
+    },
+    {
+      "epoch": 0.2287740557807658,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0017017603035914437,
+      "loss": 0.1016,
+      "step": 26355
+    },
+    {
+      "epoch": 0.22878273626096995,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0017017381718413007,
+      "loss": 0.0952,
+      "step": 26356
+    },
+    {
+      "epoch": 0.22879141674117412,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001701716039433106,
+      "loss": 0.1006,
+      "step": 26357
+    },
+    {
+      "epoch": 0.22880009722137828,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0017016939063668832,
+      "loss": 0.1738,
+      "step": 26358
+    },
+    {
+      "epoch": 0.22880877770158245,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0017016717726426576,
+      "loss": 0.1035,
+      "step": 26359
+    },
+    {
+      "epoch": 0.22881745818178661,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0017016496382604525,
+      "loss": 0.1562,
+      "step": 26360
+    },
+    {
+      "epoch": 0.22882613866199078,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001701627503220292,
+      "loss": 0.1157,
+      "step": 26361
+    },
+    {
+      "epoch": 0.22883481914219495,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0017016053675222012,
+      "loss": 0.0806,
+      "step": 26362
+    },
+    {
+      "epoch": 0.2288434996223991,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0017015832311662036,
+      "loss": 0.1025,
+      "step": 26363
+    },
+    {
+      "epoch": 0.22885218010260328,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001701561094152324,
+      "loss": 0.1465,
+      "step": 26364
+    },
+    {
+      "epoch": 0.22886086058280744,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001701538956480586,
+      "loss": 0.0708,
+      "step": 26365
+    },
+    {
+      "epoch": 0.2288695410630116,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0017015168181510137,
+      "loss": 0.084,
+      "step": 26366
+    },
+    {
+      "epoch": 0.22887822154321577,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001701494679163632,
+      "loss": 0.1069,
+      "step": 26367
+    },
+    {
+      "epoch": 0.22888690202341994,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0017014725395184647,
+      "loss": 0.0996,
+      "step": 26368
+    },
+    {
+      "epoch": 0.2288955825036241,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001701450399215536,
+      "loss": 0.0889,
+      "step": 26369
+    },
+    {
+      "epoch": 0.22890426298382827,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0017014282582548705,
+      "loss": 0.1406,
+      "step": 26370
+    },
+    {
+      "epoch": 0.22891294346403243,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0017014061166364916,
+      "loss": 0.1182,
+      "step": 26371
+    },
+    {
+      "epoch": 0.2289216239442366,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001701383974360424,
+      "loss": 0.082,
+      "step": 26372
+    },
+    {
+      "epoch": 0.22893030442444076,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0017013618314266922,
+      "loss": 0.1021,
+      "step": 26373
+    },
+    {
+      "epoch": 0.22893898490464493,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0017013396878353202,
+      "loss": 0.1484,
+      "step": 26374
+    },
+    {
+      "epoch": 0.2289476653848491,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0017013175435863321,
+      "loss": 0.123,
+      "step": 26375
+    },
+    {
+      "epoch": 0.22895634586505326,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.001701295398679752,
+      "loss": 0.1377,
+      "step": 26376
+    },
+    {
+      "epoch": 0.22896502634525742,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0017012732531156043,
+      "loss": 0.1465,
+      "step": 26377
+    },
+    {
+      "epoch": 0.2289737068254616,
+      "grad_norm": 3.21875,
+      "learning_rate": 0.0017012511068939134,
+      "loss": 0.377,
+      "step": 26378
+    },
+    {
+      "epoch": 0.22898238730566575,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0017012289600147033,
+      "loss": 0.1011,
+      "step": 26379
+    },
+    {
+      "epoch": 0.22899106778586992,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001701206812477998,
+      "loss": 0.1074,
+      "step": 26380
+    },
+    {
+      "epoch": 0.22899974826607408,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0017011846642838223,
+      "loss": 0.0952,
+      "step": 26381
+    },
+    {
+      "epoch": 0.22900842874627825,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0017011625154322,
+      "loss": 0.0981,
+      "step": 26382
+    },
+    {
+      "epoch": 0.2290171092264824,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0017011403659231553,
+      "loss": 0.126,
+      "step": 26383
+    },
+    {
+      "epoch": 0.22902578970668658,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0017011182157567124,
+      "loss": 0.0977,
+      "step": 26384
+    },
+    {
+      "epoch": 0.22903447018689074,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0017010960649328957,
+      "loss": 0.1621,
+      "step": 26385
+    },
+    {
+      "epoch": 0.2290431506670949,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0017010739134517296,
+      "loss": 0.1001,
+      "step": 26386
+    },
+    {
+      "epoch": 0.22905183114729907,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001701051761313238,
+      "loss": 0.0942,
+      "step": 26387
+    },
+    {
+      "epoch": 0.22906051162750324,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001701029608517445,
+      "loss": 0.106,
+      "step": 26388
+    },
+    {
+      "epoch": 0.2290691921077074,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0017010074550643755,
+      "loss": 0.0645,
+      "step": 26389
+    },
+    {
+      "epoch": 0.22907787258791157,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0017009853009540528,
+      "loss": 0.1206,
+      "step": 26390
+    },
+    {
+      "epoch": 0.22908655306811573,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001700963146186502,
+      "loss": 0.1162,
+      "step": 26391
+    },
+    {
+      "epoch": 0.2290952335483199,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0017009409907617467,
+      "loss": 0.1006,
+      "step": 26392
+    },
+    {
+      "epoch": 0.22910391402852406,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0017009188346798114,
+      "loss": 0.1021,
+      "step": 26393
+    },
+    {
+      "epoch": 0.22911259450872823,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0017008966779407202,
+      "loss": 0.1025,
+      "step": 26394
+    },
+    {
+      "epoch": 0.2291212749889324,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0017008745205444975,
+      "loss": 0.0898,
+      "step": 26395
+    },
+    {
+      "epoch": 0.22912995546913656,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0017008523624911674,
+      "loss": 0.1436,
+      "step": 26396
+    },
+    {
+      "epoch": 0.22913863594934072,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0017008302037807544,
+      "loss": 0.0796,
+      "step": 26397
+    },
+    {
+      "epoch": 0.2291473164295449,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0017008080444132826,
+      "loss": 0.1211,
+      "step": 26398
+    },
+    {
+      "epoch": 0.22915599690974905,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0017007858843887758,
+      "loss": 0.0859,
+      "step": 26399
+    },
+    {
+      "epoch": 0.22916467738995322,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0017007637237072587,
+      "loss": 0.123,
+      "step": 26400
+    },
+    {
+      "epoch": 0.22917335787015738,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0017007415623687553,
+      "loss": 0.1045,
+      "step": 26401
+    },
+    {
+      "epoch": 0.22918203835036155,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0017007194003732901,
+      "loss": 0.0908,
+      "step": 26402
+    },
+    {
+      "epoch": 0.22919071883056572,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0017006972377208874,
+      "loss": 0.085,
+      "step": 26403
+    },
+    {
+      "epoch": 0.22919939931076988,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0017006750744115709,
+      "loss": 0.1318,
+      "step": 26404
+    },
+    {
+      "epoch": 0.22920807979097405,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0017006529104453651,
+      "loss": 0.1143,
+      "step": 26405
+    },
+    {
+      "epoch": 0.2292167602711782,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0017006307458222944,
+      "loss": 0.1338,
+      "step": 26406
+    },
+    {
+      "epoch": 0.22922544075138238,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0017006085805423832,
+      "loss": 0.1172,
+      "step": 26407
+    },
+    {
+      "epoch": 0.22923412123158654,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0017005864146056554,
+      "loss": 0.1108,
+      "step": 26408
+    },
+    {
+      "epoch": 0.2292428017117907,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001700564248012135,
+      "loss": 0.0918,
+      "step": 26409
+    },
+    {
+      "epoch": 0.22925148219199487,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0017005420807618468,
+      "loss": 0.1045,
+      "step": 26410
+    },
+    {
+      "epoch": 0.22926016267219904,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0017005199128548147,
+      "loss": 0.1465,
+      "step": 26411
+    },
+    {
+      "epoch": 0.2292688431524032,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0017004977442910635,
+      "loss": 0.1055,
+      "step": 26412
+    },
+    {
+      "epoch": 0.22927752363260737,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0017004755750706162,
+      "loss": 0.1045,
+      "step": 26413
+    },
+    {
+      "epoch": 0.22928620411281153,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0017004534051934983,
+      "loss": 0.1719,
+      "step": 26414
+    },
+    {
+      "epoch": 0.2292948845930157,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0017004312346597336,
+      "loss": 0.1621,
+      "step": 26415
+    },
+    {
+      "epoch": 0.22930356507321986,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0017004090634693463,
+      "loss": 0.0957,
+      "step": 26416
+    },
+    {
+      "epoch": 0.22931224555342403,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0017003868916223603,
+      "loss": 0.1211,
+      "step": 26417
+    },
+    {
+      "epoch": 0.2293209260336282,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0017003647191188007,
+      "loss": 0.0859,
+      "step": 26418
+    },
+    {
+      "epoch": 0.22932960651383236,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001700342545958691,
+      "loss": 0.1113,
+      "step": 26419
+    },
+    {
+      "epoch": 0.22933828699403652,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0017003203721420558,
+      "loss": 0.1143,
+      "step": 26420
+    },
+    {
+      "epoch": 0.2293469674742407,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0017002981976689195,
+      "loss": 0.1133,
+      "step": 26421
+    },
+    {
+      "epoch": 0.22935564795444485,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0017002760225393056,
+      "loss": 0.1367,
+      "step": 26422
+    },
+    {
+      "epoch": 0.22936432843464902,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0017002538467532397,
+      "loss": 0.0825,
+      "step": 26423
+    },
+    {
+      "epoch": 0.22937300891485318,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0017002316703107444,
+      "loss": 0.0835,
+      "step": 26424
+    },
+    {
+      "epoch": 0.22938168939505735,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0017002094932118451,
+      "loss": 0.0869,
+      "step": 26425
+    },
+    {
+      "epoch": 0.2293903698752615,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0017001873154565655,
+      "loss": 0.0947,
+      "step": 26426
+    },
+    {
+      "epoch": 0.22939905035546568,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0017001651370449303,
+      "loss": 0.0913,
+      "step": 26427
+    },
+    {
+      "epoch": 0.22940773083566984,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0017001429579769634,
+      "loss": 0.0947,
+      "step": 26428
+    },
+    {
+      "epoch": 0.22941641131587398,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0017001207782526891,
+      "loss": 0.1289,
+      "step": 26429
+    },
+    {
+      "epoch": 0.22942509179607815,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0017000985978721317,
+      "loss": 0.085,
+      "step": 26430
+    },
+    {
+      "epoch": 0.2294337722762823,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0017000764168353158,
+      "loss": 0.124,
+      "step": 26431
+    },
+    {
+      "epoch": 0.22944245275648648,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0017000542351422651,
+      "loss": 0.1055,
+      "step": 26432
+    },
+    {
+      "epoch": 0.22945113323669064,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0017000320527930039,
+      "loss": 0.1543,
+      "step": 26433
+    },
+    {
+      "epoch": 0.2294598137168948,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0017000098697875572,
+      "loss": 0.1113,
+      "step": 26434
+    },
+    {
+      "epoch": 0.22946849419709897,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0016999876861259484,
+      "loss": 0.1436,
+      "step": 26435
+    },
+    {
+      "epoch": 0.22947717467730314,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0016999655018082023,
+      "loss": 0.1318,
+      "step": 26436
+    },
+    {
+      "epoch": 0.2294858551575073,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0016999433168343428,
+      "loss": 0.1348,
+      "step": 26437
+    },
+    {
+      "epoch": 0.22949453563771147,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0016999211312043942,
+      "loss": 0.1484,
+      "step": 26438
+    },
+    {
+      "epoch": 0.22950321611791563,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0016998989449183808,
+      "loss": 0.0815,
+      "step": 26439
+    },
+    {
+      "epoch": 0.2295118965981198,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0016998767579763272,
+      "loss": 0.1104,
+      "step": 26440
+    },
+    {
+      "epoch": 0.22952057707832396,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0016998545703782572,
+      "loss": 0.0698,
+      "step": 26441
+    },
+    {
+      "epoch": 0.22952925755852813,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0016998323821241954,
+      "loss": 0.1162,
+      "step": 26442
+    },
+    {
+      "epoch": 0.2295379380387323,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0016998101932141655,
+      "loss": 0.418,
+      "step": 26443
+    },
+    {
+      "epoch": 0.22954661851893646,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0016997880036481927,
+      "loss": 0.0806,
+      "step": 26444
+    },
+    {
+      "epoch": 0.22955529899914062,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0016997658134263007,
+      "loss": 0.1494,
+      "step": 26445
+    },
+    {
+      "epoch": 0.2295639794793448,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0016997436225485137,
+      "loss": 0.1084,
+      "step": 26446
+    },
+    {
+      "epoch": 0.22957265995954895,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0016997214310148564,
+      "loss": 0.1416,
+      "step": 26447
+    },
+    {
+      "epoch": 0.22958134043975312,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0016996992388253523,
+      "loss": 0.0767,
+      "step": 26448
+    },
+    {
+      "epoch": 0.22959002091995728,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001699677045980026,
+      "loss": 0.085,
+      "step": 26449
+    },
+    {
+      "epoch": 0.22959870140016145,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0016996548524789025,
+      "loss": 0.0933,
+      "step": 26450
+    },
+    {
+      "epoch": 0.2296073818803656,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001699632658322005,
+      "loss": 0.0693,
+      "step": 26451
+    },
+    {
+      "epoch": 0.22961606236056978,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0016996104635093583,
+      "loss": 0.1309,
+      "step": 26452
+    },
+    {
+      "epoch": 0.22962474284077394,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0016995882680409867,
+      "loss": 0.1094,
+      "step": 26453
+    },
+    {
+      "epoch": 0.2296334233209781,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0016995660719169143,
+      "loss": 0.124,
+      "step": 26454
+    },
+    {
+      "epoch": 0.22964210380118227,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0016995438751371654,
+      "loss": 0.1299,
+      "step": 26455
+    },
+    {
+      "epoch": 0.22965078428138644,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0016995216777017643,
+      "loss": 0.1289,
+      "step": 26456
+    },
+    {
+      "epoch": 0.2296594647615906,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0016994994796107354,
+      "loss": 0.0879,
+      "step": 26457
+    },
+    {
+      "epoch": 0.22966814524179477,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001699477280864103,
+      "loss": 0.0898,
+      "step": 26458
+    },
+    {
+      "epoch": 0.22967682572199893,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0016994550814618912,
+      "loss": 0.0889,
+      "step": 26459
+    },
+    {
+      "epoch": 0.2296855062022031,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0016994328814041242,
+      "loss": 0.0913,
+      "step": 26460
+    },
+    {
+      "epoch": 0.22969418668240726,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0016994106806908266,
+      "loss": 0.1641,
+      "step": 26461
+    },
+    {
+      "epoch": 0.22970286716261143,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0016993884793220224,
+      "loss": 0.0928,
+      "step": 26462
+    },
+    {
+      "epoch": 0.2297115476428156,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0016993662772977357,
+      "loss": 0.1309,
+      "step": 26463
+    },
+    {
+      "epoch": 0.22972022812301976,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0016993440746179912,
+      "loss": 0.1143,
+      "step": 26464
+    },
+    {
+      "epoch": 0.22972890860322392,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0016993218712828134,
+      "loss": 0.0938,
+      "step": 26465
+    },
+    {
+      "epoch": 0.2297375890834281,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0016992996672922259,
+      "loss": 0.1543,
+      "step": 26466
+    },
+    {
+      "epoch": 0.22974626956363225,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0016992774626462532,
+      "loss": 0.0679,
+      "step": 26467
+    },
+    {
+      "epoch": 0.22975495004383642,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0016992552573449199,
+      "loss": 0.0889,
+      "step": 26468
+    },
+    {
+      "epoch": 0.22976363052404059,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0016992330513882496,
+      "loss": 0.105,
+      "step": 26469
+    },
+    {
+      "epoch": 0.22977231100424475,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0016992108447762674,
+      "loss": 0.1045,
+      "step": 26470
+    },
+    {
+      "epoch": 0.22978099148444892,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0016991886375089973,
+      "loss": 0.0903,
+      "step": 26471
+    },
+    {
+      "epoch": 0.22978967196465308,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0016991664295864632,
+      "loss": 0.1025,
+      "step": 26472
+    },
+    {
+      "epoch": 0.22979835244485725,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.00169914422100869,
+      "loss": 0.1084,
+      "step": 26473
+    },
+    {
+      "epoch": 0.2298070329250614,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0016991220117757017,
+      "loss": 0.0938,
+      "step": 26474
+    },
+    {
+      "epoch": 0.22981571340526558,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0016990998018875222,
+      "loss": 0.1465,
+      "step": 26475
+    },
+    {
+      "epoch": 0.22982439388546974,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0016990775913441768,
+      "loss": 0.0991,
+      "step": 26476
+    },
+    {
+      "epoch": 0.2298330743656739,
+      "grad_norm": 2.65625,
+      "learning_rate": 0.0016990553801456887,
+      "loss": 0.1279,
+      "step": 26477
+    },
+    {
+      "epoch": 0.22984175484587807,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0016990331682920823,
+      "loss": 0.1367,
+      "step": 26478
+    },
+    {
+      "epoch": 0.22985043532608224,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0016990109557833827,
+      "loss": 0.085,
+      "step": 26479
+    },
+    {
+      "epoch": 0.2298591158062864,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0016989887426196138,
+      "loss": 0.0869,
+      "step": 26480
+    },
+    {
+      "epoch": 0.22986779628649057,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0016989665288007995,
+      "loss": 0.1006,
+      "step": 26481
+    },
+    {
+      "epoch": 0.22987647676669473,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0016989443143269648,
+      "loss": 0.0918,
+      "step": 26482
+    },
+    {
+      "epoch": 0.2298851572468989,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0016989220991981328,
+      "loss": 0.0938,
+      "step": 26483
+    },
+    {
+      "epoch": 0.22989383772710306,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0016988998834143296,
+      "loss": 0.0933,
+      "step": 26484
+    },
+    {
+      "epoch": 0.22990251820730723,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001698877666975578,
+      "loss": 0.082,
+      "step": 26485
+    },
+    {
+      "epoch": 0.2299111986875114,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0016988554498819023,
+      "loss": 0.0903,
+      "step": 26486
+    },
+    {
+      "epoch": 0.22991987916771556,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001698833232133328,
+      "loss": 0.1328,
+      "step": 26487
+    },
+    {
+      "epoch": 0.22992855964791972,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0016988110137298785,
+      "loss": 0.1387,
+      "step": 26488
+    },
+    {
+      "epoch": 0.2299372401281239,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0016987887946715782,
+      "loss": 0.082,
+      "step": 26489
+    },
+    {
+      "epoch": 0.22994592060832805,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0016987665749584515,
+      "loss": 0.0903,
+      "step": 26490
+    },
+    {
+      "epoch": 0.22995460108853222,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0016987443545905226,
+      "loss": 0.0815,
+      "step": 26491
+    },
+    {
+      "epoch": 0.22996328156873638,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.001698722133567816,
+      "loss": 0.1191,
+      "step": 26492
+    },
+    {
+      "epoch": 0.22997196204894055,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0016986999118903559,
+      "loss": 0.0967,
+      "step": 26493
+    },
+    {
+      "epoch": 0.2299806425291447,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0016986776895581666,
+      "loss": 0.0996,
+      "step": 26494
+    },
+    {
+      "epoch": 0.22998932300934888,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001698655466571272,
+      "loss": 0.2344,
+      "step": 26495
+    },
+    {
+      "epoch": 0.22999800348955304,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0016986332429296971,
+      "loss": 0.0957,
+      "step": 26496
+    },
+    {
+      "epoch": 0.2300066839697572,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0016986110186334657,
+      "loss": 0.1025,
+      "step": 26497
+    },
+    {
+      "epoch": 0.23001536444996137,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0016985887936826025,
+      "loss": 0.2295,
+      "step": 26498
+    },
+    {
+      "epoch": 0.23002404493016554,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0016985665680771314,
+      "loss": 0.1147,
+      "step": 26499
+    },
+    {
+      "epoch": 0.2300327254103697,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001698544341817077,
+      "loss": 0.0879,
+      "step": 26500
+    },
+    {
+      "epoch": 0.23004140589057387,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0016985221149024635,
+      "loss": 0.123,
+      "step": 26501
+    },
+    {
+      "epoch": 0.23005008637077803,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0016984998873333153,
+      "loss": 0.1055,
+      "step": 26502
+    },
+    {
+      "epoch": 0.2300587668509822,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0016984776591096565,
+      "loss": 0.1089,
+      "step": 26503
+    },
+    {
+      "epoch": 0.23006744733118636,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0016984554302315114,
+      "loss": 0.126,
+      "step": 26504
+    },
+    {
+      "epoch": 0.23007612781139053,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0016984332006989046,
+      "loss": 0.1104,
+      "step": 26505
+    },
+    {
+      "epoch": 0.2300848082915947,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0016984109705118601,
+      "loss": 0.1436,
+      "step": 26506
+    },
+    {
+      "epoch": 0.23009348877179886,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0016983887396704025,
+      "loss": 0.1172,
+      "step": 26507
+    },
+    {
+      "epoch": 0.23010216925200302,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0016983665081745558,
+      "loss": 0.1602,
+      "step": 26508
+    },
+    {
+      "epoch": 0.2301108497322072,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0016983442760243445,
+      "loss": 0.1006,
+      "step": 26509
+    },
+    {
+      "epoch": 0.23011953021241136,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001698322043219793,
+      "loss": 0.1377,
+      "step": 26510
+    },
+    {
+      "epoch": 0.23012821069261552,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0016982998097609257,
+      "loss": 0.1104,
+      "step": 26511
+    },
+    {
+      "epoch": 0.23013689117281969,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0016982775756477664,
+      "loss": 0.0845,
+      "step": 26512
+    },
+    {
+      "epoch": 0.23014557165302385,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0016982553408803399,
+      "loss": 0.1562,
+      "step": 26513
+    },
+    {
+      "epoch": 0.23015425213322802,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0016982331054586704,
+      "loss": 0.1143,
+      "step": 26514
+    },
+    {
+      "epoch": 0.23016293261343218,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001698210869382782,
+      "loss": 0.0825,
+      "step": 26515
+    },
+    {
+      "epoch": 0.23017161309363635,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0016981886326526989,
+      "loss": 0.1465,
+      "step": 26516
+    },
+    {
+      "epoch": 0.2301802935738405,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0016981663952684463,
+      "loss": 0.1445,
+      "step": 26517
+    },
+    {
+      "epoch": 0.23018897405404468,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0016981441572300475,
+      "loss": 0.1138,
+      "step": 26518
+    },
+    {
+      "epoch": 0.23019765453424884,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0016981219185375276,
+      "loss": 0.1611,
+      "step": 26519
+    },
+    {
+      "epoch": 0.230206335014453,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0016980996791909103,
+      "loss": 0.1162,
+      "step": 26520
+    },
+    {
+      "epoch": 0.23021501549465717,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0016980774391902199,
+      "loss": 0.1074,
+      "step": 26521
+    },
+    {
+      "epoch": 0.23022369597486134,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0016980551985354814,
+      "loss": 0.2461,
+      "step": 26522
+    },
+    {
+      "epoch": 0.2302323764550655,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0016980329572267188,
+      "loss": 0.1104,
+      "step": 26523
+    },
+    {
+      "epoch": 0.23024105693526967,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0016980107152639562,
+      "loss": 0.2695,
+      "step": 26524
+    },
+    {
+      "epoch": 0.23024973741547383,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0016979884726472181,
+      "loss": 0.1104,
+      "step": 26525
+    },
+    {
+      "epoch": 0.230258417895678,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0016979662293765286,
+      "loss": 0.1235,
+      "step": 26526
+    },
+    {
+      "epoch": 0.23026709837588216,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0016979439854519125,
+      "loss": 0.1631,
+      "step": 26527
+    },
+    {
+      "epoch": 0.23027577885608633,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0016979217408733937,
+      "loss": 0.0811,
+      "step": 26528
+    },
+    {
+      "epoch": 0.2302844593362905,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0016978994956409965,
+      "loss": 0.1289,
+      "step": 26529
+    },
+    {
+      "epoch": 0.23029313981649466,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0016978772497547457,
+      "loss": 0.1191,
+      "step": 26530
+    },
+    {
+      "epoch": 0.23030182029669882,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001697855003214665,
+      "loss": 0.123,
+      "step": 26531
+    },
+    {
+      "epoch": 0.230310500776903,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0016978327560207794,
+      "loss": 0.1089,
+      "step": 26532
+    },
+    {
+      "epoch": 0.23031918125710715,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0016978105081731129,
+      "loss": 0.1069,
+      "step": 26533
+    },
+    {
+      "epoch": 0.23032786173731132,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001697788259671689,
+      "loss": 0.1318,
+      "step": 26534
+    },
+    {
+      "epoch": 0.23033654221751548,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0016977660105165337,
+      "loss": 0.103,
+      "step": 26535
+    },
+    {
+      "epoch": 0.23034522269771965,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0016977437607076705,
+      "loss": 0.0718,
+      "step": 26536
+    },
+    {
+      "epoch": 0.2303539031779238,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0016977215102451234,
+      "loss": 0.1738,
+      "step": 26537
+    },
+    {
+      "epoch": 0.23036258365812798,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0016976992591289167,
+      "loss": 0.0952,
+      "step": 26538
+    },
+    {
+      "epoch": 0.23037126413833214,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0016976770073590757,
+      "loss": 0.1055,
+      "step": 26539
+    },
+    {
+      "epoch": 0.2303799446185363,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0016976547549356238,
+      "loss": 0.0811,
+      "step": 26540
+    },
+    {
+      "epoch": 0.23038862509874047,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0016976325018585856,
+      "loss": 0.0942,
+      "step": 26541
+    },
+    {
+      "epoch": 0.23039730557894464,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0016976102481279856,
+      "loss": 0.123,
+      "step": 26542
+    },
+    {
+      "epoch": 0.2304059860591488,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001697587993743848,
+      "loss": 0.1494,
+      "step": 26543
+    },
+    {
+      "epoch": 0.23041466653935297,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001697565738706197,
+      "loss": 0.1074,
+      "step": 26544
+    },
+    {
+      "epoch": 0.23042334701955713,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0016975434830150573,
+      "loss": 0.0996,
+      "step": 26545
+    },
+    {
+      "epoch": 0.2304320274997613,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001697521226670453,
+      "loss": 0.1357,
+      "step": 26546
+    },
+    {
+      "epoch": 0.23044070797996546,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0016974989696724083,
+      "loss": 0.0957,
+      "step": 26547
+    },
+    {
+      "epoch": 0.23044938846016963,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001697476712020948,
+      "loss": 0.104,
+      "step": 26548
+    },
+    {
+      "epoch": 0.2304580689403738,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0016974544537160959,
+      "loss": 0.1416,
+      "step": 26549
+    },
+    {
+      "epoch": 0.23046674942057796,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0016974321947578765,
+      "loss": 0.0879,
+      "step": 26550
+    },
+    {
+      "epoch": 0.23047542990078212,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0016974099351463145,
+      "loss": 0.1641,
+      "step": 26551
+    },
+    {
+      "epoch": 0.23048411038098626,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0016973876748814336,
+      "loss": 0.1387,
+      "step": 26552
+    },
+    {
+      "epoch": 0.23049279086119043,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001697365413963259,
+      "loss": 0.1562,
+      "step": 26553
+    },
+    {
+      "epoch": 0.2305014713413946,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001697343152391814,
+      "loss": 0.1084,
+      "step": 26554
+    },
+    {
+      "epoch": 0.23051015182159876,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0016973208901671242,
+      "loss": 0.0972,
+      "step": 26555
+    },
+    {
+      "epoch": 0.23051883230180292,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0016972986272892128,
+      "loss": 0.1089,
+      "step": 26556
+    },
+    {
+      "epoch": 0.2305275127820071,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0016972763637581045,
+      "loss": 0.1523,
+      "step": 26557
+    },
+    {
+      "epoch": 0.23053619326221125,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0016972540995738238,
+      "loss": 0.0898,
+      "step": 26558
+    },
+    {
+      "epoch": 0.23054487374241542,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0016972318347363952,
+      "loss": 0.0806,
+      "step": 26559
+    },
+    {
+      "epoch": 0.23055355422261958,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0016972095692458426,
+      "loss": 0.0801,
+      "step": 26560
+    },
+    {
+      "epoch": 0.23056223470282375,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0016971873031021907,
+      "loss": 0.1182,
+      "step": 26561
+    },
+    {
+      "epoch": 0.2305709151830279,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001697165036305464,
+      "loss": 0.106,
+      "step": 26562
+    },
+    {
+      "epoch": 0.23057959566323208,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0016971427688556862,
+      "loss": 0.127,
+      "step": 26563
+    },
+    {
+      "epoch": 0.23058827614343624,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0016971205007528822,
+      "loss": 0.1123,
+      "step": 26564
+    },
+    {
+      "epoch": 0.2305969566236404,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0016970982319970762,
+      "loss": 0.0894,
+      "step": 26565
+    },
+    {
+      "epoch": 0.23060563710384457,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0016970759625882926,
+      "loss": 0.1729,
+      "step": 26566
+    },
+    {
+      "epoch": 0.23061431758404874,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0016970536925265556,
+      "loss": 0.0903,
+      "step": 26567
+    },
+    {
+      "epoch": 0.2306229980642529,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0016970314218118896,
+      "loss": 0.1064,
+      "step": 26568
+    },
+    {
+      "epoch": 0.23063167854445707,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001697009150444319,
+      "loss": 0.0947,
+      "step": 26569
+    },
+    {
+      "epoch": 0.23064035902466123,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0016969868784238685,
+      "loss": 0.1211,
+      "step": 26570
+    },
+    {
+      "epoch": 0.2306490395048654,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0016969646057505617,
+      "loss": 0.1162,
+      "step": 26571
+    },
+    {
+      "epoch": 0.23065771998506956,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0016969423324244236,
+      "loss": 0.1299,
+      "step": 26572
+    },
+    {
+      "epoch": 0.23066640046527373,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0016969200584454783,
+      "loss": 0.1104,
+      "step": 26573
+    },
+    {
+      "epoch": 0.2306750809454779,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00169689778381375,
+      "loss": 0.1562,
+      "step": 26574
+    },
+    {
+      "epoch": 0.23068376142568206,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0016968755085292634,
+      "loss": 0.1221,
+      "step": 26575
+    },
+    {
+      "epoch": 0.23069244190588623,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001696853232592043,
+      "loss": 0.1084,
+      "step": 26576
+    },
+    {
+      "epoch": 0.2307011223860904,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0016968309560021124,
+      "loss": 0.1191,
+      "step": 26577
+    },
+    {
+      "epoch": 0.23070980286629456,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0016968086787594965,
+      "loss": 0.0835,
+      "step": 26578
+    },
+    {
+      "epoch": 0.23071848334649872,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0016967864008642197,
+      "loss": 0.1123,
+      "step": 26579
+    },
+    {
+      "epoch": 0.23072716382670289,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0016967641223163065,
+      "loss": 0.0864,
+      "step": 26580
+    },
+    {
+      "epoch": 0.23073584430690705,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.0016967418431157808,
+      "loss": 0.0796,
+      "step": 26581
+    },
+    {
+      "epoch": 0.23074452478711122,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0016967195632626674,
+      "loss": 0.082,
+      "step": 26582
+    },
+    {
+      "epoch": 0.23075320526731538,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00169669728275699,
+      "loss": 0.1138,
+      "step": 26583
+    },
+    {
+      "epoch": 0.23076188574751955,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0016966750015987736,
+      "loss": 0.0957,
+      "step": 26584
+    },
+    {
+      "epoch": 0.2307705662277237,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0016966527197880429,
+      "loss": 0.0762,
+      "step": 26585
+    },
+    {
+      "epoch": 0.23077924670792788,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0016966304373248209,
+      "loss": 0.1348,
+      "step": 26586
+    },
+    {
+      "epoch": 0.23078792718813204,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0016966081542091334,
+      "loss": 0.0918,
+      "step": 26587
+    },
+    {
+      "epoch": 0.2307966076683362,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001696585870441004,
+      "loss": 0.1157,
+      "step": 26588
+    },
+    {
+      "epoch": 0.23080528814854037,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001696563586020457,
+      "loss": 0.1436,
+      "step": 26589
+    },
+    {
+      "epoch": 0.23081396862874454,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0016965413009475173,
+      "loss": 0.106,
+      "step": 26590
+    },
+    {
+      "epoch": 0.2308226491089487,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001696519015222209,
+      "loss": 0.1138,
+      "step": 26591
+    },
+    {
+      "epoch": 0.23083132958915287,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0016964967288445566,
+      "loss": 0.0879,
+      "step": 26592
+    },
+    {
+      "epoch": 0.23084001006935703,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0016964744418145842,
+      "loss": 0.1035,
+      "step": 26593
+    },
+    {
+      "epoch": 0.2308486905495612,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0016964521541323163,
+      "loss": 0.1514,
+      "step": 26594
+    },
+    {
+      "epoch": 0.23085737102976536,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0016964298657977772,
+      "loss": 0.1138,
+      "step": 26595
+    },
+    {
+      "epoch": 0.23086605150996953,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016964075768109914,
+      "loss": 0.1289,
+      "step": 26596
+    },
+    {
+      "epoch": 0.2308747319901737,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0016963852871719831,
+      "loss": 0.0938,
+      "step": 26597
+    },
+    {
+      "epoch": 0.23088341247037786,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0016963629968807772,
+      "loss": 0.1147,
+      "step": 26598
+    },
+    {
+      "epoch": 0.23089209295058202,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0016963407059373975,
+      "loss": 0.0908,
+      "step": 26599
+    },
+    {
+      "epoch": 0.2309007734307862,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0016963184143418686,
+      "loss": 0.0962,
+      "step": 26600
+    },
+    {
+      "epoch": 0.23090945391099035,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0016962961220942149,
+      "loss": 0.1084,
+      "step": 26601
+    },
+    {
+      "epoch": 0.23091813439119452,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0016962738291944606,
+      "loss": 0.1025,
+      "step": 26602
+    },
+    {
+      "epoch": 0.23092681487139868,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.00169625153564263,
+      "loss": 0.1396,
+      "step": 26603
+    },
+    {
+      "epoch": 0.23093549535160285,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0016962292414387477,
+      "loss": 0.0918,
+      "step": 26604
+    },
+    {
+      "epoch": 0.230944175831807,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0016962069465828384,
+      "loss": 0.1211,
+      "step": 26605
+    },
+    {
+      "epoch": 0.23095285631201118,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001696184651074926,
+      "loss": 0.1133,
+      "step": 26606
+    },
+    {
+      "epoch": 0.23096153679221534,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001696162354915035,
+      "loss": 0.1289,
+      "step": 26607
+    },
+    {
+      "epoch": 0.2309702172724195,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0016961400581031899,
+      "loss": 0.1011,
+      "step": 26608
+    },
+    {
+      "epoch": 0.23097889775262367,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001696117760639415,
+      "loss": 0.0859,
+      "step": 26609
+    },
+    {
+      "epoch": 0.23098757823282784,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0016960954625237343,
+      "loss": 0.0859,
+      "step": 26610
+    },
+    {
+      "epoch": 0.230996258713032,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001696073163756173,
+      "loss": 0.1191,
+      "step": 26611
+    },
+    {
+      "epoch": 0.23100493919323617,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0016960508643367547,
+      "loss": 0.0854,
+      "step": 26612
+    },
+    {
+      "epoch": 0.23101361967344033,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0016960285642655046,
+      "loss": 0.0918,
+      "step": 26613
+    },
+    {
+      "epoch": 0.2310223001536445,
+      "grad_norm": 2.125,
+      "learning_rate": 0.0016960062635424464,
+      "loss": 0.2217,
+      "step": 26614
+    },
+    {
+      "epoch": 0.23103098063384866,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0016959839621676046,
+      "loss": 0.1172,
+      "step": 26615
+    },
+    {
+      "epoch": 0.23103966111405283,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0016959616601410037,
+      "loss": 0.1348,
+      "step": 26616
+    },
+    {
+      "epoch": 0.231048341594257,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001695939357462668,
+      "loss": 0.1138,
+      "step": 26617
+    },
+    {
+      "epoch": 0.23105702207446116,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001695917054132622,
+      "loss": 0.1016,
+      "step": 26618
+    },
+    {
+      "epoch": 0.23106570255466533,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0016958947501508903,
+      "loss": 0.1348,
+      "step": 26619
+    },
+    {
+      "epoch": 0.2310743830348695,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0016958724455174968,
+      "loss": 0.0977,
+      "step": 26620
+    },
+    {
+      "epoch": 0.23108306351507366,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0016958501402324663,
+      "loss": 0.0845,
+      "step": 26621
+    },
+    {
+      "epoch": 0.23109174399527782,
+      "grad_norm": 1.0,
+      "learning_rate": 0.001695827834295823,
+      "loss": 0.1367,
+      "step": 26622
+    },
+    {
+      "epoch": 0.23110042447548199,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0016958055277075913,
+      "loss": 0.1514,
+      "step": 26623
+    },
+    {
+      "epoch": 0.23110910495568615,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0016957832204677957,
+      "loss": 0.1064,
+      "step": 26624
+    },
+    {
+      "epoch": 0.23111778543589032,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0016957609125764603,
+      "loss": 0.1162,
+      "step": 26625
+    },
+    {
+      "epoch": 0.23112646591609448,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0016957386040336098,
+      "loss": 0.1201,
+      "step": 26626
+    },
+    {
+      "epoch": 0.23113514639629865,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0016957162948392688,
+      "loss": 0.1143,
+      "step": 26627
+    },
+    {
+      "epoch": 0.2311438268765028,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0016956939849934612,
+      "loss": 0.083,
+      "step": 26628
+    },
+    {
+      "epoch": 0.23115250735670698,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0016956716744962114,
+      "loss": 0.1006,
+      "step": 26629
+    },
+    {
+      "epoch": 0.23116118783691114,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0016956493633475446,
+      "loss": 0.125,
+      "step": 26630
+    },
+    {
+      "epoch": 0.2311698683171153,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0016956270515474838,
+      "loss": 0.1113,
+      "step": 26631
+    },
+    {
+      "epoch": 0.23117854879731947,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0016956047390960546,
+      "loss": 0.103,
+      "step": 26632
+    },
+    {
+      "epoch": 0.23118722927752364,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0016955824259932812,
+      "loss": 0.0869,
+      "step": 26633
+    },
+    {
+      "epoch": 0.2311959097577278,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0016955601122391878,
+      "loss": 0.1016,
+      "step": 26634
+    },
+    {
+      "epoch": 0.23120459023793197,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0016955377978337984,
+      "loss": 0.1089,
+      "step": 26635
+    },
+    {
+      "epoch": 0.23121327071813613,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0016955154827771384,
+      "loss": 0.0957,
+      "step": 26636
+    },
+    {
+      "epoch": 0.2312219511983403,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0016954931670692312,
+      "loss": 0.126,
+      "step": 26637
+    },
+    {
+      "epoch": 0.23123063167854446,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0016954708507101015,
+      "loss": 0.1035,
+      "step": 26638
+    },
+    {
+      "epoch": 0.23123931215874863,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001695448533699774,
+      "loss": 0.1006,
+      "step": 26639
+    },
+    {
+      "epoch": 0.2312479926389528,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001695426216038273,
+      "loss": 0.0845,
+      "step": 26640
+    },
+    {
+      "epoch": 0.23125667311915696,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0016954038977256228,
+      "loss": 0.1006,
+      "step": 26641
+    },
+    {
+      "epoch": 0.23126535359936112,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0016953815787618478,
+      "loss": 0.0991,
+      "step": 26642
+    },
+    {
+      "epoch": 0.2312740340795653,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0016953592591469726,
+      "loss": 0.0815,
+      "step": 26643
+    },
+    {
+      "epoch": 0.23128271455976945,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0016953369388810213,
+      "loss": 0.1104,
+      "step": 26644
+    },
+    {
+      "epoch": 0.23129139503997362,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0016953146179640186,
+      "loss": 0.0962,
+      "step": 26645
+    },
+    {
+      "epoch": 0.23130007552017778,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0016952922963959884,
+      "loss": 0.1221,
+      "step": 26646
+    },
+    {
+      "epoch": 0.23130875600038195,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001695269974176956,
+      "loss": 0.1484,
+      "step": 26647
+    },
+    {
+      "epoch": 0.2313174364805861,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.001695247651306945,
+      "loss": 0.0933,
+      "step": 26648
+    },
+    {
+      "epoch": 0.23132611696079028,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0016952253277859803,
+      "loss": 0.0791,
+      "step": 26649
+    },
+    {
+      "epoch": 0.23133479744099444,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001695203003614086,
+      "loss": 0.165,
+      "step": 26650
+    },
+    {
+      "epoch": 0.2313434779211986,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0016951806787912868,
+      "loss": 0.0815,
+      "step": 26651
+    },
+    {
+      "epoch": 0.23135215840140277,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0016951583533176067,
+      "loss": 0.1191,
+      "step": 26652
+    },
+    {
+      "epoch": 0.23136083888160694,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0016951360271930708,
+      "loss": 0.1157,
+      "step": 26653
+    },
+    {
+      "epoch": 0.2313695193618111,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0016951137004177024,
+      "loss": 0.1289,
+      "step": 26654
+    },
+    {
+      "epoch": 0.23137819984201527,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001695091372991527,
+      "loss": 0.123,
+      "step": 26655
+    },
+    {
+      "epoch": 0.23138688032221943,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0016950690449145686,
+      "loss": 0.0981,
+      "step": 26656
+    },
+    {
+      "epoch": 0.2313955608024236,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0016950467161868518,
+      "loss": 0.1123,
+      "step": 26657
+    },
+    {
+      "epoch": 0.23140424128262777,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0016950243868084005,
+      "loss": 0.127,
+      "step": 26658
+    },
+    {
+      "epoch": 0.23141292176283193,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0016950020567792397,
+      "loss": 0.0869,
+      "step": 26659
+    },
+    {
+      "epoch": 0.2314216022430361,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0016949797260993934,
+      "loss": 0.123,
+      "step": 26660
+    },
+    {
+      "epoch": 0.23143028272324026,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0016949573947688864,
+      "loss": 0.123,
+      "step": 26661
+    },
+    {
+      "epoch": 0.23143896320344443,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001694935062787743,
+      "loss": 0.1133,
+      "step": 26662
+    },
+    {
+      "epoch": 0.2314476436836486,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0016949127301559876,
+      "loss": 0.0996,
+      "step": 26663
+    },
+    {
+      "epoch": 0.23145632416385276,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001694890396873644,
+      "loss": 0.1113,
+      "step": 26664
+    },
+    {
+      "epoch": 0.23146500464405692,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0016948680629407377,
+      "loss": 0.1055,
+      "step": 26665
+    },
+    {
+      "epoch": 0.23147368512426109,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0016948457283572926,
+      "loss": 0.1133,
+      "step": 26666
+    },
+    {
+      "epoch": 0.23148236560446525,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001694823393123333,
+      "loss": 0.0996,
+      "step": 26667
+    },
+    {
+      "epoch": 0.23149104608466942,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0016948010572388835,
+      "loss": 0.0938,
+      "step": 26668
+    },
+    {
+      "epoch": 0.23149972656487358,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0016947787207039686,
+      "loss": 0.1104,
+      "step": 26669
+    },
+    {
+      "epoch": 0.23150840704507775,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0016947563835186124,
+      "loss": 0.0874,
+      "step": 26670
+    },
+    {
+      "epoch": 0.2315170875252819,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.00169473404568284,
+      "loss": 0.0698,
+      "step": 26671
+    },
+    {
+      "epoch": 0.23152576800548608,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001694711707196675,
+      "loss": 0.2002,
+      "step": 26672
+    },
+    {
+      "epoch": 0.23153444848569024,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0016946893680601424,
+      "loss": 0.0864,
+      "step": 26673
+    },
+    {
+      "epoch": 0.2315431289658944,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0016946670282732663,
+      "loss": 0.1104,
+      "step": 26674
+    },
+    {
+      "epoch": 0.23155180944609854,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0016946446878360712,
+      "loss": 0.085,
+      "step": 26675
+    },
+    {
+      "epoch": 0.2315604899263027,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.001694622346748582,
+      "loss": 0.1128,
+      "step": 26676
+    },
+    {
+      "epoch": 0.23156917040650687,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0016946000050108224,
+      "loss": 0.1123,
+      "step": 26677
+    },
+    {
+      "epoch": 0.23157785088671104,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0016945776626228171,
+      "loss": 0.1299,
+      "step": 26678
+    },
+    {
+      "epoch": 0.2315865313669152,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001694555319584591,
+      "loss": 0.1094,
+      "step": 26679
+    },
+    {
+      "epoch": 0.23159521184711937,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0016945329758961676,
+      "loss": 0.1172,
+      "step": 26680
+    },
+    {
+      "epoch": 0.23160389232732354,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0016945106315575723,
+      "loss": 0.0835,
+      "step": 26681
+    },
+    {
+      "epoch": 0.2316125728075277,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0016944882865688289,
+      "loss": 0.1152,
+      "step": 26682
+    },
+    {
+      "epoch": 0.23162125328773187,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0016944659409299618,
+      "loss": 0.1025,
+      "step": 26683
+    },
+    {
+      "epoch": 0.23162993376793603,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0016944435946409965,
+      "loss": 0.1182,
+      "step": 26684
+    },
+    {
+      "epoch": 0.2316386142481402,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0016944212477019557,
+      "loss": 0.123,
+      "step": 26685
+    },
+    {
+      "epoch": 0.23164729472834436,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0016943989001128655,
+      "loss": 0.0938,
+      "step": 26686
+    },
+    {
+      "epoch": 0.23165597520854853,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001694376551873749,
+      "loss": 0.0957,
+      "step": 26687
+    },
+    {
+      "epoch": 0.2316646556887527,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0016943542029846314,
+      "loss": 0.1143,
+      "step": 26688
+    },
+    {
+      "epoch": 0.23167333616895686,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0016943318534455374,
+      "loss": 0.1172,
+      "step": 26689
+    },
+    {
+      "epoch": 0.23168201664916102,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0016943095032564903,
+      "loss": 0.1221,
+      "step": 26690
+    },
+    {
+      "epoch": 0.2316906971293652,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.001694287152417516,
+      "loss": 0.1328,
+      "step": 26691
+    },
+    {
+      "epoch": 0.23169937760956935,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0016942648009286377,
+      "loss": 0.1016,
+      "step": 26692
+    },
+    {
+      "epoch": 0.23170805808977352,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0016942424487898804,
+      "loss": 0.1172,
+      "step": 26693
+    },
+    {
+      "epoch": 0.23171673856997768,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0016942200960012685,
+      "loss": 0.1143,
+      "step": 26694
+    },
+    {
+      "epoch": 0.23172541905018185,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0016941977425628265,
+      "loss": 0.0957,
+      "step": 26695
+    },
+    {
+      "epoch": 0.231734099530386,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0016941753884745787,
+      "loss": 0.1152,
+      "step": 26696
+    },
+    {
+      "epoch": 0.23174278001059018,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0016941530337365497,
+      "loss": 0.1465,
+      "step": 26697
+    },
+    {
+      "epoch": 0.23175146049079434,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0016941306783487637,
+      "loss": 0.1445,
+      "step": 26698
+    },
+    {
+      "epoch": 0.2317601409709985,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0016941083223112458,
+      "loss": 0.1855,
+      "step": 26699
+    },
+    {
+      "epoch": 0.23176882145120267,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0016940859656240195,
+      "loss": 0.1426,
+      "step": 26700
+    },
+    {
+      "epoch": 0.23177750193140684,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0016940636082871101,
+      "loss": 0.0791,
+      "step": 26701
+    },
+    {
+      "epoch": 0.231786182411611,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0016940412503005412,
+      "loss": 0.1084,
+      "step": 26702
+    },
+    {
+      "epoch": 0.23179486289181517,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0016940188916643377,
+      "loss": 0.1089,
+      "step": 26703
+    },
+    {
+      "epoch": 0.23180354337201933,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0016939965323785246,
+      "loss": 0.127,
+      "step": 26704
+    },
+    {
+      "epoch": 0.2318122238522235,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0016939741724431255,
+      "loss": 0.1221,
+      "step": 26705
+    },
+    {
+      "epoch": 0.23182090433242766,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001693951811858165,
+      "loss": 0.0928,
+      "step": 26706
+    },
+    {
+      "epoch": 0.23182958481263183,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001693929450623668,
+      "loss": 0.1147,
+      "step": 26707
+    },
+    {
+      "epoch": 0.231838265292836,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0016939070887396583,
+      "loss": 0.0986,
+      "step": 26708
+    },
+    {
+      "epoch": 0.23184694577304016,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0016938847262061613,
+      "loss": 0.1387,
+      "step": 26709
+    },
+    {
+      "epoch": 0.23185562625324432,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0016938623630232007,
+      "loss": 0.0977,
+      "step": 26710
+    },
+    {
+      "epoch": 0.2318643067334485,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001693839999190801,
+      "loss": 0.1035,
+      "step": 26711
+    },
+    {
+      "epoch": 0.23187298721365265,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0016938176347089866,
+      "loss": 0.1328,
+      "step": 26712
+    },
+    {
+      "epoch": 0.23188166769385682,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0016937952695777825,
+      "loss": 0.1738,
+      "step": 26713
+    },
+    {
+      "epoch": 0.23189034817406098,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0016937729037972127,
+      "loss": 0.1328,
+      "step": 26714
+    },
+    {
+      "epoch": 0.23189902865426515,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0016937505373673019,
+      "loss": 0.1118,
+      "step": 26715
+    },
+    {
+      "epoch": 0.23190770913446931,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0016937281702880743,
+      "loss": 0.1357,
+      "step": 26716
+    },
+    {
+      "epoch": 0.23191638961467348,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0016937058025595542,
+      "loss": 0.1123,
+      "step": 26717
+    },
+    {
+      "epoch": 0.23192507009487764,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001693683434181767,
+      "loss": 0.1035,
+      "step": 26718
+    },
+    {
+      "epoch": 0.2319337505750818,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0016936610651547362,
+      "loss": 0.123,
+      "step": 26719
+    },
+    {
+      "epoch": 0.23194243105528597,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0016936386954784863,
+      "loss": 0.1055,
+      "step": 26720
+    },
+    {
+      "epoch": 0.23195111153549014,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0016936163251530425,
+      "loss": 0.1191,
+      "step": 26721
+    },
+    {
+      "epoch": 0.2319597920156943,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0016935939541784287,
+      "loss": 0.1094,
+      "step": 26722
+    },
+    {
+      "epoch": 0.23196847249589847,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001693571582554669,
+      "loss": 0.0674,
+      "step": 26723
+    },
+    {
+      "epoch": 0.23197715297610264,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0016935492102817889,
+      "loss": 0.0981,
+      "step": 26724
+    },
+    {
+      "epoch": 0.2319858334563068,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001693526837359812,
+      "loss": 0.1348,
+      "step": 26725
+    },
+    {
+      "epoch": 0.23199451393651097,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0016935044637887632,
+      "loss": 0.124,
+      "step": 26726
+    },
+    {
+      "epoch": 0.23200319441671513,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001693482089568667,
+      "loss": 0.0815,
+      "step": 26727
+    },
+    {
+      "epoch": 0.2320118748969193,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0016934597146995476,
+      "loss": 0.0859,
+      "step": 26728
+    },
+    {
+      "epoch": 0.23202055537712346,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0016934373391814296,
+      "loss": 0.1191,
+      "step": 26729
+    },
+    {
+      "epoch": 0.23202923585732763,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001693414963014337,
+      "loss": 0.1602,
+      "step": 26730
+    },
+    {
+      "epoch": 0.2320379163375318,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0016933925861982951,
+      "loss": 0.0977,
+      "step": 26731
+    },
+    {
+      "epoch": 0.23204659681773596,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0016933702087333277,
+      "loss": 0.1094,
+      "step": 26732
+    },
+    {
+      "epoch": 0.23205527729794012,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.00169334783061946,
+      "loss": 0.0986,
+      "step": 26733
+    },
+    {
+      "epoch": 0.2320639577781443,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0016933254518567158,
+      "loss": 0.0674,
+      "step": 26734
+    },
+    {
+      "epoch": 0.23207263825834845,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.00169330307244512,
+      "loss": 0.1855,
+      "step": 26735
+    },
+    {
+      "epoch": 0.23208131873855262,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0016932806923846966,
+      "loss": 0.0996,
+      "step": 26736
+    },
+    {
+      "epoch": 0.23208999921875678,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0016932583116754704,
+      "loss": 0.0781,
+      "step": 26737
+    },
+    {
+      "epoch": 0.23209867969896095,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0016932359303174658,
+      "loss": 0.1465,
+      "step": 26738
+    },
+    {
+      "epoch": 0.2321073601791651,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0016932135483107074,
+      "loss": 0.1318,
+      "step": 26739
+    },
+    {
+      "epoch": 0.23211604065936928,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0016931911656552197,
+      "loss": 0.1338,
+      "step": 26740
+    },
+    {
+      "epoch": 0.23212472113957344,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0016931687823510267,
+      "loss": 0.1123,
+      "step": 26741
+    },
+    {
+      "epoch": 0.2321334016197776,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0016931463983981534,
+      "loss": 0.126,
+      "step": 26742
+    },
+    {
+      "epoch": 0.23214208209998177,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0016931240137966243,
+      "loss": 0.0981,
+      "step": 26743
+    },
+    {
+      "epoch": 0.23215076258018594,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0016931016285464636,
+      "loss": 0.1099,
+      "step": 26744
+    },
+    {
+      "epoch": 0.2321594430603901,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0016930792426476957,
+      "loss": 0.0874,
+      "step": 26745
+    },
+    {
+      "epoch": 0.23216812354059427,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0016930568561003454,
+      "loss": 0.1289,
+      "step": 26746
+    },
+    {
+      "epoch": 0.23217680402079843,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0016930344689044367,
+      "loss": 0.1162,
+      "step": 26747
+    },
+    {
+      "epoch": 0.2321854845010026,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0016930120810599947,
+      "loss": 0.1182,
+      "step": 26748
+    },
+    {
+      "epoch": 0.23219416498120676,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0016929896925670439,
+      "loss": 0.0986,
+      "step": 26749
+    },
+    {
+      "epoch": 0.23220284546141093,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001692967303425608,
+      "loss": 0.1196,
+      "step": 26750
+    },
+    {
+      "epoch": 0.2322115259416151,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001692944913635712,
+      "loss": 0.1006,
+      "step": 26751
+    },
+    {
+      "epoch": 0.23222020642181926,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0016929225231973805,
+      "loss": 0.127,
+      "step": 26752
+    },
+    {
+      "epoch": 0.23222888690202342,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0016929001321106382,
+      "loss": 0.1123,
+      "step": 26753
+    },
+    {
+      "epoch": 0.2322375673822276,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0016928777403755085,
+      "loss": 0.1084,
+      "step": 26754
+    },
+    {
+      "epoch": 0.23224624786243175,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001692855347992017,
+      "loss": 0.1084,
+      "step": 26755
+    },
+    {
+      "epoch": 0.23225492834263592,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0016928329549601878,
+      "loss": 0.1025,
+      "step": 26756
+    },
+    {
+      "epoch": 0.23226360882284008,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0016928105612800453,
+      "loss": 0.0938,
+      "step": 26757
+    },
+    {
+      "epoch": 0.23227228930304425,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0016927881669516144,
+      "loss": 0.1328,
+      "step": 26758
+    },
+    {
+      "epoch": 0.23228096978324841,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0016927657719749188,
+      "loss": 0.1299,
+      "step": 26759
+    },
+    {
+      "epoch": 0.23228965026345258,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0016927433763499838,
+      "loss": 0.0967,
+      "step": 26760
+    },
+    {
+      "epoch": 0.23229833074365674,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0016927209800768333,
+      "loss": 0.0894,
+      "step": 26761
+    },
+    {
+      "epoch": 0.2323070112238609,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0016926985831554925,
+      "loss": 0.1289,
+      "step": 26762
+    },
+    {
+      "epoch": 0.23231569170406507,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0016926761855859847,
+      "loss": 0.1211,
+      "step": 26763
+    },
+    {
+      "epoch": 0.23232437218426924,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0016926537873683354,
+      "loss": 0.1021,
+      "step": 26764
+    },
+    {
+      "epoch": 0.2323330526644734,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0016926313885025689,
+      "loss": 0.1406,
+      "step": 26765
+    },
+    {
+      "epoch": 0.23234173314467757,
+      "grad_norm": 0.5,
+      "learning_rate": 0.00169260898898871,
+      "loss": 0.1494,
+      "step": 26766
+    },
+    {
+      "epoch": 0.23235041362488174,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001692586588826782,
+      "loss": 0.168,
+      "step": 26767
+    },
+    {
+      "epoch": 0.2323590941050859,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0016925641880168108,
+      "loss": 0.1543,
+      "step": 26768
+    },
+    {
+      "epoch": 0.23236777458529007,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0016925417865588201,
+      "loss": 0.0957,
+      "step": 26769
+    },
+    {
+      "epoch": 0.23237645506549423,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0016925193844528348,
+      "loss": 0.1064,
+      "step": 26770
+    },
+    {
+      "epoch": 0.2323851355456984,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0016924969816988792,
+      "loss": 0.1104,
+      "step": 26771
+    },
+    {
+      "epoch": 0.23239381602590256,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0016924745782969778,
+      "loss": 0.1074,
+      "step": 26772
+    },
+    {
+      "epoch": 0.23240249650610673,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0016924521742471548,
+      "loss": 0.0928,
+      "step": 26773
+    },
+    {
+      "epoch": 0.2324111769863109,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0016924297695494353,
+      "loss": 0.1074,
+      "step": 26774
+    },
+    {
+      "epoch": 0.23241985746651506,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0016924073642038432,
+      "loss": 0.0835,
+      "step": 26775
+    },
+    {
+      "epoch": 0.23242853794671922,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0016923849582104037,
+      "loss": 0.1211,
+      "step": 26776
+    },
+    {
+      "epoch": 0.2324372184269234,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0016923625515691406,
+      "loss": 0.0791,
+      "step": 26777
+    },
+    {
+      "epoch": 0.23244589890712755,
+      "grad_norm": 2.625,
+      "learning_rate": 0.001692340144280079,
+      "loss": 0.2637,
+      "step": 26778
+    },
+    {
+      "epoch": 0.23245457938733172,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001692317736343243,
+      "loss": 0.1143,
+      "step": 26779
+    },
+    {
+      "epoch": 0.23246325986753588,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.001692295327758657,
+      "loss": 0.1035,
+      "step": 26780
+    },
+    {
+      "epoch": 0.23247194034774005,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001692272918526346,
+      "loss": 0.1602,
+      "step": 26781
+    },
+    {
+      "epoch": 0.2324806208279442,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0016922505086463342,
+      "loss": 0.0771,
+      "step": 26782
+    },
+    {
+      "epoch": 0.23248930130814838,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0016922280981186463,
+      "loss": 0.127,
+      "step": 26783
+    },
+    {
+      "epoch": 0.23249798178835254,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0016922056869433063,
+      "loss": 0.1338,
+      "step": 26784
+    },
+    {
+      "epoch": 0.2325066622685567,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001692183275120339,
+      "loss": 0.084,
+      "step": 26785
+    },
+    {
+      "epoch": 0.23251534274876087,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0016921608626497694,
+      "loss": 0.0957,
+      "step": 26786
+    },
+    {
+      "epoch": 0.23252402322896504,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001692138449531621,
+      "loss": 0.0938,
+      "step": 26787
+    },
+    {
+      "epoch": 0.2325327037091692,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0016921160357659193,
+      "loss": 0.1602,
+      "step": 26788
+    },
+    {
+      "epoch": 0.23254138418937337,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0016920936213526883,
+      "loss": 0.1025,
+      "step": 26789
+    },
+    {
+      "epoch": 0.23255006466957753,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0016920712062919526,
+      "loss": 0.1543,
+      "step": 26790
+    },
+    {
+      "epoch": 0.2325587451497817,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0016920487905837367,
+      "loss": 0.1182,
+      "step": 26791
+    },
+    {
+      "epoch": 0.23256742562998586,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0016920263742280653,
+      "loss": 0.1152,
+      "step": 26792
+    },
+    {
+      "epoch": 0.23257610611019003,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0016920039572249628,
+      "loss": 0.1348,
+      "step": 26793
+    },
+    {
+      "epoch": 0.2325847865903942,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0016919815395744534,
+      "loss": 0.1196,
+      "step": 26794
+    },
+    {
+      "epoch": 0.23259346707059836,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001691959121276562,
+      "loss": 0.1631,
+      "step": 26795
+    },
+    {
+      "epoch": 0.23260214755080252,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0016919367023313131,
+      "loss": 0.124,
+      "step": 26796
+    },
+    {
+      "epoch": 0.2326108280310067,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0016919142827387307,
+      "loss": 0.124,
+      "step": 26797
+    },
+    {
+      "epoch": 0.23261950851121085,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0016918918624988402,
+      "loss": 0.082,
+      "step": 26798
+    },
+    {
+      "epoch": 0.232628188991415,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0016918694416116653,
+      "loss": 0.1084,
+      "step": 26799
+    },
+    {
+      "epoch": 0.23263686947161916,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0016918470200772316,
+      "loss": 0.1523,
+      "step": 26800
+    },
+    {
+      "epoch": 0.23264554995182332,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0016918245978955622,
+      "loss": 0.1245,
+      "step": 26801
+    },
+    {
+      "epoch": 0.2326542304320275,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0016918021750666826,
+      "loss": 0.0762,
+      "step": 26802
+    },
+    {
+      "epoch": 0.23266291091223165,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0016917797515906168,
+      "loss": 0.1406,
+      "step": 26803
+    },
+    {
+      "epoch": 0.23267159139243582,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0016917573274673895,
+      "loss": 0.123,
+      "step": 26804
+    },
+    {
+      "epoch": 0.23268027187263998,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0016917349026970257,
+      "loss": 0.1035,
+      "step": 26805
+    },
+    {
+      "epoch": 0.23268895235284415,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0016917124772795492,
+      "loss": 0.1035,
+      "step": 26806
+    },
+    {
+      "epoch": 0.2326976328330483,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001691690051214985,
+      "loss": 0.1006,
+      "step": 26807
+    },
+    {
+      "epoch": 0.23270631331325248,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.001691667624503357,
+      "loss": 0.1025,
+      "step": 26808
+    },
+    {
+      "epoch": 0.23271499379345664,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0016916451971446904,
+      "loss": 0.0811,
+      "step": 26809
+    },
+    {
+      "epoch": 0.2327236742736608,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.00169162276913901,
+      "loss": 0.127,
+      "step": 26810
+    },
+    {
+      "epoch": 0.23273235475386497,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0016916003404863391,
+      "loss": 0.0854,
+      "step": 26811
+    },
+    {
+      "epoch": 0.23274103523406914,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0016915779111867034,
+      "loss": 0.0879,
+      "step": 26812
+    },
+    {
+      "epoch": 0.2327497157142733,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0016915554812401267,
+      "loss": 0.0894,
+      "step": 26813
+    },
+    {
+      "epoch": 0.23275839619447747,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001691533050646634,
+      "loss": 0.0977,
+      "step": 26814
+    },
+    {
+      "epoch": 0.23276707667468163,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0016915106194062498,
+      "loss": 0.1035,
+      "step": 26815
+    },
+    {
+      "epoch": 0.2327757571548858,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0016914881875189983,
+      "loss": 0.1445,
+      "step": 26816
+    },
+    {
+      "epoch": 0.23278443763508996,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001691465754984904,
+      "loss": 0.0986,
+      "step": 26817
+    },
+    {
+      "epoch": 0.23279311811529413,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001691443321803992,
+      "loss": 0.1196,
+      "step": 26818
+    },
+    {
+      "epoch": 0.2328017985954983,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0016914208879762862,
+      "loss": 0.1289,
+      "step": 26819
+    },
+    {
+      "epoch": 0.23281047907570246,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0016913984535018116,
+      "loss": 0.124,
+      "step": 26820
+    },
+    {
+      "epoch": 0.23281915955590662,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0016913760183805925,
+      "loss": 0.1113,
+      "step": 26821
+    },
+    {
+      "epoch": 0.2328278400361108,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0016913535826126535,
+      "loss": 0.1245,
+      "step": 26822
+    },
+    {
+      "epoch": 0.23283652051631495,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0016913311461980189,
+      "loss": 0.1025,
+      "step": 26823
+    },
+    {
+      "epoch": 0.23284520099651912,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0016913087091367134,
+      "loss": 0.0928,
+      "step": 26824
+    },
+    {
+      "epoch": 0.23285388147672328,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0016912862714287618,
+      "loss": 0.1211,
+      "step": 26825
+    },
+    {
+      "epoch": 0.23286256195692745,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0016912638330741888,
+      "loss": 0.1445,
+      "step": 26826
+    },
+    {
+      "epoch": 0.23287124243713161,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0016912413940730177,
+      "loss": 0.1064,
+      "step": 26827
+    },
+    {
+      "epoch": 0.23287992291733578,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0016912189544252742,
+      "loss": 0.0801,
+      "step": 26828
+    },
+    {
+      "epoch": 0.23288860339753994,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0016911965141309829,
+      "loss": 0.1465,
+      "step": 26829
+    },
+    {
+      "epoch": 0.2328972838777441,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0016911740731901676,
+      "loss": 0.1523,
+      "step": 26830
+    },
+    {
+      "epoch": 0.23290596435794828,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0016911516316028535,
+      "loss": 0.1206,
+      "step": 26831
+    },
+    {
+      "epoch": 0.23291464483815244,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0016911291893690645,
+      "loss": 0.1123,
+      "step": 26832
+    },
+    {
+      "epoch": 0.2329233253183566,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.001691106746488826,
+      "loss": 0.0908,
+      "step": 26833
+    },
+    {
+      "epoch": 0.23293200579856077,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0016910843029621617,
+      "loss": 0.0879,
+      "step": 26834
+    },
+    {
+      "epoch": 0.23294068627876494,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0016910618587890965,
+      "loss": 0.0762,
+      "step": 26835
+    },
+    {
+      "epoch": 0.2329493667589691,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001691039413969655,
+      "loss": 0.1182,
+      "step": 26836
+    },
+    {
+      "epoch": 0.23295804723917327,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0016910169685038616,
+      "loss": 0.1191,
+      "step": 26837
+    },
+    {
+      "epoch": 0.23296672771937743,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001690994522391741,
+      "loss": 0.0903,
+      "step": 26838
+    },
+    {
+      "epoch": 0.2329754081995816,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0016909720756333175,
+      "loss": 0.0947,
+      "step": 26839
+    },
+    {
+      "epoch": 0.23298408867978576,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001690949628228616,
+      "loss": 0.1445,
+      "step": 26840
+    },
+    {
+      "epoch": 0.23299276915998993,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0016909271801776608,
+      "loss": 0.085,
+      "step": 26841
+    },
+    {
+      "epoch": 0.2330014496401941,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0016909047314804765,
+      "loss": 0.125,
+      "step": 26842
+    },
+    {
+      "epoch": 0.23301013012039826,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0016908822821370877,
+      "loss": 0.1172,
+      "step": 26843
+    },
+    {
+      "epoch": 0.23301881060060242,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0016908598321475191,
+      "loss": 0.1172,
+      "step": 26844
+    },
+    {
+      "epoch": 0.2330274910808066,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0016908373815117948,
+      "loss": 0.1172,
+      "step": 26845
+    },
+    {
+      "epoch": 0.23303617156101075,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0016908149302299396,
+      "loss": 0.1328,
+      "step": 26846
+    },
+    {
+      "epoch": 0.23304485204121492,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0016907924783019783,
+      "loss": 0.1118,
+      "step": 26847
+    },
+    {
+      "epoch": 0.23305353252141908,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0016907700257279353,
+      "loss": 0.1338,
+      "step": 26848
+    },
+    {
+      "epoch": 0.23306221300162325,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0016907475725078348,
+      "loss": 0.0928,
+      "step": 26849
+    },
+    {
+      "epoch": 0.2330708934818274,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0016907251186417016,
+      "loss": 0.0972,
+      "step": 26850
+    },
+    {
+      "epoch": 0.23307957396203158,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0016907026641295602,
+      "loss": 0.1113,
+      "step": 26851
+    },
+    {
+      "epoch": 0.23308825444223574,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0016906802089714352,
+      "loss": 0.0757,
+      "step": 26852
+    },
+    {
+      "epoch": 0.2330969349224399,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0016906577531673518,
+      "loss": 0.0747,
+      "step": 26853
+    },
+    {
+      "epoch": 0.23310561540264407,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0016906352967173332,
+      "loss": 0.0952,
+      "step": 26854
+    },
+    {
+      "epoch": 0.23311429588284824,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0016906128396214054,
+      "loss": 0.0786,
+      "step": 26855
+    },
+    {
+      "epoch": 0.2331229763630524,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0016905903818795918,
+      "loss": 0.1001,
+      "step": 26856
+    },
+    {
+      "epoch": 0.23313165684325657,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0016905679234919177,
+      "loss": 0.1055,
+      "step": 26857
+    },
+    {
+      "epoch": 0.23314033732346073,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0016905454644584074,
+      "loss": 0.0938,
+      "step": 26858
+    },
+    {
+      "epoch": 0.2331490178036649,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0016905230047790853,
+      "loss": 0.0688,
+      "step": 26859
+    },
+    {
+      "epoch": 0.23315769828386906,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001690500544453976,
+      "loss": 0.0791,
+      "step": 26860
+    },
+    {
+      "epoch": 0.23316637876407323,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0016904780834831044,
+      "loss": 0.1318,
+      "step": 26861
+    },
+    {
+      "epoch": 0.2331750592442774,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001690455621866495,
+      "loss": 0.1133,
+      "step": 26862
+    },
+    {
+      "epoch": 0.23318373972448156,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0016904331596041715,
+      "loss": 0.1113,
+      "step": 26863
+    },
+    {
+      "epoch": 0.23319242020468572,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0016904106966961598,
+      "loss": 0.1016,
+      "step": 26864
+    },
+    {
+      "epoch": 0.2332011006848899,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0016903882331424838,
+      "loss": 0.1064,
+      "step": 26865
+    },
+    {
+      "epoch": 0.23320978116509405,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001690365768943168,
+      "loss": 0.082,
+      "step": 26866
+    },
+    {
+      "epoch": 0.23321846164529822,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0016903433040982368,
+      "loss": 0.1289,
+      "step": 26867
+    },
+    {
+      "epoch": 0.23322714212550238,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0016903208386077153,
+      "loss": 0.0952,
+      "step": 26868
+    },
+    {
+      "epoch": 0.23323582260570655,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0016902983724716278,
+      "loss": 0.0938,
+      "step": 26869
+    },
+    {
+      "epoch": 0.23324450308591071,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001690275905689999,
+      "loss": 0.0718,
+      "step": 26870
+    },
+    {
+      "epoch": 0.23325318356611488,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0016902534382628532,
+      "loss": 0.104,
+      "step": 26871
+    },
+    {
+      "epoch": 0.23326186404631905,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0016902309701902149,
+      "loss": 0.1504,
+      "step": 26872
+    },
+    {
+      "epoch": 0.2332705445265232,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0016902085014721092,
+      "loss": 0.0684,
+      "step": 26873
+    },
+    {
+      "epoch": 0.23327922500672738,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0016901860321085602,
+      "loss": 0.1299,
+      "step": 26874
+    },
+    {
+      "epoch": 0.23328790548693154,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0016901635620995927,
+      "loss": 0.1226,
+      "step": 26875
+    },
+    {
+      "epoch": 0.2332965859671357,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0016901410914452312,
+      "loss": 0.127,
+      "step": 26876
+    },
+    {
+      "epoch": 0.23330526644733987,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0016901186201455,
+      "loss": 0.0889,
+      "step": 26877
+    },
+    {
+      "epoch": 0.23331394692754404,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001690096148200424,
+      "loss": 0.1064,
+      "step": 26878
+    },
+    {
+      "epoch": 0.2333226274077482,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0016900736756100278,
+      "loss": 0.0952,
+      "step": 26879
+    },
+    {
+      "epoch": 0.23333130788795237,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0016900512023743362,
+      "loss": 0.1729,
+      "step": 26880
+    },
+    {
+      "epoch": 0.23333998836815653,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0016900287284933734,
+      "loss": 0.1445,
+      "step": 26881
+    },
+    {
+      "epoch": 0.2333486688483607,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0016900062539671637,
+      "loss": 0.0781,
+      "step": 26882
+    },
+    {
+      "epoch": 0.23335734932856486,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0016899837787957325,
+      "loss": 0.123,
+      "step": 26883
+    },
+    {
+      "epoch": 0.23336602980876903,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0016899613029791032,
+      "loss": 0.1016,
+      "step": 26884
+    },
+    {
+      "epoch": 0.2333747102889732,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0016899388265173014,
+      "loss": 0.0923,
+      "step": 26885
+    },
+    {
+      "epoch": 0.23338339076917736,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0016899163494103516,
+      "loss": 0.1299,
+      "step": 26886
+    },
+    {
+      "epoch": 0.23339207124938152,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001689893871658278,
+      "loss": 0.1553,
+      "step": 26887
+    },
+    {
+      "epoch": 0.2334007517295857,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0016898713932611054,
+      "loss": 0.104,
+      "step": 26888
+    },
+    {
+      "epoch": 0.23340943220978985,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0016898489142188585,
+      "loss": 0.1279,
+      "step": 26889
+    },
+    {
+      "epoch": 0.23341811268999402,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.001689826434531561,
+      "loss": 0.0903,
+      "step": 26890
+    },
+    {
+      "epoch": 0.23342679317019818,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0016898039541992386,
+      "loss": 0.1104,
+      "step": 26891
+    },
+    {
+      "epoch": 0.23343547365040235,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0016897814732219157,
+      "loss": 0.1582,
+      "step": 26892
+    },
+    {
+      "epoch": 0.2334441541306065,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0016897589915996164,
+      "loss": 0.0806,
+      "step": 26893
+    },
+    {
+      "epoch": 0.23345283461081068,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0016897365093323654,
+      "loss": 0.0742,
+      "step": 26894
+    },
+    {
+      "epoch": 0.23346151509101484,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0016897140264201876,
+      "loss": 0.061,
+      "step": 26895
+    },
+    {
+      "epoch": 0.233470195571219,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016896915428631072,
+      "loss": 0.1113,
+      "step": 26896
+    },
+    {
+      "epoch": 0.23347887605142317,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0016896690586611493,
+      "loss": 0.1113,
+      "step": 26897
+    },
+    {
+      "epoch": 0.23348755653162734,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0016896465738143378,
+      "loss": 0.0874,
+      "step": 26898
+    },
+    {
+      "epoch": 0.2334962370118315,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001689624088322698,
+      "loss": 0.1348,
+      "step": 26899
+    },
+    {
+      "epoch": 0.23350491749203567,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001689601602186254,
+      "loss": 0.1147,
+      "step": 26900
+    },
+    {
+      "epoch": 0.23351359797223983,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0016895791154050304,
+      "loss": 0.0825,
+      "step": 26901
+    },
+    {
+      "epoch": 0.233522278452444,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0016895566279790523,
+      "loss": 0.126,
+      "step": 26902
+    },
+    {
+      "epoch": 0.23353095893264816,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0016895341399083437,
+      "loss": 0.1104,
+      "step": 26903
+    },
+    {
+      "epoch": 0.23353963941285233,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0016895116511929295,
+      "loss": 0.1104,
+      "step": 26904
+    },
+    {
+      "epoch": 0.2335483198930565,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001689489161832834,
+      "loss": 0.1094,
+      "step": 26905
+    },
+    {
+      "epoch": 0.23355700037326066,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0016894666718280822,
+      "loss": 0.123,
+      "step": 26906
+    },
+    {
+      "epoch": 0.23356568085346482,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0016894441811786988,
+      "loss": 0.0811,
+      "step": 26907
+    },
+    {
+      "epoch": 0.233574361333669,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0016894216898847075,
+      "loss": 0.1216,
+      "step": 26908
+    },
+    {
+      "epoch": 0.23358304181387315,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001689399197946134,
+      "loss": 0.125,
+      "step": 26909
+    },
+    {
+      "epoch": 0.23359172229407732,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0016893767053630018,
+      "loss": 0.0938,
+      "step": 26910
+    },
+    {
+      "epoch": 0.23360040277428148,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0016893542121353368,
+      "loss": 0.085,
+      "step": 26911
+    },
+    {
+      "epoch": 0.23360908325448565,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0016893317182631624,
+      "loss": 0.1006,
+      "step": 26912
+    },
+    {
+      "epoch": 0.23361776373468982,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001689309223746504,
+      "loss": 0.1211,
+      "step": 26913
+    },
+    {
+      "epoch": 0.23362644421489398,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0016892867285853857,
+      "loss": 0.0967,
+      "step": 26914
+    },
+    {
+      "epoch": 0.23363512469509815,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0016892642327798323,
+      "loss": 0.1016,
+      "step": 26915
+    },
+    {
+      "epoch": 0.2336438051753023,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0016892417363298684,
+      "loss": 0.1357,
+      "step": 26916
+    },
+    {
+      "epoch": 0.23365248565550648,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0016892192392355186,
+      "loss": 0.1001,
+      "step": 26917
+    },
+    {
+      "epoch": 0.23366116613571064,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0016891967414968075,
+      "loss": 0.1484,
+      "step": 26918
+    },
+    {
+      "epoch": 0.2336698466159148,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0016891742431137595,
+      "loss": 0.0781,
+      "step": 26919
+    },
+    {
+      "epoch": 0.23367852709611897,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0016891517440863998,
+      "loss": 0.1113,
+      "step": 26920
+    },
+    {
+      "epoch": 0.23368720757632314,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0016891292444147523,
+      "loss": 0.1338,
+      "step": 26921
+    },
+    {
+      "epoch": 0.23369588805652727,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0016891067440988421,
+      "loss": 0.083,
+      "step": 26922
+    },
+    {
+      "epoch": 0.23370456853673144,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0016890842431386934,
+      "loss": 0.123,
+      "step": 26923
+    },
+    {
+      "epoch": 0.2337132490169356,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.001689061741534331,
+      "loss": 0.1367,
+      "step": 26924
+    },
+    {
+      "epoch": 0.23372192949713977,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0016890392392857798,
+      "loss": 0.1045,
+      "step": 26925
+    },
+    {
+      "epoch": 0.23373060997734393,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001689016736393064,
+      "loss": 0.0947,
+      "step": 26926
+    },
+    {
+      "epoch": 0.2337392904575481,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0016889942328562084,
+      "loss": 0.126,
+      "step": 26927
+    },
+    {
+      "epoch": 0.23374797093775226,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0016889717286752376,
+      "loss": 0.1143,
+      "step": 26928
+    },
+    {
+      "epoch": 0.23375665141795643,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0016889492238501757,
+      "loss": 0.0957,
+      "step": 26929
+    },
+    {
+      "epoch": 0.2337653318981606,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.0016889267183810483,
+      "loss": 0.1396,
+      "step": 26930
+    },
+    {
+      "epoch": 0.23377401237836476,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0016889042122678797,
+      "loss": 0.0962,
+      "step": 26931
+    },
+    {
+      "epoch": 0.23378269285856892,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0016888817055106939,
+      "loss": 0.1445,
+      "step": 26932
+    },
+    {
+      "epoch": 0.2337913733387731,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0016888591981095156,
+      "loss": 0.127,
+      "step": 26933
+    },
+    {
+      "epoch": 0.23380005381897725,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0016888366900643704,
+      "loss": 0.127,
+      "step": 26934
+    },
+    {
+      "epoch": 0.23380873429918142,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0016888141813752818,
+      "loss": 0.1108,
+      "step": 26935
+    },
+    {
+      "epoch": 0.23381741477938559,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0016887916720422752,
+      "loss": 0.1279,
+      "step": 26936
+    },
+    {
+      "epoch": 0.23382609525958975,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0016887691620653747,
+      "loss": 0.1113,
+      "step": 26937
+    },
+    {
+      "epoch": 0.23383477573979392,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0016887466514446047,
+      "loss": 0.1396,
+      "step": 26938
+    },
+    {
+      "epoch": 0.23384345621999808,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0016887241401799908,
+      "loss": 0.1196,
+      "step": 26939
+    },
+    {
+      "epoch": 0.23385213670020225,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0016887016282715567,
+      "loss": 0.124,
+      "step": 26940
+    },
+    {
+      "epoch": 0.2338608171804064,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0016886791157193274,
+      "loss": 0.124,
+      "step": 26941
+    },
+    {
+      "epoch": 0.23386949766061058,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0016886566025233276,
+      "loss": 0.0918,
+      "step": 26942
+    },
+    {
+      "epoch": 0.23387817814081474,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0016886340886835816,
+      "loss": 0.1338,
+      "step": 26943
+    },
+    {
+      "epoch": 0.2338868586210189,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0016886115742001143,
+      "loss": 0.1289,
+      "step": 26944
+    },
+    {
+      "epoch": 0.23389553910122307,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0016885890590729502,
+      "loss": 0.0986,
+      "step": 26945
+    },
+    {
+      "epoch": 0.23390421958142724,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001688566543302114,
+      "loss": 0.0796,
+      "step": 26946
+    },
+    {
+      "epoch": 0.2339129000616314,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0016885440268876303,
+      "loss": 0.1279,
+      "step": 26947
+    },
+    {
+      "epoch": 0.23392158054183557,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0016885215098295238,
+      "loss": 0.0957,
+      "step": 26948
+    },
+    {
+      "epoch": 0.23393026102203973,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0016884989921278184,
+      "loss": 0.0835,
+      "step": 26949
+    },
+    {
+      "epoch": 0.2339389415022439,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.00168847647378254,
+      "loss": 0.1455,
+      "step": 26950
+    },
+    {
+      "epoch": 0.23394762198244806,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0016884539547937123,
+      "loss": 0.1396,
+      "step": 26951
+    },
+    {
+      "epoch": 0.23395630246265223,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.00168843143516136,
+      "loss": 0.0942,
+      "step": 26952
+    },
+    {
+      "epoch": 0.2339649829428564,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0016884089148855085,
+      "loss": 0.1094,
+      "step": 26953
+    },
+    {
+      "epoch": 0.23397366342306056,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0016883863939661813,
+      "loss": 0.0977,
+      "step": 26954
+    },
+    {
+      "epoch": 0.23398234390326472,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0016883638724034039,
+      "loss": 0.0928,
+      "step": 26955
+    },
+    {
+      "epoch": 0.2339910243834689,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0016883413501972008,
+      "loss": 0.1133,
+      "step": 26956
+    },
+    {
+      "epoch": 0.23399970486367305,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0016883188273475963,
+      "loss": 0.0845,
+      "step": 26957
+    },
+    {
+      "epoch": 0.23400838534387722,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0016882963038546146,
+      "loss": 0.127,
+      "step": 26958
+    },
+    {
+      "epoch": 0.23401706582408138,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0016882737797182814,
+      "loss": 0.1104,
+      "step": 26959
+    },
+    {
+      "epoch": 0.23402574630428555,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0016882512549386208,
+      "loss": 0.0977,
+      "step": 26960
+    },
+    {
+      "epoch": 0.2340344267844897,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0016882287295156573,
+      "loss": 0.1006,
+      "step": 26961
+    },
+    {
+      "epoch": 0.23404310726469388,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001688206203449416,
+      "loss": 0.1221,
+      "step": 26962
+    },
+    {
+      "epoch": 0.23405178774489804,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001688183676739921,
+      "loss": 0.1123,
+      "step": 26963
+    },
+    {
+      "epoch": 0.2340604682251022,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0016881611493871974,
+      "loss": 0.0786,
+      "step": 26964
+    },
+    {
+      "epoch": 0.23406914870530637,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0016881386213912692,
+      "loss": 0.1289,
+      "step": 26965
+    },
+    {
+      "epoch": 0.23407782918551054,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0016881160927521618,
+      "loss": 0.1138,
+      "step": 26966
+    },
+    {
+      "epoch": 0.2340865096657147,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0016880935634698993,
+      "loss": 0.125,
+      "step": 26967
+    },
+    {
+      "epoch": 0.23409519014591887,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0016880710335445066,
+      "loss": 0.085,
+      "step": 26968
+    },
+    {
+      "epoch": 0.23410387062612303,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0016880485029760084,
+      "loss": 0.1089,
+      "step": 26969
+    },
+    {
+      "epoch": 0.2341125511063272,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0016880259717644288,
+      "loss": 0.1035,
+      "step": 26970
+    },
+    {
+      "epoch": 0.23412123158653136,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001688003439909793,
+      "loss": 0.1094,
+      "step": 26971
+    },
+    {
+      "epoch": 0.23412991206673553,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0016879809074121256,
+      "loss": 0.123,
+      "step": 26972
+    },
+    {
+      "epoch": 0.2341385925469397,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0016879583742714513,
+      "loss": 0.1064,
+      "step": 26973
+    },
+    {
+      "epoch": 0.23414727302714386,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0016879358404877942,
+      "loss": 0.127,
+      "step": 26974
+    },
+    {
+      "epoch": 0.23415595350734802,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0016879133060611793,
+      "loss": 0.1045,
+      "step": 26975
+    },
+    {
+      "epoch": 0.2341646339875522,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001687890770991631,
+      "loss": 0.1079,
+      "step": 26976
+    },
+    {
+      "epoch": 0.23417331446775635,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001687868235279175,
+      "loss": 0.0933,
+      "step": 26977
+    },
+    {
+      "epoch": 0.23418199494796052,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0016878456989238347,
+      "loss": 0.0898,
+      "step": 26978
+    },
+    {
+      "epoch": 0.23419067542816469,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001687823161925635,
+      "loss": 0.0874,
+      "step": 26979
+    },
+    {
+      "epoch": 0.23419935590836885,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0016878006242846007,
+      "loss": 0.1104,
+      "step": 26980
+    },
+    {
+      "epoch": 0.23420803638857302,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0016877780860007567,
+      "loss": 0.1113,
+      "step": 26981
+    },
+    {
+      "epoch": 0.23421671686877718,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0016877555470741273,
+      "loss": 0.0737,
+      "step": 26982
+    },
+    {
+      "epoch": 0.23422539734898135,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001687733007504737,
+      "loss": 0.1484,
+      "step": 26983
+    },
+    {
+      "epoch": 0.2342340778291855,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.001687710467292611,
+      "loss": 0.1099,
+      "step": 26984
+    },
+    {
+      "epoch": 0.23424275830938968,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001687687926437774,
+      "loss": 0.085,
+      "step": 26985
+    },
+    {
+      "epoch": 0.23425143878959384,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00168766538494025,
+      "loss": 0.0859,
+      "step": 26986
+    },
+    {
+      "epoch": 0.234260119269798,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001687642842800064,
+      "loss": 0.123,
+      "step": 26987
+    },
+    {
+      "epoch": 0.23426879975000217,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0016876203000172404,
+      "loss": 0.1387,
+      "step": 26988
+    },
+    {
+      "epoch": 0.23427748023020634,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.001687597756591804,
+      "loss": 0.0757,
+      "step": 26989
+    },
+    {
+      "epoch": 0.2342861607104105,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.00168757521252378,
+      "loss": 0.1089,
+      "step": 26990
+    },
+    {
+      "epoch": 0.23429484119061467,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0016875526678131925,
+      "loss": 0.0898,
+      "step": 26991
+    },
+    {
+      "epoch": 0.23430352167081883,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001687530122460066,
+      "loss": 0.1045,
+      "step": 26992
+    },
+    {
+      "epoch": 0.234312202151023,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0016875075764644254,
+      "loss": 0.1016,
+      "step": 26993
+    },
+    {
+      "epoch": 0.23432088263122716,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0016874850298262954,
+      "loss": 0.1113,
+      "step": 26994
+    },
+    {
+      "epoch": 0.23432956311143133,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0016874624825457007,
+      "loss": 0.1016,
+      "step": 26995
+    },
+    {
+      "epoch": 0.2343382435916355,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0016874399346226657,
+      "loss": 0.127,
+      "step": 26996
+    },
+    {
+      "epoch": 0.23434692407183966,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0016874173860572154,
+      "loss": 0.1035,
+      "step": 26997
+    },
+    {
+      "epoch": 0.23435560455204382,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001687394836849374,
+      "loss": 0.1094,
+      "step": 26998
+    },
+    {
+      "epoch": 0.234364285032248,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0016873722869991665,
+      "loss": 0.1406,
+      "step": 26999
+    },
+    {
+      "epoch": 0.23437296551245215,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0016873497365066177,
+      "loss": 0.1196,
+      "step": 27000
+    },
+    {
+      "epoch": 0.23438164599265632,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0016873271853717518,
+      "loss": 0.125,
+      "step": 27001
+    },
+    {
+      "epoch": 0.23439032647286048,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0016873046335945938,
+      "loss": 0.2041,
+      "step": 27002
+    },
+    {
+      "epoch": 0.23439900695306465,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0016872820811751684,
+      "loss": 0.0825,
+      "step": 27003
+    },
+    {
+      "epoch": 0.2344076874332688,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0016872595281135002,
+      "loss": 0.0957,
+      "step": 27004
+    },
+    {
+      "epoch": 0.23441636791347298,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0016872369744096135,
+      "loss": 0.1289,
+      "step": 27005
+    },
+    {
+      "epoch": 0.23442504839367714,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0016872144200635333,
+      "loss": 0.1104,
+      "step": 27006
+    },
+    {
+      "epoch": 0.2344337288738813,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0016871918650752842,
+      "loss": 0.126,
+      "step": 27007
+    },
+    {
+      "epoch": 0.23444240935408547,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0016871693094448912,
+      "loss": 0.1162,
+      "step": 27008
+    },
+    {
+      "epoch": 0.23445108983428964,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0016871467531723785,
+      "loss": 0.0947,
+      "step": 27009
+    },
+    {
+      "epoch": 0.2344597703144938,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0016871241962577708,
+      "loss": 0.1025,
+      "step": 27010
+    },
+    {
+      "epoch": 0.23446845079469797,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0016871016387010932,
+      "loss": 0.0664,
+      "step": 27011
+    },
+    {
+      "epoch": 0.23447713127490213,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.00168707908050237,
+      "loss": 0.085,
+      "step": 27012
+    },
+    {
+      "epoch": 0.2344858117551063,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0016870565216616257,
+      "loss": 0.0737,
+      "step": 27013
+    },
+    {
+      "epoch": 0.23449449223531046,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0016870339621788855,
+      "loss": 0.1123,
+      "step": 27014
+    },
+    {
+      "epoch": 0.23450317271551463,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0016870114020541736,
+      "loss": 0.1133,
+      "step": 27015
+    },
+    {
+      "epoch": 0.2345118531957188,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0016869888412875146,
+      "loss": 0.0864,
+      "step": 27016
+    },
+    {
+      "epoch": 0.23452053367592296,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001686966279878934,
+      "loss": 0.0938,
+      "step": 27017
+    },
+    {
+      "epoch": 0.23452921415612712,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0016869437178284555,
+      "loss": 0.1079,
+      "step": 27018
+    },
+    {
+      "epoch": 0.2345378946363313,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0016869211551361043,
+      "loss": 0.1572,
+      "step": 27019
+    },
+    {
+      "epoch": 0.23454657511653546,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001686898591801905,
+      "loss": 0.084,
+      "step": 27020
+    },
+    {
+      "epoch": 0.23455525559673962,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0016868760278258824,
+      "loss": 0.1641,
+      "step": 27021
+    },
+    {
+      "epoch": 0.23456393607694379,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0016868534632080603,
+      "loss": 0.1157,
+      "step": 27022
+    },
+    {
+      "epoch": 0.23457261655714795,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0016868308979484648,
+      "loss": 0.0972,
+      "step": 27023
+    },
+    {
+      "epoch": 0.23458129703735212,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0016868083320471193,
+      "loss": 0.1211,
+      "step": 27024
+    },
+    {
+      "epoch": 0.23458997751755628,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0016867857655040496,
+      "loss": 0.0996,
+      "step": 27025
+    },
+    {
+      "epoch": 0.23459865799776045,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0016867631983192792,
+      "loss": 0.0728,
+      "step": 27026
+    },
+    {
+      "epoch": 0.2346073384779646,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0016867406304928337,
+      "loss": 0.0933,
+      "step": 27027
+    },
+    {
+      "epoch": 0.23461601895816878,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0016867180620247374,
+      "loss": 0.1045,
+      "step": 27028
+    },
+    {
+      "epoch": 0.23462469943837294,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0016866954929150152,
+      "loss": 0.0996,
+      "step": 27029
+    },
+    {
+      "epoch": 0.2346333799185771,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0016866729231636915,
+      "loss": 0.124,
+      "step": 27030
+    },
+    {
+      "epoch": 0.23464206039878127,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0016866503527707911,
+      "loss": 0.1211,
+      "step": 27031
+    },
+    {
+      "epoch": 0.23465074087898544,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0016866277817363384,
+      "loss": 0.1011,
+      "step": 27032
+    },
+    {
+      "epoch": 0.2346594213591896,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0016866052100603587,
+      "loss": 0.123,
+      "step": 27033
+    },
+    {
+      "epoch": 0.23466810183939377,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0016865826377428764,
+      "loss": 0.1396,
+      "step": 27034
+    },
+    {
+      "epoch": 0.23467678231959793,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.001686560064783916,
+      "loss": 0.1172,
+      "step": 27035
+    },
+    {
+      "epoch": 0.2346854627998021,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0016865374911835024,
+      "loss": 0.1025,
+      "step": 27036
+    },
+    {
+      "epoch": 0.23469414328000626,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.00168651491694166,
+      "loss": 0.1074,
+      "step": 27037
+    },
+    {
+      "epoch": 0.23470282376021043,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001686492342058414,
+      "loss": 0.084,
+      "step": 27038
+    },
+    {
+      "epoch": 0.2347115042404146,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0016864697665337888,
+      "loss": 0.0996,
+      "step": 27039
+    },
+    {
+      "epoch": 0.23472018472061876,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.001686447190367809,
+      "loss": 0.1338,
+      "step": 27040
+    },
+    {
+      "epoch": 0.23472886520082292,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0016864246135604992,
+      "loss": 0.1172,
+      "step": 27041
+    },
+    {
+      "epoch": 0.2347375456810271,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0016864020361118844,
+      "loss": 0.1152,
+      "step": 27042
+    },
+    {
+      "epoch": 0.23474622616123125,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001686379458021989,
+      "loss": 0.1602,
+      "step": 27043
+    },
+    {
+      "epoch": 0.23475490664143542,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001686356879290838,
+      "loss": 0.1211,
+      "step": 27044
+    },
+    {
+      "epoch": 0.23476358712163956,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0016863342999184558,
+      "loss": 0.1108,
+      "step": 27045
+    },
+    {
+      "epoch": 0.23477226760184372,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0016863117199048675,
+      "loss": 0.124,
+      "step": 27046
+    },
+    {
+      "epoch": 0.23478094808204789,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0016862891392500972,
+      "loss": 0.1064,
+      "step": 27047
+    },
+    {
+      "epoch": 0.23478962856225205,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.00168626655795417,
+      "loss": 0.0962,
+      "step": 27048
+    },
+    {
+      "epoch": 0.23479830904245622,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0016862439760171105,
+      "loss": 0.1055,
+      "step": 27049
+    },
+    {
+      "epoch": 0.23480698952266038,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0016862213934389434,
+      "loss": 0.0952,
+      "step": 27050
+    },
+    {
+      "epoch": 0.23481567000286455,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0016861988102196935,
+      "loss": 0.127,
+      "step": 27051
+    },
+    {
+      "epoch": 0.2348243504830687,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0016861762263593854,
+      "loss": 0.1016,
+      "step": 27052
+    },
+    {
+      "epoch": 0.23483303096327288,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0016861536418580434,
+      "loss": 0.1123,
+      "step": 27053
+    },
+    {
+      "epoch": 0.23484171144347704,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001686131056715693,
+      "loss": 0.1113,
+      "step": 27054
+    },
+    {
+      "epoch": 0.2348503919236812,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0016861084709323584,
+      "loss": 0.1035,
+      "step": 27055
+    },
+    {
+      "epoch": 0.23485907240388537,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0016860858845080642,
+      "loss": 0.104,
+      "step": 27056
+    },
+    {
+      "epoch": 0.23486775288408954,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0016860632974428354,
+      "loss": 0.123,
+      "step": 27057
+    },
+    {
+      "epoch": 0.2348764333642937,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0016860407097366965,
+      "loss": 0.1211,
+      "step": 27058
+    },
+    {
+      "epoch": 0.23488511384449787,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0016860181213896727,
+      "loss": 0.1187,
+      "step": 27059
+    },
+    {
+      "epoch": 0.23489379432470203,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0016859955324017878,
+      "loss": 0.1172,
+      "step": 27060
+    },
+    {
+      "epoch": 0.2349024748049062,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0016859729427730673,
+      "loss": 0.1172,
+      "step": 27061
+    },
+    {
+      "epoch": 0.23491115528511036,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0016859503525035353,
+      "loss": 0.083,
+      "step": 27062
+    },
+    {
+      "epoch": 0.23491983576531453,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001685927761593217,
+      "loss": 0.0859,
+      "step": 27063
+    },
+    {
+      "epoch": 0.2349285162455187,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0016859051700421365,
+      "loss": 0.1094,
+      "step": 27064
+    },
+    {
+      "epoch": 0.23493719672572286,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0016858825778503198,
+      "loss": 0.124,
+      "step": 27065
+    },
+    {
+      "epoch": 0.23494587720592702,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0016858599850177898,
+      "loss": 0.1396,
+      "step": 27066
+    },
+    {
+      "epoch": 0.2349545576861312,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0016858373915445725,
+      "loss": 0.0869,
+      "step": 27067
+    },
+    {
+      "epoch": 0.23496323816633535,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0016858147974306923,
+      "loss": 0.1299,
+      "step": 27068
+    },
+    {
+      "epoch": 0.23497191864653952,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0016857922026761738,
+      "loss": 0.0986,
+      "step": 27069
+    },
+    {
+      "epoch": 0.23498059912674368,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0016857696072810416,
+      "loss": 0.1562,
+      "step": 27070
+    },
+    {
+      "epoch": 0.23498927960694785,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0016857470112453209,
+      "loss": 0.0757,
+      "step": 27071
+    },
+    {
+      "epoch": 0.234997960087152,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0016857244145690358,
+      "loss": 0.1094,
+      "step": 27072
+    },
+    {
+      "epoch": 0.23500664056735618,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0016857018172522112,
+      "loss": 0.1074,
+      "step": 27073
+    },
+    {
+      "epoch": 0.23501532104756034,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0016856792192948718,
+      "loss": 0.0586,
+      "step": 27074
+    },
+    {
+      "epoch": 0.2350240015277645,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0016856566206970428,
+      "loss": 0.1309,
+      "step": 27075
+    },
+    {
+      "epoch": 0.23503268200796867,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0016856340214587485,
+      "loss": 0.0913,
+      "step": 27076
+    },
+    {
+      "epoch": 0.23504136248817284,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0016856114215800134,
+      "loss": 0.0933,
+      "step": 27077
+    },
+    {
+      "epoch": 0.235050042968377,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0016855888210608628,
+      "loss": 0.125,
+      "step": 27078
+    },
+    {
+      "epoch": 0.23505872344858117,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0016855662199013203,
+      "loss": 0.1128,
+      "step": 27079
+    },
+    {
+      "epoch": 0.23506740392878533,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001685543618101412,
+      "loss": 0.0977,
+      "step": 27080
+    },
+    {
+      "epoch": 0.2350760844089895,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001685521015661162,
+      "loss": 0.0933,
+      "step": 27081
+    },
+    {
+      "epoch": 0.23508476488919366,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.001685498412580595,
+      "loss": 0.0923,
+      "step": 27082
+    },
+    {
+      "epoch": 0.23509344536939783,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0016854758088597356,
+      "loss": 0.1025,
+      "step": 27083
+    },
+    {
+      "epoch": 0.235102125849602,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0016854532044986085,
+      "loss": 0.0957,
+      "step": 27084
+    },
+    {
+      "epoch": 0.23511080632980616,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0016854305994972384,
+      "loss": 0.1377,
+      "step": 27085
+    },
+    {
+      "epoch": 0.23511948681001033,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0016854079938556509,
+      "loss": 0.1064,
+      "step": 27086
+    },
+    {
+      "epoch": 0.2351281672902145,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0016853853875738694,
+      "loss": 0.0972,
+      "step": 27087
+    },
+    {
+      "epoch": 0.23513684777041866,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0016853627806519196,
+      "loss": 0.1147,
+      "step": 27088
+    },
+    {
+      "epoch": 0.23514552825062282,
+      "grad_norm": 2.5,
+      "learning_rate": 0.001685340173089826,
+      "loss": 0.1475,
+      "step": 27089
+    },
+    {
+      "epoch": 0.23515420873082699,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0016853175648876126,
+      "loss": 0.1035,
+      "step": 27090
+    },
+    {
+      "epoch": 0.23516288921103115,
+      "grad_norm": 8.8125,
+      "learning_rate": 0.0016852949560453051,
+      "loss": 0.1348,
+      "step": 27091
+    },
+    {
+      "epoch": 0.23517156969123532,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0016852723465629277,
+      "loss": 0.1104,
+      "step": 27092
+    },
+    {
+      "epoch": 0.23518025017143948,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0016852497364405054,
+      "loss": 0.0732,
+      "step": 27093
+    },
+    {
+      "epoch": 0.23518893065164365,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0016852271256780627,
+      "loss": 0.1416,
+      "step": 27094
+    },
+    {
+      "epoch": 0.2351976111318478,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0016852045142756243,
+      "loss": 0.1084,
+      "step": 27095
+    },
+    {
+      "epoch": 0.23520629161205198,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0016851819022332152,
+      "loss": 0.1133,
+      "step": 27096
+    },
+    {
+      "epoch": 0.23521497209225614,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00168515928955086,
+      "loss": 0.1035,
+      "step": 27097
+    },
+    {
+      "epoch": 0.2352236525724603,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0016851366762285832,
+      "loss": 0.1289,
+      "step": 27098
+    },
+    {
+      "epoch": 0.23523233305266447,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0016851140622664098,
+      "loss": 0.1025,
+      "step": 27099
+    },
+    {
+      "epoch": 0.23524101353286864,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0016850914476643647,
+      "loss": 0.0859,
+      "step": 27100
+    },
+    {
+      "epoch": 0.2352496940130728,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0016850688324224717,
+      "loss": 0.0781,
+      "step": 27101
+    },
+    {
+      "epoch": 0.23525837449327697,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0016850462165407568,
+      "loss": 0.0918,
+      "step": 27102
+    },
+    {
+      "epoch": 0.23526705497348113,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0016850236000192443,
+      "loss": 0.0923,
+      "step": 27103
+    },
+    {
+      "epoch": 0.2352757354536853,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0016850009828579583,
+      "loss": 0.0947,
+      "step": 27104
+    },
+    {
+      "epoch": 0.23528441593388946,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.001684978365056924,
+      "loss": 0.1484,
+      "step": 27105
+    },
+    {
+      "epoch": 0.23529309641409363,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0016849557466161664,
+      "loss": 0.0801,
+      "step": 27106
+    },
+    {
+      "epoch": 0.2353017768942978,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00168493312753571,
+      "loss": 0.0918,
+      "step": 27107
+    },
+    {
+      "epoch": 0.23531045737450196,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0016849105078155797,
+      "loss": 0.1226,
+      "step": 27108
+    },
+    {
+      "epoch": 0.23531913785470612,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0016848878874557995,
+      "loss": 0.0938,
+      "step": 27109
+    },
+    {
+      "epoch": 0.2353278183349103,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0016848652664563952,
+      "loss": 0.1572,
+      "step": 27110
+    },
+    {
+      "epoch": 0.23533649881511445,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0016848426448173909,
+      "loss": 0.126,
+      "step": 27111
+    },
+    {
+      "epoch": 0.23534517929531862,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0016848200225388114,
+      "loss": 0.1621,
+      "step": 27112
+    },
+    {
+      "epoch": 0.23535385977552278,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0016847973996206816,
+      "loss": 0.1055,
+      "step": 27113
+    },
+    {
+      "epoch": 0.23536254025572695,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001684774776063026,
+      "loss": 0.0728,
+      "step": 27114
+    },
+    {
+      "epoch": 0.2353712207359311,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0016847521518658696,
+      "loss": 0.1348,
+      "step": 27115
+    },
+    {
+      "epoch": 0.23537990121613528,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001684729527029237,
+      "loss": 0.0815,
+      "step": 27116
+    },
+    {
+      "epoch": 0.23538858169633944,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0016847069015531529,
+      "loss": 0.1123,
+      "step": 27117
+    },
+    {
+      "epoch": 0.2353972621765436,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0016846842754376422,
+      "loss": 0.1309,
+      "step": 27118
+    },
+    {
+      "epoch": 0.23540594265674777,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0016846616486827298,
+      "loss": 0.1201,
+      "step": 27119
+    },
+    {
+      "epoch": 0.23541462313695194,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0016846390212884397,
+      "loss": 0.1504,
+      "step": 27120
+    },
+    {
+      "epoch": 0.2354233036171561,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0016846163932547973,
+      "loss": 0.0742,
+      "step": 27121
+    },
+    {
+      "epoch": 0.23543198409736027,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0016845937645818272,
+      "loss": 0.1572,
+      "step": 27122
+    },
+    {
+      "epoch": 0.23544066457756443,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0016845711352695546,
+      "loss": 0.0928,
+      "step": 27123
+    },
+    {
+      "epoch": 0.2354493450577686,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0016845485053180033,
+      "loss": 0.1035,
+      "step": 27124
+    },
+    {
+      "epoch": 0.23545802553797276,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0016845258747271987,
+      "loss": 0.1021,
+      "step": 27125
+    },
+    {
+      "epoch": 0.23546670601817693,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0016845032434971653,
+      "loss": 0.1348,
+      "step": 27126
+    },
+    {
+      "epoch": 0.2354753864983811,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0016844806116279281,
+      "loss": 0.1226,
+      "step": 27127
+    },
+    {
+      "epoch": 0.23548406697858526,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0016844579791195115,
+      "loss": 0.123,
+      "step": 27128
+    },
+    {
+      "epoch": 0.23549274745878943,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0016844353459719405,
+      "loss": 0.0996,
+      "step": 27129
+    },
+    {
+      "epoch": 0.2355014279389936,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0016844127121852398,
+      "loss": 0.1963,
+      "step": 27130
+    },
+    {
+      "epoch": 0.23551010841919776,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0016843900777594341,
+      "loss": 0.0684,
+      "step": 27131
+    },
+    {
+      "epoch": 0.23551878889940192,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0016843674426945483,
+      "loss": 0.1191,
+      "step": 27132
+    },
+    {
+      "epoch": 0.23552746937960609,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001684344806990607,
+      "loss": 0.0981,
+      "step": 27133
+    },
+    {
+      "epoch": 0.23553614985981025,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.001684322170647635,
+      "loss": 0.123,
+      "step": 27134
+    },
+    {
+      "epoch": 0.23554483034001442,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001684299533665657,
+      "loss": 0.124,
+      "step": 27135
+    },
+    {
+      "epoch": 0.23555351082021858,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0016842768960446978,
+      "loss": 0.1104,
+      "step": 27136
+    },
+    {
+      "epoch": 0.23556219130042275,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001684254257784782,
+      "loss": 0.1299,
+      "step": 27137
+    },
+    {
+      "epoch": 0.2355708717806269,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001684231618885935,
+      "loss": 0.0811,
+      "step": 27138
+    },
+    {
+      "epoch": 0.23557955226083108,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0016842089793481806,
+      "loss": 0.1138,
+      "step": 27139
+    },
+    {
+      "epoch": 0.23558823274103524,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0016841863391715442,
+      "loss": 0.1182,
+      "step": 27140
+    },
+    {
+      "epoch": 0.2355969132212394,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0016841636983560507,
+      "loss": 0.1094,
+      "step": 27141
+    },
+    {
+      "epoch": 0.23560559370144357,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0016841410569017243,
+      "loss": 0.1201,
+      "step": 27142
+    },
+    {
+      "epoch": 0.23561427418164774,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.00168411841480859,
+      "loss": 0.0806,
+      "step": 27143
+    },
+    {
+      "epoch": 0.2356229546618519,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0016840957720766726,
+      "loss": 0.1279,
+      "step": 27144
+    },
+    {
+      "epoch": 0.23563163514205607,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0016840731287059967,
+      "loss": 0.1426,
+      "step": 27145
+    },
+    {
+      "epoch": 0.23564031562226023,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0016840504846965874,
+      "loss": 0.0957,
+      "step": 27146
+    },
+    {
+      "epoch": 0.2356489961024644,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0016840278400484694,
+      "loss": 0.1084,
+      "step": 27147
+    },
+    {
+      "epoch": 0.23565767658266856,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001684005194761667,
+      "loss": 0.123,
+      "step": 27148
+    },
+    {
+      "epoch": 0.23566635706287273,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0016839825488362053,
+      "loss": 0.083,
+      "step": 27149
+    },
+    {
+      "epoch": 0.2356750375430769,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0016839599022721093,
+      "loss": 0.085,
+      "step": 27150
+    },
+    {
+      "epoch": 0.23568371802328106,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0016839372550694035,
+      "loss": 0.0928,
+      "step": 27151
+    },
+    {
+      "epoch": 0.23569239850348522,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0016839146072281123,
+      "loss": 0.0703,
+      "step": 27152
+    },
+    {
+      "epoch": 0.2357010789836894,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0016838919587482614,
+      "loss": 0.103,
+      "step": 27153
+    },
+    {
+      "epoch": 0.23570975946389355,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0016838693096298746,
+      "loss": 0.0957,
+      "step": 27154
+    },
+    {
+      "epoch": 0.23571843994409772,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016838466598729774,
+      "loss": 0.0864,
+      "step": 27155
+    },
+    {
+      "epoch": 0.23572712042430188,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001683824009477594,
+      "loss": 0.1055,
+      "step": 27156
+    },
+    {
+      "epoch": 0.23573580090450605,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0016838013584437495,
+      "loss": 0.0938,
+      "step": 27157
+    },
+    {
+      "epoch": 0.2357444813847102,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001683778706771469,
+      "loss": 0.1553,
+      "step": 27158
+    },
+    {
+      "epoch": 0.23575316186491438,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0016837560544607764,
+      "loss": 0.0879,
+      "step": 27159
+    },
+    {
+      "epoch": 0.23576184234511854,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0016837334015116971,
+      "loss": 0.1318,
+      "step": 27160
+    },
+    {
+      "epoch": 0.2357705228253227,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0016837107479242557,
+      "loss": 0.1235,
+      "step": 27161
+    },
+    {
+      "epoch": 0.23577920330552687,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0016836880936984772,
+      "loss": 0.1099,
+      "step": 27162
+    },
+    {
+      "epoch": 0.23578788378573104,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.001683665438834386,
+      "loss": 0.1387,
+      "step": 27163
+    },
+    {
+      "epoch": 0.2357965642659352,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001683642783332007,
+      "loss": 0.1309,
+      "step": 27164
+    },
+    {
+      "epoch": 0.23580524474613937,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001683620127191365,
+      "loss": 0.1191,
+      "step": 27165
+    },
+    {
+      "epoch": 0.23581392522634353,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001683597470412485,
+      "loss": 0.0947,
+      "step": 27166
+    },
+    {
+      "epoch": 0.2358226057065477,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0016835748129953915,
+      "loss": 0.0933,
+      "step": 27167
+    },
+    {
+      "epoch": 0.23583128618675184,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0016835521549401092,
+      "loss": 0.0933,
+      "step": 27168
+    },
+    {
+      "epoch": 0.235839966666956,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001683529496246663,
+      "loss": 0.1055,
+      "step": 27169
+    },
+    {
+      "epoch": 0.23584864714716017,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0016835068369150778,
+      "loss": 0.0923,
+      "step": 27170
+    },
+    {
+      "epoch": 0.23585732762736433,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0016834841769453783,
+      "loss": 0.1187,
+      "step": 27171
+    },
+    {
+      "epoch": 0.2358660081075685,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0016834615163375892,
+      "loss": 0.0967,
+      "step": 27172
+    },
+    {
+      "epoch": 0.23587468858777266,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0016834388550917357,
+      "loss": 0.1133,
+      "step": 27173
+    },
+    {
+      "epoch": 0.23588336906797683,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0016834161932078417,
+      "loss": 0.0957,
+      "step": 27174
+    },
+    {
+      "epoch": 0.235892049548181,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001683393530685933,
+      "loss": 0.0859,
+      "step": 27175
+    },
+    {
+      "epoch": 0.23590073002838516,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0016833708675260339,
+      "loss": 0.0688,
+      "step": 27176
+    },
+    {
+      "epoch": 0.23590941050858932,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001683348203728169,
+      "loss": 0.0908,
+      "step": 27177
+    },
+    {
+      "epoch": 0.2359180909887935,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001683325539292363,
+      "loss": 0.1104,
+      "step": 27178
+    },
+    {
+      "epoch": 0.23592677146899765,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0016833028742186413,
+      "loss": 0.1064,
+      "step": 27179
+    },
+    {
+      "epoch": 0.23593545194920182,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0016832802085070282,
+      "loss": 0.0977,
+      "step": 27180
+    },
+    {
+      "epoch": 0.23594413242940598,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0016832575421575487,
+      "loss": 0.0771,
+      "step": 27181
+    },
+    {
+      "epoch": 0.23595281290961015,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0016832348751702276,
+      "loss": 0.084,
+      "step": 27182
+    },
+    {
+      "epoch": 0.23596149338981431,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0016832122075450893,
+      "loss": 0.0601,
+      "step": 27183
+    },
+    {
+      "epoch": 0.23597017387001848,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0016831895392821592,
+      "loss": 0.1875,
+      "step": 27184
+    },
+    {
+      "epoch": 0.23597885435022264,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0016831668703814616,
+      "loss": 0.0908,
+      "step": 27185
+    },
+    {
+      "epoch": 0.2359875348304268,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0016831442008430216,
+      "loss": 0.1426,
+      "step": 27186
+    },
+    {
+      "epoch": 0.23599621531063097,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0016831215306668636,
+      "loss": 0.0928,
+      "step": 27187
+    },
+    {
+      "epoch": 0.23600489579083514,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001683098859853013,
+      "loss": 0.0874,
+      "step": 27188
+    },
+    {
+      "epoch": 0.2360135762710393,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0016830761884014942,
+      "loss": 0.1094,
+      "step": 27189
+    },
+    {
+      "epoch": 0.23602225675124347,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0016830535163123322,
+      "loss": 0.1777,
+      "step": 27190
+    },
+    {
+      "epoch": 0.23603093723144764,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001683030843585551,
+      "loss": 0.0977,
+      "step": 27191
+    },
+    {
+      "epoch": 0.2360396177116518,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0016830081702211763,
+      "loss": 0.0908,
+      "step": 27192
+    },
+    {
+      "epoch": 0.23604829819185597,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0016829854962192327,
+      "loss": 0.1074,
+      "step": 27193
+    },
+    {
+      "epoch": 0.23605697867206013,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0016829628215797453,
+      "loss": 0.166,
+      "step": 27194
+    },
+    {
+      "epoch": 0.2360656591522643,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0016829401463027381,
+      "loss": 0.1387,
+      "step": 27195
+    },
+    {
+      "epoch": 0.23607433963246846,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001682917470388236,
+      "loss": 0.0879,
+      "step": 27196
+    },
+    {
+      "epoch": 0.23608302011267263,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0016828947938362648,
+      "loss": 0.0664,
+      "step": 27197
+    },
+    {
+      "epoch": 0.2360917005928768,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0016828721166468486,
+      "loss": 0.127,
+      "step": 27198
+    },
+    {
+      "epoch": 0.23610038107308096,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0016828494388200117,
+      "loss": 0.1152,
+      "step": 27199
+    },
+    {
+      "epoch": 0.23610906155328512,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0016828267603557795,
+      "loss": 0.1602,
+      "step": 27200
+    },
+    {
+      "epoch": 0.2361177420334893,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001682804081254177,
+      "loss": 0.1377,
+      "step": 27201
+    },
+    {
+      "epoch": 0.23612642251369345,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0016827814015152286,
+      "loss": 0.0908,
+      "step": 27202
+    },
+    {
+      "epoch": 0.23613510299389762,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0016827587211389592,
+      "loss": 0.1045,
+      "step": 27203
+    },
+    {
+      "epoch": 0.23614378347410178,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0016827360401253935,
+      "loss": 0.0977,
+      "step": 27204
+    },
+    {
+      "epoch": 0.23615246395430595,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0016827133584745566,
+      "loss": 0.1875,
+      "step": 27205
+    },
+    {
+      "epoch": 0.2361611444345101,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.001682690676186473,
+      "loss": 0.0928,
+      "step": 27206
+    },
+    {
+      "epoch": 0.23616982491471428,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0016826679932611676,
+      "loss": 0.1206,
+      "step": 27207
+    },
+    {
+      "epoch": 0.23617850539491844,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0016826453096986651,
+      "loss": 0.1172,
+      "step": 27208
+    },
+    {
+      "epoch": 0.2361871858751226,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0016826226254989909,
+      "loss": 0.1416,
+      "step": 27209
+    },
+    {
+      "epoch": 0.23619586635532677,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0016825999406621692,
+      "loss": 0.1226,
+      "step": 27210
+    },
+    {
+      "epoch": 0.23620454683553094,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0016825772551882247,
+      "loss": 0.1357,
+      "step": 27211
+    },
+    {
+      "epoch": 0.2362132273157351,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0016825545690771827,
+      "loss": 0.1084,
+      "step": 27212
+    },
+    {
+      "epoch": 0.23622190779593927,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0016825318823290678,
+      "loss": 0.1484,
+      "step": 27213
+    },
+    {
+      "epoch": 0.23623058827614343,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0016825091949439046,
+      "loss": 0.1064,
+      "step": 27214
+    },
+    {
+      "epoch": 0.2362392687563476,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0016824865069217185,
+      "loss": 0.1069,
+      "step": 27215
+    },
+    {
+      "epoch": 0.23624794923655176,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0016824638182625334,
+      "loss": 0.1367,
+      "step": 27216
+    },
+    {
+      "epoch": 0.23625662971675593,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0016824411289663747,
+      "loss": 0.1445,
+      "step": 27217
+    },
+    {
+      "epoch": 0.2362653101969601,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0016824184390332674,
+      "loss": 0.1162,
+      "step": 27218
+    },
+    {
+      "epoch": 0.23627399067716426,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001682395748463236,
+      "loss": 0.1221,
+      "step": 27219
+    },
+    {
+      "epoch": 0.23628267115736842,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0016823730572563052,
+      "loss": 0.1196,
+      "step": 27220
+    },
+    {
+      "epoch": 0.2362913516375726,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0016823503654125002,
+      "loss": 0.125,
+      "step": 27221
+    },
+    {
+      "epoch": 0.23630003211777675,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0016823276729318452,
+      "loss": 0.123,
+      "step": 27222
+    },
+    {
+      "epoch": 0.23630871259798092,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0016823049798143654,
+      "loss": 0.1387,
+      "step": 27223
+    },
+    {
+      "epoch": 0.23631739307818508,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0016822822860600862,
+      "loss": 0.457,
+      "step": 27224
+    },
+    {
+      "epoch": 0.23632607355838925,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0016822595916690312,
+      "loss": 0.1113,
+      "step": 27225
+    },
+    {
+      "epoch": 0.23633475403859341,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0016822368966412263,
+      "loss": 0.1348,
+      "step": 27226
+    },
+    {
+      "epoch": 0.23634343451879758,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0016822142009766958,
+      "loss": 0.1143,
+      "step": 27227
+    },
+    {
+      "epoch": 0.23635211499900174,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0016821915046754646,
+      "loss": 0.125,
+      "step": 27228
+    },
+    {
+      "epoch": 0.2363607954792059,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0016821688077375573,
+      "loss": 0.125,
+      "step": 27229
+    },
+    {
+      "epoch": 0.23636947595941007,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0016821461101629992,
+      "loss": 0.0938,
+      "step": 27230
+    },
+    {
+      "epoch": 0.23637815643961424,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0016821234119518145,
+      "loss": 0.1006,
+      "step": 27231
+    },
+    {
+      "epoch": 0.2363868369198184,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0016821007131040287,
+      "loss": 0.1123,
+      "step": 27232
+    },
+    {
+      "epoch": 0.23639551740002257,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0016820780136196662,
+      "loss": 0.1523,
+      "step": 27233
+    },
+    {
+      "epoch": 0.23640419788022674,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001682055313498752,
+      "loss": 0.1221,
+      "step": 27234
+    },
+    {
+      "epoch": 0.2364128783604309,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001682032612741311,
+      "loss": 0.0879,
+      "step": 27235
+    },
+    {
+      "epoch": 0.23642155884063507,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0016820099113473675,
+      "loss": 0.1934,
+      "step": 27236
+    },
+    {
+      "epoch": 0.23643023932083923,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001681987209316947,
+      "loss": 0.0981,
+      "step": 27237
+    },
+    {
+      "epoch": 0.2364389198010434,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0016819645066500739,
+      "loss": 0.0986,
+      "step": 27238
+    },
+    {
+      "epoch": 0.23644760028124756,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0016819418033467732,
+      "loss": 0.1406,
+      "step": 27239
+    },
+    {
+      "epoch": 0.23645628076145173,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0016819190994070694,
+      "loss": 0.1445,
+      "step": 27240
+    },
+    {
+      "epoch": 0.2364649612416559,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0016818963948309882,
+      "loss": 0.1069,
+      "step": 27241
+    },
+    {
+      "epoch": 0.23647364172186006,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0016818736896185538,
+      "loss": 0.1196,
+      "step": 27242
+    },
+    {
+      "epoch": 0.23648232220206422,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0016818509837697906,
+      "loss": 0.0928,
+      "step": 27243
+    },
+    {
+      "epoch": 0.2364910026822684,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001681828277284724,
+      "loss": 0.1367,
+      "step": 27244
+    },
+    {
+      "epoch": 0.23649968316247255,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001681805570163379,
+      "loss": 0.0918,
+      "step": 27245
+    },
+    {
+      "epoch": 0.23650836364267672,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.00168178286240578,
+      "loss": 0.1582,
+      "step": 27246
+    },
+    {
+      "epoch": 0.23651704412288088,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0016817601540119519,
+      "loss": 0.1089,
+      "step": 27247
+    },
+    {
+      "epoch": 0.23652572460308505,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0016817374449819198,
+      "loss": 0.1299,
+      "step": 27248
+    },
+    {
+      "epoch": 0.2365344050832892,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0016817147353157082,
+      "loss": 0.1094,
+      "step": 27249
+    },
+    {
+      "epoch": 0.23654308556349338,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0016816920250133426,
+      "loss": 0.1494,
+      "step": 27250
+    },
+    {
+      "epoch": 0.23655176604369754,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0016816693140748466,
+      "loss": 0.1318,
+      "step": 27251
+    },
+    {
+      "epoch": 0.2365604465239017,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0016816466025002463,
+      "loss": 0.124,
+      "step": 27252
+    },
+    {
+      "epoch": 0.23656912700410587,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001681623890289566,
+      "loss": 0.1221,
+      "step": 27253
+    },
+    {
+      "epoch": 0.23657780748431004,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0016816011774428306,
+      "loss": 0.1494,
+      "step": 27254
+    },
+    {
+      "epoch": 0.2365864879645142,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0016815784639600646,
+      "loss": 0.1055,
+      "step": 27255
+    },
+    {
+      "epoch": 0.23659516844471837,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0016815557498412932,
+      "loss": 0.1104,
+      "step": 27256
+    },
+    {
+      "epoch": 0.23660384892492253,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0016815330350865411,
+      "loss": 0.1113,
+      "step": 27257
+    },
+    {
+      "epoch": 0.2366125294051267,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0016815103196958336,
+      "loss": 0.1025,
+      "step": 27258
+    },
+    {
+      "epoch": 0.23662120988533086,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0016814876036691949,
+      "loss": 0.124,
+      "step": 27259
+    },
+    {
+      "epoch": 0.23662989036553503,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0016814648870066497,
+      "loss": 0.1328,
+      "step": 27260
+    },
+    {
+      "epoch": 0.2366385708457392,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0016814421697082238,
+      "loss": 0.0981,
+      "step": 27261
+    },
+    {
+      "epoch": 0.23664725132594336,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0016814194517739416,
+      "loss": 0.1201,
+      "step": 27262
+    },
+    {
+      "epoch": 0.23665593180614752,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0016813967332038274,
+      "loss": 0.0889,
+      "step": 27263
+    },
+    {
+      "epoch": 0.2366646122863517,
+      "grad_norm": 2.34375,
+      "learning_rate": 0.0016813740139979064,
+      "loss": 0.1035,
+      "step": 27264
+    },
+    {
+      "epoch": 0.23667329276655585,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001681351294156204,
+      "loss": 0.0913,
+      "step": 27265
+    },
+    {
+      "epoch": 0.23668197324676002,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001681328573678744,
+      "loss": 0.2148,
+      "step": 27266
+    },
+    {
+      "epoch": 0.23669065372696418,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0016813058525655521,
+      "loss": 0.1064,
+      "step": 27267
+    },
+    {
+      "epoch": 0.23669933420716835,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0016812831308166529,
+      "loss": 0.1406,
+      "step": 27268
+    },
+    {
+      "epoch": 0.23670801468737251,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0016812604084320713,
+      "loss": 0.1328,
+      "step": 27269
+    },
+    {
+      "epoch": 0.23671669516757668,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0016812376854118316,
+      "loss": 0.1162,
+      "step": 27270
+    },
+    {
+      "epoch": 0.23672537564778084,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0016812149617559593,
+      "loss": 0.0796,
+      "step": 27271
+    },
+    {
+      "epoch": 0.236734056127985,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0016811922374644793,
+      "loss": 0.1191,
+      "step": 27272
+    },
+    {
+      "epoch": 0.23674273660818917,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001681169512537416,
+      "loss": 0.1035,
+      "step": 27273
+    },
+    {
+      "epoch": 0.23675141708839334,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0016811467869747945,
+      "loss": 0.1025,
+      "step": 27274
+    },
+    {
+      "epoch": 0.2367600975685975,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0016811240607766393,
+      "loss": 0.1338,
+      "step": 27275
+    },
+    {
+      "epoch": 0.23676877804880167,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001681101333942976,
+      "loss": 0.104,
+      "step": 27276
+    },
+    {
+      "epoch": 0.23677745852900584,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001681078606473829,
+      "loss": 0.0933,
+      "step": 27277
+    },
+    {
+      "epoch": 0.23678613900921,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0016810558783692232,
+      "loss": 0.1299,
+      "step": 27278
+    },
+    {
+      "epoch": 0.23679481948941417,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0016810331496291831,
+      "loss": 0.1611,
+      "step": 27279
+    },
+    {
+      "epoch": 0.23680349996961833,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0016810104202537339,
+      "loss": 0.1182,
+      "step": 27280
+    },
+    {
+      "epoch": 0.2368121804498225,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0016809876902429007,
+      "loss": 0.0933,
+      "step": 27281
+    },
+    {
+      "epoch": 0.23682086093002666,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.0016809649595967077,
+      "loss": 0.1309,
+      "step": 27282
+    },
+    {
+      "epoch": 0.23682954141023083,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0016809422283151805,
+      "loss": 0.0962,
+      "step": 27283
+    },
+    {
+      "epoch": 0.236838221890435,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0016809194963983436,
+      "loss": 0.1309,
+      "step": 27284
+    },
+    {
+      "epoch": 0.23684690237063916,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0016808967638462218,
+      "loss": 0.0923,
+      "step": 27285
+    },
+    {
+      "epoch": 0.23685558285084332,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.00168087403065884,
+      "loss": 0.1328,
+      "step": 27286
+    },
+    {
+      "epoch": 0.2368642633310475,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0016808512968362228,
+      "loss": 0.1934,
+      "step": 27287
+    },
+    {
+      "epoch": 0.23687294381125165,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0016808285623783957,
+      "loss": 0.123,
+      "step": 27288
+    },
+    {
+      "epoch": 0.23688162429145582,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0016808058272853833,
+      "loss": 0.1133,
+      "step": 27289
+    },
+    {
+      "epoch": 0.23689030477165998,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0016807830915572103,
+      "loss": 0.1064,
+      "step": 27290
+    },
+    {
+      "epoch": 0.23689898525186412,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0016807603551939015,
+      "loss": 0.1001,
+      "step": 27291
+    },
+    {
+      "epoch": 0.23690766573206828,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001680737618195482,
+      "loss": 0.1221,
+      "step": 27292
+    },
+    {
+      "epoch": 0.23691634621227245,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0016807148805619762,
+      "loss": 0.1079,
+      "step": 27293
+    },
+    {
+      "epoch": 0.23692502669247661,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.00168069214229341,
+      "loss": 0.1416,
+      "step": 27294
+    },
+    {
+      "epoch": 0.23693370717268078,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001680669403389807,
+      "loss": 0.1196,
+      "step": 27295
+    },
+    {
+      "epoch": 0.23694238765288494,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001680646663851193,
+      "loss": 0.1357,
+      "step": 27296
+    },
+    {
+      "epoch": 0.2369510681330891,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0016806239236775923,
+      "loss": 0.1099,
+      "step": 27297
+    },
+    {
+      "epoch": 0.23695974861329328,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0016806011828690301,
+      "loss": 0.1318,
+      "step": 27298
+    },
+    {
+      "epoch": 0.23696842909349744,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0016805784414255314,
+      "loss": 0.1084,
+      "step": 27299
+    },
+    {
+      "epoch": 0.2369771095737016,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0016805556993471208,
+      "loss": 0.1055,
+      "step": 27300
+    },
+    {
+      "epoch": 0.23698579005390577,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.001680532956633823,
+      "loss": 0.0972,
+      "step": 27301
+    },
+    {
+      "epoch": 0.23699447053410994,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0016805102132856633,
+      "loss": 0.1367,
+      "step": 27302
+    },
+    {
+      "epoch": 0.2370031510143141,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001680487469302666,
+      "loss": 0.1543,
+      "step": 27303
+    },
+    {
+      "epoch": 0.23701183149451827,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0016804647246848566,
+      "loss": 0.1904,
+      "step": 27304
+    },
+    {
+      "epoch": 0.23702051197472243,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0016804419794322599,
+      "loss": 0.1245,
+      "step": 27305
+    },
+    {
+      "epoch": 0.2370291924549266,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0016804192335449003,
+      "loss": 0.0894,
+      "step": 27306
+    },
+    {
+      "epoch": 0.23703787293513076,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0016803964870228025,
+      "loss": 0.1069,
+      "step": 27307
+    },
+    {
+      "epoch": 0.23704655341533493,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0016803737398659925,
+      "loss": 0.1162,
+      "step": 27308
+    },
+    {
+      "epoch": 0.2370552338955391,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0016803509920744941,
+      "loss": 0.1123,
+      "step": 27309
+    },
+    {
+      "epoch": 0.23706391437574326,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0016803282436483329,
+      "loss": 0.1162,
+      "step": 27310
+    },
+    {
+      "epoch": 0.23707259485594742,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0016803054945875332,
+      "loss": 0.0957,
+      "step": 27311
+    },
+    {
+      "epoch": 0.2370812753361516,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0016802827448921204,
+      "loss": 0.1367,
+      "step": 27312
+    },
+    {
+      "epoch": 0.23708995581635575,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0016802599945621186,
+      "loss": 0.1523,
+      "step": 27313
+    },
+    {
+      "epoch": 0.23709863629655992,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0016802372435975536,
+      "loss": 0.1123,
+      "step": 27314
+    },
+    {
+      "epoch": 0.23710731677676408,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.00168021449199845,
+      "loss": 0.0928,
+      "step": 27315
+    },
+    {
+      "epoch": 0.23711599725696825,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0016801917397648322,
+      "loss": 0.0874,
+      "step": 27316
+    },
+    {
+      "epoch": 0.2371246777371724,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0016801689868967257,
+      "loss": 0.1338,
+      "step": 27317
+    },
+    {
+      "epoch": 0.23713335821737658,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001680146233394155,
+      "loss": 0.1787,
+      "step": 27318
+    },
+    {
+      "epoch": 0.23714203869758074,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0016801234792571452,
+      "loss": 0.1289,
+      "step": 27319
+    },
+    {
+      "epoch": 0.2371507191777849,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0016801007244857209,
+      "loss": 0.1338,
+      "step": 27320
+    },
+    {
+      "epoch": 0.23715939965798907,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0016800779690799073,
+      "loss": 0.124,
+      "step": 27321
+    },
+    {
+      "epoch": 0.23716808013819324,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0016800552130397288,
+      "loss": 0.0938,
+      "step": 27322
+    },
+    {
+      "epoch": 0.2371767606183974,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001680032456365211,
+      "loss": 0.1016,
+      "step": 27323
+    },
+    {
+      "epoch": 0.23718544109860157,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0016800096990563784,
+      "loss": 0.1172,
+      "step": 27324
+    },
+    {
+      "epoch": 0.23719412157880573,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0016799869411132555,
+      "loss": 0.0894,
+      "step": 27325
+    },
+    {
+      "epoch": 0.2372028020590099,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0016799641825358683,
+      "loss": 0.1104,
+      "step": 27326
+    },
+    {
+      "epoch": 0.23721148253921406,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0016799414233242406,
+      "loss": 0.1143,
+      "step": 27327
+    },
+    {
+      "epoch": 0.23722016301941823,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0016799186634783975,
+      "loss": 0.0918,
+      "step": 27328
+    },
+    {
+      "epoch": 0.2372288434996224,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0016798959029983643,
+      "loss": 0.1992,
+      "step": 27329
+    },
+    {
+      "epoch": 0.23723752397982656,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0016798731418841656,
+      "loss": 0.0825,
+      "step": 27330
+    },
+    {
+      "epoch": 0.23724620446003072,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0016798503801358263,
+      "loss": 0.1143,
+      "step": 27331
+    },
+    {
+      "epoch": 0.2372548849402349,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0016798276177533715,
+      "loss": 0.0938,
+      "step": 27332
+    },
+    {
+      "epoch": 0.23726356542043905,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0016798048547368256,
+      "loss": 0.127,
+      "step": 27333
+    },
+    {
+      "epoch": 0.23727224590064322,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0016797820910862142,
+      "loss": 0.0693,
+      "step": 27334
+    },
+    {
+      "epoch": 0.23728092638084738,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0016797593268015618,
+      "loss": 0.0908,
+      "step": 27335
+    },
+    {
+      "epoch": 0.23728960686105155,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0016797365618828931,
+      "loss": 0.1021,
+      "step": 27336
+    },
+    {
+      "epoch": 0.23729828734125571,
+      "grad_norm": 0.875,
+      "learning_rate": 0.001679713796330233,
+      "loss": 0.1455,
+      "step": 27337
+    },
+    {
+      "epoch": 0.23730696782145988,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001679691030143607,
+      "loss": 0.1367,
+      "step": 27338
+    },
+    {
+      "epoch": 0.23731564830166405,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0016796682633230396,
+      "loss": 0.1172,
+      "step": 27339
+    },
+    {
+      "epoch": 0.2373243287818682,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0016796454958685554,
+      "loss": 0.085,
+      "step": 27340
+    },
+    {
+      "epoch": 0.23733300926207238,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00167962272778018,
+      "loss": 0.1357,
+      "step": 27341
+    },
+    {
+      "epoch": 0.23734168974227654,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0016795999590579374,
+      "loss": 0.1357,
+      "step": 27342
+    },
+    {
+      "epoch": 0.2373503702224807,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0016795771897018534,
+      "loss": 0.1172,
+      "step": 27343
+    },
+    {
+      "epoch": 0.23735905070268487,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0016795544197119522,
+      "loss": 0.1025,
+      "step": 27344
+    },
+    {
+      "epoch": 0.23736773118288904,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0016795316490882592,
+      "loss": 0.0684,
+      "step": 27345
+    },
+    {
+      "epoch": 0.2373764116630932,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0016795088778307993,
+      "loss": 0.1094,
+      "step": 27346
+    },
+    {
+      "epoch": 0.23738509214329737,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0016794861059395967,
+      "loss": 0.1143,
+      "step": 27347
+    },
+    {
+      "epoch": 0.23739377262350153,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0016794633334146771,
+      "loss": 0.0845,
+      "step": 27348
+    },
+    {
+      "epoch": 0.2374024531037057,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001679440560256065,
+      "loss": 0.0771,
+      "step": 27349
+    },
+    {
+      "epoch": 0.23741113358390986,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0016794177864637855,
+      "loss": 0.126,
+      "step": 27350
+    },
+    {
+      "epoch": 0.23741981406411403,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001679395012037863,
+      "loss": 0.0859,
+      "step": 27351
+    },
+    {
+      "epoch": 0.2374284945443182,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0016793722369783234,
+      "loss": 0.1377,
+      "step": 27352
+    },
+    {
+      "epoch": 0.23743717502452236,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001679349461285191,
+      "loss": 0.1211,
+      "step": 27353
+    },
+    {
+      "epoch": 0.23744585550472652,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0016793266849584904,
+      "loss": 0.1123,
+      "step": 27354
+    },
+    {
+      "epoch": 0.2374545359849307,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0016793039079982472,
+      "loss": 0.1143,
+      "step": 27355
+    },
+    {
+      "epoch": 0.23746321646513485,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0016792811304044855,
+      "loss": 0.0649,
+      "step": 27356
+    },
+    {
+      "epoch": 0.23747189694533902,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0016792583521772308,
+      "loss": 0.0967,
+      "step": 27357
+    },
+    {
+      "epoch": 0.23748057742554318,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001679235573316508,
+      "loss": 0.0933,
+      "step": 27358
+    },
+    {
+      "epoch": 0.23748925790574735,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0016792127938223419,
+      "loss": 0.1387,
+      "step": 27359
+    },
+    {
+      "epoch": 0.2374979383859515,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0016791900136947572,
+      "loss": 0.1216,
+      "step": 27360
+    },
+    {
+      "epoch": 0.23750661886615568,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0016791672329337793,
+      "loss": 0.0854,
+      "step": 27361
+    },
+    {
+      "epoch": 0.23751529934635984,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0016791444515394326,
+      "loss": 0.127,
+      "step": 27362
+    },
+    {
+      "epoch": 0.237523979826564,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0016791216695117425,
+      "loss": 0.0986,
+      "step": 27363
+    },
+    {
+      "epoch": 0.23753266030676817,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0016790988868507332,
+      "loss": 0.0967,
+      "step": 27364
+    },
+    {
+      "epoch": 0.23754134078697234,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0016790761035564304,
+      "loss": 0.1348,
+      "step": 27365
+    },
+    {
+      "epoch": 0.2375500212671765,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0016790533196288583,
+      "loss": 0.0972,
+      "step": 27366
+    },
+    {
+      "epoch": 0.23755870174738067,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0016790305350680426,
+      "loss": 0.0811,
+      "step": 27367
+    },
+    {
+      "epoch": 0.23756738222758483,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0016790077498740074,
+      "loss": 0.1416,
+      "step": 27368
+    },
+    {
+      "epoch": 0.237576062707789,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0016789849640467785,
+      "loss": 0.083,
+      "step": 27369
+    },
+    {
+      "epoch": 0.23758474318799316,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00167896217758638,
+      "loss": 0.1133,
+      "step": 27370
+    },
+    {
+      "epoch": 0.23759342366819733,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0016789393904928375,
+      "loss": 0.125,
+      "step": 27371
+    },
+    {
+      "epoch": 0.2376021041484015,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0016789166027661754,
+      "loss": 0.1387,
+      "step": 27372
+    },
+    {
+      "epoch": 0.23761078462860566,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016788938144064185,
+      "loss": 0.0996,
+      "step": 27373
+    },
+    {
+      "epoch": 0.23761946510880982,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0016788710254135926,
+      "loss": 0.0947,
+      "step": 27374
+    },
+    {
+      "epoch": 0.237628145589014,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0016788482357877217,
+      "loss": 0.1377,
+      "step": 27375
+    },
+    {
+      "epoch": 0.23763682606921815,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001678825445528831,
+      "loss": 0.1113,
+      "step": 27376
+    },
+    {
+      "epoch": 0.23764550654942232,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0016788026546369454,
+      "loss": 0.1196,
+      "step": 27377
+    },
+    {
+      "epoch": 0.23765418702962648,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0016787798631120903,
+      "loss": 0.0996,
+      "step": 27378
+    },
+    {
+      "epoch": 0.23766286750983065,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.00167875707095429,
+      "loss": 0.0859,
+      "step": 27379
+    },
+    {
+      "epoch": 0.23767154799003481,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.00167873427816357,
+      "loss": 0.1143,
+      "step": 27380
+    },
+    {
+      "epoch": 0.23768022847023898,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0016787114847399542,
+      "loss": 0.0923,
+      "step": 27381
+    },
+    {
+      "epoch": 0.23768890895044315,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0016786886906834688,
+      "loss": 0.1279,
+      "step": 27382
+    },
+    {
+      "epoch": 0.2376975894306473,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001678665895994138,
+      "loss": 0.0996,
+      "step": 27383
+    },
+    {
+      "epoch": 0.23770626991085148,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0016786431006719869,
+      "loss": 0.104,
+      "step": 27384
+    },
+    {
+      "epoch": 0.23771495039105564,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.00167862030471704,
+      "loss": 0.0938,
+      "step": 27385
+    },
+    {
+      "epoch": 0.2377236308712598,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001678597508129323,
+      "loss": 0.0879,
+      "step": 27386
+    },
+    {
+      "epoch": 0.23773231135146397,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0016785747109088606,
+      "loss": 0.1162,
+      "step": 27387
+    },
+    {
+      "epoch": 0.23774099183166814,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0016785519130556774,
+      "loss": 0.0972,
+      "step": 27388
+    },
+    {
+      "epoch": 0.2377496723118723,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0016785291145697983,
+      "loss": 0.0762,
+      "step": 27389
+    },
+    {
+      "epoch": 0.23775835279207647,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001678506315451249,
+      "loss": 0.0825,
+      "step": 27390
+    },
+    {
+      "epoch": 0.23776703327228063,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0016784835157000536,
+      "loss": 0.1055,
+      "step": 27391
+    },
+    {
+      "epoch": 0.2377757137524848,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001678460715316237,
+      "loss": 0.0957,
+      "step": 27392
+    },
+    {
+      "epoch": 0.23778439423268896,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0016784379142998247,
+      "loss": 0.1094,
+      "step": 27393
+    },
+    {
+      "epoch": 0.23779307471289313,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0016784151126508413,
+      "loss": 0.1182,
+      "step": 27394
+    },
+    {
+      "epoch": 0.2378017551930973,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0016783923103693122,
+      "loss": 0.1797,
+      "step": 27395
+    },
+    {
+      "epoch": 0.23781043567330146,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0016783695074552617,
+      "loss": 0.1895,
+      "step": 27396
+    },
+    {
+      "epoch": 0.23781911615350562,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0016783467039087148,
+      "loss": 0.1016,
+      "step": 27397
+    },
+    {
+      "epoch": 0.2378277966337098,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001678323899729697,
+      "loss": 0.1001,
+      "step": 27398
+    },
+    {
+      "epoch": 0.23783647711391395,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0016783010949182324,
+      "loss": 0.1147,
+      "step": 27399
+    },
+    {
+      "epoch": 0.23784515759411812,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.001678278289474347,
+      "loss": 0.1138,
+      "step": 27400
+    },
+    {
+      "epoch": 0.23785383807432228,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001678255483398065,
+      "loss": 0.1138,
+      "step": 27401
+    },
+    {
+      "epoch": 0.23786251855452645,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0016782326766894113,
+      "loss": 0.1182,
+      "step": 27402
+    },
+    {
+      "epoch": 0.2378711990347306,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0016782098693484111,
+      "loss": 0.1426,
+      "step": 27403
+    },
+    {
+      "epoch": 0.23787987951493478,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001678187061375089,
+      "loss": 0.1196,
+      "step": 27404
+    },
+    {
+      "epoch": 0.23788855999513894,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0016781642527694705,
+      "loss": 0.0903,
+      "step": 27405
+    },
+    {
+      "epoch": 0.2378972404753431,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0016781414435315806,
+      "loss": 0.1338,
+      "step": 27406
+    },
+    {
+      "epoch": 0.23790592095554727,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0016781186336614433,
+      "loss": 0.123,
+      "step": 27407
+    },
+    {
+      "epoch": 0.23791460143575144,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0016780958231590844,
+      "loss": 0.1523,
+      "step": 27408
+    },
+    {
+      "epoch": 0.2379232819159556,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0016780730120245285,
+      "loss": 0.123,
+      "step": 27409
+    },
+    {
+      "epoch": 0.23793196239615977,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0016780502002578007,
+      "loss": 0.0923,
+      "step": 27410
+    },
+    {
+      "epoch": 0.23794064287636393,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001678027387858926,
+      "loss": 0.1152,
+      "step": 27411
+    },
+    {
+      "epoch": 0.2379493233565681,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001678004574827929,
+      "loss": 0.0898,
+      "step": 27412
+    },
+    {
+      "epoch": 0.23795800383677226,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0016779817611648352,
+      "loss": 0.1426,
+      "step": 27413
+    },
+    {
+      "epoch": 0.23796668431697643,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001677958946869669,
+      "loss": 0.1016,
+      "step": 27414
+    },
+    {
+      "epoch": 0.23797536479718057,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0016779361319424558,
+      "loss": 0.124,
+      "step": 27415
+    },
+    {
+      "epoch": 0.23798404527738473,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0016779133163832198,
+      "loss": 0.126,
+      "step": 27416
+    },
+    {
+      "epoch": 0.2379927257575889,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001677890500191987,
+      "loss": 0.1328,
+      "step": 27417
+    },
+    {
+      "epoch": 0.23800140623779306,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001677867683368782,
+      "loss": 0.0703,
+      "step": 27418
+    },
+    {
+      "epoch": 0.23801008671799723,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001677844865913629,
+      "loss": 0.1328,
+      "step": 27419
+    },
+    {
+      "epoch": 0.2380187671982014,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0016778220478265535,
+      "loss": 0.1025,
+      "step": 27420
+    },
+    {
+      "epoch": 0.23802744767840556,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001677799229107581,
+      "loss": 0.1973,
+      "step": 27421
+    },
+    {
+      "epoch": 0.23803612815860972,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0016777764097567359,
+      "loss": 0.1299,
+      "step": 27422
+    },
+    {
+      "epoch": 0.2380448086388139,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001677753589774043,
+      "loss": 0.1035,
+      "step": 27423
+    },
+    {
+      "epoch": 0.23805348911901805,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0016777307691595274,
+      "loss": 0.103,
+      "step": 27424
+    },
+    {
+      "epoch": 0.23806216959922222,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001677707947913214,
+      "loss": 0.1045,
+      "step": 27425
+    },
+    {
+      "epoch": 0.23807085007942638,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0016776851260351282,
+      "loss": 0.1147,
+      "step": 27426
+    },
+    {
+      "epoch": 0.23807953055963055,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0016776623035252944,
+      "loss": 0.1113,
+      "step": 27427
+    },
+    {
+      "epoch": 0.2380882110398347,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001677639480383738,
+      "loss": 0.0898,
+      "step": 27428
+    },
+    {
+      "epoch": 0.23809689152003888,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0016776166566104835,
+      "loss": 0.1201,
+      "step": 27429
+    },
+    {
+      "epoch": 0.23810557200024304,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0016775938322055562,
+      "loss": 0.0791,
+      "step": 27430
+    },
+    {
+      "epoch": 0.2381142524804472,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0016775710071689812,
+      "loss": 0.0898,
+      "step": 27431
+    },
+    {
+      "epoch": 0.23812293296065137,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0016775481815007832,
+      "loss": 0.0928,
+      "step": 27432
+    },
+    {
+      "epoch": 0.23813161344085554,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.001677525355200987,
+      "loss": 0.1006,
+      "step": 27433
+    },
+    {
+      "epoch": 0.2381402939210597,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0016775025282696177,
+      "loss": 0.0898,
+      "step": 27434
+    },
+    {
+      "epoch": 0.23814897440126387,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0016774797007067003,
+      "loss": 0.0825,
+      "step": 27435
+    },
+    {
+      "epoch": 0.23815765488146803,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0016774568725122596,
+      "loss": 0.1338,
+      "step": 27436
+    },
+    {
+      "epoch": 0.2381663353616722,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001677434043686321,
+      "loss": 0.1123,
+      "step": 27437
+    },
+    {
+      "epoch": 0.23817501584187636,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0016774112142289093,
+      "loss": 0.0742,
+      "step": 27438
+    },
+    {
+      "epoch": 0.23818369632208053,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0016773883841400493,
+      "loss": 0.0957,
+      "step": 27439
+    },
+    {
+      "epoch": 0.2381923768022847,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0016773655534197658,
+      "loss": 0.1279,
+      "step": 27440
+    },
+    {
+      "epoch": 0.23820105728248886,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0016773427220680844,
+      "loss": 0.1157,
+      "step": 27441
+    },
+    {
+      "epoch": 0.23820973776269302,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0016773198900850294,
+      "loss": 0.0981,
+      "step": 27442
+    },
+    {
+      "epoch": 0.2382184182428972,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001677297057470626,
+      "loss": 0.0928,
+      "step": 27443
+    },
+    {
+      "epoch": 0.23822709872310135,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0016772742242248996,
+      "loss": 0.1045,
+      "step": 27444
+    },
+    {
+      "epoch": 0.23823577920330552,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0016772513903478743,
+      "loss": 0.1582,
+      "step": 27445
+    },
+    {
+      "epoch": 0.23824445968350969,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0016772285558395758,
+      "loss": 0.1006,
+      "step": 27446
+    },
+    {
+      "epoch": 0.23825314016371385,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0016772057207000287,
+      "loss": 0.1318,
+      "step": 27447
+    },
+    {
+      "epoch": 0.23826182064391802,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001677182884929258,
+      "loss": 0.0791,
+      "step": 27448
+    },
+    {
+      "epoch": 0.23827050112412218,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0016771600485272892,
+      "loss": 0.1348,
+      "step": 27449
+    },
+    {
+      "epoch": 0.23827918160432635,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0016771372114941466,
+      "loss": 0.1299,
+      "step": 27450
+    },
+    {
+      "epoch": 0.2382878620845305,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0016771143738298553,
+      "loss": 0.0894,
+      "step": 27451
+    },
+    {
+      "epoch": 0.23829654256473468,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0016770915355344405,
+      "loss": 0.1201,
+      "step": 27452
+    },
+    {
+      "epoch": 0.23830522304493884,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0016770686966079275,
+      "loss": 0.1094,
+      "step": 27453
+    },
+    {
+      "epoch": 0.238313903525143,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0016770458570503403,
+      "loss": 0.083,
+      "step": 27454
+    },
+    {
+      "epoch": 0.23832258400534717,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0016770230168617042,
+      "loss": 0.0869,
+      "step": 27455
+    },
+    {
+      "epoch": 0.23833126448555134,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0016770001760420448,
+      "loss": 0.1143,
+      "step": 27456
+    },
+    {
+      "epoch": 0.2383399449657555,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0016769773345913868,
+      "loss": 0.0952,
+      "step": 27457
+    },
+    {
+      "epoch": 0.23834862544595967,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0016769544925097546,
+      "loss": 0.0903,
+      "step": 27458
+    },
+    {
+      "epoch": 0.23835730592616383,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0016769316497971737,
+      "loss": 0.1221,
+      "step": 27459
+    },
+    {
+      "epoch": 0.238365986406368,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0016769088064536694,
+      "loss": 0.1152,
+      "step": 27460
+    },
+    {
+      "epoch": 0.23837466688657216,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0016768859624792663,
+      "loss": 0.1045,
+      "step": 27461
+    },
+    {
+      "epoch": 0.23838334736677633,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0016768631178739892,
+      "loss": 0.1484,
+      "step": 27462
+    },
+    {
+      "epoch": 0.2383920278469805,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001676840272637863,
+      "loss": 0.1299,
+      "step": 27463
+    },
+    {
+      "epoch": 0.23840070832718466,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0016768174267709132,
+      "loss": 0.1074,
+      "step": 27464
+    },
+    {
+      "epoch": 0.23840938880738882,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0016767945802731645,
+      "loss": 0.0908,
+      "step": 27465
+    },
+    {
+      "epoch": 0.238418069287593,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001676771733144642,
+      "loss": 0.1064,
+      "step": 27466
+    },
+    {
+      "epoch": 0.23842674976779715,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0016767488853853705,
+      "loss": 0.0933,
+      "step": 27467
+    },
+    {
+      "epoch": 0.23843543024800132,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0016767260369953752,
+      "loss": 0.1055,
+      "step": 27468
+    },
+    {
+      "epoch": 0.23844411072820548,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001676703187974681,
+      "loss": 0.0801,
+      "step": 27469
+    },
+    {
+      "epoch": 0.23845279120840965,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0016766803383233128,
+      "loss": 0.1094,
+      "step": 27470
+    },
+    {
+      "epoch": 0.2384614716886138,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0016766574880412955,
+      "loss": 0.0801,
+      "step": 27471
+    },
+    {
+      "epoch": 0.23847015216881798,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.001676634637128654,
+      "loss": 0.0938,
+      "step": 27472
+    },
+    {
+      "epoch": 0.23847883264902214,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0016766117855854142,
+      "loss": 0.0889,
+      "step": 27473
+    },
+    {
+      "epoch": 0.2384875131292263,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0016765889334116,
+      "loss": 0.1035,
+      "step": 27474
+    },
+    {
+      "epoch": 0.23849619360943047,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001676566080607237,
+      "loss": 0.1328,
+      "step": 27475
+    },
+    {
+      "epoch": 0.23850487408963464,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0016765432271723497,
+      "loss": 0.1055,
+      "step": 27476
+    },
+    {
+      "epoch": 0.2385135545698388,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0016765203731069636,
+      "loss": 0.1221,
+      "step": 27477
+    },
+    {
+      "epoch": 0.23852223505004297,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0016764975184111036,
+      "loss": 0.1133,
+      "step": 27478
+    },
+    {
+      "epoch": 0.23853091553024713,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0016764746630847942,
+      "loss": 0.0889,
+      "step": 27479
+    },
+    {
+      "epoch": 0.2385395960104513,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001676451807128061,
+      "loss": 0.1094,
+      "step": 27480
+    },
+    {
+      "epoch": 0.23854827649065546,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001676428950540929,
+      "loss": 0.0967,
+      "step": 27481
+    },
+    {
+      "epoch": 0.23855695697085963,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0016764060933234226,
+      "loss": 0.0996,
+      "step": 27482
+    },
+    {
+      "epoch": 0.2385656374510638,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0016763832354755673,
+      "loss": 0.1172,
+      "step": 27483
+    },
+    {
+      "epoch": 0.23857431793126796,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001676360376997388,
+      "loss": 0.1191,
+      "step": 27484
+    },
+    {
+      "epoch": 0.23858299841147212,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0016763375178889095,
+      "loss": 0.1963,
+      "step": 27485
+    },
+    {
+      "epoch": 0.2385916788916763,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001676314658150157,
+      "loss": 0.1211,
+      "step": 27486
+    },
+    {
+      "epoch": 0.23860035937188045,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016762917977811557,
+      "loss": 0.1143,
+      "step": 27487
+    },
+    {
+      "epoch": 0.23860903985208462,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0016762689367819299,
+      "loss": 0.124,
+      "step": 27488
+    },
+    {
+      "epoch": 0.23861772033228879,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0016762460751525053,
+      "loss": 0.1914,
+      "step": 27489
+    },
+    {
+      "epoch": 0.23862640081249295,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0016762232128929066,
+      "loss": 0.1123,
+      "step": 27490
+    },
+    {
+      "epoch": 0.23863508129269712,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.001676200350003159,
+      "loss": 0.1182,
+      "step": 27491
+    },
+    {
+      "epoch": 0.23864376177290128,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.001676177486483287,
+      "loss": 0.1113,
+      "step": 27492
+    },
+    {
+      "epoch": 0.23865244225310545,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0016761546223333162,
+      "loss": 0.0977,
+      "step": 27493
+    },
+    {
+      "epoch": 0.2386611227333096,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0016761317575532716,
+      "loss": 0.1104,
+      "step": 27494
+    },
+    {
+      "epoch": 0.23866980321351378,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0016761088921431774,
+      "loss": 0.1064,
+      "step": 27495
+    },
+    {
+      "epoch": 0.23867848369371794,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0016760860261030596,
+      "loss": 0.1367,
+      "step": 27496
+    },
+    {
+      "epoch": 0.2386871641739221,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0016760631594329427,
+      "loss": 0.1084,
+      "step": 27497
+    },
+    {
+      "epoch": 0.23869584465412627,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0016760402921328517,
+      "loss": 0.1328,
+      "step": 27498
+    },
+    {
+      "epoch": 0.23870452513433044,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001676017424202812,
+      "loss": 0.1143,
+      "step": 27499
+    },
+    {
+      "epoch": 0.2387132056145346,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0016759945556428478,
+      "loss": 0.0874,
+      "step": 27500
+    },
+    {
+      "epoch": 0.23872188609473877,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001675971686452985,
+      "loss": 0.0957,
+      "step": 27501
+    },
+    {
+      "epoch": 0.23873056657494293,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001675948816633248,
+      "loss": 0.1455,
+      "step": 27502
+    },
+    {
+      "epoch": 0.2387392470551471,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001675925946183662,
+      "loss": 0.1138,
+      "step": 27503
+    },
+    {
+      "epoch": 0.23874792753535126,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0016759030751042523,
+      "loss": 0.1113,
+      "step": 27504
+    },
+    {
+      "epoch": 0.23875660801555543,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0016758802033950432,
+      "loss": 0.1484,
+      "step": 27505
+    },
+    {
+      "epoch": 0.2387652884957596,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0016758573310560606,
+      "loss": 0.0933,
+      "step": 27506
+    },
+    {
+      "epoch": 0.23877396897596376,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0016758344580873289,
+      "loss": 0.104,
+      "step": 27507
+    },
+    {
+      "epoch": 0.23878264945616792,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0016758115844888735,
+      "loss": 0.1055,
+      "step": 27508
+    },
+    {
+      "epoch": 0.2387913299363721,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001675788710260719,
+      "loss": 0.0806,
+      "step": 27509
+    },
+    {
+      "epoch": 0.23880001041657625,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0016757658354028906,
+      "loss": 0.1436,
+      "step": 27510
+    },
+    {
+      "epoch": 0.23880869089678042,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0016757429599154132,
+      "loss": 0.1289,
+      "step": 27511
+    },
+    {
+      "epoch": 0.23881737137698458,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0016757200837983122,
+      "loss": 0.1387,
+      "step": 27512
+    },
+    {
+      "epoch": 0.23882605185718875,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0016756972070516123,
+      "loss": 0.1719,
+      "step": 27513
+    },
+    {
+      "epoch": 0.2388347323373929,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0016756743296753387,
+      "loss": 0.0859,
+      "step": 27514
+    },
+    {
+      "epoch": 0.23884341281759708,
+      "grad_norm": 3.28125,
+      "learning_rate": 0.001675651451669516,
+      "loss": 0.2275,
+      "step": 27515
+    },
+    {
+      "epoch": 0.23885209329780124,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0016756285730341698,
+      "loss": 0.1172,
+      "step": 27516
+    },
+    {
+      "epoch": 0.2388607737780054,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0016756056937693247,
+      "loss": 0.123,
+      "step": 27517
+    },
+    {
+      "epoch": 0.23886945425820957,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0016755828138750057,
+      "loss": 0.167,
+      "step": 27518
+    },
+    {
+      "epoch": 0.23887813473841374,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0016755599333512384,
+      "loss": 0.0825,
+      "step": 27519
+    },
+    {
+      "epoch": 0.2388868152186179,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001675537052198047,
+      "loss": 0.1338,
+      "step": 27520
+    },
+    {
+      "epoch": 0.23889549569882207,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001675514170415457,
+      "loss": 0.1416,
+      "step": 27521
+    },
+    {
+      "epoch": 0.23890417617902623,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0016754912880034935,
+      "loss": 0.1465,
+      "step": 27522
+    },
+    {
+      "epoch": 0.2389128566592304,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0016754684049621815,
+      "loss": 0.1348,
+      "step": 27523
+    },
+    {
+      "epoch": 0.23892153713943456,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0016754455212915455,
+      "loss": 0.1182,
+      "step": 27524
+    },
+    {
+      "epoch": 0.23893021761963873,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0016754226369916116,
+      "loss": 0.0923,
+      "step": 27525
+    },
+    {
+      "epoch": 0.2389388980998429,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0016753997520624034,
+      "loss": 0.0967,
+      "step": 27526
+    },
+    {
+      "epoch": 0.23894757858004706,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001675376866503947,
+      "loss": 0.127,
+      "step": 27527
+    },
+    {
+      "epoch": 0.23895625906025122,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001675353980316267,
+      "loss": 0.1211,
+      "step": 27528
+    },
+    {
+      "epoch": 0.2389649395404554,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0016753310934993887,
+      "loss": 0.0986,
+      "step": 27529
+    },
+    {
+      "epoch": 0.23897362002065956,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0016753082060533369,
+      "loss": 0.0801,
+      "step": 27530
+    },
+    {
+      "epoch": 0.23898230050086372,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0016752853179781368,
+      "loss": 0.1152,
+      "step": 27531
+    },
+    {
+      "epoch": 0.23899098098106789,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001675262429273813,
+      "loss": 0.0889,
+      "step": 27532
+    },
+    {
+      "epoch": 0.23899966146127205,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0016752395399403913,
+      "loss": 0.1582,
+      "step": 27533
+    },
+    {
+      "epoch": 0.23900834194147622,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0016752166499778957,
+      "loss": 0.0859,
+      "step": 27534
+    },
+    {
+      "epoch": 0.23901702242168038,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0016751937593863521,
+      "loss": 0.1133,
+      "step": 27535
+    },
+    {
+      "epoch": 0.23902570290188455,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0016751708681657855,
+      "loss": 0.0938,
+      "step": 27536
+    },
+    {
+      "epoch": 0.2390343833820887,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0016751479763162205,
+      "loss": 0.0996,
+      "step": 27537
+    },
+    {
+      "epoch": 0.23904306386229285,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001675125083837682,
+      "loss": 0.0918,
+      "step": 27538
+    },
+    {
+      "epoch": 0.239051744342497,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0016751021907301959,
+      "loss": 0.0996,
+      "step": 27539
+    },
+    {
+      "epoch": 0.23906042482270118,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0016750792969937864,
+      "loss": 0.1113,
+      "step": 27540
+    },
+    {
+      "epoch": 0.23906910530290534,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0016750564026284786,
+      "loss": 0.125,
+      "step": 27541
+    },
+    {
+      "epoch": 0.2390777857831095,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001675033507634298,
+      "loss": 0.0967,
+      "step": 27542
+    },
+    {
+      "epoch": 0.23908646626331367,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0016750106120112694,
+      "loss": 0.0854,
+      "step": 27543
+    },
+    {
+      "epoch": 0.23909514674351784,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001674987715759418,
+      "loss": 0.0918,
+      "step": 27544
+    },
+    {
+      "epoch": 0.239103827223722,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0016749648188787683,
+      "loss": 0.1191,
+      "step": 27545
+    },
+    {
+      "epoch": 0.23911250770392617,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001674941921369346,
+      "loss": 0.1387,
+      "step": 27546
+    },
+    {
+      "epoch": 0.23912118818413033,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0016749190232311756,
+      "loss": 0.1523,
+      "step": 27547
+    },
+    {
+      "epoch": 0.2391298686643345,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0016748961244642826,
+      "loss": 0.1172,
+      "step": 27548
+    },
+    {
+      "epoch": 0.23913854914453866,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001674873225068692,
+      "loss": 0.124,
+      "step": 27549
+    },
+    {
+      "epoch": 0.23914722962474283,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0016748503250444285,
+      "loss": 0.1191,
+      "step": 27550
+    },
+    {
+      "epoch": 0.239155910104947,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0016748274243915172,
+      "loss": 0.1221,
+      "step": 27551
+    },
+    {
+      "epoch": 0.23916459058515116,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0016748045231099833,
+      "loss": 0.123,
+      "step": 27552
+    },
+    {
+      "epoch": 0.23917327106535533,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001674781621199852,
+      "loss": 0.1123,
+      "step": 27553
+    },
+    {
+      "epoch": 0.2391819515455595,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0016747587186611477,
+      "loss": 0.0889,
+      "step": 27554
+    },
+    {
+      "epoch": 0.23919063202576366,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0016747358154938963,
+      "loss": 0.0918,
+      "step": 27555
+    },
+    {
+      "epoch": 0.23919931250596782,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0016747129116981227,
+      "loss": 0.0776,
+      "step": 27556
+    },
+    {
+      "epoch": 0.23920799298617199,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0016746900072738512,
+      "loss": 0.0767,
+      "step": 27557
+    },
+    {
+      "epoch": 0.23921667346637615,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0016746671022211074,
+      "loss": 0.1807,
+      "step": 27558
+    },
+    {
+      "epoch": 0.23922535394658032,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0016746441965399164,
+      "loss": 0.0977,
+      "step": 27559
+    },
+    {
+      "epoch": 0.23923403442678448,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0016746212902303034,
+      "loss": 0.1113,
+      "step": 27560
+    },
+    {
+      "epoch": 0.23924271490698865,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0016745983832922927,
+      "loss": 0.0898,
+      "step": 27561
+    },
+    {
+      "epoch": 0.2392513953871928,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00167457547572591,
+      "loss": 0.1816,
+      "step": 27562
+    },
+    {
+      "epoch": 0.23926007586739698,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0016745525675311804,
+      "loss": 0.1348,
+      "step": 27563
+    },
+    {
+      "epoch": 0.23926875634760114,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0016745296587081285,
+      "loss": 0.1113,
+      "step": 27564
+    },
+    {
+      "epoch": 0.2392774368278053,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0016745067492567798,
+      "loss": 0.1426,
+      "step": 27565
+    },
+    {
+      "epoch": 0.23928611730800947,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0016744838391771588,
+      "loss": 0.1562,
+      "step": 27566
+    },
+    {
+      "epoch": 0.23929479778821364,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0016744609284692913,
+      "loss": 0.1504,
+      "step": 27567
+    },
+    {
+      "epoch": 0.2393034782684178,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0016744380171332019,
+      "loss": 0.1172,
+      "step": 27568
+    },
+    {
+      "epoch": 0.23931215874862197,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0016744151051689157,
+      "loss": 0.0898,
+      "step": 27569
+    },
+    {
+      "epoch": 0.23932083922882613,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0016743921925764576,
+      "loss": 0.1338,
+      "step": 27570
+    },
+    {
+      "epoch": 0.2393295197090303,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0016743692793558529,
+      "loss": 0.1055,
+      "step": 27571
+    },
+    {
+      "epoch": 0.23933820018923446,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0016743463655071267,
+      "loss": 0.0811,
+      "step": 27572
+    },
+    {
+      "epoch": 0.23934688066943863,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0016743234510303037,
+      "loss": 0.0986,
+      "step": 27573
+    },
+    {
+      "epoch": 0.2393555611496428,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0016743005359254096,
+      "loss": 0.1045,
+      "step": 27574
+    },
+    {
+      "epoch": 0.23936424162984696,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0016742776201924686,
+      "loss": 0.0762,
+      "step": 27575
+    },
+    {
+      "epoch": 0.23937292211005112,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0016742547038315066,
+      "loss": 0.0977,
+      "step": 27576
+    },
+    {
+      "epoch": 0.2393816025902553,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0016742317868425481,
+      "loss": 0.1221,
+      "step": 27577
+    },
+    {
+      "epoch": 0.23939028307045945,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0016742088692256183,
+      "loss": 0.0806,
+      "step": 27578
+    },
+    {
+      "epoch": 0.23939896355066362,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0016741859509807424,
+      "loss": 0.1035,
+      "step": 27579
+    },
+    {
+      "epoch": 0.23940764403086778,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0016741630321079452,
+      "loss": 0.1201,
+      "step": 27580
+    },
+    {
+      "epoch": 0.23941632451107195,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0016741401126072523,
+      "loss": 0.0938,
+      "step": 27581
+    },
+    {
+      "epoch": 0.2394250049912761,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0016741171924786883,
+      "loss": 0.1064,
+      "step": 27582
+    },
+    {
+      "epoch": 0.23943368547148028,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0016740942717222782,
+      "loss": 0.0728,
+      "step": 27583
+    },
+    {
+      "epoch": 0.23944236595168444,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0016740713503380473,
+      "loss": 0.1069,
+      "step": 27584
+    },
+    {
+      "epoch": 0.2394510464318886,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0016740484283260207,
+      "loss": 0.0986,
+      "step": 27585
+    },
+    {
+      "epoch": 0.23945972691209277,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001674025505686223,
+      "loss": 0.0957,
+      "step": 27586
+    },
+    {
+      "epoch": 0.23946840739229694,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.00167400258241868,
+      "loss": 0.1108,
+      "step": 27587
+    },
+    {
+      "epoch": 0.2394770878725011,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0016739796585234165,
+      "loss": 0.0703,
+      "step": 27588
+    },
+    {
+      "epoch": 0.23948576835270527,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0016739567340004569,
+      "loss": 0.1074,
+      "step": 27589
+    },
+    {
+      "epoch": 0.23949444883290943,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0016739338088498272,
+      "loss": 0.0737,
+      "step": 27590
+    },
+    {
+      "epoch": 0.2395031293131136,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0016739108830715523,
+      "loss": 0.0981,
+      "step": 27591
+    },
+    {
+      "epoch": 0.23951180979331776,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001673887956665657,
+      "loss": 0.1543,
+      "step": 27592
+    },
+    {
+      "epoch": 0.23952049027352193,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001673865029632166,
+      "loss": 0.0874,
+      "step": 27593
+    },
+    {
+      "epoch": 0.2395291707537261,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0016738421019711052,
+      "loss": 0.1021,
+      "step": 27594
+    },
+    {
+      "epoch": 0.23953785123393026,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0016738191736824994,
+      "loss": 0.1377,
+      "step": 27595
+    },
+    {
+      "epoch": 0.23954653171413443,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0016737962447663736,
+      "loss": 0.1133,
+      "step": 27596
+    },
+    {
+      "epoch": 0.2395552121943386,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0016737733152227523,
+      "loss": 0.084,
+      "step": 27597
+    },
+    {
+      "epoch": 0.23956389267454276,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0016737503850516615,
+      "loss": 0.106,
+      "step": 27598
+    },
+    {
+      "epoch": 0.23957257315474692,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0016737274542531263,
+      "loss": 0.1035,
+      "step": 27599
+    },
+    {
+      "epoch": 0.23958125363495109,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0016737045228271705,
+      "loss": 0.1357,
+      "step": 27600
+    },
+    {
+      "epoch": 0.23958993411515525,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0016736815907738207,
+      "loss": 0.0767,
+      "step": 27601
+    },
+    {
+      "epoch": 0.23959861459535942,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0016736586580931012,
+      "loss": 0.0894,
+      "step": 27602
+    },
+    {
+      "epoch": 0.23960729507556358,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001673635724785037,
+      "loss": 0.0967,
+      "step": 27603
+    },
+    {
+      "epoch": 0.23961597555576775,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0016736127908496539,
+      "loss": 0.1143,
+      "step": 27604
+    },
+    {
+      "epoch": 0.2396246560359719,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001673589856286976,
+      "loss": 0.1172,
+      "step": 27605
+    },
+    {
+      "epoch": 0.23963333651617608,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0016735669210970288,
+      "loss": 0.0986,
+      "step": 27606
+    },
+    {
+      "epoch": 0.23964201699638024,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0016735439852798377,
+      "loss": 0.1172,
+      "step": 27607
+    },
+    {
+      "epoch": 0.2396506974765844,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0016735210488354275,
+      "loss": 0.1445,
+      "step": 27608
+    },
+    {
+      "epoch": 0.23965937795678857,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0016734981117638233,
+      "loss": 0.1074,
+      "step": 27609
+    },
+    {
+      "epoch": 0.23966805843699274,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0016734751740650501,
+      "loss": 0.0928,
+      "step": 27610
+    },
+    {
+      "epoch": 0.2396767389171969,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001673452235739133,
+      "loss": 0.1602,
+      "step": 27611
+    },
+    {
+      "epoch": 0.23968541939740107,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0016734292967860974,
+      "loss": 0.126,
+      "step": 27612
+    },
+    {
+      "epoch": 0.23969409987760523,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0016734063572059678,
+      "loss": 0.1143,
+      "step": 27613
+    },
+    {
+      "epoch": 0.2397027803578094,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0016733834169987698,
+      "loss": 0.1157,
+      "step": 27614
+    },
+    {
+      "epoch": 0.23971146083801356,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0016733604761645282,
+      "loss": 0.1123,
+      "step": 27615
+    },
+    {
+      "epoch": 0.23972014131821773,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0016733375347032684,
+      "loss": 0.0942,
+      "step": 27616
+    },
+    {
+      "epoch": 0.2397288217984219,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0016733145926150152,
+      "loss": 0.0781,
+      "step": 27617
+    },
+    {
+      "epoch": 0.23973750227862606,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0016732916498997938,
+      "loss": 0.1113,
+      "step": 27618
+    },
+    {
+      "epoch": 0.23974618275883022,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0016732687065576296,
+      "loss": 0.1143,
+      "step": 27619
+    },
+    {
+      "epoch": 0.2397548632390344,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0016732457625885469,
+      "loss": 0.083,
+      "step": 27620
+    },
+    {
+      "epoch": 0.23976354371923855,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0016732228179925713,
+      "loss": 0.1118,
+      "step": 27621
+    },
+    {
+      "epoch": 0.23977222419944272,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001673199872769728,
+      "loss": 0.1226,
+      "step": 27622
+    },
+    {
+      "epoch": 0.23978090467964688,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0016731769269200416,
+      "loss": 0.0942,
+      "step": 27623
+    },
+    {
+      "epoch": 0.23978958515985105,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0016731539804435377,
+      "loss": 0.1289,
+      "step": 27624
+    },
+    {
+      "epoch": 0.2397982656400552,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0016731310333402412,
+      "loss": 0.125,
+      "step": 27625
+    },
+    {
+      "epoch": 0.23980694612025938,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0016731080856101773,
+      "loss": 0.0967,
+      "step": 27626
+    },
+    {
+      "epoch": 0.23981562660046354,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001673085137253371,
+      "loss": 0.1123,
+      "step": 27627
+    },
+    {
+      "epoch": 0.2398243070806677,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0016730621882698473,
+      "loss": 0.1001,
+      "step": 27628
+    },
+    {
+      "epoch": 0.23983298756087187,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0016730392386596314,
+      "loss": 0.1572,
+      "step": 27629
+    },
+    {
+      "epoch": 0.23984166804107604,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0016730162884227486,
+      "loss": 0.1777,
+      "step": 27630
+    },
+    {
+      "epoch": 0.2398503485212802,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0016729933375592239,
+      "loss": 0.1113,
+      "step": 27631
+    },
+    {
+      "epoch": 0.23985902900148437,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0016729703860690816,
+      "loss": 0.084,
+      "step": 27632
+    },
+    {
+      "epoch": 0.23986770948168853,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0016729474339523484,
+      "loss": 0.1055,
+      "step": 27633
+    },
+    {
+      "epoch": 0.2398763899618927,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0016729244812090478,
+      "loss": 0.0942,
+      "step": 27634
+    },
+    {
+      "epoch": 0.23988507044209686,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0016729015278392056,
+      "loss": 0.1523,
+      "step": 27635
+    },
+    {
+      "epoch": 0.23989375092230103,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0016728785738428474,
+      "loss": 0.1016,
+      "step": 27636
+    },
+    {
+      "epoch": 0.2399024314025052,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0016728556192199975,
+      "loss": 0.1074,
+      "step": 27637
+    },
+    {
+      "epoch": 0.23991111188270936,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0016728326639706814,
+      "loss": 0.1016,
+      "step": 27638
+    },
+    {
+      "epoch": 0.23991979236291353,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0016728097080949238,
+      "loss": 0.1328,
+      "step": 27639
+    },
+    {
+      "epoch": 0.2399284728431177,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0016727867515927504,
+      "loss": 0.0767,
+      "step": 27640
+    },
+    {
+      "epoch": 0.23993715332332186,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0016727637944641863,
+      "loss": 0.1211,
+      "step": 27641
+    },
+    {
+      "epoch": 0.23994583380352602,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0016727408367092557,
+      "loss": 0.0967,
+      "step": 27642
+    },
+    {
+      "epoch": 0.23995451428373019,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0016727178783279848,
+      "loss": 0.1484,
+      "step": 27643
+    },
+    {
+      "epoch": 0.23996319476393435,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001672694919320398,
+      "loss": 0.1328,
+      "step": 27644
+    },
+    {
+      "epoch": 0.23997187524413852,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0016726719596865205,
+      "loss": 0.0811,
+      "step": 27645
+    },
+    {
+      "epoch": 0.23998055572434268,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001672648999426378,
+      "loss": 0.1318,
+      "step": 27646
+    },
+    {
+      "epoch": 0.23998923620454685,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0016726260385399947,
+      "loss": 0.2754,
+      "step": 27647
+    },
+    {
+      "epoch": 0.239997916684751,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0016726030770273966,
+      "loss": 0.1152,
+      "step": 27648
+    },
+    {
+      "epoch": 0.24000659716495518,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0016725801148886079,
+      "loss": 0.0869,
+      "step": 27649
+    },
+    {
+      "epoch": 0.24001527764515934,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0016725571521236545,
+      "loss": 0.1299,
+      "step": 27650
+    },
+    {
+      "epoch": 0.2400239581253635,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0016725341887325613,
+      "loss": 0.1055,
+      "step": 27651
+    },
+    {
+      "epoch": 0.24003263860556767,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001672511224715353,
+      "loss": 0.1113,
+      "step": 27652
+    },
+    {
+      "epoch": 0.24004131908577184,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0016724882600720554,
+      "loss": 0.1191,
+      "step": 27653
+    },
+    {
+      "epoch": 0.240049999565976,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0016724652948026928,
+      "loss": 0.1006,
+      "step": 27654
+    },
+    {
+      "epoch": 0.24005868004618017,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001672442328907291,
+      "loss": 0.0859,
+      "step": 27655
+    },
+    {
+      "epoch": 0.24006736052638433,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001672419362385875,
+      "loss": 0.1025,
+      "step": 27656
+    },
+    {
+      "epoch": 0.2400760410065885,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016723963952384698,
+      "loss": 0.1055,
+      "step": 27657
+    },
+    {
+      "epoch": 0.24008472148679266,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0016723734274651002,
+      "loss": 0.0972,
+      "step": 27658
+    },
+    {
+      "epoch": 0.24009340196699683,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0016723504590657916,
+      "loss": 0.1201,
+      "step": 27659
+    },
+    {
+      "epoch": 0.240102082447201,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0016723274900405695,
+      "loss": 0.1357,
+      "step": 27660
+    },
+    {
+      "epoch": 0.24011076292740513,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0016723045203894582,
+      "loss": 0.0859,
+      "step": 27661
+    },
+    {
+      "epoch": 0.2401194434076093,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0016722815501124836,
+      "loss": 0.125,
+      "step": 27662
+    },
+    {
+      "epoch": 0.24012812388781346,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0016722585792096703,
+      "loss": 0.0879,
+      "step": 27663
+    },
+    {
+      "epoch": 0.24013680436801763,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.0016722356076810437,
+      "loss": 0.1543,
+      "step": 27664
+    },
+    {
+      "epoch": 0.2401454848482218,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001672212635526629,
+      "loss": 0.1128,
+      "step": 27665
+    },
+    {
+      "epoch": 0.24015416532842596,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0016721896627464507,
+      "loss": 0.1133,
+      "step": 27666
+    },
+    {
+      "epoch": 0.24016284580863012,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0016721666893405348,
+      "loss": 0.1309,
+      "step": 27667
+    },
+    {
+      "epoch": 0.24017152628883429,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0016721437153089061,
+      "loss": 0.1543,
+      "step": 27668
+    },
+    {
+      "epoch": 0.24018020676903845,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0016721207406515896,
+      "loss": 0.1045,
+      "step": 27669
+    },
+    {
+      "epoch": 0.24018888724924262,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0016720977653686103,
+      "loss": 0.106,
+      "step": 27670
+    },
+    {
+      "epoch": 0.24019756772944678,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0016720747894599934,
+      "loss": 0.1084,
+      "step": 27671
+    },
+    {
+      "epoch": 0.24020624820965095,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0016720518129257642,
+      "loss": 0.1436,
+      "step": 27672
+    },
+    {
+      "epoch": 0.2402149286898551,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0016720288357659477,
+      "loss": 0.0869,
+      "step": 27673
+    },
+    {
+      "epoch": 0.24022360917005928,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001672005857980569,
+      "loss": 0.0938,
+      "step": 27674
+    },
+    {
+      "epoch": 0.24023228965026344,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0016719828795696532,
+      "loss": 0.0811,
+      "step": 27675
+    },
+    {
+      "epoch": 0.2402409701304676,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001671959900533226,
+      "loss": 0.0869,
+      "step": 27676
+    },
+    {
+      "epoch": 0.24024965061067177,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0016719369208713117,
+      "loss": 0.0977,
+      "step": 27677
+    },
+    {
+      "epoch": 0.24025833109087594,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0016719139405839357,
+      "loss": 0.1226,
+      "step": 27678
+    },
+    {
+      "epoch": 0.2402670115710801,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0016718909596711233,
+      "loss": 0.0908,
+      "step": 27679
+    },
+    {
+      "epoch": 0.24027569205128427,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0016718679781328998,
+      "loss": 0.209,
+      "step": 27680
+    },
+    {
+      "epoch": 0.24028437253148843,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0016718449959692897,
+      "loss": 0.1074,
+      "step": 27681
+    },
+    {
+      "epoch": 0.2402930530116926,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0016718220131803185,
+      "loss": 0.0864,
+      "step": 27682
+    },
+    {
+      "epoch": 0.24030173349189676,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0016717990297660118,
+      "loss": 0.0796,
+      "step": 27683
+    },
+    {
+      "epoch": 0.24031041397210093,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0016717760457263936,
+      "loss": 0.1025,
+      "step": 27684
+    },
+    {
+      "epoch": 0.2403190944523051,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.00167175306106149,
+      "loss": 0.1196,
+      "step": 27685
+    },
+    {
+      "epoch": 0.24032777493250926,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001671730075771326,
+      "loss": 0.1201,
+      "step": 27686
+    },
+    {
+      "epoch": 0.24033645541271342,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0016717070898559269,
+      "loss": 0.1553,
+      "step": 27687
+    },
+    {
+      "epoch": 0.2403451358929176,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0016716841033153165,
+      "loss": 0.0928,
+      "step": 27688
+    },
+    {
+      "epoch": 0.24035381637312175,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0016716611161495217,
+      "loss": 0.0957,
+      "step": 27689
+    },
+    {
+      "epoch": 0.24036249685332592,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0016716381283585669,
+      "loss": 0.1089,
+      "step": 27690
+    },
+    {
+      "epoch": 0.24037117733353008,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001671615139942477,
+      "loss": 0.1201,
+      "step": 27691
+    },
+    {
+      "epoch": 0.24037985781373425,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0016715921509012773,
+      "loss": 0.1289,
+      "step": 27692
+    },
+    {
+      "epoch": 0.24038853829393841,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0016715691612349932,
+      "loss": 0.1348,
+      "step": 27693
+    },
+    {
+      "epoch": 0.24039721877414258,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0016715461709436495,
+      "loss": 0.0962,
+      "step": 27694
+    },
+    {
+      "epoch": 0.24040589925434674,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0016715231800272718,
+      "loss": 0.1465,
+      "step": 27695
+    },
+    {
+      "epoch": 0.2404145797345509,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0016715001884858847,
+      "loss": 0.1201,
+      "step": 27696
+    },
+    {
+      "epoch": 0.24042326021475507,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0016714771963195134,
+      "loss": 0.1084,
+      "step": 27697
+    },
+    {
+      "epoch": 0.24043194069495924,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0016714542035281834,
+      "loss": 0.166,
+      "step": 27698
+    },
+    {
+      "epoch": 0.2404406211751634,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.00167143121011192,
+      "loss": 0.1084,
+      "step": 27699
+    },
+    {
+      "epoch": 0.24044930165536757,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0016714082160707478,
+      "loss": 0.1113,
+      "step": 27700
+    },
+    {
+      "epoch": 0.24045798213557174,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001671385221404692,
+      "loss": 0.1167,
+      "step": 27701
+    },
+    {
+      "epoch": 0.2404666626157759,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0016713622261137777,
+      "loss": 0.0923,
+      "step": 27702
+    },
+    {
+      "epoch": 0.24047534309598007,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0016713392301980305,
+      "loss": 0.1309,
+      "step": 27703
+    },
+    {
+      "epoch": 0.24048402357618423,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0016713162336574757,
+      "loss": 0.0723,
+      "step": 27704
+    },
+    {
+      "epoch": 0.2404927040563884,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0016712932364921376,
+      "loss": 0.0752,
+      "step": 27705
+    },
+    {
+      "epoch": 0.24050138453659256,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0016712702387020417,
+      "loss": 0.1064,
+      "step": 27706
+    },
+    {
+      "epoch": 0.24051006501679673,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0016712472402872134,
+      "loss": 0.1504,
+      "step": 27707
+    },
+    {
+      "epoch": 0.2405187454970009,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0016712242412476779,
+      "loss": 0.1172,
+      "step": 27708
+    },
+    {
+      "epoch": 0.24052742597720506,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0016712012415834598,
+      "loss": 0.0859,
+      "step": 27709
+    },
+    {
+      "epoch": 0.24053610645740922,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0016711782412945852,
+      "loss": 0.0923,
+      "step": 27710
+    },
+    {
+      "epoch": 0.2405447869376134,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.001671155240381078,
+      "loss": 0.0713,
+      "step": 27711
+    },
+    {
+      "epoch": 0.24055346741781755,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0016711322388429644,
+      "loss": 0.1406,
+      "step": 27712
+    },
+    {
+      "epoch": 0.24056214789802172,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0016711092366802692,
+      "loss": 0.1211,
+      "step": 27713
+    },
+    {
+      "epoch": 0.24057082837822588,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0016710862338930172,
+      "loss": 0.0952,
+      "step": 27714
+    },
+    {
+      "epoch": 0.24057950885843005,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001671063230481234,
+      "loss": 0.0811,
+      "step": 27715
+    },
+    {
+      "epoch": 0.2405881893386342,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0016710402264449448,
+      "loss": 0.0869,
+      "step": 27716
+    },
+    {
+      "epoch": 0.24059686981883838,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0016710172217841743,
+      "loss": 0.126,
+      "step": 27717
+    },
+    {
+      "epoch": 0.24060555029904254,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001670994216498948,
+      "loss": 0.1094,
+      "step": 27718
+    },
+    {
+      "epoch": 0.2406142307792467,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.001670971210589291,
+      "loss": 0.1152,
+      "step": 27719
+    },
+    {
+      "epoch": 0.24062291125945087,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0016709482040552287,
+      "loss": 0.0732,
+      "step": 27720
+    },
+    {
+      "epoch": 0.24063159173965504,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.001670925196896786,
+      "loss": 0.1108,
+      "step": 27721
+    },
+    {
+      "epoch": 0.2406402722198592,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0016709021891139876,
+      "loss": 0.1162,
+      "step": 27722
+    },
+    {
+      "epoch": 0.24064895270006337,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0016708791807068597,
+      "loss": 0.1191,
+      "step": 27723
+    },
+    {
+      "epoch": 0.24065763318026753,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001670856171675427,
+      "loss": 0.1123,
+      "step": 27724
+    },
+    {
+      "epoch": 0.2406663136604717,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001670833162019714,
+      "loss": 0.1348,
+      "step": 27725
+    },
+    {
+      "epoch": 0.24067499414067586,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0016708101517397469,
+      "loss": 0.0972,
+      "step": 27726
+    },
+    {
+      "epoch": 0.24068367462088003,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.00167078714083555,
+      "loss": 0.0981,
+      "step": 27727
+    },
+    {
+      "epoch": 0.2406923551010842,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0016707641293071491,
+      "loss": 0.1279,
+      "step": 27728
+    },
+    {
+      "epoch": 0.24070103558128836,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0016707411171545691,
+      "loss": 0.0767,
+      "step": 27729
+    },
+    {
+      "epoch": 0.24070971606149252,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0016707181043778352,
+      "loss": 0.0718,
+      "step": 27730
+    },
+    {
+      "epoch": 0.2407183965416967,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0016706950909769725,
+      "loss": 0.0928,
+      "step": 27731
+    },
+    {
+      "epoch": 0.24072707702190085,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0016706720769520064,
+      "loss": 0.1562,
+      "step": 27732
+    },
+    {
+      "epoch": 0.24073575750210502,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0016706490623029618,
+      "loss": 0.1104,
+      "step": 27733
+    },
+    {
+      "epoch": 0.24074443798230918,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0016706260470298637,
+      "loss": 0.1201,
+      "step": 27734
+    },
+    {
+      "epoch": 0.24075311846251335,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.001670603031132738,
+      "loss": 0.1016,
+      "step": 27735
+    },
+    {
+      "epoch": 0.24076179894271751,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001670580014611609,
+      "loss": 0.123,
+      "step": 27736
+    },
+    {
+      "epoch": 0.24077047942292168,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0016705569974665025,
+      "loss": 0.0684,
+      "step": 27737
+    },
+    {
+      "epoch": 0.24077915990312584,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0016705339796974435,
+      "loss": 0.0869,
+      "step": 27738
+    },
+    {
+      "epoch": 0.24078784038333,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0016705109613044567,
+      "loss": 0.0991,
+      "step": 27739
+    },
+    {
+      "epoch": 0.24079652086353417,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.001670487942287568,
+      "loss": 0.0889,
+      "step": 27740
+    },
+    {
+      "epoch": 0.24080520134373834,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0016704649226468023,
+      "loss": 0.1201,
+      "step": 27741
+    },
+    {
+      "epoch": 0.2408138818239425,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0016704419023821845,
+      "loss": 0.0928,
+      "step": 27742
+    },
+    {
+      "epoch": 0.24082256230414667,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0016704188814937403,
+      "loss": 0.0771,
+      "step": 27743
+    },
+    {
+      "epoch": 0.24083124278435084,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001670395859981494,
+      "loss": 0.1191,
+      "step": 27744
+    },
+    {
+      "epoch": 0.240839923264555,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001670372837845472,
+      "loss": 0.0952,
+      "step": 27745
+    },
+    {
+      "epoch": 0.24084860374475917,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0016703498150856986,
+      "loss": 0.0923,
+      "step": 27746
+    },
+    {
+      "epoch": 0.24085728422496333,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0016703267917021994,
+      "loss": 0.1152,
+      "step": 27747
+    },
+    {
+      "epoch": 0.2408659647051675,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0016703037676949993,
+      "loss": 0.1064,
+      "step": 27748
+    },
+    {
+      "epoch": 0.24087464518537166,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0016702807430641234,
+      "loss": 0.0742,
+      "step": 27749
+    },
+    {
+      "epoch": 0.24088332566557583,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001670257717809597,
+      "loss": 0.0942,
+      "step": 27750
+    },
+    {
+      "epoch": 0.24089200614578,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0016702346919314456,
+      "loss": 0.1157,
+      "step": 27751
+    },
+    {
+      "epoch": 0.24090068662598416,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001670211665429694,
+      "loss": 0.1074,
+      "step": 27752
+    },
+    {
+      "epoch": 0.24090936710618832,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0016701886383043674,
+      "loss": 0.106,
+      "step": 27753
+    },
+    {
+      "epoch": 0.2409180475863925,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0016701656105554909,
+      "loss": 0.0928,
+      "step": 27754
+    },
+    {
+      "epoch": 0.24092672806659665,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0016701425821830903,
+      "loss": 0.1055,
+      "step": 27755
+    },
+    {
+      "epoch": 0.24093540854680082,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00167011955318719,
+      "loss": 0.1299,
+      "step": 27756
+    },
+    {
+      "epoch": 0.24094408902700498,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0016700965235678159,
+      "loss": 0.0894,
+      "step": 27757
+    },
+    {
+      "epoch": 0.24095276950720915,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0016700734933249924,
+      "loss": 0.1016,
+      "step": 27758
+    },
+    {
+      "epoch": 0.2409614499874133,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.001670050462458745,
+      "loss": 0.1084,
+      "step": 27759
+    },
+    {
+      "epoch": 0.24097013046761748,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0016700274309690995,
+      "loss": 0.0903,
+      "step": 27760
+    },
+    {
+      "epoch": 0.24097881094782164,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0016700043988560802,
+      "loss": 0.0918,
+      "step": 27761
+    },
+    {
+      "epoch": 0.2409874914280258,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0016699813661197127,
+      "loss": 0.0693,
+      "step": 27762
+    },
+    {
+      "epoch": 0.24099617190822997,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001669958332760022,
+      "loss": 0.1182,
+      "step": 27763
+    },
+    {
+      "epoch": 0.24100485238843414,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001669935298777034,
+      "loss": 0.1123,
+      "step": 27764
+    },
+    {
+      "epoch": 0.2410135328686383,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0016699122641707729,
+      "loss": 0.1045,
+      "step": 27765
+    },
+    {
+      "epoch": 0.24102221334884247,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001669889228941264,
+      "loss": 0.1523,
+      "step": 27766
+    },
+    {
+      "epoch": 0.24103089382904663,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0016698661930885335,
+      "loss": 0.1016,
+      "step": 27767
+    },
+    {
+      "epoch": 0.2410395743092508,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0016698431566126055,
+      "loss": 0.1177,
+      "step": 27768
+    },
+    {
+      "epoch": 0.24104825478945496,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0016698201195135055,
+      "loss": 0.1108,
+      "step": 27769
+    },
+    {
+      "epoch": 0.24105693526965913,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0016697970817912589,
+      "loss": 0.127,
+      "step": 27770
+    },
+    {
+      "epoch": 0.2410656157498633,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0016697740434458907,
+      "loss": 0.1484,
+      "step": 27771
+    },
+    {
+      "epoch": 0.24107429623006746,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0016697510044774264,
+      "loss": 0.1172,
+      "step": 27772
+    },
+    {
+      "epoch": 0.24108297671027162,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0016697279648858908,
+      "loss": 0.1216,
+      "step": 27773
+    },
+    {
+      "epoch": 0.2410916571904758,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0016697049246713093,
+      "loss": 0.0913,
+      "step": 27774
+    },
+    {
+      "epoch": 0.24110033767067995,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0016696818838337069,
+      "loss": 0.1328,
+      "step": 27775
+    },
+    {
+      "epoch": 0.24110901815088412,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001669658842373109,
+      "loss": 0.1016,
+      "step": 27776
+    },
+    {
+      "epoch": 0.24111769863108828,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001669635800289541,
+      "loss": 0.1348,
+      "step": 27777
+    },
+    {
+      "epoch": 0.24112637911129245,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0016696127575830278,
+      "loss": 0.103,
+      "step": 27778
+    },
+    {
+      "epoch": 0.24113505959149661,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0016695897142535944,
+      "loss": 0.1211,
+      "step": 27779
+    },
+    {
+      "epoch": 0.24114374007170078,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0016695666703012665,
+      "loss": 0.1147,
+      "step": 27780
+    },
+    {
+      "epoch": 0.24115242055190494,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0016695436257260687,
+      "loss": 0.0815,
+      "step": 27781
+    },
+    {
+      "epoch": 0.2411611010321091,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001669520580528027,
+      "loss": 0.103,
+      "step": 27782
+    },
+    {
+      "epoch": 0.24116978151231327,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0016694975347071657,
+      "loss": 0.1221,
+      "step": 27783
+    },
+    {
+      "epoch": 0.2411784619925174,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0016694744882635106,
+      "loss": 0.0986,
+      "step": 27784
+    },
+    {
+      "epoch": 0.24118714247272158,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0016694514411970869,
+      "loss": 0.1484,
+      "step": 27785
+    },
+    {
+      "epoch": 0.24119582295292574,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0016694283935079198,
+      "loss": 0.0923,
+      "step": 27786
+    },
+    {
+      "epoch": 0.2412045034331299,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0016694053451960338,
+      "loss": 0.0781,
+      "step": 27787
+    },
+    {
+      "epoch": 0.24121318391333407,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0016693822962614551,
+      "loss": 0.1177,
+      "step": 27788
+    },
+    {
+      "epoch": 0.24122186439353824,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0016693592467042085,
+      "loss": 0.1367,
+      "step": 27789
+    },
+    {
+      "epoch": 0.2412305448737424,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001669336196524319,
+      "loss": 0.1006,
+      "step": 27790
+    },
+    {
+      "epoch": 0.24123922535394657,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001669313145721812,
+      "loss": 0.0913,
+      "step": 27791
+    },
+    {
+      "epoch": 0.24124790583415073,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.001669290094296713,
+      "loss": 0.1084,
+      "step": 27792
+    },
+    {
+      "epoch": 0.2412565863143549,
+      "grad_norm": 1.953125,
+      "learning_rate": 0.0016692670422490464,
+      "loss": 0.0986,
+      "step": 27793
+    },
+    {
+      "epoch": 0.24126526679455906,
+      "grad_norm": 2.125,
+      "learning_rate": 0.0016692439895788383,
+      "loss": 0.1709,
+      "step": 27794
+    },
+    {
+      "epoch": 0.24127394727476323,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0016692209362861134,
+      "loss": 0.1113,
+      "step": 27795
+    },
+    {
+      "epoch": 0.2412826277549674,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0016691978823708971,
+      "loss": 0.1138,
+      "step": 27796
+    },
+    {
+      "epoch": 0.24129130823517156,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0016691748278332144,
+      "loss": 0.0674,
+      "step": 27797
+    },
+    {
+      "epoch": 0.24129998871537572,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0016691517726730907,
+      "loss": 0.1211,
+      "step": 27798
+    },
+    {
+      "epoch": 0.2413086691955799,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001669128716890551,
+      "loss": 0.1064,
+      "step": 27799
+    },
+    {
+      "epoch": 0.24131734967578405,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0016691056604856212,
+      "loss": 0.1113,
+      "step": 27800
+    },
+    {
+      "epoch": 0.24132603015598822,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0016690826034583257,
+      "loss": 0.0771,
+      "step": 27801
+    },
+    {
+      "epoch": 0.24133471063619238,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0016690595458086902,
+      "loss": 0.0732,
+      "step": 27802
+    },
+    {
+      "epoch": 0.24134339111639655,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0016690364875367393,
+      "loss": 0.0859,
+      "step": 27803
+    },
+    {
+      "epoch": 0.24135207159660071,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0016690134286424989,
+      "loss": 0.103,
+      "step": 27804
+    },
+    {
+      "epoch": 0.24136075207680488,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001668990369125994,
+      "loss": 0.1084,
+      "step": 27805
+    },
+    {
+      "epoch": 0.24136943255700904,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.00166896730898725,
+      "loss": 0.0938,
+      "step": 27806
+    },
+    {
+      "epoch": 0.2413781130372132,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0016689442482262914,
+      "loss": 0.1055,
+      "step": 27807
+    },
+    {
+      "epoch": 0.24138679351741738,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0016689211868431441,
+      "loss": 0.1289,
+      "step": 27808
+    },
+    {
+      "epoch": 0.24139547399762154,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0016688981248378333,
+      "loss": 0.1025,
+      "step": 27809
+    },
+    {
+      "epoch": 0.2414041544778257,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001668875062210384,
+      "loss": 0.1182,
+      "step": 27810
+    },
+    {
+      "epoch": 0.24141283495802987,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0016688519989608212,
+      "loss": 0.1104,
+      "step": 27811
+    },
+    {
+      "epoch": 0.24142151543823404,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0016688289350891708,
+      "loss": 0.1699,
+      "step": 27812
+    },
+    {
+      "epoch": 0.2414301959184382,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0016688058705954575,
+      "loss": 0.1143,
+      "step": 27813
+    },
+    {
+      "epoch": 0.24143887639864237,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0016687828054797065,
+      "loss": 0.0845,
+      "step": 27814
+    },
+    {
+      "epoch": 0.24144755687884653,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0016687597397419436,
+      "loss": 0.0845,
+      "step": 27815
+    },
+    {
+      "epoch": 0.2414562373590507,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0016687366733821932,
+      "loss": 0.0938,
+      "step": 27816
+    },
+    {
+      "epoch": 0.24146491783925486,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0016687136064004809,
+      "loss": 0.1191,
+      "step": 27817
+    },
+    {
+      "epoch": 0.24147359831945903,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0016686905387968318,
+      "loss": 0.0791,
+      "step": 27818
+    },
+    {
+      "epoch": 0.2414822787996632,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0016686674705712716,
+      "loss": 0.0752,
+      "step": 27819
+    },
+    {
+      "epoch": 0.24149095927986736,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0016686444017238254,
+      "loss": 0.0996,
+      "step": 27820
+    },
+    {
+      "epoch": 0.24149963976007152,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0016686213322545179,
+      "loss": 0.0908,
+      "step": 27821
+    },
+    {
+      "epoch": 0.2415083202402757,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0016685982621633746,
+      "loss": 0.0977,
+      "step": 27822
+    },
+    {
+      "epoch": 0.24151700072047985,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0016685751914504209,
+      "loss": 0.0762,
+      "step": 27823
+    },
+    {
+      "epoch": 0.24152568120068402,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001668552120115682,
+      "loss": 0.1235,
+      "step": 27824
+    },
+    {
+      "epoch": 0.24153436168088818,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0016685290481591828,
+      "loss": 0.1025,
+      "step": 27825
+    },
+    {
+      "epoch": 0.24154304216109235,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001668505975580949,
+      "loss": 0.1191,
+      "step": 27826
+    },
+    {
+      "epoch": 0.2415517226412965,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0016684829023810052,
+      "loss": 0.1162,
+      "step": 27827
+    },
+    {
+      "epoch": 0.24156040312150068,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0016684598285593775,
+      "loss": 0.1191,
+      "step": 27828
+    },
+    {
+      "epoch": 0.24156908360170484,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0016684367541160905,
+      "loss": 0.1445,
+      "step": 27829
+    },
+    {
+      "epoch": 0.241577764081909,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0016684136790511698,
+      "loss": 0.1562,
+      "step": 27830
+    },
+    {
+      "epoch": 0.24158644456211317,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.00166839060336464,
+      "loss": 0.0903,
+      "step": 27831
+    },
+    {
+      "epoch": 0.24159512504231734,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0016683675270565274,
+      "loss": 0.1211,
+      "step": 27832
+    },
+    {
+      "epoch": 0.2416038055225215,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0016683444501268561,
+      "loss": 0.0801,
+      "step": 27833
+    },
+    {
+      "epoch": 0.24161248600272567,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0016683213725756521,
+      "loss": 0.1113,
+      "step": 27834
+    },
+    {
+      "epoch": 0.24162116648292983,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.00166829829440294,
+      "loss": 0.1426,
+      "step": 27835
+    },
+    {
+      "epoch": 0.241629846963134,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001668275215608746,
+      "loss": 0.1328,
+      "step": 27836
+    },
+    {
+      "epoch": 0.24163852744333816,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0016682521361930942,
+      "loss": 0.1035,
+      "step": 27837
+    },
+    {
+      "epoch": 0.24164720792354233,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0016682290561560109,
+      "loss": 0.1436,
+      "step": 27838
+    },
+    {
+      "epoch": 0.2416558884037465,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0016682059754975205,
+      "loss": 0.0981,
+      "step": 27839
+    },
+    {
+      "epoch": 0.24166456888395066,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.001668182894217649,
+      "loss": 0.0806,
+      "step": 27840
+    },
+    {
+      "epoch": 0.24167324936415482,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0016681598123164205,
+      "loss": 0.126,
+      "step": 27841
+    },
+    {
+      "epoch": 0.241681929844359,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0016681367297938614,
+      "loss": 0.0938,
+      "step": 27842
+    },
+    {
+      "epoch": 0.24169061032456315,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.001668113646649997,
+      "loss": 0.1426,
+      "step": 27843
+    },
+    {
+      "epoch": 0.24169929080476732,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0016680905628848515,
+      "loss": 0.1025,
+      "step": 27844
+    },
+    {
+      "epoch": 0.24170797128497148,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.0016680674784984506,
+      "loss": 0.0618,
+      "step": 27845
+    },
+    {
+      "epoch": 0.24171665176517565,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0016680443934908196,
+      "loss": 0.0747,
+      "step": 27846
+    },
+    {
+      "epoch": 0.24172533224537981,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0016680213078619842,
+      "loss": 0.1455,
+      "step": 27847
+    },
+    {
+      "epoch": 0.24173401272558398,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001667998221611969,
+      "loss": 0.1167,
+      "step": 27848
+    },
+    {
+      "epoch": 0.24174269320578815,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0016679751347407994,
+      "loss": 0.1309,
+      "step": 27849
+    },
+    {
+      "epoch": 0.2417513736859923,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0016679520472485008,
+      "loss": 0.124,
+      "step": 27850
+    },
+    {
+      "epoch": 0.24176005416619648,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0016679289591350985,
+      "loss": 0.126,
+      "step": 27851
+    },
+    {
+      "epoch": 0.24176873464640064,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0016679058704006174,
+      "loss": 0.168,
+      "step": 27852
+    },
+    {
+      "epoch": 0.2417774151266048,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001667882781045083,
+      "loss": 0.0845,
+      "step": 27853
+    },
+    {
+      "epoch": 0.24178609560680897,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0016678596910685208,
+      "loss": 0.0786,
+      "step": 27854
+    },
+    {
+      "epoch": 0.24179477608701314,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0016678366004709555,
+      "loss": 0.1025,
+      "step": 27855
+    },
+    {
+      "epoch": 0.2418034565672173,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0016678135092524128,
+      "loss": 0.1006,
+      "step": 27856
+    },
+    {
+      "epoch": 0.24181213704742147,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0016677904174129175,
+      "loss": 0.1318,
+      "step": 27857
+    },
+    {
+      "epoch": 0.24182081752762563,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0016677673249524954,
+      "loss": 0.0991,
+      "step": 27858
+    },
+    {
+      "epoch": 0.2418294980078298,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0016677442318711714,
+      "loss": 0.1328,
+      "step": 27859
+    },
+    {
+      "epoch": 0.24183817848803396,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0016677211381689709,
+      "loss": 0.1045,
+      "step": 27860
+    },
+    {
+      "epoch": 0.24184685896823813,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001667698043845919,
+      "loss": 0.0703,
+      "step": 27861
+    },
+    {
+      "epoch": 0.2418555394484423,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001667674948902041,
+      "loss": 0.1079,
+      "step": 27862
+    },
+    {
+      "epoch": 0.24186421992864646,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0016676518533373626,
+      "loss": 0.1211,
+      "step": 27863
+    },
+    {
+      "epoch": 0.24187290040885062,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001667628757151908,
+      "loss": 0.125,
+      "step": 27864
+    },
+    {
+      "epoch": 0.2418815808890548,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0016676056603457036,
+      "loss": 0.1328,
+      "step": 27865
+    },
+    {
+      "epoch": 0.24189026136925895,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0016675825629187742,
+      "loss": 0.1211,
+      "step": 27866
+    },
+    {
+      "epoch": 0.24189894184946312,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0016675594648711447,
+      "loss": 0.1084,
+      "step": 27867
+    },
+    {
+      "epoch": 0.24190762232966728,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001667536366202841,
+      "loss": 0.085,
+      "step": 27868
+    },
+    {
+      "epoch": 0.24191630280987145,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001667513266913888,
+      "loss": 0.0791,
+      "step": 27869
+    },
+    {
+      "epoch": 0.2419249832900756,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0016674901670043108,
+      "loss": 0.1367,
+      "step": 27870
+    },
+    {
+      "epoch": 0.24193366377027978,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0016674670664741352,
+      "loss": 0.1328,
+      "step": 27871
+    },
+    {
+      "epoch": 0.24194234425048394,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001667443965323386,
+      "loss": 0.1006,
+      "step": 27872
+    },
+    {
+      "epoch": 0.2419510247306881,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0016674208635520885,
+      "loss": 0.0806,
+      "step": 27873
+    },
+    {
+      "epoch": 0.24195970521089227,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0016673977611602682,
+      "loss": 0.1016,
+      "step": 27874
+    },
+    {
+      "epoch": 0.24196838569109644,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.00166737465814795,
+      "loss": 0.0947,
+      "step": 27875
+    },
+    {
+      "epoch": 0.2419770661713006,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0016673515545151596,
+      "loss": 0.1035,
+      "step": 27876
+    },
+    {
+      "epoch": 0.24198574665150477,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0016673284502619222,
+      "loss": 0.0898,
+      "step": 27877
+    },
+    {
+      "epoch": 0.24199442713170893,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0016673053453882627,
+      "loss": 0.1133,
+      "step": 27878
+    },
+    {
+      "epoch": 0.2420031076119131,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0016672822398942067,
+      "loss": 0.1543,
+      "step": 27879
+    },
+    {
+      "epoch": 0.24201178809211726,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0016672591337797793,
+      "loss": 0.085,
+      "step": 27880
+    },
+    {
+      "epoch": 0.24202046857232143,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0016672360270450059,
+      "loss": 0.1084,
+      "step": 27881
+    },
+    {
+      "epoch": 0.2420291490525256,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0016672129196899114,
+      "loss": 0.1104,
+      "step": 27882
+    },
+    {
+      "epoch": 0.24203782953272976,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001667189811714522,
+      "loss": 0.166,
+      "step": 27883
+    },
+    {
+      "epoch": 0.24204651001293392,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.001667166703118862,
+      "loss": 0.1172,
+      "step": 27884
+    },
+    {
+      "epoch": 0.2420551904931381,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0016671435939029568,
+      "loss": 0.0776,
+      "step": 27885
+    },
+    {
+      "epoch": 0.24206387097334225,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0016671204840668324,
+      "loss": 0.1299,
+      "step": 27886
+    },
+    {
+      "epoch": 0.24207255145354642,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001667097373610513,
+      "loss": 0.1387,
+      "step": 27887
+    },
+    {
+      "epoch": 0.24208123193375058,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0016670742625340246,
+      "loss": 0.0742,
+      "step": 27888
+    },
+    {
+      "epoch": 0.24208991241395475,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0016670511508373926,
+      "loss": 0.0825,
+      "step": 27889
+    },
+    {
+      "epoch": 0.24209859289415891,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001667028038520642,
+      "loss": 0.1465,
+      "step": 27890
+    },
+    {
+      "epoch": 0.24210727337436308,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0016670049255837975,
+      "loss": 0.0811,
+      "step": 27891
+    },
+    {
+      "epoch": 0.24211595385456725,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0016669818120268855,
+      "loss": 0.1396,
+      "step": 27892
+    },
+    {
+      "epoch": 0.2421246343347714,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0016669586978499303,
+      "loss": 0.1143,
+      "step": 27893
+    },
+    {
+      "epoch": 0.24213331481497558,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0016669355830529577,
+      "loss": 0.0801,
+      "step": 27894
+    },
+    {
+      "epoch": 0.24214199529517974,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0016669124676359929,
+      "loss": 0.1191,
+      "step": 27895
+    },
+    {
+      "epoch": 0.2421506757753839,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0016668893515990615,
+      "loss": 0.1152,
+      "step": 27896
+    },
+    {
+      "epoch": 0.24215935625558807,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001666866234942188,
+      "loss": 0.1006,
+      "step": 27897
+    },
+    {
+      "epoch": 0.24216803673579224,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0016668431176653981,
+      "loss": 0.1113,
+      "step": 27898
+    },
+    {
+      "epoch": 0.2421767172159964,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0016668199997687174,
+      "loss": 0.1045,
+      "step": 27899
+    },
+    {
+      "epoch": 0.24218539769620057,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0016667968812521703,
+      "loss": 0.0991,
+      "step": 27900
+    },
+    {
+      "epoch": 0.24219407817640473,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0016667737621157833,
+      "loss": 0.1123,
+      "step": 27901
+    },
+    {
+      "epoch": 0.2422027586566089,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0016667506423595807,
+      "loss": 0.1177,
+      "step": 27902
+    },
+    {
+      "epoch": 0.24221143913681306,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.001666727521983588,
+      "loss": 0.0903,
+      "step": 27903
+    },
+    {
+      "epoch": 0.24222011961701723,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0016667044009878308,
+      "loss": 0.1016,
+      "step": 27904
+    },
+    {
+      "epoch": 0.2422288000972214,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.001666681279372334,
+      "loss": 0.0825,
+      "step": 27905
+    },
+    {
+      "epoch": 0.24223748057742556,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0016666581571371233,
+      "loss": 0.1162,
+      "step": 27906
+    },
+    {
+      "epoch": 0.2422461610576297,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0016666350342822236,
+      "loss": 0.0869,
+      "step": 27907
+    },
+    {
+      "epoch": 0.24225484153783386,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0016666119108076603,
+      "loss": 0.1191,
+      "step": 27908
+    },
+    {
+      "epoch": 0.24226352201803802,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0016665887867134588,
+      "loss": 0.1035,
+      "step": 27909
+    },
+    {
+      "epoch": 0.2422722024982422,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0016665656619996444,
+      "loss": 0.0942,
+      "step": 27910
+    },
+    {
+      "epoch": 0.24228088297844635,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0016665425366662423,
+      "loss": 0.1426,
+      "step": 27911
+    },
+    {
+      "epoch": 0.24228956345865052,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0016665194107132778,
+      "loss": 0.1689,
+      "step": 27912
+    },
+    {
+      "epoch": 0.24229824393885468,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.001666496284140776,
+      "loss": 0.1113,
+      "step": 27913
+    },
+    {
+      "epoch": 0.24230692441905885,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0016664731569487625,
+      "loss": 0.0952,
+      "step": 27914
+    },
+    {
+      "epoch": 0.24231560489926302,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0016664500291372626,
+      "loss": 0.0845,
+      "step": 27915
+    },
+    {
+      "epoch": 0.24232428537946718,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0016664269007063011,
+      "loss": 0.1816,
+      "step": 27916
+    },
+    {
+      "epoch": 0.24233296585967135,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0016664037716559038,
+      "loss": 0.0723,
+      "step": 27917
+    },
+    {
+      "epoch": 0.2423416463398755,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0016663806419860959,
+      "loss": 0.0962,
+      "step": 27918
+    },
+    {
+      "epoch": 0.24235032682007968,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0016663575116969026,
+      "loss": 0.1035,
+      "step": 27919
+    },
+    {
+      "epoch": 0.24235900730028384,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0016663343807883494,
+      "loss": 0.1357,
+      "step": 27920
+    },
+    {
+      "epoch": 0.242367687780488,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0016663112492604612,
+      "loss": 0.1582,
+      "step": 27921
+    },
+    {
+      "epoch": 0.24237636826069217,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0016662881171132635,
+      "loss": 0.1328,
+      "step": 27922
+    },
+    {
+      "epoch": 0.24238504874089634,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.001666264984346782,
+      "loss": 0.1021,
+      "step": 27923
+    },
+    {
+      "epoch": 0.2423937292211005,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0016662418509610414,
+      "loss": 0.105,
+      "step": 27924
+    },
+    {
+      "epoch": 0.24240240970130467,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0016662187169560673,
+      "loss": 0.1367,
+      "step": 27925
+    },
+    {
+      "epoch": 0.24241109018150883,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0016661955823318846,
+      "loss": 0.0991,
+      "step": 27926
+    },
+    {
+      "epoch": 0.242419770661713,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001666172447088519,
+      "loss": 0.127,
+      "step": 27927
+    },
+    {
+      "epoch": 0.24242845114191716,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0016661493112259962,
+      "loss": 0.1084,
+      "step": 27928
+    },
+    {
+      "epoch": 0.24243713162212133,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0016661261747443405,
+      "loss": 0.085,
+      "step": 27929
+    },
+    {
+      "epoch": 0.2424458121023255,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0016661030376435779,
+      "loss": 0.0815,
+      "step": 27930
+    },
+    {
+      "epoch": 0.24245449258252966,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0016660798999237334,
+      "loss": 0.1211,
+      "step": 27931
+    },
+    {
+      "epoch": 0.24246317306273382,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0016660567615848325,
+      "loss": 0.1328,
+      "step": 27932
+    },
+    {
+      "epoch": 0.242471853542938,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001666033622626901,
+      "loss": 0.1602,
+      "step": 27933
+    },
+    {
+      "epoch": 0.24248053402314215,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0016660104830499628,
+      "loss": 0.1328,
+      "step": 27934
+    },
+    {
+      "epoch": 0.24248921450334632,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0016659873428540445,
+      "loss": 0.1338,
+      "step": 27935
+    },
+    {
+      "epoch": 0.24249789498355048,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0016659642020391706,
+      "loss": 0.1143,
+      "step": 27936
+    },
+    {
+      "epoch": 0.24250657546375465,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0016659410606053669,
+      "loss": 0.0742,
+      "step": 27937
+    },
+    {
+      "epoch": 0.2425152559439588,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0016659179185526585,
+      "loss": 0.1016,
+      "step": 27938
+    },
+    {
+      "epoch": 0.24252393642416298,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.001665894775881071,
+      "loss": 0.123,
+      "step": 27939
+    },
+    {
+      "epoch": 0.24253261690436714,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0016658716325906294,
+      "loss": 0.0703,
+      "step": 27940
+    },
+    {
+      "epoch": 0.2425412973845713,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001665848488681359,
+      "loss": 0.0864,
+      "step": 27941
+    },
+    {
+      "epoch": 0.24254997786477547,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0016658253441532854,
+      "loss": 0.1133,
+      "step": 27942
+    },
+    {
+      "epoch": 0.24255865834497964,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0016658021990064334,
+      "loss": 0.0933,
+      "step": 27943
+    },
+    {
+      "epoch": 0.2425673388251838,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0016657790532408289,
+      "loss": 0.0771,
+      "step": 27944
+    },
+    {
+      "epoch": 0.24257601930538797,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0016657559068564967,
+      "loss": 0.1016,
+      "step": 27945
+    },
+    {
+      "epoch": 0.24258469978559213,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0016657327598534623,
+      "loss": 0.0962,
+      "step": 27946
+    },
+    {
+      "epoch": 0.2425933802657963,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0016657096122317512,
+      "loss": 0.0825,
+      "step": 27947
+    },
+    {
+      "epoch": 0.24260206074600046,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0016656864639913887,
+      "loss": 0.1309,
+      "step": 27948
+    },
+    {
+      "epoch": 0.24261074122620463,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0016656633151323997,
+      "loss": 0.0806,
+      "step": 27949
+    },
+    {
+      "epoch": 0.2426194217064088,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0016656401656548099,
+      "loss": 0.1338,
+      "step": 27950
+    },
+    {
+      "epoch": 0.24262810218661296,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016656170155586445,
+      "loss": 0.0967,
+      "step": 27951
+    },
+    {
+      "epoch": 0.24263678266681712,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0016655938648439288,
+      "loss": 0.0991,
+      "step": 27952
+    },
+    {
+      "epoch": 0.2426454631470213,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0016655707135106884,
+      "loss": 0.0791,
+      "step": 27953
+    },
+    {
+      "epoch": 0.24265414362722545,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001665547561558948,
+      "loss": 0.125,
+      "step": 27954
+    },
+    {
+      "epoch": 0.24266282410742962,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0016655244089887335,
+      "loss": 0.0781,
+      "step": 27955
+    },
+    {
+      "epoch": 0.24267150458763379,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.00166550125580007,
+      "loss": 0.084,
+      "step": 27956
+    },
+    {
+      "epoch": 0.24268018506783795,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0016654781019929828,
+      "loss": 0.1289,
+      "step": 27957
+    },
+    {
+      "epoch": 0.24268886554804212,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0016654549475674971,
+      "loss": 0.0923,
+      "step": 27958
+    },
+    {
+      "epoch": 0.24269754602824628,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001665431792523639,
+      "loss": 0.0806,
+      "step": 27959
+    },
+    {
+      "epoch": 0.24270622650845045,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0016654086368614322,
+      "loss": 0.105,
+      "step": 27960
+    },
+    {
+      "epoch": 0.2427149069886546,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0016653854805809038,
+      "loss": 0.1045,
+      "step": 27961
+    },
+    {
+      "epoch": 0.24272358746885878,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0016653623236820778,
+      "loss": 0.1416,
+      "step": 27962
+    },
+    {
+      "epoch": 0.24273226794906294,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0016653391661649805,
+      "loss": 0.1133,
+      "step": 27963
+    },
+    {
+      "epoch": 0.2427409484292671,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0016653160080296364,
+      "loss": 0.1484,
+      "step": 27964
+    },
+    {
+      "epoch": 0.24274962890947127,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0016652928492760714,
+      "loss": 0.1025,
+      "step": 27965
+    },
+    {
+      "epoch": 0.24275830938967544,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0016652696899043108,
+      "loss": 0.0991,
+      "step": 27966
+    },
+    {
+      "epoch": 0.2427669898698796,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0016652465299143793,
+      "loss": 0.123,
+      "step": 27967
+    },
+    {
+      "epoch": 0.24277567035008377,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001665223369306303,
+      "loss": 0.0889,
+      "step": 27968
+    },
+    {
+      "epoch": 0.24278435083028793,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0016652002080801069,
+      "loss": 0.123,
+      "step": 27969
+    },
+    {
+      "epoch": 0.2427930313104921,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0016651770462358165,
+      "loss": 0.1211,
+      "step": 27970
+    },
+    {
+      "epoch": 0.24280171179069626,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0016651538837734565,
+      "loss": 0.1123,
+      "step": 27971
+    },
+    {
+      "epoch": 0.24281039227090043,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001665130720693053,
+      "loss": 0.083,
+      "step": 27972
+    },
+    {
+      "epoch": 0.2428190727511046,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0016651075569946313,
+      "loss": 0.0645,
+      "step": 27973
+    },
+    {
+      "epoch": 0.24282775323130876,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0016650843926782162,
+      "loss": 0.1299,
+      "step": 27974
+    },
+    {
+      "epoch": 0.24283643371151292,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016650612277438331,
+      "loss": 0.1133,
+      "step": 27975
+    },
+    {
+      "epoch": 0.2428451141917171,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0016650380621915078,
+      "loss": 0.1211,
+      "step": 27976
+    },
+    {
+      "epoch": 0.24285379467192125,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001665014896021265,
+      "loss": 0.0986,
+      "step": 27977
+    },
+    {
+      "epoch": 0.24286247515212542,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001664991729233131,
+      "loss": 0.1211,
+      "step": 27978
+    },
+    {
+      "epoch": 0.24287115563232958,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00166496856182713,
+      "loss": 0.1035,
+      "step": 27979
+    },
+    {
+      "epoch": 0.24287983611253375,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001664945393803288,
+      "loss": 0.0923,
+      "step": 27980
+    },
+    {
+      "epoch": 0.2428885165927379,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0016649222251616305,
+      "loss": 0.0874,
+      "step": 27981
+    },
+    {
+      "epoch": 0.24289719707294208,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0016648990559021823,
+      "loss": 0.1074,
+      "step": 27982
+    },
+    {
+      "epoch": 0.24290587755314624,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001664875886024969,
+      "loss": 0.1221,
+      "step": 27983
+    },
+    {
+      "epoch": 0.2429145580333504,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001664852715530016,
+      "loss": 0.0806,
+      "step": 27984
+    },
+    {
+      "epoch": 0.24292323851355457,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0016648295444173485,
+      "loss": 0.0972,
+      "step": 27985
+    },
+    {
+      "epoch": 0.24293191899375874,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0016648063726869917,
+      "loss": 0.0938,
+      "step": 27986
+    },
+    {
+      "epoch": 0.2429405994739629,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0016647832003389715,
+      "loss": 0.1118,
+      "step": 27987
+    },
+    {
+      "epoch": 0.24294927995416707,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0016647600273733128,
+      "loss": 0.0952,
+      "step": 27988
+    },
+    {
+      "epoch": 0.24295796043437123,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0016647368537900408,
+      "loss": 0.0942,
+      "step": 27989
+    },
+    {
+      "epoch": 0.2429666409145754,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0016647136795891811,
+      "loss": 0.1172,
+      "step": 27990
+    },
+    {
+      "epoch": 0.24297532139477956,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0016646905047707592,
+      "loss": 0.0928,
+      "step": 27991
+    },
+    {
+      "epoch": 0.24298400187498373,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0016646673293348,
+      "loss": 0.1084,
+      "step": 27992
+    },
+    {
+      "epoch": 0.2429926823551879,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0016646441532813293,
+      "loss": 0.1436,
+      "step": 27993
+    },
+    {
+      "epoch": 0.24300136283539206,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0016646209766103723,
+      "loss": 0.0918,
+      "step": 27994
+    },
+    {
+      "epoch": 0.24301004331559622,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0016645977993219542,
+      "loss": 0.106,
+      "step": 27995
+    },
+    {
+      "epoch": 0.2430187237958004,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0016645746214161003,
+      "loss": 0.1094,
+      "step": 27996
+    },
+    {
+      "epoch": 0.24302740427600455,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0016645514428928363,
+      "loss": 0.123,
+      "step": 27997
+    },
+    {
+      "epoch": 0.24303608475620872,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001664528263752187,
+      "loss": 0.1045,
+      "step": 27998
+    },
+    {
+      "epoch": 0.24304476523641289,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0016645050839941783,
+      "loss": 0.1191,
+      "step": 27999
+    },
+    {
+      "epoch": 0.24305344571661705,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0016644819036188351,
+      "loss": 0.0986,
+      "step": 28000
+    },
+    {
+      "epoch": 0.24306212619682122,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0016644587226261835,
+      "loss": 0.1182,
+      "step": 28001
+    },
+    {
+      "epoch": 0.24307080667702538,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001664435541016248,
+      "loss": 0.127,
+      "step": 28002
+    },
+    {
+      "epoch": 0.24307948715722955,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0016644123587890541,
+      "loss": 0.1016,
+      "step": 28003
+    },
+    {
+      "epoch": 0.2430881676374337,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0016643891759446272,
+      "loss": 0.1348,
+      "step": 28004
+    },
+    {
+      "epoch": 0.24309684811763788,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0016643659924829933,
+      "loss": 0.1367,
+      "step": 28005
+    },
+    {
+      "epoch": 0.24310552859784204,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001664342808404177,
+      "loss": 0.0962,
+      "step": 28006
+    },
+    {
+      "epoch": 0.2431142090780462,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0016643196237082038,
+      "loss": 0.1143,
+      "step": 28007
+    },
+    {
+      "epoch": 0.24312288955825037,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001664296438395099,
+      "loss": 0.0762,
+      "step": 28008
+    },
+    {
+      "epoch": 0.24313157003845454,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0016642732524648888,
+      "loss": 0.0898,
+      "step": 28009
+    },
+    {
+      "epoch": 0.2431402505186587,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001664250065917597,
+      "loss": 0.0791,
+      "step": 28010
+    },
+    {
+      "epoch": 0.24314893099886287,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0016642268787532505,
+      "loss": 0.0898,
+      "step": 28011
+    },
+    {
+      "epoch": 0.24315761147906703,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0016642036909718734,
+      "loss": 0.1514,
+      "step": 28012
+    },
+    {
+      "epoch": 0.2431662919592712,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.001664180502573492,
+      "loss": 0.1279,
+      "step": 28013
+    },
+    {
+      "epoch": 0.24317497243947536,
+      "grad_norm": 2.421875,
+      "learning_rate": 0.0016641573135581307,
+      "loss": 0.209,
+      "step": 28014
+    },
+    {
+      "epoch": 0.24318365291967953,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0016641341239258158,
+      "loss": 0.1064,
+      "step": 28015
+    },
+    {
+      "epoch": 0.2431923333998837,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0016641109336765728,
+      "loss": 0.0986,
+      "step": 28016
+    },
+    {
+      "epoch": 0.24320101388008786,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0016640877428104259,
+      "loss": 0.1299,
+      "step": 28017
+    },
+    {
+      "epoch": 0.24320969436029202,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0016640645513274013,
+      "loss": 0.0952,
+      "step": 28018
+    },
+    {
+      "epoch": 0.2432183748404962,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0016640413592275242,
+      "loss": 0.1055,
+      "step": 28019
+    },
+    {
+      "epoch": 0.24322705532070035,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0016640181665108199,
+      "loss": 0.0972,
+      "step": 28020
+    },
+    {
+      "epoch": 0.24323573580090452,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001663994973177314,
+      "loss": 0.2461,
+      "step": 28021
+    },
+    {
+      "epoch": 0.24324441628110868,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0016639717792270314,
+      "loss": 0.084,
+      "step": 28022
+    },
+    {
+      "epoch": 0.24325309676131285,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001663948584659998,
+      "loss": 0.123,
+      "step": 28023
+    },
+    {
+      "epoch": 0.243261777241517,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0016639253894762387,
+      "loss": 0.0928,
+      "step": 28024
+    },
+    {
+      "epoch": 0.24327045772172118,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001663902193675779,
+      "loss": 0.1094,
+      "step": 28025
+    },
+    {
+      "epoch": 0.24327913820192534,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0016638789972586447,
+      "loss": 0.1021,
+      "step": 28026
+    },
+    {
+      "epoch": 0.2432878186821295,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0016638558002248604,
+      "loss": 0.1084,
+      "step": 28027
+    },
+    {
+      "epoch": 0.24329649916233367,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0016638326025744524,
+      "loss": 0.1094,
+      "step": 28028
+    },
+    {
+      "epoch": 0.24330517964253784,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001663809404307445,
+      "loss": 0.1309,
+      "step": 28029
+    },
+    {
+      "epoch": 0.243313860122742,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0016637862054238641,
+      "loss": 0.1074,
+      "step": 28030
+    },
+    {
+      "epoch": 0.24332254060294614,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0016637630059237354,
+      "loss": 0.0928,
+      "step": 28031
+    },
+    {
+      "epoch": 0.2433312210831503,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 0.0016637398058070838,
+      "loss": 0.0752,
+      "step": 28032
+    },
+    {
+      "epoch": 0.24333990156335447,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001663716605073935,
+      "loss": 0.1133,
+      "step": 28033
+    },
+    {
+      "epoch": 0.24334858204355864,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001663693403724314,
+      "loss": 0.1172,
+      "step": 28034
+    },
+    {
+      "epoch": 0.2433572625237628,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001663670201758246,
+      "loss": 0.1162,
+      "step": 28035
+    },
+    {
+      "epoch": 0.24336594300396697,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0016636469991757573,
+      "loss": 0.1201,
+      "step": 28036
+    },
+    {
+      "epoch": 0.24337462348417113,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0016636237959768724,
+      "loss": 0.1016,
+      "step": 28037
+    },
+    {
+      "epoch": 0.2433833039643753,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0016636005921616172,
+      "loss": 0.106,
+      "step": 28038
+    },
+    {
+      "epoch": 0.24339198444457946,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0016635773877300168,
+      "loss": 0.0938,
+      "step": 28039
+    },
+    {
+      "epoch": 0.24340066492478363,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0016635541826820967,
+      "loss": 0.106,
+      "step": 28040
+    },
+    {
+      "epoch": 0.2434093454049878,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001663530977017882,
+      "loss": 0.124,
+      "step": 28041
+    },
+    {
+      "epoch": 0.24341802588519196,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0016635077707373984,
+      "loss": 0.1196,
+      "step": 28042
+    },
+    {
+      "epoch": 0.24342670636539612,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0016634845638406712,
+      "loss": 0.1064,
+      "step": 28043
+    },
+    {
+      "epoch": 0.2434353868456003,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0016634613563277258,
+      "loss": 0.1084,
+      "step": 28044
+    },
+    {
+      "epoch": 0.24344406732580445,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0016634381481985877,
+      "loss": 0.1079,
+      "step": 28045
+    },
+    {
+      "epoch": 0.24345274780600862,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0016634149394532817,
+      "loss": 0.0854,
+      "step": 28046
+    },
+    {
+      "epoch": 0.24346142828621278,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0016633917300918337,
+      "loss": 0.0981,
+      "step": 28047
+    },
+    {
+      "epoch": 0.24347010876641695,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001663368520114269,
+      "loss": 0.0884,
+      "step": 28048
+    },
+    {
+      "epoch": 0.2434787892466211,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0016633453095206128,
+      "loss": 0.1045,
+      "step": 28049
+    },
+    {
+      "epoch": 0.24348746972682528,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0016633220983108907,
+      "loss": 0.1035,
+      "step": 28050
+    },
+    {
+      "epoch": 0.24349615020702944,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0016632988864851283,
+      "loss": 0.1133,
+      "step": 28051
+    },
+    {
+      "epoch": 0.2435048306872336,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0016632756740433505,
+      "loss": 0.1084,
+      "step": 28052
+    },
+    {
+      "epoch": 0.24351351116743777,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001663252460985583,
+      "loss": 0.1128,
+      "step": 28053
+    },
+    {
+      "epoch": 0.24352219164764194,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0016632292473118511,
+      "loss": 0.0854,
+      "step": 28054
+    },
+    {
+      "epoch": 0.2435308721278461,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0016632060330221798,
+      "loss": 0.1582,
+      "step": 28055
+    },
+    {
+      "epoch": 0.24353955260805027,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0016631828181165951,
+      "loss": 0.0776,
+      "step": 28056
+    },
+    {
+      "epoch": 0.24354823308825443,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001663159602595122,
+      "loss": 0.1289,
+      "step": 28057
+    },
+    {
+      "epoch": 0.2435569135684586,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0016631363864577865,
+      "loss": 0.1035,
+      "step": 28058
+    },
+    {
+      "epoch": 0.24356559404866276,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001663113169704613,
+      "loss": 0.1211,
+      "step": 28059
+    },
+    {
+      "epoch": 0.24357427452886693,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0016630899523356276,
+      "loss": 0.0903,
+      "step": 28060
+    },
+    {
+      "epoch": 0.2435829550090711,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0016630667343508552,
+      "loss": 0.1064,
+      "step": 28061
+    },
+    {
+      "epoch": 0.24359163548927526,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001663043515750322,
+      "loss": 0.0996,
+      "step": 28062
+    },
+    {
+      "epoch": 0.24360031596947943,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0016630202965340524,
+      "loss": 0.1138,
+      "step": 28063
+    },
+    {
+      "epoch": 0.2436089964496836,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0016629970767020723,
+      "loss": 0.0796,
+      "step": 28064
+    },
+    {
+      "epoch": 0.24361767692988776,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0016629738562544073,
+      "loss": 0.1162,
+      "step": 28065
+    },
+    {
+      "epoch": 0.24362635741009192,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0016629506351910823,
+      "loss": 0.0776,
+      "step": 28066
+    },
+    {
+      "epoch": 0.24363503789029609,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0016629274135121232,
+      "loss": 0.1025,
+      "step": 28067
+    },
+    {
+      "epoch": 0.24364371837050025,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001662904191217555,
+      "loss": 0.1816,
+      "step": 28068
+    },
+    {
+      "epoch": 0.24365239885070442,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0016628809683074033,
+      "loss": 0.1436,
+      "step": 28069
+    },
+    {
+      "epoch": 0.24366107933090858,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0016628577447816936,
+      "loss": 0.1006,
+      "step": 28070
+    },
+    {
+      "epoch": 0.24366975981111275,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0016628345206404506,
+      "loss": 0.0645,
+      "step": 28071
+    },
+    {
+      "epoch": 0.2436784402913169,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0016628112958837005,
+      "loss": 0.1006,
+      "step": 28072
+    },
+    {
+      "epoch": 0.24368712077152108,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0016627880705114687,
+      "loss": 0.1514,
+      "step": 28073
+    },
+    {
+      "epoch": 0.24369580125172524,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.00166276484452378,
+      "loss": 0.1387,
+      "step": 28074
+    },
+    {
+      "epoch": 0.2437044817319294,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0016627416179206603,
+      "loss": 0.0977,
+      "step": 28075
+    },
+    {
+      "epoch": 0.24371316221213357,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0016627183907021348,
+      "loss": 0.1084,
+      "step": 28076
+    },
+    {
+      "epoch": 0.24372184269233774,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0016626951628682285,
+      "loss": 0.0889,
+      "step": 28077
+    },
+    {
+      "epoch": 0.2437305231725419,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0016626719344189679,
+      "loss": 0.1006,
+      "step": 28078
+    },
+    {
+      "epoch": 0.24373920365274607,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0016626487053543774,
+      "loss": 0.1855,
+      "step": 28079
+    },
+    {
+      "epoch": 0.24374788413295023,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0016626254756744827,
+      "loss": 0.1177,
+      "step": 28080
+    },
+    {
+      "epoch": 0.2437565646131544,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0016626022453793093,
+      "loss": 0.1367,
+      "step": 28081
+    },
+    {
+      "epoch": 0.24376524509335856,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0016625790144688824,
+      "loss": 0.0952,
+      "step": 28082
+    },
+    {
+      "epoch": 0.24377392557356273,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0016625557829432279,
+      "loss": 0.0835,
+      "step": 28083
+    },
+    {
+      "epoch": 0.2437826060537669,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0016625325508023703,
+      "loss": 0.127,
+      "step": 28084
+    },
+    {
+      "epoch": 0.24379128653397106,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001662509318046336,
+      "loss": 0.0996,
+      "step": 28085
+    },
+    {
+      "epoch": 0.24379996701417522,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0016624860846751496,
+      "loss": 0.1318,
+      "step": 28086
+    },
+    {
+      "epoch": 0.2438086474943794,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001662462850688837,
+      "loss": 0.084,
+      "step": 28087
+    },
+    {
+      "epoch": 0.24381732797458355,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.001662439616087424,
+      "loss": 0.0869,
+      "step": 28088
+    },
+    {
+      "epoch": 0.24382600845478772,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001662416380870935,
+      "loss": 0.1045,
+      "step": 28089
+    },
+    {
+      "epoch": 0.24383468893499188,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001662393145039396,
+      "loss": 0.1309,
+      "step": 28090
+    },
+    {
+      "epoch": 0.24384336941519605,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0016623699085928322,
+      "loss": 0.0894,
+      "step": 28091
+    },
+    {
+      "epoch": 0.2438520498954002,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.001662346671531269,
+      "loss": 0.1016,
+      "step": 28092
+    },
+    {
+      "epoch": 0.24386073037560438,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0016623234338547321,
+      "loss": 0.0996,
+      "step": 28093
+    },
+    {
+      "epoch": 0.24386941085580854,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0016623001955632466,
+      "loss": 0.1289,
+      "step": 28094
+    },
+    {
+      "epoch": 0.2438780913360127,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0016622769566568382,
+      "loss": 0.1001,
+      "step": 28095
+    },
+    {
+      "epoch": 0.24388677181621687,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001662253717135532,
+      "loss": 0.1523,
+      "step": 28096
+    },
+    {
+      "epoch": 0.24389545229642104,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001662230476999354,
+      "loss": 0.1045,
+      "step": 28097
+    },
+    {
+      "epoch": 0.2439041327766252,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.001662207236248329,
+      "loss": 0.1172,
+      "step": 28098
+    },
+    {
+      "epoch": 0.24391281325682937,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0016621839948824826,
+      "loss": 0.1025,
+      "step": 28099
+    },
+    {
+      "epoch": 0.24392149373703353,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.00166216075290184,
+      "loss": 0.1025,
+      "step": 28100
+    },
+    {
+      "epoch": 0.2439301742172377,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001662137510306427,
+      "loss": 0.0898,
+      "step": 28101
+    },
+    {
+      "epoch": 0.24393885469744186,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001662114267096269,
+      "loss": 0.0879,
+      "step": 28102
+    },
+    {
+      "epoch": 0.24394753517764603,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0016620910232713912,
+      "loss": 0.085,
+      "step": 28103
+    },
+    {
+      "epoch": 0.2439562156578502,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001662067778831819,
+      "loss": 0.1113,
+      "step": 28104
+    },
+    {
+      "epoch": 0.24396489613805436,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0016620445337775782,
+      "loss": 0.1006,
+      "step": 28105
+    },
+    {
+      "epoch": 0.24397357661825853,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0016620212881086939,
+      "loss": 0.1094,
+      "step": 28106
+    },
+    {
+      "epoch": 0.2439822570984627,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0016619980418251911,
+      "loss": 0.0864,
+      "step": 28107
+    },
+    {
+      "epoch": 0.24399093757866686,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.001661974794927096,
+      "loss": 0.1143,
+      "step": 28108
+    },
+    {
+      "epoch": 0.24399961805887102,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0016619515474144337,
+      "loss": 0.0898,
+      "step": 28109
+    },
+    {
+      "epoch": 0.24400829853907519,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0016619282992872298,
+      "loss": 0.1377,
+      "step": 28110
+    },
+    {
+      "epoch": 0.24401697901927935,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0016619050505455092,
+      "loss": 0.0801,
+      "step": 28111
+    },
+    {
+      "epoch": 0.24402565949948352,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001661881801189298,
+      "loss": 0.125,
+      "step": 28112
+    },
+    {
+      "epoch": 0.24403433997968768,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001661858551218621,
+      "loss": 0.1299,
+      "step": 28113
+    },
+    {
+      "epoch": 0.24404302045989185,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0016618353006335042,
+      "loss": 0.0913,
+      "step": 28114
+    },
+    {
+      "epoch": 0.244051700940096,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001661812049433973,
+      "loss": 0.0874,
+      "step": 28115
+    },
+    {
+      "epoch": 0.24406038142030018,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0016617887976200518,
+      "loss": 0.1221,
+      "step": 28116
+    },
+    {
+      "epoch": 0.24406906190050434,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0016617655451917673,
+      "loss": 0.1396,
+      "step": 28117
+    },
+    {
+      "epoch": 0.2440777423807085,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0016617422921491446,
+      "loss": 0.0957,
+      "step": 28118
+    },
+    {
+      "epoch": 0.24408642286091267,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0016617190384922086,
+      "loss": 0.1357,
+      "step": 28119
+    },
+    {
+      "epoch": 0.24409510334111684,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0016616957842209852,
+      "loss": 0.1309,
+      "step": 28120
+    },
+    {
+      "epoch": 0.244103783821321,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0016616725293355,
+      "loss": 0.0938,
+      "step": 28121
+    },
+    {
+      "epoch": 0.24411246430152517,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0016616492738357777,
+      "loss": 0.103,
+      "step": 28122
+    },
+    {
+      "epoch": 0.24412114478172933,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0016616260177218446,
+      "loss": 0.1025,
+      "step": 28123
+    },
+    {
+      "epoch": 0.2441298252619335,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0016616027609937254,
+      "loss": 0.0898,
+      "step": 28124
+    },
+    {
+      "epoch": 0.24413850574213766,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0016615795036514461,
+      "loss": 0.062,
+      "step": 28125
+    },
+    {
+      "epoch": 0.24414718622234183,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001661556245695032,
+      "loss": 0.0815,
+      "step": 28126
+    },
+    {
+      "epoch": 0.244155866702546,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.001661532987124508,
+      "loss": 0.0811,
+      "step": 28127
+    },
+    {
+      "epoch": 0.24416454718275016,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0016615097279399004,
+      "loss": 0.084,
+      "step": 28128
+    },
+    {
+      "epoch": 0.24417322766295432,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001661486468141234,
+      "loss": 0.125,
+      "step": 28129
+    },
+    {
+      "epoch": 0.2441819081431585,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0016614632077285344,
+      "loss": 0.1289,
+      "step": 28130
+    },
+    {
+      "epoch": 0.24419058862336265,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0016614399467018274,
+      "loss": 0.0879,
+      "step": 28131
+    },
+    {
+      "epoch": 0.24419926910356682,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0016614166850611376,
+      "loss": 0.1309,
+      "step": 28132
+    },
+    {
+      "epoch": 0.24420794958377098,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0016613934228064911,
+      "loss": 0.1045,
+      "step": 28133
+    },
+    {
+      "epoch": 0.24421663006397515,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0016613701599379135,
+      "loss": 0.0645,
+      "step": 28134
+    },
+    {
+      "epoch": 0.2442253105441793,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0016613468964554294,
+      "loss": 0.1133,
+      "step": 28135
+    },
+    {
+      "epoch": 0.24423399102438348,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0016613236323590654,
+      "loss": 0.0684,
+      "step": 28136
+    },
+    {
+      "epoch": 0.24424267150458764,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0016613003676488457,
+      "loss": 0.1172,
+      "step": 28137
+    },
+    {
+      "epoch": 0.2442513519847918,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0016612771023247965,
+      "loss": 0.127,
+      "step": 28138
+    },
+    {
+      "epoch": 0.24426003246499597,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0016612538363869435,
+      "loss": 0.1201,
+      "step": 28139
+    },
+    {
+      "epoch": 0.24426871294520014,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0016612305698353114,
+      "loss": 0.0996,
+      "step": 28140
+    },
+    {
+      "epoch": 0.2442773934254043,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0016612073026699263,
+      "loss": 0.1182,
+      "step": 28141
+    },
+    {
+      "epoch": 0.24428607390560847,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001661184034890813,
+      "loss": 0.0737,
+      "step": 28142
+    },
+    {
+      "epoch": 0.24429475438581263,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.001661160766497997,
+      "loss": 0.1279,
+      "step": 28143
+    },
+    {
+      "epoch": 0.2443034348660168,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0016611374974915046,
+      "loss": 0.0938,
+      "step": 28144
+    },
+    {
+      "epoch": 0.24431211534622096,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0016611142278713602,
+      "loss": 0.105,
+      "step": 28145
+    },
+    {
+      "epoch": 0.24432079582642513,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0016610909576375902,
+      "loss": 0.0791,
+      "step": 28146
+    },
+    {
+      "epoch": 0.2443294763066293,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0016610676867902191,
+      "loss": 0.1201,
+      "step": 28147
+    },
+    {
+      "epoch": 0.24433815678683346,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001661044415329273,
+      "loss": 0.1074,
+      "step": 28148
+    },
+    {
+      "epoch": 0.24434683726703763,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0016610211432547773,
+      "loss": 0.1201,
+      "step": 28149
+    },
+    {
+      "epoch": 0.2443555177472418,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0016609978705667567,
+      "loss": 0.0864,
+      "step": 28150
+    },
+    {
+      "epoch": 0.24436419822744596,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001660974597265238,
+      "loss": 0.0991,
+      "step": 28151
+    },
+    {
+      "epoch": 0.24437287870765012,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0016609513233502456,
+      "loss": 0.0957,
+      "step": 28152
+    },
+    {
+      "epoch": 0.24438155918785429,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0016609280488218052,
+      "loss": 0.1279,
+      "step": 28153
+    },
+    {
+      "epoch": 0.24439023966805842,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0016609047736799425,
+      "loss": 0.1016,
+      "step": 28154
+    },
+    {
+      "epoch": 0.2443989201482626,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0016608814979246825,
+      "loss": 0.1245,
+      "step": 28155
+    },
+    {
+      "epoch": 0.24440760062846675,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0016608582215560514,
+      "loss": 0.0972,
+      "step": 28156
+    },
+    {
+      "epoch": 0.24441628110867092,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0016608349445740736,
+      "loss": 0.1025,
+      "step": 28157
+    },
+    {
+      "epoch": 0.24442496158887508,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0016608116669787757,
+      "loss": 0.1074,
+      "step": 28158
+    },
+    {
+      "epoch": 0.24443364206907925,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001660788388770182,
+      "loss": 0.0957,
+      "step": 28159
+    },
+    {
+      "epoch": 0.24444232254928341,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001660765109948319,
+      "loss": 0.0791,
+      "step": 28160
+    },
+    {
+      "epoch": 0.24445100302948758,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0016607418305132117,
+      "loss": 0.1348,
+      "step": 28161
+    },
+    {
+      "epoch": 0.24445968350969174,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001660718550464885,
+      "loss": 0.1377,
+      "step": 28162
+    },
+    {
+      "epoch": 0.2444683639898959,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0016606952698033654,
+      "loss": 0.1377,
+      "step": 28163
+    },
+    {
+      "epoch": 0.24447704447010007,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0016606719885286778,
+      "loss": 0.0737,
+      "step": 28164
+    },
+    {
+      "epoch": 0.24448572495030424,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0016606487066408479,
+      "loss": 0.1133,
+      "step": 28165
+    },
+    {
+      "epoch": 0.2444944054305084,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0016606254241399005,
+      "loss": 0.1123,
+      "step": 28166
+    },
+    {
+      "epoch": 0.24450308591071257,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0016606021410258622,
+      "loss": 0.104,
+      "step": 28167
+    },
+    {
+      "epoch": 0.24451176639091673,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0016605788572987572,
+      "loss": 0.1055,
+      "step": 28168
+    },
+    {
+      "epoch": 0.2445204468711209,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001660555572958612,
+      "loss": 0.126,
+      "step": 28169
+    },
+    {
+      "epoch": 0.24452912735132507,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0016605322880054515,
+      "loss": 0.0996,
+      "step": 28170
+    },
+    {
+      "epoch": 0.24453780783152923,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0016605090024393013,
+      "loss": 0.0664,
+      "step": 28171
+    },
+    {
+      "epoch": 0.2445464883117334,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0016604857162601868,
+      "loss": 0.1562,
+      "step": 28172
+    },
+    {
+      "epoch": 0.24455516879193756,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0016604624294681337,
+      "loss": 0.1055,
+      "step": 28173
+    },
+    {
+      "epoch": 0.24456384927214173,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0016604391420631673,
+      "loss": 0.1016,
+      "step": 28174
+    },
+    {
+      "epoch": 0.2445725297523459,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0016604158540453131,
+      "loss": 0.0796,
+      "step": 28175
+    },
+    {
+      "epoch": 0.24458121023255006,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0016603925654145966,
+      "loss": 0.1191,
+      "step": 28176
+    },
+    {
+      "epoch": 0.24458989071275422,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001660369276171043,
+      "loss": 0.0889,
+      "step": 28177
+    },
+    {
+      "epoch": 0.2445985711929584,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0016603459863146779,
+      "loss": 0.0815,
+      "step": 28178
+    },
+    {
+      "epoch": 0.24460725167316255,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0016603226958455273,
+      "loss": 0.0762,
+      "step": 28179
+    },
+    {
+      "epoch": 0.24461593215336672,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0016602994047636157,
+      "loss": 0.1123,
+      "step": 28180
+    },
+    {
+      "epoch": 0.24462461263357088,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0016602761130689694,
+      "loss": 0.0854,
+      "step": 28181
+    },
+    {
+      "epoch": 0.24463329311377505,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0016602528207616137,
+      "loss": 0.1074,
+      "step": 28182
+    },
+    {
+      "epoch": 0.2446419735939792,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0016602295278415734,
+      "loss": 0.1235,
+      "step": 28183
+    },
+    {
+      "epoch": 0.24465065407418338,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0016602062343088748,
+      "loss": 0.1191,
+      "step": 28184
+    },
+    {
+      "epoch": 0.24465933455438754,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0016601829401635432,
+      "loss": 0.0957,
+      "step": 28185
+    },
+    {
+      "epoch": 0.2446680150345917,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0016601596454056038,
+      "loss": 0.0957,
+      "step": 28186
+    },
+    {
+      "epoch": 0.24467669551479587,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0016601363500350823,
+      "loss": 0.1143,
+      "step": 28187
+    },
+    {
+      "epoch": 0.24468537599500004,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0016601130540520041,
+      "loss": 0.1055,
+      "step": 28188
+    },
+    {
+      "epoch": 0.2446940564752042,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0016600897574563947,
+      "loss": 0.0962,
+      "step": 28189
+    },
+    {
+      "epoch": 0.24470273695540837,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0016600664602482796,
+      "loss": 0.0996,
+      "step": 28190
+    },
+    {
+      "epoch": 0.24471141743561253,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0016600431624276839,
+      "loss": 0.0684,
+      "step": 28191
+    },
+    {
+      "epoch": 0.2447200979158167,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001660019863994634,
+      "loss": 0.1064,
+      "step": 28192
+    },
+    {
+      "epoch": 0.24472877839602086,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0016599965649491543,
+      "loss": 0.1025,
+      "step": 28193
+    },
+    {
+      "epoch": 0.24473745887622503,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0016599732652912709,
+      "loss": 0.103,
+      "step": 28194
+    },
+    {
+      "epoch": 0.2447461393564292,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0016599499650210093,
+      "loss": 0.124,
+      "step": 28195
+    },
+    {
+      "epoch": 0.24475481983663336,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0016599266641383946,
+      "loss": 0.1001,
+      "step": 28196
+    },
+    {
+      "epoch": 0.24476350031683752,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0016599033626434527,
+      "loss": 0.0933,
+      "step": 28197
+    },
+    {
+      "epoch": 0.2447721807970417,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.001659880060536209,
+      "loss": 0.1455,
+      "step": 28198
+    },
+    {
+      "epoch": 0.24478086127724585,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0016598567578166884,
+      "loss": 0.0593,
+      "step": 28199
+    },
+    {
+      "epoch": 0.24478954175745002,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0016598334544849175,
+      "loss": 0.0933,
+      "step": 28200
+    },
+    {
+      "epoch": 0.24479822223765418,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0016598101505409207,
+      "loss": 0.1094,
+      "step": 28201
+    },
+    {
+      "epoch": 0.24480690271785835,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0016597868459847238,
+      "loss": 0.1113,
+      "step": 28202
+    },
+    {
+      "epoch": 0.24481558319806251,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001659763540816353,
+      "loss": 0.1338,
+      "step": 28203
+    },
+    {
+      "epoch": 0.24482426367826668,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 0.0016597402350358329,
+      "loss": 0.0947,
+      "step": 28204
+    },
+    {
+      "epoch": 0.24483294415847084,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001659716928643189,
+      "loss": 0.1143,
+      "step": 28205
+    },
+    {
+      "epoch": 0.244841624638675,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0016596936216384474,
+      "loss": 0.0859,
+      "step": 28206
+    },
+    {
+      "epoch": 0.24485030511887917,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0016596703140216332,
+      "loss": 0.1348,
+      "step": 28207
+    },
+    {
+      "epoch": 0.24485898559908334,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0016596470057927718,
+      "loss": 0.1084,
+      "step": 28208
+    },
+    {
+      "epoch": 0.2448676660792875,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001659623696951889,
+      "loss": 0.1084,
+      "step": 28209
+    },
+    {
+      "epoch": 0.24487634655949167,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0016596003874990103,
+      "loss": 0.1445,
+      "step": 28210
+    },
+    {
+      "epoch": 0.24488502703969584,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0016595770774341608,
+      "loss": 0.0898,
+      "step": 28211
+    },
+    {
+      "epoch": 0.2448937075199,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0016595537667573664,
+      "loss": 0.1172,
+      "step": 28212
+    },
+    {
+      "epoch": 0.24490238800010417,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0016595304554686521,
+      "loss": 0.1084,
+      "step": 28213
+    },
+    {
+      "epoch": 0.24491106848030833,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0016595071435680437,
+      "loss": 0.0688,
+      "step": 28214
+    },
+    {
+      "epoch": 0.2449197489605125,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001659483831055567,
+      "loss": 0.1426,
+      "step": 28215
+    },
+    {
+      "epoch": 0.24492842944071666,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0016594605179312471,
+      "loss": 0.1035,
+      "step": 28216
+    },
+    {
+      "epoch": 0.24493710992092083,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0016594372041951095,
+      "loss": 0.1016,
+      "step": 28217
+    },
+    {
+      "epoch": 0.244945790401125,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.00165941388984718,
+      "loss": 0.125,
+      "step": 28218
+    },
+    {
+      "epoch": 0.24495447088132916,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0016593905748874833,
+      "loss": 0.0723,
+      "step": 28219
+    },
+    {
+      "epoch": 0.24496315136153332,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001659367259316046,
+      "loss": 0.0801,
+      "step": 28220
+    },
+    {
+      "epoch": 0.2449718318417375,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0016593439431328932,
+      "loss": 0.1289,
+      "step": 28221
+    },
+    {
+      "epoch": 0.24498051232194165,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0016593206263380497,
+      "loss": 0.0938,
+      "step": 28222
+    },
+    {
+      "epoch": 0.24498919280214582,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001659297308931542,
+      "loss": 0.0908,
+      "step": 28223
+    },
+    {
+      "epoch": 0.24499787328234998,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001659273990913395,
+      "loss": 0.1084,
+      "step": 28224
+    },
+    {
+      "epoch": 0.24500655376255415,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0016592506722836343,
+      "loss": 0.1299,
+      "step": 28225
+    },
+    {
+      "epoch": 0.2450152342427583,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0016592273530422856,
+      "loss": 0.0703,
+      "step": 28226
+    },
+    {
+      "epoch": 0.24502391472296248,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0016592040331893745,
+      "loss": 0.105,
+      "step": 28227
+    },
+    {
+      "epoch": 0.24503259520316664,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0016591807127249256,
+      "loss": 0.124,
+      "step": 28228
+    },
+    {
+      "epoch": 0.2450412756833708,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0016591573916489655,
+      "loss": 0.0933,
+      "step": 28229
+    },
+    {
+      "epoch": 0.24504995616357497,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0016591340699615195,
+      "loss": 0.1191,
+      "step": 28230
+    },
+    {
+      "epoch": 0.24505863664377914,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0016591107476626127,
+      "loss": 0.1055,
+      "step": 28231
+    },
+    {
+      "epoch": 0.2450673171239833,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0016590874247522707,
+      "loss": 0.1523,
+      "step": 28232
+    },
+    {
+      "epoch": 0.24507599760418747,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001659064101230519,
+      "loss": 0.0918,
+      "step": 28233
+    },
+    {
+      "epoch": 0.24508467808439163,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0016590407770973835,
+      "loss": 0.0854,
+      "step": 28234
+    },
+    {
+      "epoch": 0.2450933585645958,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0016590174523528891,
+      "loss": 0.1436,
+      "step": 28235
+    },
+    {
+      "epoch": 0.24510203904479996,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001658994126997062,
+      "loss": 0.1045,
+      "step": 28236
+    },
+    {
+      "epoch": 0.24511071952500413,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001658970801029927,
+      "loss": 0.1445,
+      "step": 28237
+    },
+    {
+      "epoch": 0.2451194000052083,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0016589474744515098,
+      "loss": 0.0952,
+      "step": 28238
+    },
+    {
+      "epoch": 0.24512808048541246,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0016589241472618362,
+      "loss": 0.0859,
+      "step": 28239
+    },
+    {
+      "epoch": 0.24513676096561662,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0016589008194609317,
+      "loss": 0.1108,
+      "step": 28240
+    },
+    {
+      "epoch": 0.2451454414458208,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0016588774910488216,
+      "loss": 0.082,
+      "step": 28241
+    },
+    {
+      "epoch": 0.24515412192602495,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.001658854162025531,
+      "loss": 0.1152,
+      "step": 28242
+    },
+    {
+      "epoch": 0.24516280240622912,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0016588308323910864,
+      "loss": 0.1289,
+      "step": 28243
+    },
+    {
+      "epoch": 0.24517148288643328,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0016588075021455127,
+      "loss": 0.0996,
+      "step": 28244
+    },
+    {
+      "epoch": 0.24518016336663745,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0016587841712888357,
+      "loss": 0.0918,
+      "step": 28245
+    },
+    {
+      "epoch": 0.24518884384684161,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0016587608398210803,
+      "loss": 0.0918,
+      "step": 28246
+    },
+    {
+      "epoch": 0.24519752432704578,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001658737507742273,
+      "loss": 0.1182,
+      "step": 28247
+    },
+    {
+      "epoch": 0.24520620480724994,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0016587141750524382,
+      "loss": 0.1216,
+      "step": 28248
+    },
+    {
+      "epoch": 0.2452148852874541,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0016586908417516022,
+      "loss": 0.1055,
+      "step": 28249
+    },
+    {
+      "epoch": 0.24522356576765827,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0016586675078397904,
+      "loss": 0.0718,
+      "step": 28250
+    },
+    {
+      "epoch": 0.24523224624786244,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0016586441733170282,
+      "loss": 0.1182,
+      "step": 28251
+    },
+    {
+      "epoch": 0.2452409267280666,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001658620838183341,
+      "loss": 0.1621,
+      "step": 28252
+    },
+    {
+      "epoch": 0.24524960720827077,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0016585975024387544,
+      "loss": 0.0923,
+      "step": 28253
+    },
+    {
+      "epoch": 0.24525828768847494,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0016585741660832937,
+      "loss": 0.1543,
+      "step": 28254
+    },
+    {
+      "epoch": 0.2452669681686791,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0016585508291169853,
+      "loss": 0.0977,
+      "step": 28255
+    },
+    {
+      "epoch": 0.24527564864888327,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001658527491539854,
+      "loss": 0.0742,
+      "step": 28256
+    },
+    {
+      "epoch": 0.24528432912908743,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001658504153351925,
+      "loss": 0.1064,
+      "step": 28257
+    },
+    {
+      "epoch": 0.2452930096092916,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0016584808145532247,
+      "loss": 0.1943,
+      "step": 28258
+    },
+    {
+      "epoch": 0.24530169008949576,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0016584574751437777,
+      "loss": 0.0957,
+      "step": 28259
+    },
+    {
+      "epoch": 0.24531037056969993,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0016584341351236105,
+      "loss": 0.1299,
+      "step": 28260
+    },
+    {
+      "epoch": 0.2453190510499041,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.001658410794492748,
+      "loss": 0.1348,
+      "step": 28261
+    },
+    {
+      "epoch": 0.24532773153010826,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0016583874532512155,
+      "loss": 0.0762,
+      "step": 28262
+    },
+    {
+      "epoch": 0.24533641201031242,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0016583641113990391,
+      "loss": 0.0957,
+      "step": 28263
+    },
+    {
+      "epoch": 0.2453450924905166,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0016583407689362442,
+      "loss": 0.1514,
+      "step": 28264
+    },
+    {
+      "epoch": 0.24535377297072075,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001658317425862856,
+      "loss": 0.1045,
+      "step": 28265
+    },
+    {
+      "epoch": 0.24536245345092492,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0016582940821789004,
+      "loss": 0.1426,
+      "step": 28266
+    },
+    {
+      "epoch": 0.24537113393112908,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0016582707378844026,
+      "loss": 0.0957,
+      "step": 28267
+    },
+    {
+      "epoch": 0.24537981441133325,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001658247392979389,
+      "loss": 0.1318,
+      "step": 28268
+    },
+    {
+      "epoch": 0.2453884948915374,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0016582240474638838,
+      "loss": 0.0854,
+      "step": 28269
+    },
+    {
+      "epoch": 0.24539717537174158,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0016582007013379132,
+      "loss": 0.1689,
+      "step": 28270
+    },
+    {
+      "epoch": 0.24540585585194574,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0016581773546015027,
+      "loss": 0.0938,
+      "step": 28271
+    },
+    {
+      "epoch": 0.2454145363321499,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.001658154007254678,
+      "loss": 0.1328,
+      "step": 28272
+    },
+    {
+      "epoch": 0.24542321681235407,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0016581306592974641,
+      "loss": 0.0981,
+      "step": 28273
+    },
+    {
+      "epoch": 0.24543189729255824,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0016581073107298874,
+      "loss": 0.0952,
+      "step": 28274
+    },
+    {
+      "epoch": 0.2454405777727624,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0016580839615519727,
+      "loss": 0.0894,
+      "step": 28275
+    },
+    {
+      "epoch": 0.24544925825296657,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0016580606117637458,
+      "loss": 0.25,
+      "step": 28276
+    },
+    {
+      "epoch": 0.2454579387331707,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001658037261365232,
+      "loss": 0.1016,
+      "step": 28277
+    },
+    {
+      "epoch": 0.24546661921337487,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0016580139103564574,
+      "loss": 0.1201,
+      "step": 28278
+    },
+    {
+      "epoch": 0.24547529969357904,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001657990558737447,
+      "loss": 0.1104,
+      "step": 28279
+    },
+    {
+      "epoch": 0.2454839801737832,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0016579672065082263,
+      "loss": 0.127,
+      "step": 28280
+    },
+    {
+      "epoch": 0.24549266065398737,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0016579438536688212,
+      "loss": 0.1035,
+      "step": 28281
+    },
+    {
+      "epoch": 0.24550134113419153,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0016579205002192573,
+      "loss": 0.1445,
+      "step": 28282
+    },
+    {
+      "epoch": 0.2455100216143957,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0016578971461595595,
+      "loss": 0.0884,
+      "step": 28283
+    },
+    {
+      "epoch": 0.24551870209459986,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001657873791489754,
+      "loss": 0.1582,
+      "step": 28284
+    },
+    {
+      "epoch": 0.24552738257480403,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0016578504362098662,
+      "loss": 0.125,
+      "step": 28285
+    },
+    {
+      "epoch": 0.2455360630550082,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0016578270803199215,
+      "loss": 0.0879,
+      "step": 28286
+    },
+    {
+      "epoch": 0.24554474353521236,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0016578037238199452,
+      "loss": 0.1318,
+      "step": 28287
+    },
+    {
+      "epoch": 0.24555342401541652,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0016577803667099636,
+      "loss": 0.1143,
+      "step": 28288
+    },
+    {
+      "epoch": 0.2455621044956207,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0016577570089900014,
+      "loss": 0.0938,
+      "step": 28289
+    },
+    {
+      "epoch": 0.24557078497582485,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0016577336506600846,
+      "loss": 0.1357,
+      "step": 28290
+    },
+    {
+      "epoch": 0.24557946545602902,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0016577102917202386,
+      "loss": 0.0874,
+      "step": 28291
+    },
+    {
+      "epoch": 0.24558814593623318,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0016576869321704892,
+      "loss": 0.0986,
+      "step": 28292
+    },
+    {
+      "epoch": 0.24559682641643735,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0016576635720108616,
+      "loss": 0.1143,
+      "step": 28293
+    },
+    {
+      "epoch": 0.2456055068966415,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0016576402112413817,
+      "loss": 0.1074,
+      "step": 28294
+    },
+    {
+      "epoch": 0.24561418737684568,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0016576168498620748,
+      "loss": 0.1133,
+      "step": 28295
+    },
+    {
+      "epoch": 0.24562286785704984,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0016575934878729664,
+      "loss": 0.0713,
+      "step": 28296
+    },
+    {
+      "epoch": 0.245631548337254,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001657570125274082,
+      "loss": 0.0669,
+      "step": 28297
+    },
+    {
+      "epoch": 0.24564022881745817,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0016575467620654474,
+      "loss": 0.0938,
+      "step": 28298
+    },
+    {
+      "epoch": 0.24564890929766234,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.001657523398247088,
+      "loss": 0.1133,
+      "step": 28299
+    },
+    {
+      "epoch": 0.2456575897778665,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0016575000338190295,
+      "loss": 0.1289,
+      "step": 28300
+    },
+    {
+      "epoch": 0.24566627025807067,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0016574766687812974,
+      "loss": 0.0732,
+      "step": 28301
+    },
+    {
+      "epoch": 0.24567495073827483,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001657453303133917,
+      "loss": 0.084,
+      "step": 28302
+    },
+    {
+      "epoch": 0.245683631218479,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0016574299368769141,
+      "loss": 0.1279,
+      "step": 28303
+    },
+    {
+      "epoch": 0.24569231169868316,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0016574065700103145,
+      "loss": 0.0908,
+      "step": 28304
+    },
+    {
+      "epoch": 0.24570099217888733,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0016573832025341428,
+      "loss": 0.0889,
+      "step": 28305
+    },
+    {
+      "epoch": 0.2457096726590915,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0016573598344484257,
+      "loss": 0.0918,
+      "step": 28306
+    },
+    {
+      "epoch": 0.24571835313929566,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0016573364657531881,
+      "loss": 0.0732,
+      "step": 28307
+    },
+    {
+      "epoch": 0.24572703361949982,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0016573130964484557,
+      "loss": 0.0825,
+      "step": 28308
+    },
+    {
+      "epoch": 0.245735714099704,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0016572897265342541,
+      "loss": 0.1143,
+      "step": 28309
+    },
+    {
+      "epoch": 0.24574439457990815,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001657266356010609,
+      "loss": 0.1123,
+      "step": 28310
+    },
+    {
+      "epoch": 0.24575307506011232,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0016572429848775454,
+      "loss": 0.0957,
+      "step": 28311
+    },
+    {
+      "epoch": 0.24576175554031648,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0016572196131350897,
+      "loss": 0.0972,
+      "step": 28312
+    },
+    {
+      "epoch": 0.24577043602052065,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001657196240783267,
+      "loss": 0.1592,
+      "step": 28313
+    },
+    {
+      "epoch": 0.24577911650072481,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0016571728678221025,
+      "loss": 0.0947,
+      "step": 28314
+    },
+    {
+      "epoch": 0.24578779698092898,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0016571494942516222,
+      "loss": 0.1133,
+      "step": 28315
+    },
+    {
+      "epoch": 0.24579647746113314,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0016571261200718516,
+      "loss": 0.1216,
+      "step": 28316
+    },
+    {
+      "epoch": 0.2458051579413373,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0016571027452828163,
+      "loss": 0.1211,
+      "step": 28317
+    },
+    {
+      "epoch": 0.24581383842154148,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001657079369884542,
+      "loss": 0.166,
+      "step": 28318
+    },
+    {
+      "epoch": 0.24582251890174564,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0016570559938770537,
+      "loss": 0.1016,
+      "step": 28319
+    },
+    {
+      "epoch": 0.2458311993819498,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0016570326172603773,
+      "loss": 0.1045,
+      "step": 28320
+    },
+    {
+      "epoch": 0.24583987986215397,
+      "grad_norm": 3.28125,
+      "learning_rate": 0.0016570092400345388,
+      "loss": 0.1895,
+      "step": 28321
+    },
+    {
+      "epoch": 0.24584856034235814,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0016569858621995632,
+      "loss": 0.0811,
+      "step": 28322
+    },
+    {
+      "epoch": 0.2458572408225623,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001656962483755476,
+      "loss": 0.1689,
+      "step": 28323
+    },
+    {
+      "epoch": 0.24586592130276647,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0016569391047023035,
+      "loss": 0.1504,
+      "step": 28324
+    },
+    {
+      "epoch": 0.24587460178297063,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0016569157250400702,
+      "loss": 0.1416,
+      "step": 28325
+    },
+    {
+      "epoch": 0.2458832822631748,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0016568923447688028,
+      "loss": 0.1133,
+      "step": 28326
+    },
+    {
+      "epoch": 0.24589196274337896,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0016568689638885258,
+      "loss": 0.1152,
+      "step": 28327
+    },
+    {
+      "epoch": 0.24590064322358313,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0016568455823992654,
+      "loss": 0.0908,
+      "step": 28328
+    },
+    {
+      "epoch": 0.2459093237037873,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001656822200301047,
+      "loss": 0.1738,
+      "step": 28329
+    },
+    {
+      "epoch": 0.24591800418399146,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0016567988175938964,
+      "loss": 0.0898,
+      "step": 28330
+    },
+    {
+      "epoch": 0.24592668466419562,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001656775434277839,
+      "loss": 0.1211,
+      "step": 28331
+    },
+    {
+      "epoch": 0.2459353651443998,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0016567520503529,
+      "loss": 0.1045,
+      "step": 28332
+    },
+    {
+      "epoch": 0.24594404562460395,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0016567286658191057,
+      "loss": 0.1328,
+      "step": 28333
+    },
+    {
+      "epoch": 0.24595272610480812,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0016567052806764813,
+      "loss": 0.1152,
+      "step": 28334
+    },
+    {
+      "epoch": 0.24596140658501228,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0016566818949250522,
+      "loss": 0.0874,
+      "step": 28335
+    },
+    {
+      "epoch": 0.24597008706521645,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.001656658508564844,
+      "loss": 0.104,
+      "step": 28336
+    },
+    {
+      "epoch": 0.2459787675454206,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0016566351215958825,
+      "loss": 0.1006,
+      "step": 28337
+    },
+    {
+      "epoch": 0.24598744802562478,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0016566117340181933,
+      "loss": 0.0869,
+      "step": 28338
+    },
+    {
+      "epoch": 0.24599612850582894,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001656588345831802,
+      "loss": 0.0835,
+      "step": 28339
+    },
+    {
+      "epoch": 0.2460048089860331,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0016565649570367338,
+      "loss": 0.0747,
+      "step": 28340
+    },
+    {
+      "epoch": 0.24601348946623727,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0016565415676330147,
+      "loss": 0.1104,
+      "step": 28341
+    },
+    {
+      "epoch": 0.24602216994644144,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0016565181776206702,
+      "loss": 0.1055,
+      "step": 28342
+    },
+    {
+      "epoch": 0.2460308504266456,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0016564947869997255,
+      "loss": 0.0903,
+      "step": 28343
+    },
+    {
+      "epoch": 0.24603953090684977,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0016564713957702066,
+      "loss": 0.1338,
+      "step": 28344
+    },
+    {
+      "epoch": 0.24604821138705393,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0016564480039321387,
+      "loss": 0.1152,
+      "step": 28345
+    },
+    {
+      "epoch": 0.2460568918672581,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0016564246114855478,
+      "loss": 0.127,
+      "step": 28346
+    },
+    {
+      "epoch": 0.24606557234746226,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0016564012184304595,
+      "loss": 0.1094,
+      "step": 28347
+    },
+    {
+      "epoch": 0.24607425282766643,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001656377824766899,
+      "loss": 0.1182,
+      "step": 28348
+    },
+    {
+      "epoch": 0.2460829333078706,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0016563544304948924,
+      "loss": 0.1562,
+      "step": 28349
+    },
+    {
+      "epoch": 0.24609161378807476,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0016563310356144643,
+      "loss": 0.105,
+      "step": 28350
+    },
+    {
+      "epoch": 0.24610029426827892,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0016563076401256416,
+      "loss": 0.125,
+      "step": 28351
+    },
+    {
+      "epoch": 0.2461089747484831,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0016562842440284488,
+      "loss": 0.1055,
+      "step": 28352
+    },
+    {
+      "epoch": 0.24611765522868725,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.001656260847322912,
+      "loss": 0.0889,
+      "step": 28353
+    },
+    {
+      "epoch": 0.24612633570889142,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0016562374500090568,
+      "loss": 0.0923,
+      "step": 28354
+    },
+    {
+      "epoch": 0.24613501618909558,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0016562140520869084,
+      "loss": 0.1133,
+      "step": 28355
+    },
+    {
+      "epoch": 0.24614369666929975,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001656190653556493,
+      "loss": 0.1562,
+      "step": 28356
+    },
+    {
+      "epoch": 0.24615237714950391,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0016561672544178356,
+      "loss": 0.1094,
+      "step": 28357
+    },
+    {
+      "epoch": 0.24616105762970808,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0016561438546709624,
+      "loss": 0.0718,
+      "step": 28358
+    },
+    {
+      "epoch": 0.24616973810991225,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0016561204543158983,
+      "loss": 0.0879,
+      "step": 28359
+    },
+    {
+      "epoch": 0.2461784185901164,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0016560970533526693,
+      "loss": 0.0977,
+      "step": 28360
+    },
+    {
+      "epoch": 0.24618709907032058,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0016560736517813011,
+      "loss": 0.1299,
+      "step": 28361
+    },
+    {
+      "epoch": 0.24619577955052474,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0016560502496018189,
+      "loss": 0.1079,
+      "step": 28362
+    },
+    {
+      "epoch": 0.2462044600307289,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0016560268468142486,
+      "loss": 0.0713,
+      "step": 28363
+    },
+    {
+      "epoch": 0.24621314051093307,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0016560034434186157,
+      "loss": 0.0811,
+      "step": 28364
+    },
+    {
+      "epoch": 0.24622182099113724,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0016559800394149457,
+      "loss": 0.104,
+      "step": 28365
+    },
+    {
+      "epoch": 0.2462305014713414,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0016559566348032643,
+      "loss": 0.1016,
+      "step": 28366
+    },
+    {
+      "epoch": 0.24623918195154557,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0016559332295835972,
+      "loss": 0.0869,
+      "step": 28367
+    },
+    {
+      "epoch": 0.24624786243174973,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0016559098237559696,
+      "loss": 0.1021,
+      "step": 28368
+    },
+    {
+      "epoch": 0.2462565429119539,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0016558864173204077,
+      "loss": 0.1099,
+      "step": 28369
+    },
+    {
+      "epoch": 0.24626522339215806,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0016558630102769365,
+      "loss": 0.0889,
+      "step": 28370
+    },
+    {
+      "epoch": 0.24627390387236223,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001655839602625582,
+      "loss": 0.1348,
+      "step": 28371
+    },
+    {
+      "epoch": 0.2462825843525664,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0016558161943663696,
+      "loss": 0.1406,
+      "step": 28372
+    },
+    {
+      "epoch": 0.24629126483277056,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0016557927854993252,
+      "loss": 0.1084,
+      "step": 28373
+    },
+    {
+      "epoch": 0.24629994531297472,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0016557693760244738,
+      "loss": 0.1133,
+      "step": 28374
+    },
+    {
+      "epoch": 0.2463086257931789,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0016557459659418414,
+      "loss": 0.0923,
+      "step": 28375
+    },
+    {
+      "epoch": 0.24631730627338305,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0016557225552514539,
+      "loss": 0.1455,
+      "step": 28376
+    },
+    {
+      "epoch": 0.24632598675358722,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0016556991439533363,
+      "loss": 0.0986,
+      "step": 28377
+    },
+    {
+      "epoch": 0.24633466723379138,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0016556757320475147,
+      "loss": 0.1211,
+      "step": 28378
+    },
+    {
+      "epoch": 0.24634334771399555,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0016556523195340141,
+      "loss": 0.1182,
+      "step": 28379
+    },
+    {
+      "epoch": 0.2463520281941997,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0016556289064128605,
+      "loss": 0.1221,
+      "step": 28380
+    },
+    {
+      "epoch": 0.24636070867440388,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.00165560549268408,
+      "loss": 0.1025,
+      "step": 28381
+    },
+    {
+      "epoch": 0.24636938915460804,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0016555820783476969,
+      "loss": 0.0859,
+      "step": 28382
+    },
+    {
+      "epoch": 0.2463780696348122,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0016555586634037383,
+      "loss": 0.1133,
+      "step": 28383
+    },
+    {
+      "epoch": 0.24638675011501637,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0016555352478522288,
+      "loss": 0.084,
+      "step": 28384
+    },
+    {
+      "epoch": 0.24639543059522054,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001655511831693194,
+      "loss": 0.1211,
+      "step": 28385
+    },
+    {
+      "epoch": 0.2464041110754247,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0016554884149266602,
+      "loss": 0.0894,
+      "step": 28386
+    },
+    {
+      "epoch": 0.24641279155562887,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0016554649975526528,
+      "loss": 0.1348,
+      "step": 28387
+    },
+    {
+      "epoch": 0.24642147203583303,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0016554415795711967,
+      "loss": 0.1196,
+      "step": 28388
+    },
+    {
+      "epoch": 0.2464301525160372,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0016554181609823183,
+      "loss": 0.1094,
+      "step": 28389
+    },
+    {
+      "epoch": 0.24643883299624136,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.001655394741786043,
+      "loss": 0.1348,
+      "step": 28390
+    },
+    {
+      "epoch": 0.24644751347644553,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0016553713219823966,
+      "loss": 0.0781,
+      "step": 28391
+    },
+    {
+      "epoch": 0.2464561939566497,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0016553479015714038,
+      "loss": 0.1045,
+      "step": 28392
+    },
+    {
+      "epoch": 0.24646487443685386,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0016553244805530911,
+      "loss": 0.0791,
+      "step": 28393
+    },
+    {
+      "epoch": 0.24647355491705802,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001655301058927484,
+      "loss": 0.1191,
+      "step": 28394
+    },
+    {
+      "epoch": 0.2464822353972622,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.001655277636694608,
+      "loss": 0.1152,
+      "step": 28395
+    },
+    {
+      "epoch": 0.24649091587746635,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0016552542138544891,
+      "loss": 0.0908,
+      "step": 28396
+    },
+    {
+      "epoch": 0.24649959635767052,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0016552307904071522,
+      "loss": 0.0957,
+      "step": 28397
+    },
+    {
+      "epoch": 0.24650827683787468,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0016552073663526228,
+      "loss": 0.0957,
+      "step": 28398
+    },
+    {
+      "epoch": 0.24651695731807885,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0016551839416909279,
+      "loss": 0.1069,
+      "step": 28399
+    },
+    {
+      "epoch": 0.246525637798283,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0016551605164220912,
+      "loss": 0.1035,
+      "step": 28400
+    },
+    {
+      "epoch": 0.24653431827848715,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0016551370905461401,
+      "loss": 0.0991,
+      "step": 28401
+    },
+    {
+      "epoch": 0.24654299875869132,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0016551136640630987,
+      "loss": 0.1074,
+      "step": 28402
+    },
+    {
+      "epoch": 0.24655167923889548,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0016550902369729939,
+      "loss": 0.1089,
+      "step": 28403
+    },
+    {
+      "epoch": 0.24656035971909965,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0016550668092758507,
+      "loss": 0.1133,
+      "step": 28404
+    },
+    {
+      "epoch": 0.2465690401993038,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0016550433809716944,
+      "loss": 0.0996,
+      "step": 28405
+    },
+    {
+      "epoch": 0.24657772067950798,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0016550199520605513,
+      "loss": 0.0771,
+      "step": 28406
+    },
+    {
+      "epoch": 0.24658640115971214,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001654996522542447,
+      "loss": 0.0874,
+      "step": 28407
+    },
+    {
+      "epoch": 0.2465950816399163,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0016549730924174065,
+      "loss": 0.0918,
+      "step": 28408
+    },
+    {
+      "epoch": 0.24660376212012047,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0016549496616854557,
+      "loss": 0.126,
+      "step": 28409
+    },
+    {
+      "epoch": 0.24661244260032464,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0016549262303466202,
+      "loss": 0.124,
+      "step": 28410
+    },
+    {
+      "epoch": 0.2466211230805288,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0016549027984009259,
+      "loss": 0.0952,
+      "step": 28411
+    },
+    {
+      "epoch": 0.24662980356073297,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0016548793658483984,
+      "loss": 0.1074,
+      "step": 28412
+    },
+    {
+      "epoch": 0.24663848404093713,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0016548559326890631,
+      "loss": 0.1133,
+      "step": 28413
+    },
+    {
+      "epoch": 0.2466471645211413,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0016548324989229452,
+      "loss": 0.1187,
+      "step": 28414
+    },
+    {
+      "epoch": 0.24665584500134546,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0016548090645500714,
+      "loss": 0.1289,
+      "step": 28415
+    },
+    {
+      "epoch": 0.24666452548154963,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0016547856295704665,
+      "loss": 0.0957,
+      "step": 28416
+    },
+    {
+      "epoch": 0.2466732059617538,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0016547621939841564,
+      "loss": 0.1123,
+      "step": 28417
+    },
+    {
+      "epoch": 0.24668188644195796,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0016547387577911666,
+      "loss": 0.0869,
+      "step": 28418
+    },
+    {
+      "epoch": 0.24669056692216212,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001654715320991523,
+      "loss": 0.1182,
+      "step": 28419
+    },
+    {
+      "epoch": 0.2466992474023663,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001654691883585251,
+      "loss": 0.0967,
+      "step": 28420
+    },
+    {
+      "epoch": 0.24670792788257045,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0016546684455723763,
+      "loss": 0.1094,
+      "step": 28421
+    },
+    {
+      "epoch": 0.24671660836277462,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0016546450069529245,
+      "loss": 0.0894,
+      "step": 28422
+    },
+    {
+      "epoch": 0.24672528884297878,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0016546215677269212,
+      "loss": 0.0879,
+      "step": 28423
+    },
+    {
+      "epoch": 0.24673396932318295,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0016545981278943923,
+      "loss": 0.1367,
+      "step": 28424
+    },
+    {
+      "epoch": 0.24674264980338712,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001654574687455363,
+      "loss": 0.1182,
+      "step": 28425
+    },
+    {
+      "epoch": 0.24675133028359128,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0016545512464098593,
+      "loss": 0.1318,
+      "step": 28426
+    },
+    {
+      "epoch": 0.24676001076379545,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0016545278047579064,
+      "loss": 0.105,
+      "step": 28427
+    },
+    {
+      "epoch": 0.2467686912439996,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0016545043624995306,
+      "loss": 0.0977,
+      "step": 28428
+    },
+    {
+      "epoch": 0.24677737172420378,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001654480919634757,
+      "loss": 0.104,
+      "step": 28429
+    },
+    {
+      "epoch": 0.24678605220440794,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0016544574761636116,
+      "loss": 0.1167,
+      "step": 28430
+    },
+    {
+      "epoch": 0.2467947326846121,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0016544340320861193,
+      "loss": 0.1328,
+      "step": 28431
+    },
+    {
+      "epoch": 0.24680341316481627,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0016544105874023065,
+      "loss": 0.0957,
+      "step": 28432
+    },
+    {
+      "epoch": 0.24681209364502044,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0016543871421121987,
+      "loss": 0.0659,
+      "step": 28433
+    },
+    {
+      "epoch": 0.2468207741252246,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0016543636962158218,
+      "loss": 0.124,
+      "step": 28434
+    },
+    {
+      "epoch": 0.24682945460542877,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0016543402497132007,
+      "loss": 0.0757,
+      "step": 28435
+    },
+    {
+      "epoch": 0.24683813508563293,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0016543168026043613,
+      "loss": 0.1318,
+      "step": 28436
+    },
+    {
+      "epoch": 0.2468468155658371,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0016542933548893298,
+      "loss": 0.1045,
+      "step": 28437
+    },
+    {
+      "epoch": 0.24685549604604126,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001654269906568131,
+      "loss": 0.0977,
+      "step": 28438
+    },
+    {
+      "epoch": 0.24686417652624543,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0016542464576407912,
+      "loss": 0.0791,
+      "step": 28439
+    },
+    {
+      "epoch": 0.2468728570064496,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0016542230081073358,
+      "loss": 0.1396,
+      "step": 28440
+    },
+    {
+      "epoch": 0.24688153748665376,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0016541995579677903,
+      "loss": 0.1084,
+      "step": 28441
+    },
+    {
+      "epoch": 0.24689021796685792,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0016541761072221806,
+      "loss": 0.124,
+      "step": 28442
+    },
+    {
+      "epoch": 0.2468988984470621,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0016541526558705322,
+      "loss": 0.0972,
+      "step": 28443
+    },
+    {
+      "epoch": 0.24690757892726625,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0016541292039128708,
+      "loss": 0.1436,
+      "step": 28444
+    },
+    {
+      "epoch": 0.24691625940747042,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0016541057513492219,
+      "loss": 0.1582,
+      "step": 28445
+    },
+    {
+      "epoch": 0.24692493988767458,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0016540822981796115,
+      "loss": 0.1367,
+      "step": 28446
+    },
+    {
+      "epoch": 0.24693362036787875,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001654058844404065,
+      "loss": 0.0776,
+      "step": 28447
+    },
+    {
+      "epoch": 0.2469423008480829,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0016540353900226076,
+      "loss": 0.0894,
+      "step": 28448
+    },
+    {
+      "epoch": 0.24695098132828708,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001654011935035266,
+      "loss": 0.1182,
+      "step": 28449
+    },
+    {
+      "epoch": 0.24695966180849124,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001653988479442065,
+      "loss": 0.106,
+      "step": 28450
+    },
+    {
+      "epoch": 0.2469683422886954,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0016539650232430308,
+      "loss": 0.0854,
+      "step": 28451
+    },
+    {
+      "epoch": 0.24697702276889957,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0016539415664381888,
+      "loss": 0.0977,
+      "step": 28452
+    },
+    {
+      "epoch": 0.24698570324910374,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001653918109027564,
+      "loss": 0.1338,
+      "step": 28453
+    },
+    {
+      "epoch": 0.2469943837293079,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0016538946510111833,
+      "loss": 0.1113,
+      "step": 28454
+    },
+    {
+      "epoch": 0.24700306420951207,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0016538711923890712,
+      "loss": 0.1035,
+      "step": 28455
+    },
+    {
+      "epoch": 0.24701174468971623,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0016538477331612545,
+      "loss": 0.0713,
+      "step": 28456
+    },
+    {
+      "epoch": 0.2470204251699204,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0016538242733277577,
+      "loss": 0.1025,
+      "step": 28457
+    },
+    {
+      "epoch": 0.24702910565012456,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0016538008128886075,
+      "loss": 0.0884,
+      "step": 28458
+    },
+    {
+      "epoch": 0.24703778613032873,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0016537773518438285,
+      "loss": 0.1108,
+      "step": 28459
+    },
+    {
+      "epoch": 0.2470464666105329,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0016537538901934472,
+      "loss": 0.1309,
+      "step": 28460
+    },
+    {
+      "epoch": 0.24705514709073706,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0016537304279374892,
+      "loss": 0.1074,
+      "step": 28461
+    },
+    {
+      "epoch": 0.24706382757094122,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0016537069650759794,
+      "loss": 0.1299,
+      "step": 28462
+    },
+    {
+      "epoch": 0.2470725080511454,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0016536835016089447,
+      "loss": 0.0708,
+      "step": 28463
+    },
+    {
+      "epoch": 0.24708118853134955,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0016536600375364092,
+      "loss": 0.0903,
+      "step": 28464
+    },
+    {
+      "epoch": 0.24708986901155372,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0016536365728584,
+      "loss": 0.0957,
+      "step": 28465
+    },
+    {
+      "epoch": 0.24709854949175789,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001653613107574942,
+      "loss": 0.1172,
+      "step": 28466
+    },
+    {
+      "epoch": 0.24710722997196205,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0016535896416860613,
+      "loss": 0.083,
+      "step": 28467
+    },
+    {
+      "epoch": 0.24711591045216622,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0016535661751917827,
+      "loss": 0.1475,
+      "step": 28468
+    },
+    {
+      "epoch": 0.24712459093237038,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0016535427080921325,
+      "loss": 0.106,
+      "step": 28469
+    },
+    {
+      "epoch": 0.24713327141257455,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0016535192403871366,
+      "loss": 0.0913,
+      "step": 28470
+    },
+    {
+      "epoch": 0.2471419518927787,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0016534957720768203,
+      "loss": 0.1357,
+      "step": 28471
+    },
+    {
+      "epoch": 0.24715063237298288,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0016534723031612093,
+      "loss": 0.0801,
+      "step": 28472
+    },
+    {
+      "epoch": 0.24715931285318704,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0016534488336403296,
+      "loss": 0.1719,
+      "step": 28473
+    },
+    {
+      "epoch": 0.2471679933333912,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0016534253635142062,
+      "loss": 0.1133,
+      "step": 28474
+    },
+    {
+      "epoch": 0.24717667381359537,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001653401892782865,
+      "loss": 0.1147,
+      "step": 28475
+    },
+    {
+      "epoch": 0.24718535429379954,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0016533784214463325,
+      "loss": 0.1504,
+      "step": 28476
+    },
+    {
+      "epoch": 0.2471940347740037,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0016533549495046332,
+      "loss": 0.1167,
+      "step": 28477
+    },
+    {
+      "epoch": 0.24720271525420787,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0016533314769577932,
+      "loss": 0.1445,
+      "step": 28478
+    },
+    {
+      "epoch": 0.24721139573441203,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001653308003805838,
+      "loss": 0.1357,
+      "step": 28479
+    },
+    {
+      "epoch": 0.2472200762146162,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001653284530048794,
+      "loss": 0.1396,
+      "step": 28480
+    },
+    {
+      "epoch": 0.24722875669482036,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0016532610556866862,
+      "loss": 0.1465,
+      "step": 28481
+    },
+    {
+      "epoch": 0.24723743717502453,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0016532375807195405,
+      "loss": 0.1167,
+      "step": 28482
+    },
+    {
+      "epoch": 0.2472461176552287,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0016532141051473824,
+      "loss": 0.1191,
+      "step": 28483
+    },
+    {
+      "epoch": 0.24725479813543286,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0016531906289702371,
+      "loss": 0.0874,
+      "step": 28484
+    },
+    {
+      "epoch": 0.24726347861563702,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0016531671521881318,
+      "loss": 0.1416,
+      "step": 28485
+    },
+    {
+      "epoch": 0.2472721590958412,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0016531436748010908,
+      "loss": 0.0938,
+      "step": 28486
+    },
+    {
+      "epoch": 0.24728083957604535,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0016531201968091398,
+      "loss": 0.1084,
+      "step": 28487
+    },
+    {
+      "epoch": 0.24728952005624952,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0016530967182123055,
+      "loss": 0.0693,
+      "step": 28488
+    },
+    {
+      "epoch": 0.24729820053645368,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0016530732390106126,
+      "loss": 0.1299,
+      "step": 28489
+    },
+    {
+      "epoch": 0.24730688101665785,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0016530497592040872,
+      "loss": 0.0859,
+      "step": 28490
+    },
+    {
+      "epoch": 0.247315561496862,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001653026278792755,
+      "loss": 0.0996,
+      "step": 28491
+    },
+    {
+      "epoch": 0.24732424197706618,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0016530027977766414,
+      "loss": 0.0967,
+      "step": 28492
+    },
+    {
+      "epoch": 0.24733292245727034,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0016529793161557723,
+      "loss": 0.125,
+      "step": 28493
+    },
+    {
+      "epoch": 0.2473416029374745,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0016529558339301732,
+      "loss": 0.1396,
+      "step": 28494
+    },
+    {
+      "epoch": 0.24735028341767867,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.00165293235109987,
+      "loss": 0.1094,
+      "step": 28495
+    },
+    {
+      "epoch": 0.24735896389788284,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0016529088676648883,
+      "loss": 0.0913,
+      "step": 28496
+    },
+    {
+      "epoch": 0.247367644378087,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0016528853836252537,
+      "loss": 0.1074,
+      "step": 28497
+    },
+    {
+      "epoch": 0.24737632485829117,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0016528618989809921,
+      "loss": 0.1045,
+      "step": 28498
+    },
+    {
+      "epoch": 0.24738500533849533,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001652838413732129,
+      "loss": 0.1089,
+      "step": 28499
+    },
+    {
+      "epoch": 0.2473936858186995,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00165281492787869,
+      "loss": 0.1162,
+      "step": 28500
+    },
+    {
+      "epoch": 0.24740236629890366,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0016527914414207012,
+      "loss": 0.1279,
+      "step": 28501
+    },
+    {
+      "epoch": 0.24741104677910783,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0016527679543581878,
+      "loss": 0.1016,
+      "step": 28502
+    },
+    {
+      "epoch": 0.247419727259312,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0016527444666911753,
+      "loss": 0.0864,
+      "step": 28503
+    },
+    {
+      "epoch": 0.24742840773951616,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0016527209784196904,
+      "loss": 0.1289,
+      "step": 28504
+    },
+    {
+      "epoch": 0.24743708821972032,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0016526974895437576,
+      "loss": 0.1143,
+      "step": 28505
+    },
+    {
+      "epoch": 0.2474457686999245,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0016526740000634035,
+      "loss": 0.1167,
+      "step": 28506
+    },
+    {
+      "epoch": 0.24745444918012865,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0016526505099786531,
+      "loss": 0.1465,
+      "step": 28507
+    },
+    {
+      "epoch": 0.24746312966033282,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001652627019289533,
+      "loss": 0.1104,
+      "step": 28508
+    },
+    {
+      "epoch": 0.24747181014053699,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0016526035279960675,
+      "loss": 0.1367,
+      "step": 28509
+    },
+    {
+      "epoch": 0.24748049062074115,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016525800360982837,
+      "loss": 0.0933,
+      "step": 28510
+    },
+    {
+      "epoch": 0.24748917110094532,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0016525565435962064,
+      "loss": 0.1348,
+      "step": 28511
+    },
+    {
+      "epoch": 0.24749785158114948,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0016525330504898615,
+      "loss": 0.1113,
+      "step": 28512
+    },
+    {
+      "epoch": 0.24750653206135365,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0016525095567792747,
+      "loss": 0.1338,
+      "step": 28513
+    },
+    {
+      "epoch": 0.2475152125415578,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0016524860624644723,
+      "loss": 0.082,
+      "step": 28514
+    },
+    {
+      "epoch": 0.24752389302176198,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0016524625675454787,
+      "loss": 0.1099,
+      "step": 28515
+    },
+    {
+      "epoch": 0.24753257350196614,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.001652439072022321,
+      "loss": 0.0649,
+      "step": 28516
+    },
+    {
+      "epoch": 0.2475412539821703,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001652415575895024,
+      "loss": 0.0986,
+      "step": 28517
+    },
+    {
+      "epoch": 0.24754993446237447,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0016523920791636133,
+      "loss": 0.0811,
+      "step": 28518
+    },
+    {
+      "epoch": 0.24755861494257864,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0016523685818281154,
+      "loss": 0.0879,
+      "step": 28519
+    },
+    {
+      "epoch": 0.2475672954227828,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0016523450838885551,
+      "loss": 0.0811,
+      "step": 28520
+    },
+    {
+      "epoch": 0.24757597590298697,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001652321585344959,
+      "loss": 0.1221,
+      "step": 28521
+    },
+    {
+      "epoch": 0.24758465638319113,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001652298086197352,
+      "loss": 0.0908,
+      "step": 28522
+    },
+    {
+      "epoch": 0.2475933368633953,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.00165227458644576,
+      "loss": 0.0996,
+      "step": 28523
+    },
+    {
+      "epoch": 0.24760201734359943,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001652251086090209,
+      "loss": 0.123,
+      "step": 28524
+    },
+    {
+      "epoch": 0.2476106978238036,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0016522275851307245,
+      "loss": 0.0938,
+      "step": 28525
+    },
+    {
+      "epoch": 0.24761937830400776,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001652204083567332,
+      "loss": 0.1245,
+      "step": 28526
+    },
+    {
+      "epoch": 0.24762805878421193,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0016521805814000577,
+      "loss": 0.1196,
+      "step": 28527
+    },
+    {
+      "epoch": 0.2476367392644161,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0016521570786289267,
+      "loss": 0.0938,
+      "step": 28528
+    },
+    {
+      "epoch": 0.24764541974462026,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0016521335752539652,
+      "loss": 0.0889,
+      "step": 28529
+    },
+    {
+      "epoch": 0.24765410022482442,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0016521100712751989,
+      "loss": 0.0859,
+      "step": 28530
+    },
+    {
+      "epoch": 0.2476627807050286,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0016520865666926532,
+      "loss": 0.1338,
+      "step": 28531
+    },
+    {
+      "epoch": 0.24767146118523276,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0016520630615063537,
+      "loss": 0.0737,
+      "step": 28532
+    },
+    {
+      "epoch": 0.24768014166543692,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0016520395557163266,
+      "loss": 0.085,
+      "step": 28533
+    },
+    {
+      "epoch": 0.24768882214564109,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0016520160493225973,
+      "loss": 0.1001,
+      "step": 28534
+    },
+    {
+      "epoch": 0.24769750262584525,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0016519925423251916,
+      "loss": 0.1226,
+      "step": 28535
+    },
+    {
+      "epoch": 0.24770618310604942,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0016519690347241347,
+      "loss": 0.0762,
+      "step": 28536
+    },
+    {
+      "epoch": 0.24771486358625358,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0016519455265194532,
+      "loss": 0.1367,
+      "step": 28537
+    },
+    {
+      "epoch": 0.24772354406645775,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0016519220177111725,
+      "loss": 0.1177,
+      "step": 28538
+    },
+    {
+      "epoch": 0.2477322245466619,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0016518985082993177,
+      "loss": 0.1006,
+      "step": 28539
+    },
+    {
+      "epoch": 0.24774090502686608,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0016518749982839152,
+      "loss": 0.0884,
+      "step": 28540
+    },
+    {
+      "epoch": 0.24774958550707024,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0016518514876649906,
+      "loss": 0.1436,
+      "step": 28541
+    },
+    {
+      "epoch": 0.2477582659872744,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0016518279764425692,
+      "loss": 0.1245,
+      "step": 28542
+    },
+    {
+      "epoch": 0.24776694646747857,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0016518044646166772,
+      "loss": 0.1152,
+      "step": 28543
+    },
+    {
+      "epoch": 0.24777562694768274,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0016517809521873403,
+      "loss": 0.0752,
+      "step": 28544
+    },
+    {
+      "epoch": 0.2477843074278869,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001651757439154584,
+      "loss": 0.1089,
+      "step": 28545
+    },
+    {
+      "epoch": 0.24779298790809107,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0016517339255184339,
+      "loss": 0.0869,
+      "step": 28546
+    },
+    {
+      "epoch": 0.24780166838829523,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.001651710411278916,
+      "loss": 0.0967,
+      "step": 28547
+    },
+    {
+      "epoch": 0.2478103488684994,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0016516868964360557,
+      "loss": 0.1299,
+      "step": 28548
+    },
+    {
+      "epoch": 0.24781902934870356,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.001651663380989879,
+      "loss": 0.1055,
+      "step": 28549
+    },
+    {
+      "epoch": 0.24782770982890773,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0016516398649404115,
+      "loss": 0.1201,
+      "step": 28550
+    },
+    {
+      "epoch": 0.2478363903091119,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001651616348287679,
+      "loss": 0.1123,
+      "step": 28551
+    },
+    {
+      "epoch": 0.24784507078931606,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001651592831031707,
+      "loss": 0.1001,
+      "step": 28552
+    },
+    {
+      "epoch": 0.24785375126952022,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0016515693131725216,
+      "loss": 0.1123,
+      "step": 28553
+    },
+    {
+      "epoch": 0.2478624317497244,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001651545794710148,
+      "loss": 0.1162,
+      "step": 28554
+    },
+    {
+      "epoch": 0.24787111222992855,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0016515222756446125,
+      "loss": 0.1147,
+      "step": 28555
+    },
+    {
+      "epoch": 0.24787979271013272,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0016514987559759402,
+      "loss": 0.1621,
+      "step": 28556
+    },
+    {
+      "epoch": 0.24788847319033688,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0016514752357041573,
+      "loss": 0.0928,
+      "step": 28557
+    },
+    {
+      "epoch": 0.24789715367054105,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0016514517148292893,
+      "loss": 0.1396,
+      "step": 28558
+    },
+    {
+      "epoch": 0.2479058341507452,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0016514281933513624,
+      "loss": 0.1172,
+      "step": 28559
+    },
+    {
+      "epoch": 0.24791451463094938,
+      "grad_norm": 8.125,
+      "learning_rate": 0.0016514046712704014,
+      "loss": 0.707,
+      "step": 28560
+    },
+    {
+      "epoch": 0.24792319511115354,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0016513811485864328,
+      "loss": 0.083,
+      "step": 28561
+    },
+    {
+      "epoch": 0.2479318755913577,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001651357625299482,
+      "loss": 0.1055,
+      "step": 28562
+    },
+    {
+      "epoch": 0.24794055607156187,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001651334101409575,
+      "loss": 0.0796,
+      "step": 28563
+    },
+    {
+      "epoch": 0.24794923655176604,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0016513105769167368,
+      "loss": 0.0967,
+      "step": 28564
+    },
+    {
+      "epoch": 0.2479579170319702,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0016512870518209939,
+      "loss": 0.0894,
+      "step": 28565
+    },
+    {
+      "epoch": 0.24796659751217437,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0016512635261223718,
+      "loss": 0.0703,
+      "step": 28566
+    },
+    {
+      "epoch": 0.24797527799237853,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0016512399998208963,
+      "loss": 0.1035,
+      "step": 28567
+    },
+    {
+      "epoch": 0.2479839584725827,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0016512164729165929,
+      "loss": 0.125,
+      "step": 28568
+    },
+    {
+      "epoch": 0.24799263895278686,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0016511929454094874,
+      "loss": 0.0889,
+      "step": 28569
+    },
+    {
+      "epoch": 0.24800131943299103,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0016511694172996058,
+      "loss": 0.0947,
+      "step": 28570
+    },
+    {
+      "epoch": 0.2480099999131952,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0016511458885869732,
+      "loss": 0.0879,
+      "step": 28571
+    },
+    {
+      "epoch": 0.24801868039339936,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001651122359271616,
+      "loss": 0.103,
+      "step": 28572
+    },
+    {
+      "epoch": 0.24802736087360353,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0016510988293535597,
+      "loss": 0.0811,
+      "step": 28573
+    },
+    {
+      "epoch": 0.2480360413538077,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.00165107529883283,
+      "loss": 0.1309,
+      "step": 28574
+    },
+    {
+      "epoch": 0.24804472183401186,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0016510517677094526,
+      "loss": 0.0845,
+      "step": 28575
+    },
+    {
+      "epoch": 0.24805340231421602,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0016510282359834532,
+      "loss": 0.1089,
+      "step": 28576
+    },
+    {
+      "epoch": 0.24806208279442019,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0016510047036548577,
+      "loss": 0.1309,
+      "step": 28577
+    },
+    {
+      "epoch": 0.24807076327462435,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0016509811707236914,
+      "loss": 0.0981,
+      "step": 28578
+    },
+    {
+      "epoch": 0.24807944375482852,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0016509576371899809,
+      "loss": 0.1104,
+      "step": 28579
+    },
+    {
+      "epoch": 0.24808812423503268,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001650934103053751,
+      "loss": 0.1084,
+      "step": 28580
+    },
+    {
+      "epoch": 0.24809680471523685,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001650910568315028,
+      "loss": 0.0986,
+      "step": 28581
+    },
+    {
+      "epoch": 0.248105485195441,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0016508870329738372,
+      "loss": 0.1348,
+      "step": 28582
+    },
+    {
+      "epoch": 0.24811416567564518,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0016508634970302052,
+      "loss": 0.1177,
+      "step": 28583
+    },
+    {
+      "epoch": 0.24812284615584934,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.001650839960484157,
+      "loss": 0.1016,
+      "step": 28584
+    },
+    {
+      "epoch": 0.2481315266360535,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001650816423335718,
+      "loss": 0.1396,
+      "step": 28585
+    },
+    {
+      "epoch": 0.24814020711625767,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001650792885584915,
+      "loss": 0.1006,
+      "step": 28586
+    },
+    {
+      "epoch": 0.24814888759646184,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001650769347231773,
+      "loss": 0.0938,
+      "step": 28587
+    },
+    {
+      "epoch": 0.248157568076666,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0016507458082763175,
+      "loss": 0.0947,
+      "step": 28588
+    },
+    {
+      "epoch": 0.24816624855687017,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0016507222687185753,
+      "loss": 0.1035,
+      "step": 28589
+    },
+    {
+      "epoch": 0.24817492903707433,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0016506987285585713,
+      "loss": 0.1006,
+      "step": 28590
+    },
+    {
+      "epoch": 0.2481836095172785,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001650675187796331,
+      "loss": 0.1582,
+      "step": 28591
+    },
+    {
+      "epoch": 0.24819228999748266,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0016506516464318813,
+      "loss": 0.1191,
+      "step": 28592
+    },
+    {
+      "epoch": 0.24820097047768683,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001650628104465247,
+      "loss": 0.1006,
+      "step": 28593
+    },
+    {
+      "epoch": 0.248209650957891,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0016506045618964538,
+      "loss": 0.0928,
+      "step": 28594
+    },
+    {
+      "epoch": 0.24821833143809516,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.001650581018725528,
+      "loss": 0.0913,
+      "step": 28595
+    },
+    {
+      "epoch": 0.24822701191829932,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0016505574749524948,
+      "loss": 0.0903,
+      "step": 28596
+    },
+    {
+      "epoch": 0.2482356923985035,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0016505339305773804,
+      "loss": 0.1196,
+      "step": 28597
+    },
+    {
+      "epoch": 0.24824437287870765,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0016505103856002104,
+      "loss": 0.0654,
+      "step": 28598
+    },
+    {
+      "epoch": 0.24825305335891182,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0016504868400210105,
+      "loss": 0.1143,
+      "step": 28599
+    },
+    {
+      "epoch": 0.24826173383911598,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0016504632938398062,
+      "loss": 0.085,
+      "step": 28600
+    },
+    {
+      "epoch": 0.24827041431932015,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0016504397470566239,
+      "loss": 0.127,
+      "step": 28601
+    },
+    {
+      "epoch": 0.2482790947995243,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001650416199671489,
+      "loss": 0.1221,
+      "step": 28602
+    },
+    {
+      "epoch": 0.24828777527972848,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0016503926516844268,
+      "loss": 0.1318,
+      "step": 28603
+    },
+    {
+      "epoch": 0.24829645575993264,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0016503691030954636,
+      "loss": 0.1406,
+      "step": 28604
+    },
+    {
+      "epoch": 0.2483051362401368,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016503455539046252,
+      "loss": 0.0889,
+      "step": 28605
+    },
+    {
+      "epoch": 0.24831381672034097,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001650322004111937,
+      "loss": 0.0957,
+      "step": 28606
+    },
+    {
+      "epoch": 0.24832249720054514,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0016502984537174253,
+      "loss": 0.1201,
+      "step": 28607
+    },
+    {
+      "epoch": 0.2483311776807493,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001650274902721115,
+      "loss": 0.1191,
+      "step": 28608
+    },
+    {
+      "epoch": 0.24833985816095347,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0016502513511230323,
+      "loss": 0.1328,
+      "step": 28609
+    },
+    {
+      "epoch": 0.24834853864115763,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0016502277989232032,
+      "loss": 0.1152,
+      "step": 28610
+    },
+    {
+      "epoch": 0.2483572191213618,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0016502042461216536,
+      "loss": 0.0791,
+      "step": 28611
+    },
+    {
+      "epoch": 0.24836589960156596,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0016501806927184082,
+      "loss": 0.1299,
+      "step": 28612
+    },
+    {
+      "epoch": 0.24837458008177013,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001650157138713494,
+      "loss": 0.1055,
+      "step": 28613
+    },
+    {
+      "epoch": 0.2483832605619743,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001650133584106936,
+      "loss": 0.0908,
+      "step": 28614
+    },
+    {
+      "epoch": 0.24839194104217846,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0016501100288987603,
+      "loss": 0.1104,
+      "step": 28615
+    },
+    {
+      "epoch": 0.24840062152238263,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0016500864730889925,
+      "loss": 0.1201,
+      "step": 28616
+    },
+    {
+      "epoch": 0.2484093020025868,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0016500629166776581,
+      "loss": 0.0859,
+      "step": 28617
+    },
+    {
+      "epoch": 0.24841798248279096,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0016500393596647835,
+      "loss": 0.1045,
+      "step": 28618
+    },
+    {
+      "epoch": 0.24842666296299512,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0016500158020503939,
+      "loss": 0.1055,
+      "step": 28619
+    },
+    {
+      "epoch": 0.24843534344319929,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0016499922438345156,
+      "loss": 0.1201,
+      "step": 28620
+    },
+    {
+      "epoch": 0.24844402392340345,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0016499686850171735,
+      "loss": 0.0874,
+      "step": 28621
+    },
+    {
+      "epoch": 0.24845270440360762,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0016499451255983946,
+      "loss": 0.1299,
+      "step": 28622
+    },
+    {
+      "epoch": 0.24846138488381178,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0016499215655782034,
+      "loss": 0.106,
+      "step": 28623
+    },
+    {
+      "epoch": 0.24847006536401595,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0016498980049566266,
+      "loss": 0.0889,
+      "step": 28624
+    },
+    {
+      "epoch": 0.2484787458442201,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0016498744437336895,
+      "loss": 0.1406,
+      "step": 28625
+    },
+    {
+      "epoch": 0.24848742632442428,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0016498508819094177,
+      "loss": 0.0781,
+      "step": 28626
+    },
+    {
+      "epoch": 0.24849610680462844,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0016498273194838378,
+      "loss": 0.1367,
+      "step": 28627
+    },
+    {
+      "epoch": 0.2485047872848326,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0016498037564569746,
+      "loss": 0.0962,
+      "step": 28628
+    },
+    {
+      "epoch": 0.24851346776503677,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0016497801928288542,
+      "loss": 0.1172,
+      "step": 28629
+    },
+    {
+      "epoch": 0.24852214824524094,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0016497566285995028,
+      "loss": 0.1108,
+      "step": 28630
+    },
+    {
+      "epoch": 0.2485308287254451,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0016497330637689457,
+      "loss": 0.103,
+      "step": 28631
+    },
+    {
+      "epoch": 0.24853950920564927,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0016497094983372086,
+      "loss": 0.0972,
+      "step": 28632
+    },
+    {
+      "epoch": 0.24854818968585343,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0016496859323043176,
+      "loss": 0.1133,
+      "step": 28633
+    },
+    {
+      "epoch": 0.2485568701660576,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001649662365670298,
+      "loss": 0.1445,
+      "step": 28634
+    },
+    {
+      "epoch": 0.24856555064626176,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0016496387984351762,
+      "loss": 0.1543,
+      "step": 28635
+    },
+    {
+      "epoch": 0.24857423112646593,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0016496152305989777,
+      "loss": 0.1172,
+      "step": 28636
+    },
+    {
+      "epoch": 0.2485829116066701,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0016495916621617282,
+      "loss": 0.0884,
+      "step": 28637
+    },
+    {
+      "epoch": 0.24859159208687426,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0016495680931234535,
+      "loss": 0.0913,
+      "step": 28638
+    },
+    {
+      "epoch": 0.24860027256707842,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0016495445234841796,
+      "loss": 0.0942,
+      "step": 28639
+    },
+    {
+      "epoch": 0.2486089530472826,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0016495209532439317,
+      "loss": 0.1055,
+      "step": 28640
+    },
+    {
+      "epoch": 0.24861763352748675,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001649497382402736,
+      "loss": 0.1182,
+      "step": 28641
+    },
+    {
+      "epoch": 0.24862631400769092,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0016494738109606184,
+      "loss": 0.082,
+      "step": 28642
+    },
+    {
+      "epoch": 0.24863499448789508,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0016494502389176042,
+      "loss": 0.1235,
+      "step": 28643
+    },
+    {
+      "epoch": 0.24864367496809925,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0016494266662737196,
+      "loss": 0.1641,
+      "step": 28644
+    },
+    {
+      "epoch": 0.2486523554483034,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0016494030930289908,
+      "loss": 0.082,
+      "step": 28645
+    },
+    {
+      "epoch": 0.24866103592850758,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0016493795191834422,
+      "loss": 0.1377,
+      "step": 28646
+    },
+    {
+      "epoch": 0.24866971640871172,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0016493559447371008,
+      "loss": 0.106,
+      "step": 28647
+    },
+    {
+      "epoch": 0.24867839688891588,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001649332369689992,
+      "loss": 0.1426,
+      "step": 28648
+    },
+    {
+      "epoch": 0.24868707736912005,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0016493087940421417,
+      "loss": 0.1113,
+      "step": 28649
+    },
+    {
+      "epoch": 0.2486957578493242,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0016492852177935754,
+      "loss": 0.1221,
+      "step": 28650
+    },
+    {
+      "epoch": 0.24870443832952838,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0016492616409443188,
+      "loss": 0.1279,
+      "step": 28651
+    },
+    {
+      "epoch": 0.24871311880973254,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0016492380634943982,
+      "loss": 0.084,
+      "step": 28652
+    },
+    {
+      "epoch": 0.2487217992899367,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0016492144854438394,
+      "loss": 0.0854,
+      "step": 28653
+    },
+    {
+      "epoch": 0.24873047977014087,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0016491909067926675,
+      "loss": 0.0942,
+      "step": 28654
+    },
+    {
+      "epoch": 0.24873916025034504,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0016491673275409087,
+      "loss": 0.1172,
+      "step": 28655
+    },
+    {
+      "epoch": 0.2487478407305492,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0016491437476885886,
+      "loss": 0.0693,
+      "step": 28656
+    },
+    {
+      "epoch": 0.24875652121075337,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0016491201672357335,
+      "loss": 0.0742,
+      "step": 28657
+    },
+    {
+      "epoch": 0.24876520169095753,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0016490965861823686,
+      "loss": 0.1025,
+      "step": 28658
+    },
+    {
+      "epoch": 0.2487738821711617,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00164907300452852,
+      "loss": 0.1182,
+      "step": 28659
+    },
+    {
+      "epoch": 0.24878256265136586,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0016490494222742133,
+      "loss": 0.082,
+      "step": 28660
+    },
+    {
+      "epoch": 0.24879124313157003,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0016490258394194744,
+      "loss": 0.1113,
+      "step": 28661
+    },
+    {
+      "epoch": 0.2487999236117742,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0016490022559643293,
+      "loss": 0.1279,
+      "step": 28662
+    },
+    {
+      "epoch": 0.24880860409197836,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0016489786719088035,
+      "loss": 0.1143,
+      "step": 28663
+    },
+    {
+      "epoch": 0.24881728457218252,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001648955087252923,
+      "loss": 0.1035,
+      "step": 28664
+    },
+    {
+      "epoch": 0.2488259650523867,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0016489315019967133,
+      "loss": 0.1035,
+      "step": 28665
+    },
+    {
+      "epoch": 0.24883464553259085,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0016489079161402004,
+      "loss": 0.1069,
+      "step": 28666
+    },
+    {
+      "epoch": 0.24884332601279502,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0016488843296834101,
+      "loss": 0.1045,
+      "step": 28667
+    },
+    {
+      "epoch": 0.24885200649299918,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001648860742626368,
+      "loss": 0.1123,
+      "step": 28668
+    },
+    {
+      "epoch": 0.24886068697320335,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0016488371549690998,
+      "loss": 0.0879,
+      "step": 28669
+    },
+    {
+      "epoch": 0.24886936745340751,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001648813566711632,
+      "loss": 0.0908,
+      "step": 28670
+    },
+    {
+      "epoch": 0.24887804793361168,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.00164878997785399,
+      "loss": 0.0889,
+      "step": 28671
+    },
+    {
+      "epoch": 0.24888672841381584,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0016487663883961992,
+      "loss": 0.1182,
+      "step": 28672
+    },
+    {
+      "epoch": 0.24889540889402,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0016487427983382856,
+      "loss": 0.1465,
+      "step": 28673
+    },
+    {
+      "epoch": 0.24890408937422417,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0016487192076802754,
+      "loss": 0.0947,
+      "step": 28674
+    },
+    {
+      "epoch": 0.24891276985442834,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0016486956164221943,
+      "loss": 0.1216,
+      "step": 28675
+    },
+    {
+      "epoch": 0.2489214503346325,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001648672024564068,
+      "loss": 0.106,
+      "step": 28676
+    },
+    {
+      "epoch": 0.24893013081483667,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0016486484321059213,
+      "loss": 0.125,
+      "step": 28677
+    },
+    {
+      "epoch": 0.24893881129504083,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0016486248390477816,
+      "loss": 0.1348,
+      "step": 28678
+    },
+    {
+      "epoch": 0.248947491775245,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.001648601245389674,
+      "loss": 0.0879,
+      "step": 28679
+    },
+    {
+      "epoch": 0.24895617225544917,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0016485776511316243,
+      "loss": 0.0967,
+      "step": 28680
+    },
+    {
+      "epoch": 0.24896485273565333,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0016485540562736583,
+      "loss": 0.082,
+      "step": 28681
+    },
+    {
+      "epoch": 0.2489735332158575,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0016485304608158018,
+      "loss": 0.0796,
+      "step": 28682
+    },
+    {
+      "epoch": 0.24898221369606166,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0016485068647580806,
+      "loss": 0.1016,
+      "step": 28683
+    },
+    {
+      "epoch": 0.24899089417626583,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0016484832681005206,
+      "loss": 0.123,
+      "step": 28684
+    },
+    {
+      "epoch": 0.24899957465647,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0016484596708431476,
+      "loss": 0.0986,
+      "step": 28685
+    },
+    {
+      "epoch": 0.24900825513667416,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0016484360729859871,
+      "loss": 0.1289,
+      "step": 28686
+    },
+    {
+      "epoch": 0.24901693561687832,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0016484124745290655,
+      "loss": 0.1118,
+      "step": 28687
+    },
+    {
+      "epoch": 0.2490256160970825,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001648388875472408,
+      "loss": 0.0967,
+      "step": 28688
+    },
+    {
+      "epoch": 0.24903429657728665,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0016483652758160405,
+      "loss": 0.0762,
+      "step": 28689
+    },
+    {
+      "epoch": 0.24904297705749082,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0016483416755599895,
+      "loss": 0.1191,
+      "step": 28690
+    },
+    {
+      "epoch": 0.24905165753769498,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0016483180747042798,
+      "loss": 0.0889,
+      "step": 28691
+    },
+    {
+      "epoch": 0.24906033801789915,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001648294473248938,
+      "loss": 0.1152,
+      "step": 28692
+    },
+    {
+      "epoch": 0.2490690184981033,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0016482708711939896,
+      "loss": 0.0933,
+      "step": 28693
+    },
+    {
+      "epoch": 0.24907769897830748,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.00164824726853946,
+      "loss": 0.1162,
+      "step": 28694
+    },
+    {
+      "epoch": 0.24908637945851164,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0016482236652853757,
+      "loss": 0.1123,
+      "step": 28695
+    },
+    {
+      "epoch": 0.2490950599387158,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0016482000614317623,
+      "loss": 0.1592,
+      "step": 28696
+    },
+    {
+      "epoch": 0.24910374041891997,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0016481764569786458,
+      "loss": 0.0996,
+      "step": 28697
+    },
+    {
+      "epoch": 0.24911242089912414,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0016481528519260514,
+      "loss": 0.1094,
+      "step": 28698
+    },
+    {
+      "epoch": 0.2491211013793283,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0016481292462740052,
+      "loss": 0.0835,
+      "step": 28699
+    },
+    {
+      "epoch": 0.24912978185953247,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001648105640022533,
+      "loss": 0.1055,
+      "step": 28700
+    },
+    {
+      "epoch": 0.24913846233973663,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001648082033171661,
+      "loss": 0.1157,
+      "step": 28701
+    },
+    {
+      "epoch": 0.2491471428199408,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001648058425721415,
+      "loss": 0.0894,
+      "step": 28702
+    },
+    {
+      "epoch": 0.24915582330014496,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0016480348176718198,
+      "loss": 0.1416,
+      "step": 28703
+    },
+    {
+      "epoch": 0.24916450378034913,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0016480112090229027,
+      "loss": 0.1108,
+      "step": 28704
+    },
+    {
+      "epoch": 0.2491731842605533,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0016479875997746885,
+      "loss": 0.1133,
+      "step": 28705
+    },
+    {
+      "epoch": 0.24918186474075746,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0016479639899272033,
+      "loss": 0.1621,
+      "step": 28706
+    },
+    {
+      "epoch": 0.24919054522096162,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0016479403794804727,
+      "loss": 0.1123,
+      "step": 28707
+    },
+    {
+      "epoch": 0.2491992257011658,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001647916768434523,
+      "loss": 0.0894,
+      "step": 28708
+    },
+    {
+      "epoch": 0.24920790618136995,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0016478931567893793,
+      "loss": 0.0825,
+      "step": 28709
+    },
+    {
+      "epoch": 0.24921658666157412,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0016478695445450685,
+      "loss": 0.126,
+      "step": 28710
+    },
+    {
+      "epoch": 0.24922526714177828,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0016478459317016154,
+      "loss": 0.0703,
+      "step": 28711
+    },
+    {
+      "epoch": 0.24923394762198245,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0016478223182590462,
+      "loss": 0.126,
+      "step": 28712
+    },
+    {
+      "epoch": 0.24924262810218661,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0016477987042173867,
+      "loss": 0.0928,
+      "step": 28713
+    },
+    {
+      "epoch": 0.24925130858239078,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0016477750895766631,
+      "loss": 0.0977,
+      "step": 28714
+    },
+    {
+      "epoch": 0.24925998906259494,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0016477514743369007,
+      "loss": 0.1138,
+      "step": 28715
+    },
+    {
+      "epoch": 0.2492686695427991,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0016477278584981255,
+      "loss": 0.0762,
+      "step": 28716
+    },
+    {
+      "epoch": 0.24927735002300327,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0016477042420603636,
+      "loss": 0.1133,
+      "step": 28717
+    },
+    {
+      "epoch": 0.24928603050320744,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0016476806250236402,
+      "loss": 0.1016,
+      "step": 28718
+    },
+    {
+      "epoch": 0.2492947109834116,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0016476570073879818,
+      "loss": 0.082,
+      "step": 28719
+    },
+    {
+      "epoch": 0.24930339146361577,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0016476333891534138,
+      "loss": 0.1348,
+      "step": 28720
+    },
+    {
+      "epoch": 0.24931207194381994,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0016476097703199619,
+      "loss": 0.0996,
+      "step": 28721
+    },
+    {
+      "epoch": 0.2493207524240241,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0016475861508876526,
+      "loss": 0.1006,
+      "step": 28722
+    },
+    {
+      "epoch": 0.24932943290422827,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0016475625308565113,
+      "loss": 0.0918,
+      "step": 28723
+    },
+    {
+      "epoch": 0.24933811338443243,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0016475389102265637,
+      "loss": 0.0967,
+      "step": 28724
+    },
+    {
+      "epoch": 0.2493467938646366,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0016475152889978354,
+      "loss": 0.1553,
+      "step": 28725
+    },
+    {
+      "epoch": 0.24935547434484076,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.001647491667170353,
+      "loss": 0.1533,
+      "step": 28726
+    },
+    {
+      "epoch": 0.24936415482504493,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0016474680447441422,
+      "loss": 0.0889,
+      "step": 28727
+    },
+    {
+      "epoch": 0.2493728353052491,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0016474444217192282,
+      "loss": 0.1089,
+      "step": 28728
+    },
+    {
+      "epoch": 0.24938151578545326,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001647420798095637,
+      "loss": 0.0776,
+      "step": 28729
+    },
+    {
+      "epoch": 0.24939019626565742,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001647397173873395,
+      "loss": 0.0874,
+      "step": 28730
+    },
+    {
+      "epoch": 0.2493988767458616,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0016473735490525278,
+      "loss": 0.1357,
+      "step": 28731
+    },
+    {
+      "epoch": 0.24940755722606575,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0016473499236330607,
+      "loss": 0.0957,
+      "step": 28732
+    },
+    {
+      "epoch": 0.24941623770626992,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0016473262976150203,
+      "loss": 0.124,
+      "step": 28733
+    },
+    {
+      "epoch": 0.24942491818647408,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0016473026709984317,
+      "loss": 0.063,
+      "step": 28734
+    },
+    {
+      "epoch": 0.24943359866667825,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0016472790437833216,
+      "loss": 0.1211,
+      "step": 28735
+    },
+    {
+      "epoch": 0.2494422791468824,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001647255415969715,
+      "loss": 0.1021,
+      "step": 28736
+    },
+    {
+      "epoch": 0.24945095962708658,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0016472317875576381,
+      "loss": 0.1367,
+      "step": 28737
+    },
+    {
+      "epoch": 0.24945964010729074,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001647208158547117,
+      "loss": 0.0942,
+      "step": 28738
+    },
+    {
+      "epoch": 0.2494683205874949,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001647184528938177,
+      "loss": 0.1113,
+      "step": 28739
+    },
+    {
+      "epoch": 0.24947700106769907,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0016471608987308445,
+      "loss": 0.084,
+      "step": 28740
+    },
+    {
+      "epoch": 0.24948568154790324,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.001647137267925145,
+      "loss": 0.0869,
+      "step": 28741
+    },
+    {
+      "epoch": 0.2494943620281074,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0016471136365211042,
+      "loss": 0.0815,
+      "step": 28742
+    },
+    {
+      "epoch": 0.24950304250831157,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0016470900045187484,
+      "loss": 0.0923,
+      "step": 28743
+    },
+    {
+      "epoch": 0.24951172298851573,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0016470663719181029,
+      "loss": 0.1191,
+      "step": 28744
+    },
+    {
+      "epoch": 0.2495204034687199,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0016470427387191941,
+      "loss": 0.1055,
+      "step": 28745
+    },
+    {
+      "epoch": 0.24952908394892406,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0016470191049220476,
+      "loss": 0.1445,
+      "step": 28746
+    },
+    {
+      "epoch": 0.24953776442912823,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0016469954705266888,
+      "loss": 0.0933,
+      "step": 28747
+    },
+    {
+      "epoch": 0.2495464449093324,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0016469718355331445,
+      "loss": 0.1055,
+      "step": 28748
+    },
+    {
+      "epoch": 0.24955512538953656,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.00164694819994144,
+      "loss": 0.1025,
+      "step": 28749
+    },
+    {
+      "epoch": 0.24956380586974072,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0016469245637516007,
+      "loss": 0.0933,
+      "step": 28750
+    },
+    {
+      "epoch": 0.2495724863499449,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0016469009269636532,
+      "loss": 0.0996,
+      "step": 28751
+    },
+    {
+      "epoch": 0.24958116683014905,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0016468772895776229,
+      "loss": 0.0781,
+      "step": 28752
+    },
+    {
+      "epoch": 0.24958984731035322,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0016468536515935358,
+      "loss": 0.0972,
+      "step": 28753
+    },
+    {
+      "epoch": 0.24959852779055738,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.001646830013011418,
+      "loss": 0.1055,
+      "step": 28754
+    },
+    {
+      "epoch": 0.24960720827076155,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0016468063738312951,
+      "loss": 0.166,
+      "step": 28755
+    },
+    {
+      "epoch": 0.24961588875096571,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0016467827340531929,
+      "loss": 0.0613,
+      "step": 28756
+    },
+    {
+      "epoch": 0.24962456923116988,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0016467590936771373,
+      "loss": 0.0928,
+      "step": 28757
+    },
+    {
+      "epoch": 0.24963324971137404,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0016467354527031541,
+      "loss": 0.0767,
+      "step": 28758
+    },
+    {
+      "epoch": 0.2496419301915782,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0016467118111312695,
+      "loss": 0.1201,
+      "step": 28759
+    },
+    {
+      "epoch": 0.24965061067178237,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0016466881689615087,
+      "loss": 0.1084,
+      "step": 28760
+    },
+    {
+      "epoch": 0.24965929115198654,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001646664526193898,
+      "loss": 0.0869,
+      "step": 28761
+    },
+    {
+      "epoch": 0.2496679716321907,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0016466408828284635,
+      "loss": 0.1172,
+      "step": 28762
+    },
+    {
+      "epoch": 0.24967665211239487,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0016466172388652307,
+      "loss": 0.0977,
+      "step": 28763
+    },
+    {
+      "epoch": 0.24968533259259904,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0016465935943042252,
+      "loss": 0.0874,
+      "step": 28764
+    },
+    {
+      "epoch": 0.2496940130728032,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0016465699491454732,
+      "loss": 0.0874,
+      "step": 28765
+    },
+    {
+      "epoch": 0.24970269355300737,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0016465463033890006,
+      "loss": 0.1973,
+      "step": 28766
+    },
+    {
+      "epoch": 0.24971137403321153,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0016465226570348335,
+      "loss": 0.1289,
+      "step": 28767
+    },
+    {
+      "epoch": 0.2497200545134157,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0016464990100829968,
+      "loss": 0.1138,
+      "step": 28768
+    },
+    {
+      "epoch": 0.24972873499361986,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0016464753625335174,
+      "loss": 0.082,
+      "step": 28769
+    },
+    {
+      "epoch": 0.249737415473824,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0016464517143864208,
+      "loss": 0.1426,
+      "step": 28770
+    },
+    {
+      "epoch": 0.24974609595402816,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0016464280656417328,
+      "loss": 0.1602,
+      "step": 28771
+    },
+    {
+      "epoch": 0.24975477643423233,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0016464044162994788,
+      "loss": 0.104,
+      "step": 28772
+    },
+    {
+      "epoch": 0.2497634569144365,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0016463807663596858,
+      "loss": 0.0869,
+      "step": 28773
+    },
+    {
+      "epoch": 0.24977213739464066,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0016463571158223786,
+      "loss": 0.1196,
+      "step": 28774
+    },
+    {
+      "epoch": 0.24978081787484482,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0016463334646875836,
+      "loss": 0.0718,
+      "step": 28775
+    },
+    {
+      "epoch": 0.249789498355049,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0016463098129553264,
+      "loss": 0.1094,
+      "step": 28776
+    },
+    {
+      "epoch": 0.24979817883525315,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0016462861606256333,
+      "loss": 0.0918,
+      "step": 28777
+    },
+    {
+      "epoch": 0.24980685931545732,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0016462625076985296,
+      "loss": 0.1172,
+      "step": 28778
+    },
+    {
+      "epoch": 0.24981553979566148,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0016462388541740413,
+      "loss": 0.0801,
+      "step": 28779
+    },
+    {
+      "epoch": 0.24982422027586565,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0016462152000521948,
+      "loss": 0.1064,
+      "step": 28780
+    },
+    {
+      "epoch": 0.24983290075606981,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0016461915453330155,
+      "loss": 0.125,
+      "step": 28781
+    },
+    {
+      "epoch": 0.24984158123627398,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001646167890016529,
+      "loss": 0.0933,
+      "step": 28782
+    },
+    {
+      "epoch": 0.24985026171647814,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0016461442341027617,
+      "loss": 0.1426,
+      "step": 28783
+    },
+    {
+      "epoch": 0.2498589421966823,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0016461205775917395,
+      "loss": 0.1348,
+      "step": 28784
+    },
+    {
+      "epoch": 0.24986762267688647,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0016460969204834875,
+      "loss": 0.0879,
+      "step": 28785
+    },
+    {
+      "epoch": 0.24987630315709064,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0016460732627780325,
+      "loss": 0.1133,
+      "step": 28786
+    },
+    {
+      "epoch": 0.2498849836372948,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0016460496044753998,
+      "loss": 0.1426,
+      "step": 28787
+    },
+    {
+      "epoch": 0.24989366411749897,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0016460259455756158,
+      "loss": 0.1201,
+      "step": 28788
+    },
+    {
+      "epoch": 0.24990234459770314,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0016460022860787058,
+      "loss": 0.0996,
+      "step": 28789
+    },
+    {
+      "epoch": 0.2499110250779073,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001645978625984696,
+      "loss": 0.1011,
+      "step": 28790
+    },
+    {
+      "epoch": 0.24991970555811147,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0016459549652936118,
+      "loss": 0.1416,
+      "step": 28791
+    },
+    {
+      "epoch": 0.24992838603831563,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0016459313040054796,
+      "loss": 0.1094,
+      "step": 28792
+    },
+    {
+      "epoch": 0.2499370665185198,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0016459076421203255,
+      "loss": 0.1348,
+      "step": 28793
+    },
+    {
+      "epoch": 0.24994574699872396,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0016458839796381745,
+      "loss": 0.1211,
+      "step": 28794
+    },
+    {
+      "epoch": 0.24995442747892813,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0016458603165590536,
+      "loss": 0.0757,
+      "step": 28795
+    },
+    {
+      "epoch": 0.2499631079591323,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0016458366528829874,
+      "loss": 0.1182,
+      "step": 28796
+    },
+    {
+      "epoch": 0.24997178843933646,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0016458129886100029,
+      "loss": 0.1084,
+      "step": 28797
+    },
+    {
+      "epoch": 0.24998046891954062,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0016457893237401251,
+      "loss": 0.0811,
+      "step": 28798
+    },
+    {
+      "epoch": 0.2499891493997448,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0016457656582733807,
+      "loss": 0.0991,
+      "step": 28799
+    },
+    {
+      "epoch": 0.24999782987994895,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0016457419922097948,
+      "loss": 0.1504,
+      "step": 28800
+    },
+    {
+      "epoch": 0.25000651036015314,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001645718325549394,
+      "loss": 0.0884,
+      "step": 28801
+    },
+    {
+      "epoch": 0.2500151908403573,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0016456946582922038,
+      "loss": 0.1035,
+      "step": 28802
+    },
+    {
+      "epoch": 0.2500238713205615,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00164567099043825,
+      "loss": 0.1182,
+      "step": 28803
+    },
+    {
+      "epoch": 0.2500325518007656,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0016456473219875587,
+      "loss": 0.0781,
+      "step": 28804
+    },
+    {
+      "epoch": 0.2500412322809698,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0016456236529401559,
+      "loss": 0.0977,
+      "step": 28805
+    },
+    {
+      "epoch": 0.25004991276117394,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001645599983296067,
+      "loss": 0.1426,
+      "step": 28806
+    },
+    {
+      "epoch": 0.25005859324137814,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001645576313055318,
+      "loss": 0.1035,
+      "step": 28807
+    },
+    {
+      "epoch": 0.2500672737215823,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001645552642217935,
+      "loss": 0.0977,
+      "step": 28808
+    },
+    {
+      "epoch": 0.25007595420178647,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001645528970783944,
+      "loss": 0.1387,
+      "step": 28809
+    },
+    {
+      "epoch": 0.2500846346819906,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0016455052987533708,
+      "loss": 0.1084,
+      "step": 28810
+    },
+    {
+      "epoch": 0.2500933151621948,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0016454816261262408,
+      "loss": 0.1001,
+      "step": 28811
+    },
+    {
+      "epoch": 0.25010199564239893,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0016454579529025805,
+      "loss": 0.1191,
+      "step": 28812
+    },
+    {
+      "epoch": 0.2501106761226031,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0016454342790824157,
+      "loss": 0.1299,
+      "step": 28813
+    },
+    {
+      "epoch": 0.25011935660280726,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001645410604665772,
+      "loss": 0.123,
+      "step": 28814
+    },
+    {
+      "epoch": 0.25012803708301146,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0016453869296526756,
+      "loss": 0.0693,
+      "step": 28815
+    },
+    {
+      "epoch": 0.2501367175632156,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0016453632540431521,
+      "loss": 0.1201,
+      "step": 28816
+    },
+    {
+      "epoch": 0.2501453980434198,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0016453395778372275,
+      "loss": 0.1143,
+      "step": 28817
+    },
+    {
+      "epoch": 0.2501540785236239,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0016453159010349282,
+      "loss": 0.0859,
+      "step": 28818
+    },
+    {
+      "epoch": 0.2501627590038281,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001645292223636279,
+      "loss": 0.1016,
+      "step": 28819
+    },
+    {
+      "epoch": 0.25017143948403225,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0016452685456413067,
+      "loss": 0.124,
+      "step": 28820
+    },
+    {
+      "epoch": 0.25018011996423645,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0016452448670500372,
+      "loss": 0.106,
+      "step": 28821
+    },
+    {
+      "epoch": 0.2501888004444406,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0016452211878624956,
+      "loss": 0.0898,
+      "step": 28822
+    },
+    {
+      "epoch": 0.2501974809246448,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0016451975080787088,
+      "loss": 0.1113,
+      "step": 28823
+    },
+    {
+      "epoch": 0.2502061614048489,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0016451738276987017,
+      "loss": 0.1094,
+      "step": 28824
+    },
+    {
+      "epoch": 0.2502148418850531,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0016451501467225006,
+      "loss": 0.0947,
+      "step": 28825
+    },
+    {
+      "epoch": 0.25022352236525724,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0016451264651501322,
+      "loss": 0.0806,
+      "step": 28826
+    },
+    {
+      "epoch": 0.25023220284546144,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0016451027829816212,
+      "loss": 0.1338,
+      "step": 28827
+    },
+    {
+      "epoch": 0.2502408833256656,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.001645079100216994,
+      "loss": 0.1055,
+      "step": 28828
+    },
+    {
+      "epoch": 0.25024956380586977,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0016450554168562766,
+      "loss": 0.1348,
+      "step": 28829
+    },
+    {
+      "epoch": 0.2502582442860739,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0016450317328994945,
+      "loss": 0.0977,
+      "step": 28830
+    },
+    {
+      "epoch": 0.2502669247662781,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0016450080483466744,
+      "loss": 0.127,
+      "step": 28831
+    },
+    {
+      "epoch": 0.25027560524648224,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0016449843631978413,
+      "loss": 0.0996,
+      "step": 28832
+    },
+    {
+      "epoch": 0.2502842857266864,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0016449606774530214,
+      "loss": 0.0967,
+      "step": 28833
+    },
+    {
+      "epoch": 0.25029296620689057,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0016449369911122412,
+      "loss": 0.0854,
+      "step": 28834
+    },
+    {
+      "epoch": 0.2503016466870947,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0016449133041755257,
+      "loss": 0.0938,
+      "step": 28835
+    },
+    {
+      "epoch": 0.2503103271672989,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0016448896166429013,
+      "loss": 0.0684,
+      "step": 28836
+    },
+    {
+      "epoch": 0.25031900764750303,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001644865928514394,
+      "loss": 0.1191,
+      "step": 28837
+    },
+    {
+      "epoch": 0.2503276881277072,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001644842239790029,
+      "loss": 0.1074,
+      "step": 28838
+    },
+    {
+      "epoch": 0.25033636860791136,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001644818550469833,
+      "loss": 0.1279,
+      "step": 28839
+    },
+    {
+      "epoch": 0.25034504908811556,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0016447948605538317,
+      "loss": 0.1001,
+      "step": 28840
+    },
+    {
+      "epoch": 0.2503537295683197,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0016447711700420511,
+      "loss": 0.1001,
+      "step": 28841
+    },
+    {
+      "epoch": 0.2503624100485239,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0016447474789345164,
+      "loss": 0.1162,
+      "step": 28842
+    },
+    {
+      "epoch": 0.250371090528728,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0016447237872312544,
+      "loss": 0.1465,
+      "step": 28843
+    },
+    {
+      "epoch": 0.2503797710089322,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0016447000949322906,
+      "loss": 0.1172,
+      "step": 28844
+    },
+    {
+      "epoch": 0.25038845148913635,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001644676402037651,
+      "loss": 0.0723,
+      "step": 28845
+    },
+    {
+      "epoch": 0.25039713196934055,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0016446527085473615,
+      "loss": 0.1016,
+      "step": 28846
+    },
+    {
+      "epoch": 0.2504058124495447,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0016446290144614478,
+      "loss": 0.085,
+      "step": 28847
+    },
+    {
+      "epoch": 0.2504144929297489,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.001644605319779936,
+      "loss": 0.2109,
+      "step": 28848
+    },
+    {
+      "epoch": 0.250423173409953,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001644581624502852,
+      "loss": 0.1069,
+      "step": 28849
+    },
+    {
+      "epoch": 0.2504318538901572,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0016445579286302218,
+      "loss": 0.1484,
+      "step": 28850
+    },
+    {
+      "epoch": 0.25044053437036135,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0016445342321620713,
+      "loss": 0.1367,
+      "step": 28851
+    },
+    {
+      "epoch": 0.25044921485056554,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0016445105350984264,
+      "loss": 0.1299,
+      "step": 28852
+    },
+    {
+      "epoch": 0.2504578953307697,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0016444868374393127,
+      "loss": 0.1426,
+      "step": 28853
+    },
+    {
+      "epoch": 0.25046657581097387,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0016444631391847567,
+      "loss": 0.0908,
+      "step": 28854
+    },
+    {
+      "epoch": 0.250475256291178,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0016444394403347835,
+      "loss": 0.0977,
+      "step": 28855
+    },
+    {
+      "epoch": 0.2504839367713822,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0016444157408894201,
+      "loss": 0.125,
+      "step": 28856
+    },
+    {
+      "epoch": 0.25049261725158634,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0016443920408486913,
+      "loss": 0.0933,
+      "step": 28857
+    },
+    {
+      "epoch": 0.25050129773179053,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001644368340212624,
+      "loss": 0.0869,
+      "step": 28858
+    },
+    {
+      "epoch": 0.25050997821199467,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0016443446389812434,
+      "loss": 0.1465,
+      "step": 28859
+    },
+    {
+      "epoch": 0.25051865869219886,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0016443209371545756,
+      "loss": 0.1113,
+      "step": 28860
+    },
+    {
+      "epoch": 0.250527339172403,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0016442972347326468,
+      "loss": 0.1055,
+      "step": 28861
+    },
+    {
+      "epoch": 0.2505360196526072,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0016442735317154828,
+      "loss": 0.0884,
+      "step": 28862
+    },
+    {
+      "epoch": 0.2505447001328113,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0016442498281031093,
+      "loss": 0.1084,
+      "step": 28863
+    },
+    {
+      "epoch": 0.2505533806130155,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001644226123895552,
+      "loss": 0.1021,
+      "step": 28864
+    },
+    {
+      "epoch": 0.25056206109321966,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0016442024190928377,
+      "loss": 0.1406,
+      "step": 28865
+    },
+    {
+      "epoch": 0.25057074157342385,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0016441787136949918,
+      "loss": 0.0942,
+      "step": 28866
+    },
+    {
+      "epoch": 0.250579422053628,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.00164415500770204,
+      "loss": 0.1309,
+      "step": 28867
+    },
+    {
+      "epoch": 0.2505881025338322,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0016441313011140084,
+      "loss": 0.0947,
+      "step": 28868
+    },
+    {
+      "epoch": 0.2505967830140363,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0016441075939309234,
+      "loss": 0.0894,
+      "step": 28869
+    },
+    {
+      "epoch": 0.2506054634942405,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0016440838861528102,
+      "loss": 0.1201,
+      "step": 28870
+    },
+    {
+      "epoch": 0.25061414397444465,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0016440601777796952,
+      "loss": 0.1045,
+      "step": 28871
+    },
+    {
+      "epoch": 0.25062282445464884,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0016440364688116039,
+      "loss": 0.1084,
+      "step": 28872
+    },
+    {
+      "epoch": 0.250631504934853,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0016440127592485625,
+      "loss": 0.0894,
+      "step": 28873
+    },
+    {
+      "epoch": 0.25064018541505717,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0016439890490905973,
+      "loss": 0.0864,
+      "step": 28874
+    },
+    {
+      "epoch": 0.2506488658952613,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0016439653383377335,
+      "loss": 0.1211,
+      "step": 28875
+    },
+    {
+      "epoch": 0.2506575463754655,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0016439416269899974,
+      "loss": 0.127,
+      "step": 28876
+    },
+    {
+      "epoch": 0.25066622685566964,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001643917915047415,
+      "loss": 0.105,
+      "step": 28877
+    },
+    {
+      "epoch": 0.25067490733587383,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0016438942025100121,
+      "loss": 0.1289,
+      "step": 28878
+    },
+    {
+      "epoch": 0.25068358781607797,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001643870489377815,
+      "loss": 0.1025,
+      "step": 28879
+    },
+    {
+      "epoch": 0.25069226829628216,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001643846775650849,
+      "loss": 0.0996,
+      "step": 28880
+    },
+    {
+      "epoch": 0.2507009487764863,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.00164382306132914,
+      "loss": 0.0869,
+      "step": 28881
+    },
+    {
+      "epoch": 0.2507096292566905,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0016437993464127147,
+      "loss": 0.1035,
+      "step": 28882
+    },
+    {
+      "epoch": 0.25071830973689463,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0016437756309015986,
+      "loss": 0.1108,
+      "step": 28883
+    },
+    {
+      "epoch": 0.2507269902170988,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0016437519147958178,
+      "loss": 0.1113,
+      "step": 28884
+    },
+    {
+      "epoch": 0.25073567069730296,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0016437281980953976,
+      "loss": 0.1016,
+      "step": 28885
+    },
+    {
+      "epoch": 0.25074435117750715,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0016437044808003648,
+      "loss": 0.168,
+      "step": 28886
+    },
+    {
+      "epoch": 0.2507530316577113,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0016436807629107448,
+      "loss": 0.1367,
+      "step": 28887
+    },
+    {
+      "epoch": 0.2507617121379155,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0016436570444265637,
+      "loss": 0.0996,
+      "step": 28888
+    },
+    {
+      "epoch": 0.2507703926181196,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0016436333253478474,
+      "loss": 0.0679,
+      "step": 28889
+    },
+    {
+      "epoch": 0.2507790730983238,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0016436096056746219,
+      "loss": 0.0957,
+      "step": 28890
+    },
+    {
+      "epoch": 0.25078775357852795,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.001643585885406913,
+      "loss": 0.1289,
+      "step": 28891
+    },
+    {
+      "epoch": 0.25079643405873214,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0016435621645447469,
+      "loss": 0.1172,
+      "step": 28892
+    },
+    {
+      "epoch": 0.2508051145389363,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001643538443088149,
+      "loss": 0.0947,
+      "step": 28893
+    },
+    {
+      "epoch": 0.2508137950191405,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0016435147210371464,
+      "loss": 0.1123,
+      "step": 28894
+    },
+    {
+      "epoch": 0.2508224754993446,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0016434909983917636,
+      "loss": 0.1533,
+      "step": 28895
+    },
+    {
+      "epoch": 0.2508311559795488,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0016434672751520277,
+      "loss": 0.0884,
+      "step": 28896
+    },
+    {
+      "epoch": 0.25083983645975294,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0016434435513179636,
+      "loss": 0.0854,
+      "step": 28897
+    },
+    {
+      "epoch": 0.25084851693995713,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0016434198268895982,
+      "loss": 0.085,
+      "step": 28898
+    },
+    {
+      "epoch": 0.25085719742016127,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0016433961018669568,
+      "loss": 0.0991,
+      "step": 28899
+    },
+    {
+      "epoch": 0.25086587790036546,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0016433723762500655,
+      "loss": 0.1055,
+      "step": 28900
+    },
+    {
+      "epoch": 0.2508745583805696,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0016433486500389508,
+      "loss": 0.1191,
+      "step": 28901
+    },
+    {
+      "epoch": 0.2508832388607738,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0016433249232336378,
+      "loss": 0.1099,
+      "step": 28902
+    },
+    {
+      "epoch": 0.25089191934097793,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0016433011958341532,
+      "loss": 0.1377,
+      "step": 28903
+    },
+    {
+      "epoch": 0.2509005998211821,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001643277467840522,
+      "loss": 0.1006,
+      "step": 28904
+    },
+    {
+      "epoch": 0.25090928030138626,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0016432537392527713,
+      "loss": 0.1094,
+      "step": 28905
+    },
+    {
+      "epoch": 0.25091796078159045,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001643230010070926,
+      "loss": 0.1094,
+      "step": 28906
+    },
+    {
+      "epoch": 0.2509266412617946,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0016432062802950128,
+      "loss": 0.1328,
+      "step": 28907
+    },
+    {
+      "epoch": 0.2509353217419988,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0016431825499250573,
+      "loss": 0.1279,
+      "step": 28908
+    },
+    {
+      "epoch": 0.2509440022222029,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0016431588189610856,
+      "loss": 0.0996,
+      "step": 28909
+    },
+    {
+      "epoch": 0.2509526827024071,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0016431350874031234,
+      "loss": 0.124,
+      "step": 28910
+    },
+    {
+      "epoch": 0.25096136318261125,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001643111355251197,
+      "loss": 0.0869,
+      "step": 28911
+    },
+    {
+      "epoch": 0.25097004366281545,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001643087622505332,
+      "loss": 0.1138,
+      "step": 28912
+    },
+    {
+      "epoch": 0.2509787241430196,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0016430638891655547,
+      "loss": 0.1084,
+      "step": 28913
+    },
+    {
+      "epoch": 0.2509874046232238,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001643040155231891,
+      "loss": 0.1318,
+      "step": 28914
+    },
+    {
+      "epoch": 0.2509960851034279,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0016430164207043665,
+      "loss": 0.123,
+      "step": 28915
+    },
+    {
+      "epoch": 0.2510047655836321,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0016429926855830073,
+      "loss": 0.1172,
+      "step": 28916
+    },
+    {
+      "epoch": 0.25101344606383624,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0016429689498678398,
+      "loss": 0.063,
+      "step": 28917
+    },
+    {
+      "epoch": 0.25102212654404044,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0016429452135588896,
+      "loss": 0.0718,
+      "step": 28918
+    },
+    {
+      "epoch": 0.2510308070242446,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0016429214766561823,
+      "loss": 0.1084,
+      "step": 28919
+    },
+    {
+      "epoch": 0.25103948750444877,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0016428977391597446,
+      "loss": 0.0996,
+      "step": 28920
+    },
+    {
+      "epoch": 0.2510481679846529,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0016428740010696018,
+      "loss": 0.0762,
+      "step": 28921
+    },
+    {
+      "epoch": 0.2510568484648571,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0016428502623857804,
+      "loss": 0.1226,
+      "step": 28922
+    },
+    {
+      "epoch": 0.25106552894506123,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0016428265231083062,
+      "loss": 0.1074,
+      "step": 28923
+    },
+    {
+      "epoch": 0.2510742094252654,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0016428027832372048,
+      "loss": 0.1035,
+      "step": 28924
+    },
+    {
+      "epoch": 0.25108288990546956,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0016427790427725021,
+      "loss": 0.0776,
+      "step": 28925
+    },
+    {
+      "epoch": 0.25109157038567376,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0016427553017142249,
+      "loss": 0.123,
+      "step": 28926
+    },
+    {
+      "epoch": 0.2511002508658779,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001642731560062399,
+      "loss": 0.0972,
+      "step": 28927
+    },
+    {
+      "epoch": 0.2511089313460821,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0016427078178170493,
+      "loss": 0.1133,
+      "step": 28928
+    },
+    {
+      "epoch": 0.2511176118262862,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001642684074978203,
+      "loss": 0.0933,
+      "step": 28929
+    },
+    {
+      "epoch": 0.2511262923064904,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.001642660331545885,
+      "loss": 0.1016,
+      "step": 28930
+    },
+    {
+      "epoch": 0.25113497278669455,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0016426365875201224,
+      "loss": 0.1084,
+      "step": 28931
+    },
+    {
+      "epoch": 0.25114365326689875,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0016426128429009403,
+      "loss": 0.104,
+      "step": 28932
+    },
+    {
+      "epoch": 0.2511523337471029,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0016425890976883648,
+      "loss": 0.0894,
+      "step": 28933
+    },
+    {
+      "epoch": 0.2511610142273071,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0016425653518824224,
+      "loss": 0.1094,
+      "step": 28934
+    },
+    {
+      "epoch": 0.2511696947075112,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0016425416054831384,
+      "loss": 0.0776,
+      "step": 28935
+    },
+    {
+      "epoch": 0.2511783751877154,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001642517858490539,
+      "loss": 0.0977,
+      "step": 28936
+    },
+    {
+      "epoch": 0.25118705566791955,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0016424941109046502,
+      "loss": 0.0972,
+      "step": 28937
+    },
+    {
+      "epoch": 0.25119573614812374,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0016424703627254983,
+      "loss": 0.1025,
+      "step": 28938
+    },
+    {
+      "epoch": 0.2512044166283279,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.001642446613953109,
+      "loss": 0.1152,
+      "step": 28939
+    },
+    {
+      "epoch": 0.25121309710853207,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0016424228645875079,
+      "loss": 0.1162,
+      "step": 28940
+    },
+    {
+      "epoch": 0.2512217775887362,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0016423991146287211,
+      "loss": 0.1191,
+      "step": 28941
+    },
+    {
+      "epoch": 0.2512304580689404,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0016423753640767752,
+      "loss": 0.0981,
+      "step": 28942
+    },
+    {
+      "epoch": 0.25123913854914454,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0016423516129316959,
+      "loss": 0.1475,
+      "step": 28943
+    },
+    {
+      "epoch": 0.25124781902934873,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0016423278611935087,
+      "loss": 0.1094,
+      "step": 28944
+    },
+    {
+      "epoch": 0.25125649950955287,
+      "grad_norm": 2.453125,
+      "learning_rate": 0.00164230410886224,
+      "loss": 0.2188,
+      "step": 28945
+    },
+    {
+      "epoch": 0.25126517998975706,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0016422803559379155,
+      "loss": 0.1367,
+      "step": 28946
+    },
+    {
+      "epoch": 0.2512738604699612,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0016422566024205617,
+      "loss": 0.1543,
+      "step": 28947
+    },
+    {
+      "epoch": 0.2512825409501654,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001642232848310204,
+      "loss": 0.1133,
+      "step": 28948
+    },
+    {
+      "epoch": 0.2512912214303695,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0016422090936068685,
+      "loss": 0.1016,
+      "step": 28949
+    },
+    {
+      "epoch": 0.2512999019105737,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0016421853383105814,
+      "loss": 0.1123,
+      "step": 28950
+    },
+    {
+      "epoch": 0.25130858239077786,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0016421615824213688,
+      "loss": 0.1108,
+      "step": 28951
+    },
+    {
+      "epoch": 0.25131726287098205,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0016421378259392561,
+      "loss": 0.0928,
+      "step": 28952
+    },
+    {
+      "epoch": 0.2513259433511862,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.00164211406886427,
+      "loss": 0.1309,
+      "step": 28953
+    },
+    {
+      "epoch": 0.2513346238313904,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0016420903111964355,
+      "loss": 0.0898,
+      "step": 28954
+    },
+    {
+      "epoch": 0.2513433043115945,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0016420665529357796,
+      "loss": 0.0908,
+      "step": 28955
+    },
+    {
+      "epoch": 0.25135198479179865,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0016420427940823277,
+      "loss": 0.0981,
+      "step": 28956
+    },
+    {
+      "epoch": 0.25136066527200285,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001642019034636106,
+      "loss": 0.1162,
+      "step": 28957
+    },
+    {
+      "epoch": 0.251369345752207,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0016419952745971406,
+      "loss": 0.127,
+      "step": 28958
+    },
+    {
+      "epoch": 0.2513780262324112,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0016419715139654569,
+      "loss": 0.105,
+      "step": 28959
+    },
+    {
+      "epoch": 0.2513867067126153,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0016419477527410819,
+      "loss": 0.1836,
+      "step": 28960
+    },
+    {
+      "epoch": 0.2513953871928195,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0016419239909240405,
+      "loss": 0.1523,
+      "step": 28961
+    },
+    {
+      "epoch": 0.25140406767302365,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001641900228514359,
+      "loss": 0.1309,
+      "step": 28962
+    },
+    {
+      "epoch": 0.25141274815322784,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0016418764655120642,
+      "loss": 0.0898,
+      "step": 28963
+    },
+    {
+      "epoch": 0.251421428633432,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001641852701917181,
+      "loss": 0.1338,
+      "step": 28964
+    },
+    {
+      "epoch": 0.25143010911363617,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001641828937729736,
+      "loss": 0.0728,
+      "step": 28965
+    },
+    {
+      "epoch": 0.2514387895938403,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0016418051729497549,
+      "loss": 0.1523,
+      "step": 28966
+    },
+    {
+      "epoch": 0.2514474700740445,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0016417814075772637,
+      "loss": 0.0874,
+      "step": 28967
+    },
+    {
+      "epoch": 0.25145615055424864,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0016417576416122889,
+      "loss": 0.125,
+      "step": 28968
+    },
+    {
+      "epoch": 0.25146483103445283,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0016417338750548557,
+      "loss": 0.1147,
+      "step": 28969
+    },
+    {
+      "epoch": 0.25147351151465697,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0016417101079049907,
+      "loss": 0.1035,
+      "step": 28970
+    },
+    {
+      "epoch": 0.25148219199486116,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0016416863401627194,
+      "loss": 0.1113,
+      "step": 28971
+    },
+    {
+      "epoch": 0.2514908724750653,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0016416625718280683,
+      "loss": 0.1162,
+      "step": 28972
+    },
+    {
+      "epoch": 0.2514995529552695,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0016416388029010629,
+      "loss": 0.0996,
+      "step": 28973
+    },
+    {
+      "epoch": 0.2515082334354736,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0016416150333817296,
+      "loss": 0.1348,
+      "step": 28974
+    },
+    {
+      "epoch": 0.2515169139156778,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0016415912632700943,
+      "loss": 0.0781,
+      "step": 28975
+    },
+    {
+      "epoch": 0.25152559439588196,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0016415674925661827,
+      "loss": 0.1143,
+      "step": 28976
+    },
+    {
+      "epoch": 0.25153427487608615,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0016415437212700214,
+      "loss": 0.1128,
+      "step": 28977
+    },
+    {
+      "epoch": 0.2515429553562903,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0016415199493816357,
+      "loss": 0.0845,
+      "step": 28978
+    },
+    {
+      "epoch": 0.2515516358364945,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001641496176901052,
+      "loss": 0.1162,
+      "step": 28979
+    },
+    {
+      "epoch": 0.2515603163166986,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0016414724038282965,
+      "loss": 0.1152,
+      "step": 28980
+    },
+    {
+      "epoch": 0.2515689967969028,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0016414486301633946,
+      "loss": 0.1045,
+      "step": 28981
+    },
+    {
+      "epoch": 0.25157767727710695,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0016414248559063727,
+      "loss": 0.0835,
+      "step": 28982
+    },
+    {
+      "epoch": 0.25158635775731114,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0016414010810572566,
+      "loss": 0.104,
+      "step": 28983
+    },
+    {
+      "epoch": 0.2515950382375153,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0016413773056160724,
+      "loss": 0.1338,
+      "step": 28984
+    },
+    {
+      "epoch": 0.25160371871771947,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0016413535295828462,
+      "loss": 0.082,
+      "step": 28985
+    },
+    {
+      "epoch": 0.2516123991979236,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0016413297529576044,
+      "loss": 0.1289,
+      "step": 28986
+    },
+    {
+      "epoch": 0.2516210796781278,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0016413059757403718,
+      "loss": 0.1152,
+      "step": 28987
+    },
+    {
+      "epoch": 0.25162976015833194,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0016412821979311756,
+      "loss": 0.1001,
+      "step": 28988
+    },
+    {
+      "epoch": 0.25163844063853613,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001641258419530041,
+      "loss": 0.1011,
+      "step": 28989
+    },
+    {
+      "epoch": 0.25164712111874027,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0016412346405369944,
+      "loss": 0.0879,
+      "step": 28990
+    },
+    {
+      "epoch": 0.25165580159894446,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0016412108609520617,
+      "loss": 0.0952,
+      "step": 28991
+    },
+    {
+      "epoch": 0.2516644820791486,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001641187080775269,
+      "loss": 0.0806,
+      "step": 28992
+    },
+    {
+      "epoch": 0.2516731625593528,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001641163300006642,
+      "loss": 0.1011,
+      "step": 28993
+    },
+    {
+      "epoch": 0.25168184303955693,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0016411395186462072,
+      "loss": 0.3164,
+      "step": 28994
+    },
+    {
+      "epoch": 0.2516905235197611,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0016411157366939904,
+      "loss": 0.1641,
+      "step": 28995
+    },
+    {
+      "epoch": 0.25169920399996526,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0016410919541500176,
+      "loss": 0.0796,
+      "step": 28996
+    },
+    {
+      "epoch": 0.25170788448016945,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0016410681710143148,
+      "loss": 0.0835,
+      "step": 28997
+    },
+    {
+      "epoch": 0.2517165649603736,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0016410443872869073,
+      "loss": 0.0718,
+      "step": 28998
+    },
+    {
+      "epoch": 0.2517252454405778,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0016410206029678226,
+      "loss": 0.084,
+      "step": 28999
+    },
+    {
+      "epoch": 0.2517339259207819,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0016409968180570859,
+      "loss": 0.0811,
+      "step": 29000
+    },
+    {
+      "epoch": 0.2517426064009861,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0016409730325547226,
+      "loss": 0.0933,
+      "step": 29001
+    },
+    {
+      "epoch": 0.25175128688119025,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0016409492464607598,
+      "loss": 0.083,
+      "step": 29002
+    },
+    {
+      "epoch": 0.25175996736139444,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0016409254597752228,
+      "loss": 0.0918,
+      "step": 29003
+    },
+    {
+      "epoch": 0.2517686478415986,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.001640901672498138,
+      "loss": 0.1338,
+      "step": 29004
+    },
+    {
+      "epoch": 0.2517773283218028,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001640877884629531,
+      "loss": 0.1338,
+      "step": 29005
+    },
+    {
+      "epoch": 0.2517860088020069,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001640854096169428,
+      "loss": 0.1099,
+      "step": 29006
+    },
+    {
+      "epoch": 0.2517946892822111,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0016408303071178555,
+      "loss": 0.1348,
+      "step": 29007
+    },
+    {
+      "epoch": 0.25180336976241524,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0016408065174748393,
+      "loss": 0.1128,
+      "step": 29008
+    },
+    {
+      "epoch": 0.25181205024261943,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0016407827272404047,
+      "loss": 0.1543,
+      "step": 29009
+    },
+    {
+      "epoch": 0.25182073072282357,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001640758936414578,
+      "loss": 0.1133,
+      "step": 29010
+    },
+    {
+      "epoch": 0.25182941120302776,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0016407351449973862,
+      "loss": 0.1279,
+      "step": 29011
+    },
+    {
+      "epoch": 0.2518380916832319,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0016407113529888543,
+      "loss": 0.1016,
+      "step": 29012
+    },
+    {
+      "epoch": 0.2518467721634361,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0016406875603890082,
+      "loss": 0.1196,
+      "step": 29013
+    },
+    {
+      "epoch": 0.25185545264364023,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0016406637671978746,
+      "loss": 0.1055,
+      "step": 29014
+    },
+    {
+      "epoch": 0.2518641331238444,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0016406399734154792,
+      "loss": 0.127,
+      "step": 29015
+    },
+    {
+      "epoch": 0.25187281360404856,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0016406161790418483,
+      "loss": 0.0928,
+      "step": 29016
+    },
+    {
+      "epoch": 0.25188149408425275,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0016405923840770074,
+      "loss": 0.1367,
+      "step": 29017
+    },
+    {
+      "epoch": 0.2518901745644569,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0016405685885209828,
+      "loss": 0.1191,
+      "step": 29018
+    },
+    {
+      "epoch": 0.2518988550446611,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0016405447923738005,
+      "loss": 0.0996,
+      "step": 29019
+    },
+    {
+      "epoch": 0.2519075355248652,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0016405209956354867,
+      "loss": 0.1201,
+      "step": 29020
+    },
+    {
+      "epoch": 0.2519162160050694,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001640497198306067,
+      "loss": 0.1855,
+      "step": 29021
+    },
+    {
+      "epoch": 0.25192489648527355,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001640473400385568,
+      "loss": 0.0859,
+      "step": 29022
+    },
+    {
+      "epoch": 0.25193357696547775,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0016404496018740155,
+      "loss": 0.1523,
+      "step": 29023
+    },
+    {
+      "epoch": 0.2519422574456819,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0016404258027714349,
+      "loss": 0.0977,
+      "step": 29024
+    },
+    {
+      "epoch": 0.2519509379258861,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001640402003077853,
+      "loss": 0.0806,
+      "step": 29025
+    },
+    {
+      "epoch": 0.2519596184060902,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0016403782027932957,
+      "loss": 0.0918,
+      "step": 29026
+    },
+    {
+      "epoch": 0.2519682988862944,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0016403544019177887,
+      "loss": 0.125,
+      "step": 29027
+    },
+    {
+      "epoch": 0.25197697936649854,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0016403306004513585,
+      "loss": 0.0781,
+      "step": 29028
+    },
+    {
+      "epoch": 0.25198565984670274,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0016403067983940308,
+      "loss": 0.1006,
+      "step": 29029
+    },
+    {
+      "epoch": 0.2519943403269069,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0016402829957458318,
+      "loss": 0.1182,
+      "step": 29030
+    },
+    {
+      "epoch": 0.25200302080711107,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001640259192506787,
+      "loss": 0.0898,
+      "step": 29031
+    },
+    {
+      "epoch": 0.2520117012873152,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0016402353886769232,
+      "loss": 0.0806,
+      "step": 29032
+    },
+    {
+      "epoch": 0.2520203817675194,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0016402115842562661,
+      "loss": 0.1055,
+      "step": 29033
+    },
+    {
+      "epoch": 0.25202906224772353,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001640187779244842,
+      "loss": 0.1611,
+      "step": 29034
+    },
+    {
+      "epoch": 0.2520377427279277,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001640163973642676,
+      "loss": 0.104,
+      "step": 29035
+    },
+    {
+      "epoch": 0.25204642320813186,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0016401401674497953,
+      "loss": 0.1099,
+      "step": 29036
+    },
+    {
+      "epoch": 0.25205510368833606,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0016401163606662252,
+      "loss": 0.1279,
+      "step": 29037
+    },
+    {
+      "epoch": 0.2520637841685402,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0016400925532919922,
+      "loss": 0.1016,
+      "step": 29038
+    },
+    {
+      "epoch": 0.2520724646487444,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001640068745327122,
+      "loss": 0.127,
+      "step": 29039
+    },
+    {
+      "epoch": 0.2520811451289485,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0016400449367716405,
+      "loss": 0.1289,
+      "step": 29040
+    },
+    {
+      "epoch": 0.2520898256091527,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0016400211276255741,
+      "loss": 0.1631,
+      "step": 29041
+    },
+    {
+      "epoch": 0.25209850608935686,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.001639997317888949,
+      "loss": 0.1318,
+      "step": 29042
+    },
+    {
+      "epoch": 0.25210718656956105,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0016399735075617906,
+      "loss": 0.0811,
+      "step": 29043
+    },
+    {
+      "epoch": 0.2521158670497652,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0016399496966441253,
+      "loss": 0.0923,
+      "step": 29044
+    },
+    {
+      "epoch": 0.2521245475299694,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0016399258851359792,
+      "loss": 0.1074,
+      "step": 29045
+    },
+    {
+      "epoch": 0.2521332280101735,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0016399020730373784,
+      "loss": 0.1309,
+      "step": 29046
+    },
+    {
+      "epoch": 0.2521419084903777,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0016398782603483486,
+      "loss": 0.106,
+      "step": 29047
+    },
+    {
+      "epoch": 0.25215058897058185,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0016398544470689162,
+      "loss": 0.1055,
+      "step": 29048
+    },
+    {
+      "epoch": 0.25215926945078604,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0016398306331991068,
+      "loss": 0.1011,
+      "step": 29049
+    },
+    {
+      "epoch": 0.2521679499309902,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001639806818738947,
+      "loss": 0.1172,
+      "step": 29050
+    },
+    {
+      "epoch": 0.25217663041119437,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0016397830036884626,
+      "loss": 0.1055,
+      "step": 29051
+    },
+    {
+      "epoch": 0.2521853108913985,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0016397591880476796,
+      "loss": 0.0957,
+      "step": 29052
+    },
+    {
+      "epoch": 0.2521939913716027,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001639735371816624,
+      "loss": 0.1104,
+      "step": 29053
+    },
+    {
+      "epoch": 0.25220267185180684,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001639711554995322,
+      "loss": 0.0815,
+      "step": 29054
+    },
+    {
+      "epoch": 0.25221135233201103,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0016396877375837993,
+      "loss": 0.0996,
+      "step": 29055
+    },
+    {
+      "epoch": 0.25222003281221517,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0016396639195820823,
+      "loss": 0.0713,
+      "step": 29056
+    },
+    {
+      "epoch": 0.25222871329241936,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001639640100990197,
+      "loss": 0.0854,
+      "step": 29057
+    },
+    {
+      "epoch": 0.2522373937726235,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0016396162818081695,
+      "loss": 0.1221,
+      "step": 29058
+    },
+    {
+      "epoch": 0.2522460742528277,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0016395924620360256,
+      "loss": 0.125,
+      "step": 29059
+    },
+    {
+      "epoch": 0.2522547547330318,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0016395686416737915,
+      "loss": 0.0996,
+      "step": 29060
+    },
+    {
+      "epoch": 0.252263435213236,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0016395448207214935,
+      "loss": 0.0854,
+      "step": 29061
+    },
+    {
+      "epoch": 0.25227211569344016,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001639520999179157,
+      "loss": 0.0957,
+      "step": 29062
+    },
+    {
+      "epoch": 0.25228079617364435,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0016394971770468084,
+      "loss": 0.1611,
+      "step": 29063
+    },
+    {
+      "epoch": 0.2522894766538485,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001639473354324474,
+      "loss": 0.127,
+      "step": 29064
+    },
+    {
+      "epoch": 0.2522981571340527,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.00163944953101218,
+      "loss": 0.1143,
+      "step": 29065
+    },
+    {
+      "epoch": 0.2523068376142568,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0016394257071099514,
+      "loss": 0.1494,
+      "step": 29066
+    },
+    {
+      "epoch": 0.252315518094461,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0016394018826178154,
+      "loss": 0.0938,
+      "step": 29067
+    },
+    {
+      "epoch": 0.25232419857466515,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0016393780575357975,
+      "loss": 0.1201,
+      "step": 29068
+    },
+    {
+      "epoch": 0.25233287905486934,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0016393542318639239,
+      "loss": 0.0874,
+      "step": 29069
+    },
+    {
+      "epoch": 0.2523415595350735,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0016393304056022207,
+      "loss": 0.1465,
+      "step": 29070
+    },
+    {
+      "epoch": 0.25235024001527767,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0016393065787507135,
+      "loss": 0.1162,
+      "step": 29071
+    },
+    {
+      "epoch": 0.2523589204954818,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001639282751309429,
+      "loss": 0.124,
+      "step": 29072
+    },
+    {
+      "epoch": 0.252367600975686,
+      "grad_norm": 4.9375,
+      "learning_rate": 0.001639258923278393,
+      "loss": 0.2773,
+      "step": 29073
+    },
+    {
+      "epoch": 0.25237628145589014,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0016392350946576315,
+      "loss": 0.0918,
+      "step": 29074
+    },
+    {
+      "epoch": 0.25238496193609433,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0016392112654471705,
+      "loss": 0.1196,
+      "step": 29075
+    },
+    {
+      "epoch": 0.25239364241629847,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0016391874356470365,
+      "loss": 0.0752,
+      "step": 29076
+    },
+    {
+      "epoch": 0.25240232289650266,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001639163605257255,
+      "loss": 0.1025,
+      "step": 29077
+    },
+    {
+      "epoch": 0.2524110033767068,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001639139774277852,
+      "loss": 0.165,
+      "step": 29078
+    },
+    {
+      "epoch": 0.25241968385691094,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001639115942708854,
+      "loss": 0.1045,
+      "step": 29079
+    },
+    {
+      "epoch": 0.25242836433711513,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0016390921105502872,
+      "loss": 0.1074,
+      "step": 29080
+    },
+    {
+      "epoch": 0.25243704481731927,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0016390682778021773,
+      "loss": 0.1055,
+      "step": 29081
+    },
+    {
+      "epoch": 0.25244572529752346,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0016390444444645503,
+      "loss": 0.0791,
+      "step": 29082
+    },
+    {
+      "epoch": 0.2524544057777276,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0016390206105374324,
+      "loss": 0.1172,
+      "step": 29083
+    },
+    {
+      "epoch": 0.2524630862579318,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0016389967760208496,
+      "loss": 0.0752,
+      "step": 29084
+    },
+    {
+      "epoch": 0.2524717667381359,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.001638972940914828,
+      "loss": 0.1084,
+      "step": 29085
+    },
+    {
+      "epoch": 0.2524804472183401,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0016389491052193942,
+      "loss": 0.1475,
+      "step": 29086
+    },
+    {
+      "epoch": 0.25248912769854426,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001638925268934573,
+      "loss": 0.1445,
+      "step": 29087
+    },
+    {
+      "epoch": 0.25249780817874845,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0016389014320603917,
+      "loss": 0.1084,
+      "step": 29088
+    },
+    {
+      "epoch": 0.2525064886589526,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0016388775945968756,
+      "loss": 0.0762,
+      "step": 29089
+    },
+    {
+      "epoch": 0.2525151691391568,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0016388537565440512,
+      "loss": 0.1426,
+      "step": 29090
+    },
+    {
+      "epoch": 0.2525238496193609,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0016388299179019446,
+      "loss": 0.125,
+      "step": 29091
+    },
+    {
+      "epoch": 0.2525325300995651,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0016388060786705816,
+      "loss": 0.1123,
+      "step": 29092
+    },
+    {
+      "epoch": 0.25254121057976925,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001638782238849988,
+      "loss": 0.0845,
+      "step": 29093
+    },
+    {
+      "epoch": 0.25254989105997344,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0016387583984401907,
+      "loss": 0.1177,
+      "step": 29094
+    },
+    {
+      "epoch": 0.2525585715401776,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0016387345574412151,
+      "loss": 0.1016,
+      "step": 29095
+    },
+    {
+      "epoch": 0.25256725202038177,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0016387107158530876,
+      "loss": 0.0952,
+      "step": 29096
+    },
+    {
+      "epoch": 0.2525759325005859,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.001638686873675834,
+      "loss": 0.2285,
+      "step": 29097
+    },
+    {
+      "epoch": 0.2525846129807901,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0016386630309094808,
+      "loss": 0.1191,
+      "step": 29098
+    },
+    {
+      "epoch": 0.25259329346099424,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0016386391875540533,
+      "loss": 0.2207,
+      "step": 29099
+    },
+    {
+      "epoch": 0.25260197394119843,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0016386153436095785,
+      "loss": 0.0947,
+      "step": 29100
+    },
+    {
+      "epoch": 0.25261065442140257,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0016385914990760821,
+      "loss": 0.0601,
+      "step": 29101
+    },
+    {
+      "epoch": 0.25261933490160676,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0016385676539535901,
+      "loss": 0.0898,
+      "step": 29102
+    },
+    {
+      "epoch": 0.2526280153818109,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0016385438082421285,
+      "loss": 0.1123,
+      "step": 29103
+    },
+    {
+      "epoch": 0.2526366958620151,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0016385199619417235,
+      "loss": 0.0874,
+      "step": 29104
+    },
+    {
+      "epoch": 0.25264537634221923,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001638496115052401,
+      "loss": 0.1211,
+      "step": 29105
+    },
+    {
+      "epoch": 0.2526540568224234,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0016384722675741872,
+      "loss": 0.0752,
+      "step": 29106
+    },
+    {
+      "epoch": 0.25266273730262756,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0016384484195071086,
+      "loss": 0.0938,
+      "step": 29107
+    },
+    {
+      "epoch": 0.25267141778283175,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0016384245708511904,
+      "loss": 0.0942,
+      "step": 29108
+    },
+    {
+      "epoch": 0.2526800982630359,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0016384007216064598,
+      "loss": 0.1309,
+      "step": 29109
+    },
+    {
+      "epoch": 0.2526887787432401,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0016383768717729418,
+      "loss": 0.1436,
+      "step": 29110
+    },
+    {
+      "epoch": 0.2526974592234442,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001638353021350663,
+      "loss": 0.1152,
+      "step": 29111
+    },
+    {
+      "epoch": 0.2527061397036484,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0016383291703396495,
+      "loss": 0.1279,
+      "step": 29112
+    },
+    {
+      "epoch": 0.25271482018385255,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0016383053187399275,
+      "loss": 0.0723,
+      "step": 29113
+    },
+    {
+      "epoch": 0.25272350066405674,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0016382814665515224,
+      "loss": 0.1328,
+      "step": 29114
+    },
+    {
+      "epoch": 0.2527321811442609,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0016382576137744611,
+      "loss": 0.0977,
+      "step": 29115
+    },
+    {
+      "epoch": 0.2527408616244651,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0016382337604087694,
+      "loss": 0.0908,
+      "step": 29116
+    },
+    {
+      "epoch": 0.2527495421046692,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0016382099064544736,
+      "loss": 0.0918,
+      "step": 29117
+    },
+    {
+      "epoch": 0.2527582225848734,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0016381860519115987,
+      "loss": 0.0967,
+      "step": 29118
+    },
+    {
+      "epoch": 0.25276690306507754,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0016381621967801723,
+      "loss": 0.1582,
+      "step": 29119
+    },
+    {
+      "epoch": 0.25277558354528173,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.0016381383410602197,
+      "loss": 0.3418,
+      "step": 29120
+    },
+    {
+      "epoch": 0.25278426402548587,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0016381144847517671,
+      "loss": 0.0977,
+      "step": 29121
+    },
+    {
+      "epoch": 0.25279294450569006,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0016380906278548404,
+      "loss": 0.1123,
+      "step": 29122
+    },
+    {
+      "epoch": 0.2528016249858942,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.001638066770369466,
+      "loss": 0.0654,
+      "step": 29123
+    },
+    {
+      "epoch": 0.2528103054660984,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.00163804291229567,
+      "loss": 0.125,
+      "step": 29124
+    },
+    {
+      "epoch": 0.25281898594630253,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0016380190536334782,
+      "loss": 0.0605,
+      "step": 29125
+    },
+    {
+      "epoch": 0.2528276664265067,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0016379951943829168,
+      "loss": 0.1182,
+      "step": 29126
+    },
+    {
+      "epoch": 0.25283634690671086,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0016379713345440119,
+      "loss": 0.0806,
+      "step": 29127
+    },
+    {
+      "epoch": 0.25284502738691506,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0016379474741167897,
+      "loss": 0.0874,
+      "step": 29128
+    },
+    {
+      "epoch": 0.2528537078671192,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0016379236131012762,
+      "loss": 0.0977,
+      "step": 29129
+    },
+    {
+      "epoch": 0.2528623883473234,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0016378997514974979,
+      "loss": 0.0889,
+      "step": 29130
+    },
+    {
+      "epoch": 0.2528710688275275,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.00163787588930548,
+      "loss": 0.1211,
+      "step": 29131
+    },
+    {
+      "epoch": 0.2528797493077317,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0016378520265252492,
+      "loss": 0.1074,
+      "step": 29132
+    },
+    {
+      "epoch": 0.25288842978793585,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0016378281631568316,
+      "loss": 0.0889,
+      "step": 29133
+    },
+    {
+      "epoch": 0.25289711026814005,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0016378042992002536,
+      "loss": 0.1055,
+      "step": 29134
+    },
+    {
+      "epoch": 0.2529057907483442,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.00163778043465554,
+      "loss": 0.0815,
+      "step": 29135
+    },
+    {
+      "epoch": 0.2529144712285484,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0016377565695227182,
+      "loss": 0.1035,
+      "step": 29136
+    },
+    {
+      "epoch": 0.2529231517087525,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0016377327038018142,
+      "loss": 0.1143,
+      "step": 29137
+    },
+    {
+      "epoch": 0.2529318321889567,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0016377088374928538,
+      "loss": 0.0986,
+      "step": 29138
+    },
+    {
+      "epoch": 0.25294051266916084,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0016376849705958627,
+      "loss": 0.0981,
+      "step": 29139
+    },
+    {
+      "epoch": 0.25294919314936504,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0016376611031108676,
+      "loss": 0.0742,
+      "step": 29140
+    },
+    {
+      "epoch": 0.2529578736295692,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0016376372350378943,
+      "loss": 0.0923,
+      "step": 29141
+    },
+    {
+      "epoch": 0.25296655410977337,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0016376133663769691,
+      "loss": 0.1021,
+      "step": 29142
+    },
+    {
+      "epoch": 0.2529752345899775,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0016375894971281182,
+      "loss": 0.083,
+      "step": 29143
+    },
+    {
+      "epoch": 0.2529839150701817,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001637565627291367,
+      "loss": 0.0859,
+      "step": 29144
+    },
+    {
+      "epoch": 0.25299259555038583,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0016375417568667426,
+      "loss": 0.1113,
+      "step": 29145
+    },
+    {
+      "epoch": 0.25300127603059,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0016375178858542702,
+      "loss": 0.123,
+      "step": 29146
+    },
+    {
+      "epoch": 0.25300995651079417,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0016374940142539767,
+      "loss": 0.1719,
+      "step": 29147
+    },
+    {
+      "epoch": 0.25301863699099836,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0016374701420658873,
+      "loss": 0.1289,
+      "step": 29148
+    },
+    {
+      "epoch": 0.2530273174712025,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0016374462692900292,
+      "loss": 0.1104,
+      "step": 29149
+    },
+    {
+      "epoch": 0.2530359979514067,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0016374223959264276,
+      "loss": 0.1045,
+      "step": 29150
+    },
+    {
+      "epoch": 0.2530446784316108,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0016373985219751092,
+      "loss": 0.1138,
+      "step": 29151
+    },
+    {
+      "epoch": 0.253053358911815,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0016373746474360996,
+      "loss": 0.1699,
+      "step": 29152
+    },
+    {
+      "epoch": 0.25306203939201916,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0016373507723094253,
+      "loss": 0.1162,
+      "step": 29153
+    },
+    {
+      "epoch": 0.25307071987222335,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001637326896595112,
+      "loss": 0.0981,
+      "step": 29154
+    },
+    {
+      "epoch": 0.2530794003524275,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0016373030202931865,
+      "loss": 0.1074,
+      "step": 29155
+    },
+    {
+      "epoch": 0.2530880808326317,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016372791434036745,
+      "loss": 0.1099,
+      "step": 29156
+    },
+    {
+      "epoch": 0.2530967613128358,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0016372552659266015,
+      "loss": 0.1279,
+      "step": 29157
+    },
+    {
+      "epoch": 0.25310544179304,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0016372313878619948,
+      "loss": 0.0786,
+      "step": 29158
+    },
+    {
+      "epoch": 0.25311412227324415,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0016372075092098797,
+      "loss": 0.0674,
+      "step": 29159
+    },
+    {
+      "epoch": 0.25312280275344834,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0016371836299702825,
+      "loss": 0.0977,
+      "step": 29160
+    },
+    {
+      "epoch": 0.2531314832336525,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0016371597501432294,
+      "loss": 0.124,
+      "step": 29161
+    },
+    {
+      "epoch": 0.25314016371385667,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0016371358697287463,
+      "loss": 0.1045,
+      "step": 29162
+    },
+    {
+      "epoch": 0.2531488441940608,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.00163711198872686,
+      "loss": 0.1206,
+      "step": 29163
+    },
+    {
+      "epoch": 0.253157524674265,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0016370881071375958,
+      "loss": 0.085,
+      "step": 29164
+    },
+    {
+      "epoch": 0.25316620515446914,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0016370642249609799,
+      "loss": 0.0996,
+      "step": 29165
+    },
+    {
+      "epoch": 0.25317488563467333,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0016370403421970387,
+      "loss": 0.125,
+      "step": 29166
+    },
+    {
+      "epoch": 0.25318356611487747,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0016370164588457986,
+      "loss": 0.0728,
+      "step": 29167
+    },
+    {
+      "epoch": 0.25319224659508166,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0016369925749072848,
+      "loss": 0.0723,
+      "step": 29168
+    },
+    {
+      "epoch": 0.2532009270752858,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0016369686903815247,
+      "loss": 0.1504,
+      "step": 29169
+    },
+    {
+      "epoch": 0.25320960755549,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0016369448052685432,
+      "loss": 0.1143,
+      "step": 29170
+    },
+    {
+      "epoch": 0.25321828803569413,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001636920919568367,
+      "loss": 0.1182,
+      "step": 29171
+    },
+    {
+      "epoch": 0.2532269685158983,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0016368970332810223,
+      "loss": 0.0859,
+      "step": 29172
+    },
+    {
+      "epoch": 0.25323564899610246,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001636873146406535,
+      "loss": 0.0884,
+      "step": 29173
+    },
+    {
+      "epoch": 0.25324432947630665,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0016368492589449312,
+      "loss": 0.1006,
+      "step": 29174
+    },
+    {
+      "epoch": 0.2532530099565108,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.001636825370896237,
+      "loss": 0.1104,
+      "step": 29175
+    },
+    {
+      "epoch": 0.253261690436715,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0016368014822604792,
+      "loss": 0.1104,
+      "step": 29176
+    },
+    {
+      "epoch": 0.2532703709169191,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0016367775930376828,
+      "loss": 0.1035,
+      "step": 29177
+    },
+    {
+      "epoch": 0.2532790513971233,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0016367537032278747,
+      "loss": 0.0767,
+      "step": 29178
+    },
+    {
+      "epoch": 0.25328773187732745,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0016367298128310808,
+      "loss": 0.1055,
+      "step": 29179
+    },
+    {
+      "epoch": 0.25329641235753164,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0016367059218473274,
+      "loss": 0.1055,
+      "step": 29180
+    },
+    {
+      "epoch": 0.2533050928377358,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0016366820302766403,
+      "loss": 0.1167,
+      "step": 29181
+    },
+    {
+      "epoch": 0.25331377331793997,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0016366581381190458,
+      "loss": 0.1035,
+      "step": 29182
+    },
+    {
+      "epoch": 0.2533224537981441,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.00163663424537457,
+      "loss": 0.1094,
+      "step": 29183
+    },
+    {
+      "epoch": 0.2533311342783483,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001636610352043239,
+      "loss": 0.1299,
+      "step": 29184
+    },
+    {
+      "epoch": 0.25333981475855244,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001636586458125079,
+      "loss": 0.0767,
+      "step": 29185
+    },
+    {
+      "epoch": 0.25334849523875663,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0016365625636201163,
+      "loss": 0.1157,
+      "step": 29186
+    },
+    {
+      "epoch": 0.25335717571896077,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0016365386685283764,
+      "loss": 0.1523,
+      "step": 29187
+    },
+    {
+      "epoch": 0.25336585619916496,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0016365147728498867,
+      "loss": 0.0903,
+      "step": 29188
+    },
+    {
+      "epoch": 0.2533745366793691,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0016364908765846716,
+      "loss": 0.1387,
+      "step": 29189
+    },
+    {
+      "epoch": 0.2533832171595733,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0016364669797327587,
+      "loss": 0.0781,
+      "step": 29190
+    },
+    {
+      "epoch": 0.25339189763977743,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0016364430822941734,
+      "loss": 0.0811,
+      "step": 29191
+    },
+    {
+      "epoch": 0.2534005781199816,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001636419184268942,
+      "loss": 0.0938,
+      "step": 29192
+    },
+    {
+      "epoch": 0.25340925860018576,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0016363952856570908,
+      "loss": 0.1191,
+      "step": 29193
+    },
+    {
+      "epoch": 0.25341793908038995,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0016363713864586454,
+      "loss": 0.1094,
+      "step": 29194
+    },
+    {
+      "epoch": 0.2534266195605941,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0016363474866736328,
+      "loss": 0.0928,
+      "step": 29195
+    },
+    {
+      "epoch": 0.2534353000407983,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001636323586302078,
+      "loss": 0.0815,
+      "step": 29196
+    },
+    {
+      "epoch": 0.2534439805210024,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0016362996853440082,
+      "loss": 0.1289,
+      "step": 29197
+    },
+    {
+      "epoch": 0.2534526610012066,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0016362757837994494,
+      "loss": 0.0996,
+      "step": 29198
+    },
+    {
+      "epoch": 0.25346134148141075,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001636251881668427,
+      "loss": 0.0869,
+      "step": 29199
+    },
+    {
+      "epoch": 0.25347002196161494,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0016362279789509677,
+      "loss": 0.0864,
+      "step": 29200
+    },
+    {
+      "epoch": 0.2534787024418191,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0016362040756470976,
+      "loss": 0.0938,
+      "step": 29201
+    },
+    {
+      "epoch": 0.2534873829220232,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0016361801717568428,
+      "loss": 0.1357,
+      "step": 29202
+    },
+    {
+      "epoch": 0.2534960634022274,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0016361562672802294,
+      "loss": 0.0947,
+      "step": 29203
+    },
+    {
+      "epoch": 0.25350474388243155,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0016361323622172835,
+      "loss": 0.1338,
+      "step": 29204
+    },
+    {
+      "epoch": 0.25351342436263574,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.001636108456568031,
+      "loss": 0.0957,
+      "step": 29205
+    },
+    {
+      "epoch": 0.2535221048428399,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001636084550332499,
+      "loss": 0.1108,
+      "step": 29206
+    },
+    {
+      "epoch": 0.25353078532304407,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0016360606435107128,
+      "loss": 0.1069,
+      "step": 29207
+    },
+    {
+      "epoch": 0.2535394658032482,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0016360367361026983,
+      "loss": 0.0776,
+      "step": 29208
+    },
+    {
+      "epoch": 0.2535481462834524,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0016360128281084825,
+      "loss": 0.1455,
+      "step": 29209
+    },
+    {
+      "epoch": 0.25355682676365654,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001635988919528091,
+      "loss": 0.082,
+      "step": 29210
+    },
+    {
+      "epoch": 0.25356550724386073,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0016359650103615502,
+      "loss": 0.1035,
+      "step": 29211
+    },
+    {
+      "epoch": 0.25357418772406487,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001635941100608886,
+      "loss": 0.0928,
+      "step": 29212
+    },
+    {
+      "epoch": 0.25358286820426906,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001635917190270125,
+      "loss": 0.0776,
+      "step": 29213
+    },
+    {
+      "epoch": 0.2535915486844732,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0016358932793452926,
+      "loss": 0.0776,
+      "step": 29214
+    },
+    {
+      "epoch": 0.2536002291646774,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0016358693678344154,
+      "loss": 0.0654,
+      "step": 29215
+    },
+    {
+      "epoch": 0.25360890964488153,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0016358454557375195,
+      "loss": 0.0728,
+      "step": 29216
+    },
+    {
+      "epoch": 0.2536175901250857,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0016358215430546312,
+      "loss": 0.1079,
+      "step": 29217
+    },
+    {
+      "epoch": 0.25362627060528986,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001635797629785776,
+      "loss": 0.1328,
+      "step": 29218
+    },
+    {
+      "epoch": 0.25363495108549405,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0016357737159309813,
+      "loss": 0.0703,
+      "step": 29219
+    },
+    {
+      "epoch": 0.2536436315656982,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0016357498014902723,
+      "loss": 0.1055,
+      "step": 29220
+    },
+    {
+      "epoch": 0.2536523120459024,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0016357258864636752,
+      "loss": 0.1167,
+      "step": 29221
+    },
+    {
+      "epoch": 0.2536609925261065,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0016357019708512167,
+      "loss": 0.1177,
+      "step": 29222
+    },
+    {
+      "epoch": 0.2536696730063107,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001635678054652922,
+      "loss": 0.1133,
+      "step": 29223
+    },
+    {
+      "epoch": 0.25367835348651485,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0016356541378688182,
+      "loss": 0.1484,
+      "step": 29224
+    },
+    {
+      "epoch": 0.25368703396671904,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001635630220498931,
+      "loss": 0.0869,
+      "step": 29225
+    },
+    {
+      "epoch": 0.2536957144469232,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0016356063025432866,
+      "loss": 0.1128,
+      "step": 29226
+    },
+    {
+      "epoch": 0.2537043949271274,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0016355823840019112,
+      "loss": 0.1211,
+      "step": 29227
+    },
+    {
+      "epoch": 0.2537130754073315,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0016355584648748308,
+      "loss": 0.1348,
+      "step": 29228
+    },
+    {
+      "epoch": 0.2537217558875357,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001635534545162072,
+      "loss": 0.1006,
+      "step": 29229
+    },
+    {
+      "epoch": 0.25373043636773984,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0016355106248636606,
+      "loss": 0.1016,
+      "step": 29230
+    },
+    {
+      "epoch": 0.25373911684794404,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0016354867039796227,
+      "loss": 0.0771,
+      "step": 29231
+    },
+    {
+      "epoch": 0.2537477973281482,
+      "grad_norm": 1.8671875,
+      "learning_rate": 0.0016354627825099848,
+      "loss": 0.1992,
+      "step": 29232
+    },
+    {
+      "epoch": 0.25375647780835237,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0016354388604547724,
+      "loss": 0.084,
+      "step": 29233
+    },
+    {
+      "epoch": 0.2537651582885565,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0016354149378140126,
+      "loss": 0.1045,
+      "step": 29234
+    },
+    {
+      "epoch": 0.2537738387687607,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001635391014587731,
+      "loss": 0.0864,
+      "step": 29235
+    },
+    {
+      "epoch": 0.25378251924896483,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0016353670907759538,
+      "loss": 0.1299,
+      "step": 29236
+    },
+    {
+      "epoch": 0.253791199729169,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0016353431663787069,
+      "loss": 0.1025,
+      "step": 29237
+    },
+    {
+      "epoch": 0.25379988020937316,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001635319241396017,
+      "loss": 0.1016,
+      "step": 29238
+    },
+    {
+      "epoch": 0.25380856068957736,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0016352953158279098,
+      "loss": 0.0879,
+      "step": 29239
+    },
+    {
+      "epoch": 0.2538172411697815,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001635271389674412,
+      "loss": 0.1035,
+      "step": 29240
+    },
+    {
+      "epoch": 0.2538259216499857,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001635247462935549,
+      "loss": 0.0908,
+      "step": 29241
+    },
+    {
+      "epoch": 0.2538346021301898,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0016352235356113481,
+      "loss": 0.0767,
+      "step": 29242
+    },
+    {
+      "epoch": 0.253843282610394,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0016351996077018345,
+      "loss": 0.0859,
+      "step": 29243
+    },
+    {
+      "epoch": 0.25385196309059815,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0016351756792070341,
+      "loss": 0.1064,
+      "step": 29244
+    },
+    {
+      "epoch": 0.25386064357080235,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0016351517501269745,
+      "loss": 0.1206,
+      "step": 29245
+    },
+    {
+      "epoch": 0.2538693240510065,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0016351278204616807,
+      "loss": 0.1309,
+      "step": 29246
+    },
+    {
+      "epoch": 0.2538780045312107,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0016351038902111788,
+      "loss": 0.1211,
+      "step": 29247
+    },
+    {
+      "epoch": 0.2538866850114148,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0016350799593754957,
+      "loss": 0.1055,
+      "step": 29248
+    },
+    {
+      "epoch": 0.253895365491619,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0016350560279546572,
+      "loss": 0.1035,
+      "step": 29249
+    },
+    {
+      "epoch": 0.25390404597182314,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0016350320959486891,
+      "loss": 0.0879,
+      "step": 29250
+    },
+    {
+      "epoch": 0.25391272645202734,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001635008163357618,
+      "loss": 0.0742,
+      "step": 29251
+    },
+    {
+      "epoch": 0.2539214069322315,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0016349842301814704,
+      "loss": 0.1016,
+      "step": 29252
+    },
+    {
+      "epoch": 0.25393008741243567,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001634960296420272,
+      "loss": 0.0728,
+      "step": 29253
+    },
+    {
+      "epoch": 0.2539387678926398,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001634936362074049,
+      "loss": 0.0747,
+      "step": 29254
+    },
+    {
+      "epoch": 0.253947448372844,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0016349124271428273,
+      "loss": 0.125,
+      "step": 29255
+    },
+    {
+      "epoch": 0.25395612885304814,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0016348884916266336,
+      "loss": 0.0879,
+      "step": 29256
+    },
+    {
+      "epoch": 0.25396480933325233,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0016348645555254936,
+      "loss": 0.0938,
+      "step": 29257
+    },
+    {
+      "epoch": 0.25397348981345647,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0016348406188394344,
+      "loss": 0.1021,
+      "step": 29258
+    },
+    {
+      "epoch": 0.25398217029366066,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0016348166815684812,
+      "loss": 0.1006,
+      "step": 29259
+    },
+    {
+      "epoch": 0.2539908507738648,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0016347927437126606,
+      "loss": 0.0962,
+      "step": 29260
+    },
+    {
+      "epoch": 0.253999531254069,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0016347688052719984,
+      "loss": 0.1553,
+      "step": 29261
+    },
+    {
+      "epoch": 0.2540082117342731,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0016347448662465214,
+      "loss": 0.1011,
+      "step": 29262
+    },
+    {
+      "epoch": 0.2540168922144773,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0016347209266362555,
+      "loss": 0.0967,
+      "step": 29263
+    },
+    {
+      "epoch": 0.25402557269468146,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0016346969864412266,
+      "loss": 0.0623,
+      "step": 29264
+    },
+    {
+      "epoch": 0.25403425317488565,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.001634673045661461,
+      "loss": 0.084,
+      "step": 29265
+    },
+    {
+      "epoch": 0.2540429336550898,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0016346491042969853,
+      "loss": 0.0874,
+      "step": 29266
+    },
+    {
+      "epoch": 0.254051614135294,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0016346251623478253,
+      "loss": 0.1113,
+      "step": 29267
+    },
+    {
+      "epoch": 0.2540602946154981,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0016346012198140071,
+      "loss": 0.0918,
+      "step": 29268
+    },
+    {
+      "epoch": 0.2540689750957023,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0016345772766955572,
+      "loss": 0.124,
+      "step": 29269
+    },
+    {
+      "epoch": 0.25407765557590645,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0016345533329925016,
+      "loss": 0.0674,
+      "step": 29270
+    },
+    {
+      "epoch": 0.25408633605611064,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0016345293887048665,
+      "loss": 0.0898,
+      "step": 29271
+    },
+    {
+      "epoch": 0.2540950165363148,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0016345054438326781,
+      "loss": 0.0776,
+      "step": 29272
+    },
+    {
+      "epoch": 0.25410369701651897,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0016344814983759624,
+      "loss": 0.1001,
+      "step": 29273
+    },
+    {
+      "epoch": 0.2541123774967231,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001634457552334746,
+      "loss": 0.0781,
+      "step": 29274
+    },
+    {
+      "epoch": 0.2541210579769273,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001634433605709055,
+      "loss": 0.0874,
+      "step": 29275
+    },
+    {
+      "epoch": 0.25412973845713144,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0016344096584989148,
+      "loss": 0.1406,
+      "step": 29276
+    },
+    {
+      "epoch": 0.25413841893733563,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.001634385710704353,
+      "loss": 0.1172,
+      "step": 29277
+    },
+    {
+      "epoch": 0.25414709941753977,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0016343617623253944,
+      "loss": 0.0801,
+      "step": 29278
+    },
+    {
+      "epoch": 0.25415577989774396,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001634337813362066,
+      "loss": 0.0996,
+      "step": 29279
+    },
+    {
+      "epoch": 0.2541644603779481,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0016343138638143938,
+      "loss": 0.0991,
+      "step": 29280
+    },
+    {
+      "epoch": 0.2541731408581523,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0016342899136824042,
+      "loss": 0.1064,
+      "step": 29281
+    },
+    {
+      "epoch": 0.25418182133835643,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.001634265962966123,
+      "loss": 0.0698,
+      "step": 29282
+    },
+    {
+      "epoch": 0.2541905018185606,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0016342420116655764,
+      "loss": 0.0811,
+      "step": 29283
+    },
+    {
+      "epoch": 0.25419918229876476,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001634218059780791,
+      "loss": 0.085,
+      "step": 29284
+    },
+    {
+      "epoch": 0.25420786277896895,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0016341941073117928,
+      "loss": 0.1113,
+      "step": 29285
+    },
+    {
+      "epoch": 0.2542165432591731,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0016341701542586077,
+      "loss": 0.0986,
+      "step": 29286
+    },
+    {
+      "epoch": 0.2542252237393773,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0016341462006212622,
+      "loss": 0.0933,
+      "step": 29287
+    },
+    {
+      "epoch": 0.2542339042195814,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0016341222463997825,
+      "loss": 0.0986,
+      "step": 29288
+    },
+    {
+      "epoch": 0.2542425846997856,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001634098291594195,
+      "loss": 0.1191,
+      "step": 29289
+    },
+    {
+      "epoch": 0.25425126517998975,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0016340743362045255,
+      "loss": 0.1133,
+      "step": 29290
+    },
+    {
+      "epoch": 0.25425994566019394,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0016340503802308,
+      "loss": 0.0962,
+      "step": 29291
+    },
+    {
+      "epoch": 0.2542686261403981,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001634026423673045,
+      "loss": 0.1094,
+      "step": 29292
+    },
+    {
+      "epoch": 0.2542773066206023,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0016340024665312872,
+      "loss": 0.1113,
+      "step": 29293
+    },
+    {
+      "epoch": 0.2542859871008064,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0016339785088055523,
+      "loss": 0.0938,
+      "step": 29294
+    },
+    {
+      "epoch": 0.2542946675810106,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001633954550495866,
+      "loss": 0.0801,
+      "step": 29295
+    },
+    {
+      "epoch": 0.25430334806121474,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0016339305916022555,
+      "loss": 0.1196,
+      "step": 29296
+    },
+    {
+      "epoch": 0.25431202854141893,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0016339066321247465,
+      "loss": 0.1309,
+      "step": 29297
+    },
+    {
+      "epoch": 0.25432070902162307,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0016338826720633652,
+      "loss": 0.0942,
+      "step": 29298
+    },
+    {
+      "epoch": 0.25432938950182726,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0016338587114181375,
+      "loss": 0.0928,
+      "step": 29299
+    },
+    {
+      "epoch": 0.2543380699820314,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0016338347501890901,
+      "loss": 0.106,
+      "step": 29300
+    },
+    {
+      "epoch": 0.2543467504622356,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0016338107883762493,
+      "loss": 0.0889,
+      "step": 29301
+    },
+    {
+      "epoch": 0.25435543094243973,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001633786825979641,
+      "loss": 0.124,
+      "step": 29302
+    },
+    {
+      "epoch": 0.2543641114226439,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0016337628629992914,
+      "loss": 0.0791,
+      "step": 29303
+    },
+    {
+      "epoch": 0.25437279190284806,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0016337388994352264,
+      "loss": 0.1025,
+      "step": 29304
+    },
+    {
+      "epoch": 0.25438147238305225,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0016337149352874728,
+      "loss": 0.0967,
+      "step": 29305
+    },
+    {
+      "epoch": 0.2543901528632564,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0016336909705560567,
+      "loss": 0.0977,
+      "step": 29306
+    },
+    {
+      "epoch": 0.2543988333434606,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0016336670052410044,
+      "loss": 0.1016,
+      "step": 29307
+    },
+    {
+      "epoch": 0.2544075138236647,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001633643039342341,
+      "loss": 0.1143,
+      "step": 29308
+    },
+    {
+      "epoch": 0.2544161943038689,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001633619072860094,
+      "loss": 0.0889,
+      "step": 29309
+    },
+    {
+      "epoch": 0.25442487478407305,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0016335951057942896,
+      "loss": 0.1348,
+      "step": 29310
+    },
+    {
+      "epoch": 0.25443355526427724,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0016335711381449535,
+      "loss": 0.1123,
+      "step": 29311
+    },
+    {
+      "epoch": 0.2544422357444814,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0016335471699121119,
+      "loss": 0.0898,
+      "step": 29312
+    },
+    {
+      "epoch": 0.2544509162246856,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0016335232010957909,
+      "loss": 0.1055,
+      "step": 29313
+    },
+    {
+      "epoch": 0.2544595967048897,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001633499231696017,
+      "loss": 0.0903,
+      "step": 29314
+    },
+    {
+      "epoch": 0.2544682771850939,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001633475261712817,
+      "loss": 0.1021,
+      "step": 29315
+    },
+    {
+      "epoch": 0.25447695766529804,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0016334512911462156,
+      "loss": 0.1011,
+      "step": 29316
+    },
+    {
+      "epoch": 0.25448563814550224,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0016334273199962402,
+      "loss": 0.2715,
+      "step": 29317
+    },
+    {
+      "epoch": 0.2544943186257064,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0016334033482629168,
+      "loss": 0.125,
+      "step": 29318
+    },
+    {
+      "epoch": 0.25450299910591057,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0016333793759462714,
+      "loss": 0.1768,
+      "step": 29319
+    },
+    {
+      "epoch": 0.2545116795861147,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0016333554030463304,
+      "loss": 0.0957,
+      "step": 29320
+    },
+    {
+      "epoch": 0.2545203600663189,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.00163333142956312,
+      "loss": 0.1128,
+      "step": 29321
+    },
+    {
+      "epoch": 0.25452904054652303,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0016333074554966663,
+      "loss": 0.1035,
+      "step": 29322
+    },
+    {
+      "epoch": 0.2545377210267272,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0016332834808469954,
+      "loss": 0.0947,
+      "step": 29323
+    },
+    {
+      "epoch": 0.25454640150693136,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001633259505614134,
+      "loss": 0.1338,
+      "step": 29324
+    },
+    {
+      "epoch": 0.2545550819871355,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0016332355297981078,
+      "loss": 0.0786,
+      "step": 29325
+    },
+    {
+      "epoch": 0.2545637624673397,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0016332115533989435,
+      "loss": 0.1406,
+      "step": 29326
+    },
+    {
+      "epoch": 0.25457244294754383,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001633187576416667,
+      "loss": 0.1582,
+      "step": 29327
+    },
+    {
+      "epoch": 0.254581123427748,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0016331635988513042,
+      "loss": 0.1426,
+      "step": 29328
+    },
+    {
+      "epoch": 0.25458980390795216,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001633139620702882,
+      "loss": 0.1289,
+      "step": 29329
+    },
+    {
+      "epoch": 0.25459848438815635,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0016331156419714262,
+      "loss": 0.1006,
+      "step": 29330
+    },
+    {
+      "epoch": 0.2546071648683605,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0016330916626569633,
+      "loss": 0.1143,
+      "step": 29331
+    },
+    {
+      "epoch": 0.2546158453485647,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0016330676827595193,
+      "loss": 0.0898,
+      "step": 29332
+    },
+    {
+      "epoch": 0.2546245258287688,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0016330437022791205,
+      "loss": 0.0957,
+      "step": 29333
+    },
+    {
+      "epoch": 0.254633206308973,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0016330197212157931,
+      "loss": 0.1108,
+      "step": 29334
+    },
+    {
+      "epoch": 0.25464188678917715,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0016329957395695633,
+      "loss": 0.1113,
+      "step": 29335
+    },
+    {
+      "epoch": 0.25465056726938134,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0016329717573404575,
+      "loss": 0.1074,
+      "step": 29336
+    },
+    {
+      "epoch": 0.2546592477495855,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0016329477745285015,
+      "loss": 0.1133,
+      "step": 29337
+    },
+    {
+      "epoch": 0.2546679282297897,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0016329237911337222,
+      "loss": 0.1152,
+      "step": 29338
+    },
+    {
+      "epoch": 0.2546766087099938,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0016328998071561452,
+      "loss": 0.0898,
+      "step": 29339
+    },
+    {
+      "epoch": 0.254685289190198,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001632875822595797,
+      "loss": 0.0874,
+      "step": 29340
+    },
+    {
+      "epoch": 0.25469396967040214,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001632851837452704,
+      "loss": 0.1328,
+      "step": 29341
+    },
+    {
+      "epoch": 0.25470265015060634,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0016328278517268922,
+      "loss": 0.0967,
+      "step": 29342
+    },
+    {
+      "epoch": 0.2547113306308105,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0016328038654183875,
+      "loss": 0.0752,
+      "step": 29343
+    },
+    {
+      "epoch": 0.25472001111101467,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0016327798785272167,
+      "loss": 0.084,
+      "step": 29344
+    },
+    {
+      "epoch": 0.2547286915912188,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0016327558910534062,
+      "loss": 0.1055,
+      "step": 29345
+    },
+    {
+      "epoch": 0.254737372071423,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0016327319029969811,
+      "loss": 0.1235,
+      "step": 29346
+    },
+    {
+      "epoch": 0.25474605255162713,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0016327079143579688,
+      "loss": 0.0635,
+      "step": 29347
+    },
+    {
+      "epoch": 0.2547547330318313,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0016326839251363952,
+      "loss": 0.1426,
+      "step": 29348
+    },
+    {
+      "epoch": 0.25476341351203546,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0016326599353322867,
+      "loss": 0.1079,
+      "step": 29349
+    },
+    {
+      "epoch": 0.25477209399223966,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0016326359449456691,
+      "loss": 0.1455,
+      "step": 29350
+    },
+    {
+      "epoch": 0.2547807744724438,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0016326119539765687,
+      "loss": 0.0825,
+      "step": 29351
+    },
+    {
+      "epoch": 0.254789454952648,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0016325879624250117,
+      "loss": 0.0913,
+      "step": 29352
+    },
+    {
+      "epoch": 0.2547981354328521,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001632563970291025,
+      "loss": 0.0781,
+      "step": 29353
+    },
+    {
+      "epoch": 0.2548068159130563,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0016325399775746342,
+      "loss": 0.1064,
+      "step": 29354
+    },
+    {
+      "epoch": 0.25481549639326045,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0016325159842758654,
+      "loss": 0.1196,
+      "step": 29355
+    },
+    {
+      "epoch": 0.25482417687346465,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001632491990394745,
+      "loss": 0.0889,
+      "step": 29356
+    },
+    {
+      "epoch": 0.2548328573536688,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0016324679959312997,
+      "loss": 0.1113,
+      "step": 29357
+    },
+    {
+      "epoch": 0.254841537833873,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0016324440008855553,
+      "loss": 0.1055,
+      "step": 29358
+    },
+    {
+      "epoch": 0.2548502183140771,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0016324200052575384,
+      "loss": 0.0967,
+      "step": 29359
+    },
+    {
+      "epoch": 0.2548588987942813,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0016323960090472748,
+      "loss": 0.1328,
+      "step": 29360
+    },
+    {
+      "epoch": 0.25486757927448545,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001632372012254791,
+      "loss": 0.0918,
+      "step": 29361
+    },
+    {
+      "epoch": 0.25487625975468964,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0016323480148801128,
+      "loss": 0.0933,
+      "step": 29362
+    },
+    {
+      "epoch": 0.2548849402348938,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0016323240169232671,
+      "loss": 0.1338,
+      "step": 29363
+    },
+    {
+      "epoch": 0.25489362071509797,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.00163230001838428,
+      "loss": 0.0938,
+      "step": 29364
+    },
+    {
+      "epoch": 0.2549023011953021,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0016322760192631774,
+      "loss": 0.0972,
+      "step": 29365
+    },
+    {
+      "epoch": 0.2549109816755063,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0016322520195599856,
+      "loss": 0.1484,
+      "step": 29366
+    },
+    {
+      "epoch": 0.25491966215571044,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0016322280192747314,
+      "loss": 0.1143,
+      "step": 29367
+    },
+    {
+      "epoch": 0.25492834263591463,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0016322040184074402,
+      "loss": 0.0903,
+      "step": 29368
+    },
+    {
+      "epoch": 0.25493702311611877,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0016321800169581389,
+      "loss": 0.1016,
+      "step": 29369
+    },
+    {
+      "epoch": 0.25494570359632296,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0016321560149268534,
+      "loss": 0.1108,
+      "step": 29370
+    },
+    {
+      "epoch": 0.2549543840765271,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0016321320123136103,
+      "loss": 0.082,
+      "step": 29371
+    },
+    {
+      "epoch": 0.2549630645567313,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0016321080091184356,
+      "loss": 0.0752,
+      "step": 29372
+    },
+    {
+      "epoch": 0.2549717450369354,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0016320840053413552,
+      "loss": 0.1221,
+      "step": 29373
+    },
+    {
+      "epoch": 0.2549804255171396,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0016320600009823963,
+      "loss": 0.1543,
+      "step": 29374
+    },
+    {
+      "epoch": 0.25498910599734376,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001632035996041584,
+      "loss": 0.123,
+      "step": 29375
+    },
+    {
+      "epoch": 0.25499778647754795,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0016320119905189453,
+      "loss": 0.1621,
+      "step": 29376
+    },
+    {
+      "epoch": 0.2550064669577521,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0016319879844145064,
+      "loss": 0.1074,
+      "step": 29377
+    },
+    {
+      "epoch": 0.2550151474379563,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0016319639777282935,
+      "loss": 0.1006,
+      "step": 29378
+    },
+    {
+      "epoch": 0.2550238279181604,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001631939970460333,
+      "loss": 0.0996,
+      "step": 29379
+    },
+    {
+      "epoch": 0.2550325083983646,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0016319159626106505,
+      "loss": 0.1006,
+      "step": 29380
+    },
+    {
+      "epoch": 0.25504118887856875,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001631891954179273,
+      "loss": 0.0908,
+      "step": 29381
+    },
+    {
+      "epoch": 0.25504986935877294,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0016318679451662263,
+      "loss": 0.1172,
+      "step": 29382
+    },
+    {
+      "epoch": 0.2550585498389771,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0016318439355715365,
+      "loss": 0.1543,
+      "step": 29383
+    },
+    {
+      "epoch": 0.25506723031918127,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0016318199253952306,
+      "loss": 0.1035,
+      "step": 29384
+    },
+    {
+      "epoch": 0.2550759107993854,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0016317959146373343,
+      "loss": 0.1719,
+      "step": 29385
+    },
+    {
+      "epoch": 0.2550845912795896,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001631771903297874,
+      "loss": 0.1182,
+      "step": 29386
+    },
+    {
+      "epoch": 0.25509327175979374,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0016317478913768758,
+      "loss": 0.1001,
+      "step": 29387
+    },
+    {
+      "epoch": 0.25510195223999793,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0016317238788743661,
+      "loss": 0.1133,
+      "step": 29388
+    },
+    {
+      "epoch": 0.25511063272020207,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0016316998657903714,
+      "loss": 0.1172,
+      "step": 29389
+    },
+    {
+      "epoch": 0.25511931320040626,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0016316758521249173,
+      "loss": 0.1777,
+      "step": 29390
+    },
+    {
+      "epoch": 0.2551279936806104,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0016316518378780309,
+      "loss": 0.1011,
+      "step": 29391
+    },
+    {
+      "epoch": 0.2551366741608146,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0016316278230497378,
+      "loss": 0.1348,
+      "step": 29392
+    },
+    {
+      "epoch": 0.25514535464101873,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0016316038076400648,
+      "loss": 0.0928,
+      "step": 29393
+    },
+    {
+      "epoch": 0.2551540351212229,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0016315797916490375,
+      "loss": 0.1216,
+      "step": 29394
+    },
+    {
+      "epoch": 0.25516271560142706,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0016315557750766828,
+      "loss": 0.1133,
+      "step": 29395
+    },
+    {
+      "epoch": 0.25517139608163125,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0016315317579230265,
+      "loss": 0.1484,
+      "step": 29396
+    },
+    {
+      "epoch": 0.2551800765618354,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0016315077401880954,
+      "loss": 0.1152,
+      "step": 29397
+    },
+    {
+      "epoch": 0.2551887570420396,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0016314837218719149,
+      "loss": 0.1055,
+      "step": 29398
+    },
+    {
+      "epoch": 0.2551974375222437,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001631459702974512,
+      "loss": 0.0723,
+      "step": 29399
+    },
+    {
+      "epoch": 0.2552061180024479,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0016314356834959125,
+      "loss": 0.1016,
+      "step": 29400
+    },
+    {
+      "epoch": 0.25521479848265205,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016314116634361434,
+      "loss": 0.125,
+      "step": 29401
+    },
+    {
+      "epoch": 0.25522347896285624,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0016313876427952304,
+      "loss": 0.1074,
+      "step": 29402
+    },
+    {
+      "epoch": 0.2552321594430604,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0016313636215731994,
+      "loss": 0.1128,
+      "step": 29403
+    },
+    {
+      "epoch": 0.2552408399232646,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0016313395997700777,
+      "loss": 0.124,
+      "step": 29404
+    },
+    {
+      "epoch": 0.2552495204034687,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0016313155773858908,
+      "loss": 0.1367,
+      "step": 29405
+    },
+    {
+      "epoch": 0.2552582008836729,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0016312915544206652,
+      "loss": 0.085,
+      "step": 29406
+    },
+    {
+      "epoch": 0.25526688136387704,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0016312675308744271,
+      "loss": 0.1514,
+      "step": 29407
+    },
+    {
+      "epoch": 0.25527556184408123,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0016312435067472027,
+      "loss": 0.1309,
+      "step": 29408
+    },
+    {
+      "epoch": 0.25528424232428537,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0016312194820390185,
+      "loss": 0.1494,
+      "step": 29409
+    },
+    {
+      "epoch": 0.25529292280448956,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0016311954567499007,
+      "loss": 0.1094,
+      "step": 29410
+    },
+    {
+      "epoch": 0.2553016032846937,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0016311714308798752,
+      "loss": 0.1069,
+      "step": 29411
+    },
+    {
+      "epoch": 0.2553102837648979,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001631147404428969,
+      "loss": 0.0977,
+      "step": 29412
+    },
+    {
+      "epoch": 0.25531896424510203,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0016311233773972078,
+      "loss": 0.1328,
+      "step": 29413
+    },
+    {
+      "epoch": 0.2553276447253062,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0016310993497846184,
+      "loss": 0.127,
+      "step": 29414
+    },
+    {
+      "epoch": 0.25533632520551036,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0016310753215912261,
+      "loss": 0.1494,
+      "step": 29415
+    },
+    {
+      "epoch": 0.25534500568571455,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001631051292817058,
+      "loss": 0.0889,
+      "step": 29416
+    },
+    {
+      "epoch": 0.2553536861659187,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0016310272634621405,
+      "loss": 0.0884,
+      "step": 29417
+    },
+    {
+      "epoch": 0.2553623666461229,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0016310032335264996,
+      "loss": 0.1055,
+      "step": 29418
+    },
+    {
+      "epoch": 0.255371047126327,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0016309792030101613,
+      "loss": 0.1299,
+      "step": 29419
+    },
+    {
+      "epoch": 0.2553797276065312,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001630955171913152,
+      "loss": 0.0879,
+      "step": 29420
+    },
+    {
+      "epoch": 0.25538840808673535,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0016309311402354983,
+      "loss": 0.0928,
+      "step": 29421
+    },
+    {
+      "epoch": 0.25539708856693955,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001630907107977226,
+      "loss": 0.1108,
+      "step": 29422
+    },
+    {
+      "epoch": 0.2554057690471437,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0016308830751383622,
+      "loss": 0.1187,
+      "step": 29423
+    },
+    {
+      "epoch": 0.2554144495273479,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0016308590417189323,
+      "loss": 0.0874,
+      "step": 29424
+    },
+    {
+      "epoch": 0.255423130007552,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0016308350077189626,
+      "loss": 0.1035,
+      "step": 29425
+    },
+    {
+      "epoch": 0.2554318104877562,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.00163081097313848,
+      "loss": 0.1172,
+      "step": 29426
+    },
+    {
+      "epoch": 0.25544049096796034,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001630786937977511,
+      "loss": 0.1445,
+      "step": 29427
+    },
+    {
+      "epoch": 0.25544917144816454,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001630762902236081,
+      "loss": 0.0806,
+      "step": 29428
+    },
+    {
+      "epoch": 0.2554578519283687,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0016307388659142167,
+      "loss": 0.0713,
+      "step": 29429
+    },
+    {
+      "epoch": 0.25546653240857287,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.001630714829011944,
+      "loss": 0.1123,
+      "step": 29430
+    },
+    {
+      "epoch": 0.255475212888777,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.00163069079152929,
+      "loss": 0.1348,
+      "step": 29431
+    },
+    {
+      "epoch": 0.2554838933689812,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0016306667534662803,
+      "loss": 0.0801,
+      "step": 29432
+    },
+    {
+      "epoch": 0.25549257384918533,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0016306427148229411,
+      "loss": 0.0957,
+      "step": 29433
+    },
+    {
+      "epoch": 0.2555012543293895,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0016306186755992995,
+      "loss": 0.1172,
+      "step": 29434
+    },
+    {
+      "epoch": 0.25550993480959366,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0016305946357953812,
+      "loss": 0.1123,
+      "step": 29435
+    },
+    {
+      "epoch": 0.25551861528979786,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0016305705954112128,
+      "loss": 0.1006,
+      "step": 29436
+    },
+    {
+      "epoch": 0.255527295770002,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.00163054655444682,
+      "loss": 0.1309,
+      "step": 29437
+    },
+    {
+      "epoch": 0.2555359762502062,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0016305225129022293,
+      "loss": 0.1309,
+      "step": 29438
+    },
+    {
+      "epoch": 0.2555446567304103,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0016304984707774673,
+      "loss": 0.0918,
+      "step": 29439
+    },
+    {
+      "epoch": 0.2555533372106145,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0016304744280725603,
+      "loss": 0.1035,
+      "step": 29440
+    },
+    {
+      "epoch": 0.25556201769081865,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0016304503847875344,
+      "loss": 0.0938,
+      "step": 29441
+    },
+    {
+      "epoch": 0.25557069817102285,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0016304263409224159,
+      "loss": 0.1328,
+      "step": 29442
+    },
+    {
+      "epoch": 0.255579378651227,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0016304022964772312,
+      "loss": 0.1094,
+      "step": 29443
+    },
+    {
+      "epoch": 0.2555880591314312,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0016303782514520062,
+      "loss": 0.1289,
+      "step": 29444
+    },
+    {
+      "epoch": 0.2555967396116353,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0016303542058467679,
+      "loss": 0.1206,
+      "step": 29445
+    },
+    {
+      "epoch": 0.2556054200918395,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0016303301596615422,
+      "loss": 0.1006,
+      "step": 29446
+    },
+    {
+      "epoch": 0.25561410057204365,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001630306112896355,
+      "loss": 0.167,
+      "step": 29447
+    },
+    {
+      "epoch": 0.2556227810522478,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001630282065551233,
+      "loss": 0.0986,
+      "step": 29448
+    },
+    {
+      "epoch": 0.255631461532452,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0016302580176262031,
+      "loss": 0.0815,
+      "step": 29449
+    },
+    {
+      "epoch": 0.2556401420126561,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0016302339691212906,
+      "loss": 0.1211,
+      "step": 29450
+    },
+    {
+      "epoch": 0.2556488224928603,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0016302099200365223,
+      "loss": 0.1279,
+      "step": 29451
+    },
+    {
+      "epoch": 0.25565750297306444,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0016301858703719244,
+      "loss": 0.0825,
+      "step": 29452
+    },
+    {
+      "epoch": 0.25566618345326864,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001630161820127523,
+      "loss": 0.0967,
+      "step": 29453
+    },
+    {
+      "epoch": 0.2556748639334728,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0016301377693033448,
+      "loss": 0.0952,
+      "step": 29454
+    },
+    {
+      "epoch": 0.25568354441367697,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001630113717899416,
+      "loss": 0.1074,
+      "step": 29455
+    },
+    {
+      "epoch": 0.2556922248938811,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0016300896659157628,
+      "loss": 0.125,
+      "step": 29456
+    },
+    {
+      "epoch": 0.2557009053740853,
+      "grad_norm": 3.171875,
+      "learning_rate": 0.0016300656133524112,
+      "loss": 0.4102,
+      "step": 29457
+    },
+    {
+      "epoch": 0.25570958585428943,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.001630041560209388,
+      "loss": 0.0952,
+      "step": 29458
+    },
+    {
+      "epoch": 0.2557182663344936,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0016300175064867194,
+      "loss": 0.0854,
+      "step": 29459
+    },
+    {
+      "epoch": 0.25572694681469776,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0016299934521844313,
+      "loss": 0.0879,
+      "step": 29460
+    },
+    {
+      "epoch": 0.25573562729490196,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0016299693973025507,
+      "loss": 0.0742,
+      "step": 29461
+    },
+    {
+      "epoch": 0.2557443077751061,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0016299453418411032,
+      "loss": 0.1182,
+      "step": 29462
+    },
+    {
+      "epoch": 0.2557529882553103,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0016299212858001157,
+      "loss": 0.1338,
+      "step": 29463
+    },
+    {
+      "epoch": 0.2557616687355144,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0016298972291796138,
+      "loss": 0.0947,
+      "step": 29464
+    },
+    {
+      "epoch": 0.2557703492157186,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001629873171979625,
+      "loss": 0.0991,
+      "step": 29465
+    },
+    {
+      "epoch": 0.25577902969592275,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0016298491142001745,
+      "loss": 0.1177,
+      "step": 29466
+    },
+    {
+      "epoch": 0.25578771017612695,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0016298250558412887,
+      "loss": 0.1069,
+      "step": 29467
+    },
+    {
+      "epoch": 0.2557963906563311,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0016298009969029942,
+      "loss": 0.1235,
+      "step": 29468
+    },
+    {
+      "epoch": 0.2558050711365353,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0016297769373853177,
+      "loss": 0.1523,
+      "step": 29469
+    },
+    {
+      "epoch": 0.2558137516167394,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001629752877288285,
+      "loss": 0.1221,
+      "step": 29470
+    },
+    {
+      "epoch": 0.2558224320969436,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0016297288166119224,
+      "loss": 0.0747,
+      "step": 29471
+    },
+    {
+      "epoch": 0.25583111257714775,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0016297047553562563,
+      "loss": 0.1104,
+      "step": 29472
+    },
+    {
+      "epoch": 0.25583979305735194,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0016296806935213132,
+      "loss": 0.0967,
+      "step": 29473
+    },
+    {
+      "epoch": 0.2558484735375561,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001629656631107119,
+      "loss": 0.1426,
+      "step": 29474
+    },
+    {
+      "epoch": 0.25585715401776027,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0016296325681137004,
+      "loss": 0.1235,
+      "step": 29475
+    },
+    {
+      "epoch": 0.2558658344979644,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0016296085045410839,
+      "loss": 0.0933,
+      "step": 29476
+    },
+    {
+      "epoch": 0.2558745149781686,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0016295844403892948,
+      "loss": 0.1338,
+      "step": 29477
+    },
+    {
+      "epoch": 0.25588319545837274,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0016295603756583605,
+      "loss": 0.1377,
+      "step": 29478
+    },
+    {
+      "epoch": 0.25589187593857693,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.001629536310348307,
+      "loss": 0.1387,
+      "step": 29479
+    },
+    {
+      "epoch": 0.25590055641878107,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0016295122444591604,
+      "loss": 0.1221,
+      "step": 29480
+    },
+    {
+      "epoch": 0.25590923689898526,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0016294881779909472,
+      "loss": 0.0898,
+      "step": 29481
+    },
+    {
+      "epoch": 0.2559179173791894,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0016294641109436936,
+      "loss": 0.0845,
+      "step": 29482
+    },
+    {
+      "epoch": 0.2559265978593936,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001629440043317426,
+      "loss": 0.0908,
+      "step": 29483
+    },
+    {
+      "epoch": 0.2559352783395977,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001629415975112171,
+      "loss": 0.1143,
+      "step": 29484
+    },
+    {
+      "epoch": 0.2559439588198019,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0016293919063279545,
+      "loss": 0.1172,
+      "step": 29485
+    },
+    {
+      "epoch": 0.25595263930000606,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001629367836964803,
+      "loss": 0.1152,
+      "step": 29486
+    },
+    {
+      "epoch": 0.25596131978021025,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0016293437670227427,
+      "loss": 0.085,
+      "step": 29487
+    },
+    {
+      "epoch": 0.2559700002604144,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0016293196965018,
+      "loss": 0.1064,
+      "step": 29488
+    },
+    {
+      "epoch": 0.2559786807406186,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0016292956254020012,
+      "loss": 0.1416,
+      "step": 29489
+    },
+    {
+      "epoch": 0.2559873612208227,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001629271553723373,
+      "loss": 0.1084,
+      "step": 29490
+    },
+    {
+      "epoch": 0.2559960417010269,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001629247481465941,
+      "loss": 0.1279,
+      "step": 29491
+    },
+    {
+      "epoch": 0.25600472218123105,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001629223408629732,
+      "loss": 0.127,
+      "step": 29492
+    },
+    {
+      "epoch": 0.25601340266143524,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0016291993352147724,
+      "loss": 0.0781,
+      "step": 29493
+    },
+    {
+      "epoch": 0.2560220831416394,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001629175261221088,
+      "loss": 0.1533,
+      "step": 29494
+    },
+    {
+      "epoch": 0.25603076362184357,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001629151186648706,
+      "loss": 0.1099,
+      "step": 29495
+    },
+    {
+      "epoch": 0.2560394441020477,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0016291271114976516,
+      "loss": 0.1406,
+      "step": 29496
+    },
+    {
+      "epoch": 0.2560481245822519,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0016291030357679522,
+      "loss": 0.0879,
+      "step": 29497
+    },
+    {
+      "epoch": 0.25605680506245604,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0016290789594596334,
+      "loss": 0.1055,
+      "step": 29498
+    },
+    {
+      "epoch": 0.25606548554266023,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0016290548825727217,
+      "loss": 0.1016,
+      "step": 29499
+    },
+    {
+      "epoch": 0.25607416602286437,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0016290308051072437,
+      "loss": 0.1055,
+      "step": 29500
+    },
+    {
+      "epoch": 0.25608284650306856,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001629006727063226,
+      "loss": 0.1084,
+      "step": 29501
+    },
+    {
+      "epoch": 0.2560915269832727,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0016289826484406939,
+      "loss": 0.1172,
+      "step": 29502
+    },
+    {
+      "epoch": 0.2561002074634769,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0016289585692396745,
+      "loss": 0.1035,
+      "step": 29503
+    },
+    {
+      "epoch": 0.25610888794368103,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0016289344894601938,
+      "loss": 0.1367,
+      "step": 29504
+    },
+    {
+      "epoch": 0.2561175684238852,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0016289104091022787,
+      "loss": 0.1455,
+      "step": 29505
+    },
+    {
+      "epoch": 0.25612624890408936,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0016288863281659549,
+      "loss": 0.0923,
+      "step": 29506
+    },
+    {
+      "epoch": 0.25613492938429355,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001628862246651249,
+      "loss": 0.0942,
+      "step": 29507
+    },
+    {
+      "epoch": 0.2561436098644977,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001628838164558187,
+      "loss": 0.0762,
+      "step": 29508
+    },
+    {
+      "epoch": 0.2561522903447019,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001628814081886796,
+      "loss": 0.0938,
+      "step": 29509
+    },
+    {
+      "epoch": 0.256160970824906,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0016287899986371015,
+      "loss": 0.1113,
+      "step": 29510
+    },
+    {
+      "epoch": 0.2561696513051102,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0016287659148091305,
+      "loss": 0.0996,
+      "step": 29511
+    },
+    {
+      "epoch": 0.25617833178531435,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0016287418304029087,
+      "loss": 0.0889,
+      "step": 29512
+    },
+    {
+      "epoch": 0.25618701226551854,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001628717745418463,
+      "loss": 0.1191,
+      "step": 29513
+    },
+    {
+      "epoch": 0.2561956927457227,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0016286936598558196,
+      "loss": 0.1011,
+      "step": 29514
+    },
+    {
+      "epoch": 0.2562043732259269,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0016286695737150047,
+      "loss": 0.0928,
+      "step": 29515
+    },
+    {
+      "epoch": 0.256213053706131,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0016286454869960444,
+      "loss": 0.1055,
+      "step": 29516
+    },
+    {
+      "epoch": 0.2562217341863352,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0016286213996989656,
+      "loss": 0.1719,
+      "step": 29517
+    },
+    {
+      "epoch": 0.25623041466653934,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0016285973118237945,
+      "loss": 0.0972,
+      "step": 29518
+    },
+    {
+      "epoch": 0.25623909514674353,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0016285732233705572,
+      "loss": 0.1387,
+      "step": 29519
+    },
+    {
+      "epoch": 0.25624777562694767,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.00162854913433928,
+      "loss": 0.1328,
+      "step": 29520
+    },
+    {
+      "epoch": 0.25625645610715186,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0016285250447299893,
+      "loss": 0.1143,
+      "step": 29521
+    },
+    {
+      "epoch": 0.256265136587356,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001628500954542712,
+      "loss": 0.1309,
+      "step": 29522
+    },
+    {
+      "epoch": 0.2562738170675602,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0016284768637774738,
+      "loss": 0.1064,
+      "step": 29523
+    },
+    {
+      "epoch": 0.25628249754776433,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0016284527724343013,
+      "loss": 0.0825,
+      "step": 29524
+    },
+    {
+      "epoch": 0.2562911780279685,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001628428680513221,
+      "loss": 0.0986,
+      "step": 29525
+    },
+    {
+      "epoch": 0.25629985850817266,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0016284045880142586,
+      "loss": 0.0977,
+      "step": 29526
+    },
+    {
+      "epoch": 0.25630853898837686,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001628380494937441,
+      "loss": 0.125,
+      "step": 29527
+    },
+    {
+      "epoch": 0.256317219468581,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0016283564012827942,
+      "loss": 0.1113,
+      "step": 29528
+    },
+    {
+      "epoch": 0.2563258999487852,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0016283323070503451,
+      "loss": 0.1172,
+      "step": 29529
+    },
+    {
+      "epoch": 0.2563345804289893,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0016283082122401198,
+      "loss": 0.1328,
+      "step": 29530
+    },
+    {
+      "epoch": 0.2563432609091935,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0016282841168521445,
+      "loss": 0.1182,
+      "step": 29531
+    },
+    {
+      "epoch": 0.25635194138939765,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0016282600208864459,
+      "loss": 0.1514,
+      "step": 29532
+    },
+    {
+      "epoch": 0.25636062186960185,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0016282359243430494,
+      "loss": 0.082,
+      "step": 29533
+    },
+    {
+      "epoch": 0.256369302349806,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0016282118272219824,
+      "loss": 0.103,
+      "step": 29534
+    },
+    {
+      "epoch": 0.2563779828300102,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0016281877295232707,
+      "loss": 0.0952,
+      "step": 29535
+    },
+    {
+      "epoch": 0.2563866633102143,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0016281636312469412,
+      "loss": 0.1201,
+      "step": 29536
+    },
+    {
+      "epoch": 0.2563953437904185,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0016281395323930197,
+      "loss": 0.1118,
+      "step": 29537
+    },
+    {
+      "epoch": 0.25640402427062264,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0016281154329615326,
+      "loss": 0.1216,
+      "step": 29538
+    },
+    {
+      "epoch": 0.25641270475082684,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0016280913329525064,
+      "loss": 0.0938,
+      "step": 29539
+    },
+    {
+      "epoch": 0.256421385231031,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0016280672323659678,
+      "loss": 0.1289,
+      "step": 29540
+    },
+    {
+      "epoch": 0.25643006571123517,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0016280431312019426,
+      "loss": 0.0923,
+      "step": 29541
+    },
+    {
+      "epoch": 0.2564387461914393,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0016280190294604574,
+      "loss": 0.0967,
+      "step": 29542
+    },
+    {
+      "epoch": 0.2564474266716435,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0016279949271415385,
+      "loss": 0.0923,
+      "step": 29543
+    },
+    {
+      "epoch": 0.25645610715184763,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001627970824245212,
+      "loss": 0.0781,
+      "step": 29544
+    },
+    {
+      "epoch": 0.2564647876320518,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0016279467207715049,
+      "loss": 0.0942,
+      "step": 29545
+    },
+    {
+      "epoch": 0.25647346811225596,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0016279226167204431,
+      "loss": 0.1191,
+      "step": 29546
+    },
+    {
+      "epoch": 0.25648214859246016,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001627898512092053,
+      "loss": 0.1465,
+      "step": 29547
+    },
+    {
+      "epoch": 0.2564908290726643,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001627874406886361,
+      "loss": 0.124,
+      "step": 29548
+    },
+    {
+      "epoch": 0.2564995095528685,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0016278503011033937,
+      "loss": 0.1553,
+      "step": 29549
+    },
+    {
+      "epoch": 0.2565081900330726,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.001627826194743177,
+      "loss": 0.1123,
+      "step": 29550
+    },
+    {
+      "epoch": 0.2565168705132768,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0016278020878057375,
+      "loss": 0.1162,
+      "step": 29551
+    },
+    {
+      "epoch": 0.25652555099348096,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001627777980291102,
+      "loss": 0.0752,
+      "step": 29552
+    },
+    {
+      "epoch": 0.25653423147368515,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0016277538721992957,
+      "loss": 0.0845,
+      "step": 29553
+    },
+    {
+      "epoch": 0.2565429119538893,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0016277297635303462,
+      "loss": 0.0771,
+      "step": 29554
+    },
+    {
+      "epoch": 0.2565515924340935,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0016277056542842794,
+      "loss": 0.0996,
+      "step": 29555
+    },
+    {
+      "epoch": 0.2565602729142976,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0016276815444611215,
+      "loss": 0.0747,
+      "step": 29556
+    },
+    {
+      "epoch": 0.2565689533945018,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.001627657434060899,
+      "loss": 0.1118,
+      "step": 29557
+    },
+    {
+      "epoch": 0.25657763387470595,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001627633323083638,
+      "loss": 0.0791,
+      "step": 29558
+    },
+    {
+      "epoch": 0.25658631435491014,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0016276092115293653,
+      "loss": 0.0811,
+      "step": 29559
+    },
+    {
+      "epoch": 0.2565949948351143,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0016275850993981071,
+      "loss": 0.1177,
+      "step": 29560
+    },
+    {
+      "epoch": 0.25660367531531847,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.00162756098668989,
+      "loss": 0.084,
+      "step": 29561
+    },
+    {
+      "epoch": 0.2566123557955226,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0016275368734047402,
+      "loss": 0.0703,
+      "step": 29562
+    },
+    {
+      "epoch": 0.2566210362757268,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0016275127595426833,
+      "loss": 0.1113,
+      "step": 29563
+    },
+    {
+      "epoch": 0.25662971675593094,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001627488645103747,
+      "loss": 0.1167,
+      "step": 29564
+    },
+    {
+      "epoch": 0.25663839723613513,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0016274645300879572,
+      "loss": 0.1074,
+      "step": 29565
+    },
+    {
+      "epoch": 0.25664707771633927,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0016274404144953395,
+      "loss": 0.0869,
+      "step": 29566
+    },
+    {
+      "epoch": 0.25665575819654346,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001627416298325921,
+      "loss": 0.1104,
+      "step": 29567
+    },
+    {
+      "epoch": 0.2566644386767476,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0016273921815797283,
+      "loss": 0.0742,
+      "step": 29568
+    },
+    {
+      "epoch": 0.2566731191569518,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001627368064256787,
+      "loss": 0.0981,
+      "step": 29569
+    },
+    {
+      "epoch": 0.2566817996371559,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0016273439463571245,
+      "loss": 0.1143,
+      "step": 29570
+    },
+    {
+      "epoch": 0.25669048011736006,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0016273198278807662,
+      "loss": 0.0874,
+      "step": 29571
+    },
+    {
+      "epoch": 0.25669916059756426,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0016272957088277389,
+      "loss": 0.1064,
+      "step": 29572
+    },
+    {
+      "epoch": 0.2567078410777684,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0016272715891980688,
+      "loss": 0.0938,
+      "step": 29573
+    },
+    {
+      "epoch": 0.2567165215579726,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0016272474689917828,
+      "loss": 0.1104,
+      "step": 29574
+    },
+    {
+      "epoch": 0.2567252020381767,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0016272233482089065,
+      "loss": 0.1377,
+      "step": 29575
+    },
+    {
+      "epoch": 0.2567338825183809,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0016271992268494667,
+      "loss": 0.127,
+      "step": 29576
+    },
+    {
+      "epoch": 0.25674256299858506,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.00162717510491349,
+      "loss": 0.127,
+      "step": 29577
+    },
+    {
+      "epoch": 0.25675124347878925,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0016271509824010025,
+      "loss": 0.124,
+      "step": 29578
+    },
+    {
+      "epoch": 0.2567599239589934,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0016271268593120306,
+      "loss": 0.084,
+      "step": 29579
+    },
+    {
+      "epoch": 0.2567686044391976,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0016271027356466003,
+      "loss": 0.0918,
+      "step": 29580
+    },
+    {
+      "epoch": 0.2567772849194017,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0016270786114047386,
+      "loss": 0.0957,
+      "step": 29581
+    },
+    {
+      "epoch": 0.2567859653996059,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0016270544865864718,
+      "loss": 0.127,
+      "step": 29582
+    },
+    {
+      "epoch": 0.25679464587981005,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001627030361191826,
+      "loss": 0.1475,
+      "step": 29583
+    },
+    {
+      "epoch": 0.25680332636001424,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0016270062352208277,
+      "loss": 0.1406,
+      "step": 29584
+    },
+    {
+      "epoch": 0.2568120068402184,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016269821086735033,
+      "loss": 0.0635,
+      "step": 29585
+    },
+    {
+      "epoch": 0.25682068732042257,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0016269579815498792,
+      "loss": 0.103,
+      "step": 29586
+    },
+    {
+      "epoch": 0.2568293678006267,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0016269338538499821,
+      "loss": 0.1123,
+      "step": 29587
+    },
+    {
+      "epoch": 0.2568380482808309,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0016269097255738375,
+      "loss": 0.1143,
+      "step": 29588
+    },
+    {
+      "epoch": 0.25684672876103504,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016268855967214726,
+      "loss": 0.0747,
+      "step": 29589
+    },
+    {
+      "epoch": 0.25685540924123923,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0016268614672929136,
+      "loss": 0.0981,
+      "step": 29590
+    },
+    {
+      "epoch": 0.25686408972144337,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0016268373372881869,
+      "loss": 0.127,
+      "step": 29591
+    },
+    {
+      "epoch": 0.25687277020164756,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0016268132067073185,
+      "loss": 0.104,
+      "step": 29592
+    },
+    {
+      "epoch": 0.2568814506818517,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001626789075550335,
+      "loss": 0.124,
+      "step": 29593
+    },
+    {
+      "epoch": 0.2568901311620559,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0016267649438172634,
+      "loss": 0.1133,
+      "step": 29594
+    },
+    {
+      "epoch": 0.25689881164226,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001626740811508129,
+      "loss": 0.0796,
+      "step": 29595
+    },
+    {
+      "epoch": 0.2569074921224642,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0016267166786229593,
+      "loss": 0.0825,
+      "step": 29596
+    },
+    {
+      "epoch": 0.25691617260266836,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0016266925451617798,
+      "loss": 0.1309,
+      "step": 29597
+    },
+    {
+      "epoch": 0.25692485308287255,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0016266684111246174,
+      "loss": 0.1543,
+      "step": 29598
+    },
+    {
+      "epoch": 0.2569335335630767,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0016266442765114982,
+      "loss": 0.1338,
+      "step": 29599
+    },
+    {
+      "epoch": 0.2569422140432809,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0016266201413224488,
+      "loss": 0.0942,
+      "step": 29600
+    },
+    {
+      "epoch": 0.256950894523485,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0016265960055574956,
+      "loss": 0.0977,
+      "step": 29601
+    },
+    {
+      "epoch": 0.2569595750036892,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0016265718692166646,
+      "loss": 0.1885,
+      "step": 29602
+    },
+    {
+      "epoch": 0.25696825548389335,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001626547732299983,
+      "loss": 0.1504,
+      "step": 29603
+    },
+    {
+      "epoch": 0.25697693596409754,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0016265235948074764,
+      "loss": 0.1641,
+      "step": 29604
+    },
+    {
+      "epoch": 0.2569856164443017,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0016264994567391719,
+      "loss": 0.083,
+      "step": 29605
+    },
+    {
+      "epoch": 0.25699429692450587,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0016264753180950948,
+      "loss": 0.1514,
+      "step": 29606
+    },
+    {
+      "epoch": 0.25700297740471,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0016264511788752728,
+      "loss": 0.1084,
+      "step": 29607
+    },
+    {
+      "epoch": 0.2570116578849142,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0016264270390797314,
+      "loss": 0.0942,
+      "step": 29608
+    },
+    {
+      "epoch": 0.25702033836511834,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0016264028987084973,
+      "loss": 0.1123,
+      "step": 29609
+    },
+    {
+      "epoch": 0.25702901884532253,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0016263787577615971,
+      "loss": 0.0859,
+      "step": 29610
+    },
+    {
+      "epoch": 0.25703769932552667,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0016263546162390567,
+      "loss": 0.0918,
+      "step": 29611
+    },
+    {
+      "epoch": 0.25704637980573086,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0016263304741409032,
+      "loss": 0.1147,
+      "step": 29612
+    },
+    {
+      "epoch": 0.257055060285935,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0016263063314671624,
+      "loss": 0.0918,
+      "step": 29613
+    },
+    {
+      "epoch": 0.2570637407661392,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016262821882178612,
+      "loss": 0.127,
+      "step": 29614
+    },
+    {
+      "epoch": 0.25707242124634333,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0016262580443930253,
+      "loss": 0.0908,
+      "step": 29615
+    },
+    {
+      "epoch": 0.2570811017265475,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0016262338999926814,
+      "loss": 0.1426,
+      "step": 29616
+    },
+    {
+      "epoch": 0.25708978220675166,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0016262097550168563,
+      "loss": 0.0981,
+      "step": 29617
+    },
+    {
+      "epoch": 0.25709846268695585,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0016261856094655761,
+      "loss": 0.1035,
+      "step": 29618
+    },
+    {
+      "epoch": 0.25710714316716,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0016261614633388672,
+      "loss": 0.0938,
+      "step": 29619
+    },
+    {
+      "epoch": 0.2571158236473642,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001626137316636756,
+      "loss": 0.1001,
+      "step": 29620
+    },
+    {
+      "epoch": 0.2571245041275683,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0016261131693592688,
+      "loss": 0.0967,
+      "step": 29621
+    },
+    {
+      "epoch": 0.2571331846077725,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0016260890215064323,
+      "loss": 0.1387,
+      "step": 29622
+    },
+    {
+      "epoch": 0.25714186508797665,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0016260648730782729,
+      "loss": 0.104,
+      "step": 29623
+    },
+    {
+      "epoch": 0.25715054556818084,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0016260407240748166,
+      "loss": 0.0957,
+      "step": 29624
+    },
+    {
+      "epoch": 0.257159226048385,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0016260165744960903,
+      "loss": 0.0986,
+      "step": 29625
+    },
+    {
+      "epoch": 0.2571679065285892,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0016259924243421198,
+      "loss": 0.082,
+      "step": 29626
+    },
+    {
+      "epoch": 0.2571765870087933,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0016259682736129324,
+      "loss": 0.1045,
+      "step": 29627
+    },
+    {
+      "epoch": 0.2571852674889975,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0016259441223085535,
+      "loss": 0.1367,
+      "step": 29628
+    },
+    {
+      "epoch": 0.25719394796920164,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0016259199704290103,
+      "loss": 0.1182,
+      "step": 29629
+    },
+    {
+      "epoch": 0.25720262844940583,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0016258958179743291,
+      "loss": 0.082,
+      "step": 29630
+    },
+    {
+      "epoch": 0.25721130892960997,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0016258716649445357,
+      "loss": 0.0654,
+      "step": 29631
+    },
+    {
+      "epoch": 0.25721998940981416,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0016258475113396572,
+      "loss": 0.103,
+      "step": 29632
+    },
+    {
+      "epoch": 0.2572286698900183,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0016258233571597197,
+      "loss": 0.1055,
+      "step": 29633
+    },
+    {
+      "epoch": 0.2572373503702225,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0016257992024047497,
+      "loss": 0.1201,
+      "step": 29634
+    },
+    {
+      "epoch": 0.25724603085042663,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0016257750470747734,
+      "loss": 0.1025,
+      "step": 29635
+    },
+    {
+      "epoch": 0.2572547113306308,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0016257508911698179,
+      "loss": 0.0625,
+      "step": 29636
+    },
+    {
+      "epoch": 0.25726339181083496,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0016257267346899088,
+      "loss": 0.123,
+      "step": 29637
+    },
+    {
+      "epoch": 0.25727207229103916,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0016257025776350725,
+      "loss": 0.0898,
+      "step": 29638
+    },
+    {
+      "epoch": 0.2572807527712433,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0016256784200053363,
+      "loss": 0.083,
+      "step": 29639
+    },
+    {
+      "epoch": 0.2572894332514475,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0016256542618007259,
+      "loss": 0.1172,
+      "step": 29640
+    },
+    {
+      "epoch": 0.2572981137316516,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0016256301030212678,
+      "loss": 0.1133,
+      "step": 29641
+    },
+    {
+      "epoch": 0.2573067942118558,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0016256059436669888,
+      "loss": 0.1406,
+      "step": 29642
+    },
+    {
+      "epoch": 0.25731547469205995,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0016255817837379147,
+      "loss": 0.0859,
+      "step": 29643
+    },
+    {
+      "epoch": 0.25732415517226415,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0016255576232340723,
+      "loss": 0.1221,
+      "step": 29644
+    },
+    {
+      "epoch": 0.2573328356524683,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0016255334621554881,
+      "loss": 0.0859,
+      "step": 29645
+    },
+    {
+      "epoch": 0.2573415161326725,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001625509300502188,
+      "loss": 0.1201,
+      "step": 29646
+    },
+    {
+      "epoch": 0.2573501966128766,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0016254851382741995,
+      "loss": 0.1172,
+      "step": 29647
+    },
+    {
+      "epoch": 0.2573588770930808,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0016254609754715484,
+      "loss": 0.1055,
+      "step": 29648
+    },
+    {
+      "epoch": 0.25736755757328494,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0016254368120942604,
+      "loss": 0.0977,
+      "step": 29649
+    },
+    {
+      "epoch": 0.25737623805348914,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001625412648142363,
+      "loss": 0.0635,
+      "step": 29650
+    },
+    {
+      "epoch": 0.2573849185336933,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0016253884836158821,
+      "loss": 0.0796,
+      "step": 29651
+    },
+    {
+      "epoch": 0.25739359901389747,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0016253643185148443,
+      "loss": 0.0977,
+      "step": 29652
+    },
+    {
+      "epoch": 0.2574022794941016,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.001625340152839276,
+      "loss": 0.1025,
+      "step": 29653
+    },
+    {
+      "epoch": 0.2574109599743058,
+      "grad_norm": 1.6640625,
+      "learning_rate": 0.0016253159865892036,
+      "loss": 0.1182,
+      "step": 29654
+    },
+    {
+      "epoch": 0.25741964045450993,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0016252918197646534,
+      "loss": 0.1514,
+      "step": 29655
+    },
+    {
+      "epoch": 0.2574283209347141,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0016252676523656523,
+      "loss": 0.0903,
+      "step": 29656
+    },
+    {
+      "epoch": 0.25743700141491827,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0016252434843922262,
+      "loss": 0.1143,
+      "step": 29657
+    },
+    {
+      "epoch": 0.25744568189512246,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0016252193158444014,
+      "loss": 0.0918,
+      "step": 29658
+    },
+    {
+      "epoch": 0.2574543623753266,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001625195146722205,
+      "loss": 0.125,
+      "step": 29659
+    },
+    {
+      "epoch": 0.2574630428555308,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0016251709770256628,
+      "loss": 0.1348,
+      "step": 29660
+    },
+    {
+      "epoch": 0.2574717233357349,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001625146806754802,
+      "loss": 0.1172,
+      "step": 29661
+    },
+    {
+      "epoch": 0.2574804038159391,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0016251226359096482,
+      "loss": 0.1123,
+      "step": 29662
+    },
+    {
+      "epoch": 0.25748908429614326,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0016250984644902281,
+      "loss": 0.124,
+      "step": 29663
+    },
+    {
+      "epoch": 0.25749776477634745,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0016250742924965684,
+      "loss": 0.1021,
+      "step": 29664
+    },
+    {
+      "epoch": 0.2575064452565516,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001625050119928695,
+      "loss": 0.0674,
+      "step": 29665
+    },
+    {
+      "epoch": 0.2575151257367558,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0016250259467866352,
+      "loss": 0.1089,
+      "step": 29666
+    },
+    {
+      "epoch": 0.2575238062169599,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0016250017730704148,
+      "loss": 0.0859,
+      "step": 29667
+    },
+    {
+      "epoch": 0.2575324866971641,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00162497759878006,
+      "loss": 0.0825,
+      "step": 29668
+    },
+    {
+      "epoch": 0.25754116717736825,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0016249534239155977,
+      "loss": 0.0747,
+      "step": 29669
+    },
+    {
+      "epoch": 0.25754984765757244,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0016249292484770545,
+      "loss": 0.0913,
+      "step": 29670
+    },
+    {
+      "epoch": 0.2575585281377766,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0016249050724644561,
+      "loss": 0.0913,
+      "step": 29671
+    },
+    {
+      "epoch": 0.25756720861798077,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0016248808958778296,
+      "loss": 0.1064,
+      "step": 29672
+    },
+    {
+      "epoch": 0.2575758890981849,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0016248567187172017,
+      "loss": 0.1094,
+      "step": 29673
+    },
+    {
+      "epoch": 0.2575845695783891,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0016248325409825978,
+      "loss": 0.1104,
+      "step": 29674
+    },
+    {
+      "epoch": 0.25759325005859324,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0016248083626740452,
+      "loss": 0.1143,
+      "step": 29675
+    },
+    {
+      "epoch": 0.25760193053879743,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.00162478418379157,
+      "loss": 0.1143,
+      "step": 29676
+    },
+    {
+      "epoch": 0.25761061101900157,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0016247600043351987,
+      "loss": 0.0923,
+      "step": 29677
+    },
+    {
+      "epoch": 0.25761929149920576,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0016247358243049574,
+      "loss": 0.1465,
+      "step": 29678
+    },
+    {
+      "epoch": 0.2576279719794099,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0016247116437008735,
+      "loss": 0.1475,
+      "step": 29679
+    },
+    {
+      "epoch": 0.2576366524596141,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0016246874625229725,
+      "loss": 0.0991,
+      "step": 29680
+    },
+    {
+      "epoch": 0.25764533293981823,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0016246632807712812,
+      "loss": 0.0996,
+      "step": 29681
+    },
+    {
+      "epoch": 0.2576540134200224,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0016246390984458261,
+      "loss": 0.1279,
+      "step": 29682
+    },
+    {
+      "epoch": 0.25766269390022656,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0016246149155466334,
+      "loss": 0.1064,
+      "step": 29683
+    },
+    {
+      "epoch": 0.25767137438043075,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0016245907320737297,
+      "loss": 0.1011,
+      "step": 29684
+    },
+    {
+      "epoch": 0.2576800548606349,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0016245665480271417,
+      "loss": 0.1191,
+      "step": 29685
+    },
+    {
+      "epoch": 0.2576887353408391,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0016245423634068954,
+      "loss": 0.0723,
+      "step": 29686
+    },
+    {
+      "epoch": 0.2576974158210432,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0016245181782130178,
+      "loss": 0.1123,
+      "step": 29687
+    },
+    {
+      "epoch": 0.2577060963012474,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0016244939924455347,
+      "loss": 0.0859,
+      "step": 29688
+    },
+    {
+      "epoch": 0.25771477678145155,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0016244698061044728,
+      "loss": 0.1104,
+      "step": 29689
+    },
+    {
+      "epoch": 0.25772345726165574,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0016244456191898584,
+      "loss": 0.085,
+      "step": 29690
+    },
+    {
+      "epoch": 0.2577321377418599,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0016244214317017185,
+      "loss": 0.126,
+      "step": 29691
+    },
+    {
+      "epoch": 0.25774081822206407,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0016243972436400791,
+      "loss": 0.1113,
+      "step": 29692
+    },
+    {
+      "epoch": 0.2577494987022682,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0016243730550049669,
+      "loss": 0.0986,
+      "step": 29693
+    },
+    {
+      "epoch": 0.25775817918247235,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0016243488657964082,
+      "loss": 0.1133,
+      "step": 29694
+    },
+    {
+      "epoch": 0.25776685966267654,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0016243246760144292,
+      "loss": 0.0923,
+      "step": 29695
+    },
+    {
+      "epoch": 0.2577755401428807,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0016243004856590566,
+      "loss": 0.082,
+      "step": 29696
+    },
+    {
+      "epoch": 0.25778422062308487,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0016242762947303172,
+      "loss": 0.1211,
+      "step": 29697
+    },
+    {
+      "epoch": 0.257792901103289,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001624252103228237,
+      "loss": 0.0815,
+      "step": 29698
+    },
+    {
+      "epoch": 0.2578015815834932,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0016242279111528422,
+      "loss": 0.0884,
+      "step": 29699
+    },
+    {
+      "epoch": 0.25781026206369734,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.00162420371850416,
+      "loss": 0.1143,
+      "step": 29700
+    },
+    {
+      "epoch": 0.25781894254390153,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0016241795252822165,
+      "loss": 0.1128,
+      "step": 29701
+    },
+    {
+      "epoch": 0.25782762302410567,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001624155331487038,
+      "loss": 0.0747,
+      "step": 29702
+    },
+    {
+      "epoch": 0.25783630350430986,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0016241311371186513,
+      "loss": 0.1001,
+      "step": 29703
+    },
+    {
+      "epoch": 0.257844983984514,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0016241069421770826,
+      "loss": 0.1133,
+      "step": 29704
+    },
+    {
+      "epoch": 0.2578536644647182,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0016240827466623585,
+      "loss": 0.1055,
+      "step": 29705
+    },
+    {
+      "epoch": 0.25786234494492233,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001624058550574505,
+      "loss": 0.1221,
+      "step": 29706
+    },
+    {
+      "epoch": 0.2578710254251265,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0016240343539135494,
+      "loss": 0.1035,
+      "step": 29707
+    },
+    {
+      "epoch": 0.25787970590533066,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0016240101566795174,
+      "loss": 0.1396,
+      "step": 29708
+    },
+    {
+      "epoch": 0.25788838638553485,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001623985958872436,
+      "loss": 0.1436,
+      "step": 29709
+    },
+    {
+      "epoch": 0.257897066865739,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0016239617604923312,
+      "loss": 0.165,
+      "step": 29710
+    },
+    {
+      "epoch": 0.2579057473459432,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0016239375615392298,
+      "loss": 0.1318,
+      "step": 29711
+    },
+    {
+      "epoch": 0.2579144278261473,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001623913362013158,
+      "loss": 0.0942,
+      "step": 29712
+    },
+    {
+      "epoch": 0.2579231083063515,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0016238891619141426,
+      "loss": 0.0762,
+      "step": 29713
+    },
+    {
+      "epoch": 0.25793178878655565,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0016238649612422098,
+      "loss": 0.0938,
+      "step": 29714
+    },
+    {
+      "epoch": 0.25794046926675984,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0016238407599973863,
+      "loss": 0.0986,
+      "step": 29715
+    },
+    {
+      "epoch": 0.257949149746964,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0016238165581796984,
+      "loss": 0.1201,
+      "step": 29716
+    },
+    {
+      "epoch": 0.25795783022716817,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0016237923557891722,
+      "loss": 0.0903,
+      "step": 29717
+    },
+    {
+      "epoch": 0.2579665107073723,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001623768152825835,
+      "loss": 0.126,
+      "step": 29718
+    },
+    {
+      "epoch": 0.2579751911875765,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0016237439492897128,
+      "loss": 0.1104,
+      "step": 29719
+    },
+    {
+      "epoch": 0.25798387166778064,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001623719745180832,
+      "loss": 0.0742,
+      "step": 29720
+    },
+    {
+      "epoch": 0.25799255214798483,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001623695540499219,
+      "loss": 0.1099,
+      "step": 29721
+    },
+    {
+      "epoch": 0.25800123262818897,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0016236713352449003,
+      "loss": 0.0947,
+      "step": 29722
+    },
+    {
+      "epoch": 0.25800991310839316,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001623647129417903,
+      "loss": 0.1709,
+      "step": 29723
+    },
+    {
+      "epoch": 0.2580185935885973,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0016236229230182527,
+      "loss": 0.1162,
+      "step": 29724
+    },
+    {
+      "epoch": 0.2580272740688015,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0016235987160459762,
+      "loss": 0.1143,
+      "step": 29725
+    },
+    {
+      "epoch": 0.25803595454900563,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0016235745085011004,
+      "loss": 0.0908,
+      "step": 29726
+    },
+    {
+      "epoch": 0.2580446350292098,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001623550300383651,
+      "loss": 0.1104,
+      "step": 29727
+    },
+    {
+      "epoch": 0.25805331550941396,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001623526091693655,
+      "loss": 0.1079,
+      "step": 29728
+    },
+    {
+      "epoch": 0.25806199598961815,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0016235018824311386,
+      "loss": 0.0649,
+      "step": 29729
+    },
+    {
+      "epoch": 0.2580706764698223,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0016234776725961287,
+      "loss": 0.0898,
+      "step": 29730
+    },
+    {
+      "epoch": 0.2580793569500265,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0016234534621886515,
+      "loss": 0.0859,
+      "step": 29731
+    },
+    {
+      "epoch": 0.2580880374302306,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0016234292512087329,
+      "loss": 0.0938,
+      "step": 29732
+    },
+    {
+      "epoch": 0.2580967179104348,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0016234050396564002,
+      "loss": 0.0918,
+      "step": 29733
+    },
+    {
+      "epoch": 0.25810539839063895,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.00162338082753168,
+      "loss": 0.083,
+      "step": 29734
+    },
+    {
+      "epoch": 0.25811407887084314,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.001623356614834598,
+      "loss": 0.0679,
+      "step": 29735
+    },
+    {
+      "epoch": 0.2581227593510473,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0016233324015651812,
+      "loss": 0.0742,
+      "step": 29736
+    },
+    {
+      "epoch": 0.2581314398312515,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0016233081877234559,
+      "loss": 0.165,
+      "step": 29737
+    },
+    {
+      "epoch": 0.2581401203114556,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0016232839733094485,
+      "loss": 0.0977,
+      "step": 29738
+    },
+    {
+      "epoch": 0.2581488007916598,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001623259758323186,
+      "loss": 0.1104,
+      "step": 29739
+    },
+    {
+      "epoch": 0.25815748127186394,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001623235542764694,
+      "loss": 0.1084,
+      "step": 29740
+    },
+    {
+      "epoch": 0.25816616175206814,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0016232113266339999,
+      "loss": 0.0654,
+      "step": 29741
+    },
+    {
+      "epoch": 0.2581748422322723,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0016231871099311296,
+      "loss": 0.0908,
+      "step": 29742
+    },
+    {
+      "epoch": 0.25818352271247647,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0016231628926561097,
+      "loss": 0.2598,
+      "step": 29743
+    },
+    {
+      "epoch": 0.2581922031926806,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0016231386748089668,
+      "loss": 0.1064,
+      "step": 29744
+    },
+    {
+      "epoch": 0.2582008836728848,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0016231144563897272,
+      "loss": 0.1367,
+      "step": 29745
+    },
+    {
+      "epoch": 0.25820956415308893,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0016230902373984173,
+      "loss": 0.1279,
+      "step": 29746
+    },
+    {
+      "epoch": 0.2582182446332931,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001623066017835064,
+      "loss": 0.0942,
+      "step": 29747
+    },
+    {
+      "epoch": 0.25822692511349726,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0016230417976996938,
+      "loss": 0.1094,
+      "step": 29748
+    },
+    {
+      "epoch": 0.25823560559370146,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0016230175769923329,
+      "loss": 0.0928,
+      "step": 29749
+    },
+    {
+      "epoch": 0.2582442860739056,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0016229933557130075,
+      "loss": 0.0942,
+      "step": 29750
+    },
+    {
+      "epoch": 0.2582529665541098,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0016229691338617445,
+      "loss": 0.1001,
+      "step": 29751
+    },
+    {
+      "epoch": 0.2582616470343139,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0016229449114385705,
+      "loss": 0.1089,
+      "step": 29752
+    },
+    {
+      "epoch": 0.2582703275145181,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0016229206884435115,
+      "loss": 0.1104,
+      "step": 29753
+    },
+    {
+      "epoch": 0.25827900799472225,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0016228964648765944,
+      "loss": 0.1211,
+      "step": 29754
+    },
+    {
+      "epoch": 0.25828768847492645,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0016228722407378457,
+      "loss": 0.0942,
+      "step": 29755
+    },
+    {
+      "epoch": 0.2582963689551306,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001622848016027292,
+      "loss": 0.0928,
+      "step": 29756
+    },
+    {
+      "epoch": 0.2583050494353348,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0016228237907449593,
+      "loss": 0.1357,
+      "step": 29757
+    },
+    {
+      "epoch": 0.2583137299155389,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0016227995648908742,
+      "loss": 0.0762,
+      "step": 29758
+    },
+    {
+      "epoch": 0.2583224103957431,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0016227753384650636,
+      "loss": 0.0879,
+      "step": 29759
+    },
+    {
+      "epoch": 0.25833109087594724,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0016227511114675535,
+      "loss": 0.1328,
+      "step": 29760
+    },
+    {
+      "epoch": 0.25833977135615144,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0016227268838983708,
+      "loss": 0.1582,
+      "step": 29761
+    },
+    {
+      "epoch": 0.2583484518363556,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0016227026557575418,
+      "loss": 0.1328,
+      "step": 29762
+    },
+    {
+      "epoch": 0.25835713231655977,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001622678427045093,
+      "loss": 0.0854,
+      "step": 29763
+    },
+    {
+      "epoch": 0.2583658127967639,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001622654197761051,
+      "loss": 0.0859,
+      "step": 29764
+    },
+    {
+      "epoch": 0.2583744932769681,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0016226299679054424,
+      "loss": 0.1045,
+      "step": 29765
+    },
+    {
+      "epoch": 0.25838317375717224,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0016226057374782931,
+      "loss": 0.1133,
+      "step": 29766
+    },
+    {
+      "epoch": 0.25839185423737643,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0016225815064796305,
+      "loss": 0.0859,
+      "step": 29767
+    },
+    {
+      "epoch": 0.25840053471758057,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0016225572749094803,
+      "loss": 0.1094,
+      "step": 29768
+    },
+    {
+      "epoch": 0.25840921519778476,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0016225330427678695,
+      "loss": 0.0879,
+      "step": 29769
+    },
+    {
+      "epoch": 0.2584178956779889,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0016225088100548242,
+      "loss": 0.1162,
+      "step": 29770
+    },
+    {
+      "epoch": 0.2584265761581931,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001622484576770371,
+      "loss": 0.1089,
+      "step": 29771
+    },
+    {
+      "epoch": 0.2584352566383972,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0016224603429145369,
+      "loss": 0.1064,
+      "step": 29772
+    },
+    {
+      "epoch": 0.2584439371186014,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0016224361084873481,
+      "loss": 0.1562,
+      "step": 29773
+    },
+    {
+      "epoch": 0.25845261759880556,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0016224118734888306,
+      "loss": 0.1387,
+      "step": 29774
+    },
+    {
+      "epoch": 0.25846129807900975,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0016223876379190115,
+      "loss": 0.1025,
+      "step": 29775
+    },
+    {
+      "epoch": 0.2584699785592139,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001622363401777917,
+      "loss": 0.1094,
+      "step": 29776
+    },
+    {
+      "epoch": 0.2584786590394181,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0016223391650655738,
+      "loss": 0.0918,
+      "step": 29777
+    },
+    {
+      "epoch": 0.2584873395196222,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0016223149277820087,
+      "loss": 0.0942,
+      "step": 29778
+    },
+    {
+      "epoch": 0.2584960199998264,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0016222906899272475,
+      "loss": 0.1182,
+      "step": 29779
+    },
+    {
+      "epoch": 0.25850470048003055,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0016222664515013168,
+      "loss": 0.1143,
+      "step": 29780
+    },
+    {
+      "epoch": 0.25851338096023474,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0016222422125042437,
+      "loss": 0.0737,
+      "step": 29781
+    },
+    {
+      "epoch": 0.2585220614404389,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0016222179729360544,
+      "loss": 0.0615,
+      "step": 29782
+    },
+    {
+      "epoch": 0.25853074192064307,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0016221937327967754,
+      "loss": 0.1064,
+      "step": 29783
+    },
+    {
+      "epoch": 0.2585394224008472,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0016221694920864327,
+      "loss": 0.0864,
+      "step": 29784
+    },
+    {
+      "epoch": 0.2585481028810514,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0016221452508050538,
+      "loss": 0.1045,
+      "step": 29785
+    },
+    {
+      "epoch": 0.25855678336125554,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0016221210089526646,
+      "loss": 0.0879,
+      "step": 29786
+    },
+    {
+      "epoch": 0.25856546384145973,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0016220967665292916,
+      "loss": 0.1104,
+      "step": 29787
+    },
+    {
+      "epoch": 0.25857414432166387,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0016220725235349613,
+      "loss": 0.0845,
+      "step": 29788
+    },
+    {
+      "epoch": 0.25858282480186806,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0016220482799697007,
+      "loss": 0.103,
+      "step": 29789
+    },
+    {
+      "epoch": 0.2585915052820722,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0016220240358335357,
+      "loss": 0.1064,
+      "step": 29790
+    },
+    {
+      "epoch": 0.2586001857622764,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0016219997911264931,
+      "loss": 0.1113,
+      "step": 29791
+    },
+    {
+      "epoch": 0.25860886624248053,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0016219755458485993,
+      "loss": 0.0874,
+      "step": 29792
+    },
+    {
+      "epoch": 0.2586175467226847,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001621951299999881,
+      "loss": 0.1016,
+      "step": 29793
+    },
+    {
+      "epoch": 0.25862622720288886,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0016219270535803645,
+      "loss": 0.1035,
+      "step": 29794
+    },
+    {
+      "epoch": 0.25863490768309305,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0016219028065900766,
+      "loss": 0.123,
+      "step": 29795
+    },
+    {
+      "epoch": 0.2586435881632972,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0016218785590290435,
+      "loss": 0.1338,
+      "step": 29796
+    },
+    {
+      "epoch": 0.2586522686435014,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001621854310897292,
+      "loss": 0.0986,
+      "step": 29797
+    },
+    {
+      "epoch": 0.2586609491237055,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0016218300621948483,
+      "loss": 0.125,
+      "step": 29798
+    },
+    {
+      "epoch": 0.2586696296039097,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001621805812921739,
+      "loss": 0.0825,
+      "step": 29799
+    },
+    {
+      "epoch": 0.25867831008411385,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0016217815630779908,
+      "loss": 0.1348,
+      "step": 29800
+    },
+    {
+      "epoch": 0.25868699056431804,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0016217573126636297,
+      "loss": 0.1187,
+      "step": 29801
+    },
+    {
+      "epoch": 0.2586956710445222,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0016217330616786834,
+      "loss": 0.1001,
+      "step": 29802
+    },
+    {
+      "epoch": 0.2587043515247264,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0016217088101231774,
+      "loss": 0.0938,
+      "step": 29803
+    },
+    {
+      "epoch": 0.2587130320049305,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0016216845579971385,
+      "loss": 0.0962,
+      "step": 29804
+    },
+    {
+      "epoch": 0.2587217124851347,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001621660305300593,
+      "loss": 0.0679,
+      "step": 29805
+    },
+    {
+      "epoch": 0.25873039296533884,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0016216360520335679,
+      "loss": 0.1084,
+      "step": 29806
+    },
+    {
+      "epoch": 0.25873907344554303,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.001621611798196089,
+      "loss": 0.1484,
+      "step": 29807
+    },
+    {
+      "epoch": 0.25874775392574717,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001621587543788184,
+      "loss": 0.105,
+      "step": 29808
+    },
+    {
+      "epoch": 0.25875643440595136,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001621563288809878,
+      "loss": 0.1055,
+      "step": 29809
+    },
+    {
+      "epoch": 0.2587651148861555,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0016215390332611987,
+      "loss": 0.1094,
+      "step": 29810
+    },
+    {
+      "epoch": 0.2587737953663597,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0016215147771421717,
+      "loss": 0.0938,
+      "step": 29811
+    },
+    {
+      "epoch": 0.25878247584656383,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0016214905204528242,
+      "loss": 0.1162,
+      "step": 29812
+    },
+    {
+      "epoch": 0.258791156326768,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0016214662631931828,
+      "loss": 0.1318,
+      "step": 29813
+    },
+    {
+      "epoch": 0.25879983680697216,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001621442005363273,
+      "loss": 0.124,
+      "step": 29814
+    },
+    {
+      "epoch": 0.25880851728717635,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0016214177469631229,
+      "loss": 0.0918,
+      "step": 29815
+    },
+    {
+      "epoch": 0.2588171977673805,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0016213934879927576,
+      "loss": 0.1089,
+      "step": 29816
+    },
+    {
+      "epoch": 0.25882587824758463,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0016213692284522045,
+      "loss": 0.1201,
+      "step": 29817
+    },
+    {
+      "epoch": 0.2588345587277888,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0016213449683414897,
+      "loss": 0.085,
+      "step": 29818
+    },
+    {
+      "epoch": 0.25884323920799296,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0016213207076606403,
+      "loss": 0.1465,
+      "step": 29819
+    },
+    {
+      "epoch": 0.25885191968819715,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001621296446409682,
+      "loss": 0.127,
+      "step": 29820
+    },
+    {
+      "epoch": 0.2588606001684013,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0016212721845886419,
+      "loss": 0.1504,
+      "step": 29821
+    },
+    {
+      "epoch": 0.2588692806486055,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.001621247922197546,
+      "loss": 0.1406,
+      "step": 29822
+    },
+    {
+      "epoch": 0.2588779611288096,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0016212236592364213,
+      "loss": 0.0996,
+      "step": 29823
+    },
+    {
+      "epoch": 0.2588866416090138,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0016211993957052945,
+      "loss": 0.1084,
+      "step": 29824
+    },
+    {
+      "epoch": 0.25889532208921795,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0016211751316041918,
+      "loss": 0.0776,
+      "step": 29825
+    },
+    {
+      "epoch": 0.25890400256942214,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0016211508669331399,
+      "loss": 0.1055,
+      "step": 29826
+    },
+    {
+      "epoch": 0.2589126830496263,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001621126601692165,
+      "loss": 0.1387,
+      "step": 29827
+    },
+    {
+      "epoch": 0.2589213635298305,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0016211023358812938,
+      "loss": 0.0889,
+      "step": 29828
+    },
+    {
+      "epoch": 0.2589300440100346,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0016210780695005533,
+      "loss": 0.123,
+      "step": 29829
+    },
+    {
+      "epoch": 0.2589387244902388,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0016210538025499695,
+      "loss": 0.1025,
+      "step": 29830
+    },
+    {
+      "epoch": 0.25894740497044294,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0016210295350295688,
+      "loss": 0.1279,
+      "step": 29831
+    },
+    {
+      "epoch": 0.25895608545064713,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0016210052669393786,
+      "loss": 0.0869,
+      "step": 29832
+    },
+    {
+      "epoch": 0.25896476593085127,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0016209809982794243,
+      "loss": 0.0859,
+      "step": 29833
+    },
+    {
+      "epoch": 0.25897344641105546,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0016209567290497333,
+      "loss": 0.1055,
+      "step": 29834
+    },
+    {
+      "epoch": 0.2589821268912596,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001620932459250332,
+      "loss": 0.0698,
+      "step": 29835
+    },
+    {
+      "epoch": 0.2589908073714638,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0016209081888812465,
+      "loss": 0.0806,
+      "step": 29836
+    },
+    {
+      "epoch": 0.25899948785166793,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0016208839179425037,
+      "loss": 0.126,
+      "step": 29837
+    },
+    {
+      "epoch": 0.2590081683318721,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.00162085964643413,
+      "loss": 0.0859,
+      "step": 29838
+    },
+    {
+      "epoch": 0.25901684881207626,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0016208353743561521,
+      "loss": 0.0898,
+      "step": 29839
+    },
+    {
+      "epoch": 0.25902552929228045,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0016208111017085966,
+      "loss": 0.1006,
+      "step": 29840
+    },
+    {
+      "epoch": 0.2590342097724846,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0016207868284914898,
+      "loss": 0.1133,
+      "step": 29841
+    },
+    {
+      "epoch": 0.2590428902526888,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0016207625547048584,
+      "loss": 0.1465,
+      "step": 29842
+    },
+    {
+      "epoch": 0.2590515707328929,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0016207382803487289,
+      "loss": 0.0879,
+      "step": 29843
+    },
+    {
+      "epoch": 0.2590602512130971,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0016207140054231275,
+      "loss": 0.1348,
+      "step": 29844
+    },
+    {
+      "epoch": 0.25906893169330125,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0016206897299280813,
+      "loss": 0.085,
+      "step": 29845
+    },
+    {
+      "epoch": 0.25907761217350544,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0016206654538636167,
+      "loss": 0.1094,
+      "step": 29846
+    },
+    {
+      "epoch": 0.2590862926537096,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0016206411772297605,
+      "loss": 0.105,
+      "step": 29847
+    },
+    {
+      "epoch": 0.2590949731339138,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0016206169000265384,
+      "loss": 0.1025,
+      "step": 29848
+    },
+    {
+      "epoch": 0.2591036536141179,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0016205926222539774,
+      "loss": 0.1123,
+      "step": 29849
+    },
+    {
+      "epoch": 0.2591123340943221,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0016205683439121047,
+      "loss": 0.1104,
+      "step": 29850
+    },
+    {
+      "epoch": 0.25912101457452624,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001620544065000946,
+      "loss": 0.0962,
+      "step": 29851
+    },
+    {
+      "epoch": 0.25912969505473044,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0016205197855205284,
+      "loss": 0.1055,
+      "step": 29852
+    },
+    {
+      "epoch": 0.2591383755349346,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0016204955054708776,
+      "loss": 0.0576,
+      "step": 29853
+    },
+    {
+      "epoch": 0.25914705601513877,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001620471224852021,
+      "loss": 0.1133,
+      "step": 29854
+    },
+    {
+      "epoch": 0.2591557364953429,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001620446943663985,
+      "loss": 0.0869,
+      "step": 29855
+    },
+    {
+      "epoch": 0.2591644169755471,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001620422661906796,
+      "loss": 0.1133,
+      "step": 29856
+    },
+    {
+      "epoch": 0.25917309745575123,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0016203983795804804,
+      "loss": 0.1299,
+      "step": 29857
+    },
+    {
+      "epoch": 0.2591817779359554,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0016203740966850653,
+      "loss": 0.1045,
+      "step": 29858
+    },
+    {
+      "epoch": 0.25919045841615956,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0016203498132205767,
+      "loss": 0.0977,
+      "step": 29859
+    },
+    {
+      "epoch": 0.25919913889636376,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0016203255291870413,
+      "loss": 0.1436,
+      "step": 29860
+    },
+    {
+      "epoch": 0.2592078193765679,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0016203012445844861,
+      "loss": 0.123,
+      "step": 29861
+    },
+    {
+      "epoch": 0.2592164998567721,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0016202769594129368,
+      "loss": 0.0908,
+      "step": 29862
+    },
+    {
+      "epoch": 0.2592251803369762,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0016202526736724206,
+      "loss": 0.1279,
+      "step": 29863
+    },
+    {
+      "epoch": 0.2592338608171804,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001620228387362964,
+      "loss": 0.0908,
+      "step": 29864
+    },
+    {
+      "epoch": 0.25924254129738455,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0016202041004845934,
+      "loss": 0.1406,
+      "step": 29865
+    },
+    {
+      "epoch": 0.25925122177758875,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0016201798130373354,
+      "loss": 0.207,
+      "step": 29866
+    },
+    {
+      "epoch": 0.2592599022577929,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0016201555250212165,
+      "loss": 0.1016,
+      "step": 29867
+    },
+    {
+      "epoch": 0.2592685827379971,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0016201312364362633,
+      "loss": 0.1123,
+      "step": 29868
+    },
+    {
+      "epoch": 0.2592772632182012,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0016201069472825025,
+      "loss": 0.0947,
+      "step": 29869
+    },
+    {
+      "epoch": 0.2592859436984054,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0016200826575599604,
+      "loss": 0.1064,
+      "step": 29870
+    },
+    {
+      "epoch": 0.25929462417860955,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001620058367268664,
+      "loss": 0.0981,
+      "step": 29871
+    },
+    {
+      "epoch": 0.25930330465881374,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0016200340764086394,
+      "loss": 0.1084,
+      "step": 29872
+    },
+    {
+      "epoch": 0.2593119851390179,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0016200097849799133,
+      "loss": 0.1113,
+      "step": 29873
+    },
+    {
+      "epoch": 0.25932066561922207,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0016199854929825125,
+      "loss": 0.083,
+      "step": 29874
+    },
+    {
+      "epoch": 0.2593293460994262,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0016199612004164634,
+      "loss": 0.1504,
+      "step": 29875
+    },
+    {
+      "epoch": 0.2593380265796304,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0016199369072817922,
+      "loss": 0.1006,
+      "step": 29876
+    },
+    {
+      "epoch": 0.25934670705983454,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0016199126135785264,
+      "loss": 0.0933,
+      "step": 29877
+    },
+    {
+      "epoch": 0.25935538754003873,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0016198883193066914,
+      "loss": 0.1045,
+      "step": 29878
+    },
+    {
+      "epoch": 0.25936406802024287,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0016198640244663145,
+      "loss": 0.1162,
+      "step": 29879
+    },
+    {
+      "epoch": 0.25937274850044706,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001619839729057422,
+      "loss": 0.1309,
+      "step": 29880
+    },
+    {
+      "epoch": 0.2593814289806512,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0016198154330800407,
+      "loss": 0.1035,
+      "step": 29881
+    },
+    {
+      "epoch": 0.2593901094608554,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0016197911365341973,
+      "loss": 0.127,
+      "step": 29882
+    },
+    {
+      "epoch": 0.2593987899410595,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0016197668394199178,
+      "loss": 0.1069,
+      "step": 29883
+    },
+    {
+      "epoch": 0.2594074704212637,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0016197425417372293,
+      "loss": 0.0781,
+      "step": 29884
+    },
+    {
+      "epoch": 0.25941615090146786,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001619718243486158,
+      "loss": 0.1699,
+      "step": 29885
+    },
+    {
+      "epoch": 0.25942483138167205,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0016196939446667309,
+      "loss": 0.1118,
+      "step": 29886
+    },
+    {
+      "epoch": 0.2594335118618762,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.001619669645278974,
+      "loss": 0.1396,
+      "step": 29887
+    },
+    {
+      "epoch": 0.2594421923420804,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0016196453453229142,
+      "loss": 0.0835,
+      "step": 29888
+    },
+    {
+      "epoch": 0.2594508728222845,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0016196210447985784,
+      "loss": 0.0967,
+      "step": 29889
+    },
+    {
+      "epoch": 0.2594595533024887,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0016195967437059925,
+      "loss": 0.1089,
+      "step": 29890
+    },
+    {
+      "epoch": 0.25946823378269285,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0016195724420451836,
+      "loss": 0.1162,
+      "step": 29891
+    },
+    {
+      "epoch": 0.25947691426289704,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0016195481398161777,
+      "loss": 0.0996,
+      "step": 29892
+    },
+    {
+      "epoch": 0.2594855947431012,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0016195238370190024,
+      "loss": 0.0923,
+      "step": 29893
+    },
+    {
+      "epoch": 0.25949427522330537,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0016194995336536832,
+      "loss": 0.1152,
+      "step": 29894
+    },
+    {
+      "epoch": 0.2595029557035095,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0016194752297202472,
+      "loss": 0.1055,
+      "step": 29895
+    },
+    {
+      "epoch": 0.2595116361837137,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001619450925218721,
+      "loss": 0.1172,
+      "step": 29896
+    },
+    {
+      "epoch": 0.25952031666391784,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0016194266201491308,
+      "loss": 0.0952,
+      "step": 29897
+    },
+    {
+      "epoch": 0.25952899714412203,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001619402314511504,
+      "loss": 0.1133,
+      "step": 29898
+    },
+    {
+      "epoch": 0.25953767762432617,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.001619378008305866,
+      "loss": 0.0854,
+      "step": 29899
+    },
+    {
+      "epoch": 0.25954635810453036,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0016193537015322442,
+      "loss": 0.1123,
+      "step": 29900
+    },
+    {
+      "epoch": 0.2595550385847345,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001619329394190665,
+      "loss": 0.1201,
+      "step": 29901
+    },
+    {
+      "epoch": 0.2595637190649387,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001619305086281155,
+      "loss": 0.0918,
+      "step": 29902
+    },
+    {
+      "epoch": 0.25957239954514283,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001619280777803741,
+      "loss": 0.127,
+      "step": 29903
+    },
+    {
+      "epoch": 0.259581080025347,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0016192564687584495,
+      "loss": 0.1172,
+      "step": 29904
+    },
+    {
+      "epoch": 0.25958976050555116,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001619232159145306,
+      "loss": 0.0923,
+      "step": 29905
+    },
+    {
+      "epoch": 0.25959844098575535,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0016192078489643386,
+      "loss": 0.1201,
+      "step": 29906
+    },
+    {
+      "epoch": 0.2596071214659595,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0016191835382155738,
+      "loss": 0.0703,
+      "step": 29907
+    },
+    {
+      "epoch": 0.2596158019461637,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001619159226899037,
+      "loss": 0.1875,
+      "step": 29908
+    },
+    {
+      "epoch": 0.2596244824263678,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0016191349150147555,
+      "loss": 0.0967,
+      "step": 29909
+    },
+    {
+      "epoch": 0.259633162906572,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001619110602562756,
+      "loss": 0.0889,
+      "step": 29910
+    },
+    {
+      "epoch": 0.25964184338677615,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001619086289543065,
+      "loss": 0.0981,
+      "step": 29911
+    },
+    {
+      "epoch": 0.25965052386698034,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001619061975955709,
+      "loss": 0.1279,
+      "step": 29912
+    },
+    {
+      "epoch": 0.2596592043471845,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0016190376618007145,
+      "loss": 0.1011,
+      "step": 29913
+    },
+    {
+      "epoch": 0.2596678848273887,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0016190133470781083,
+      "loss": 0.0781,
+      "step": 29914
+    },
+    {
+      "epoch": 0.2596765653075928,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001618989031787917,
+      "loss": 0.0708,
+      "step": 29915
+    },
+    {
+      "epoch": 0.259685245787797,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001618964715930167,
+      "loss": 0.0649,
+      "step": 29916
+    },
+    {
+      "epoch": 0.25969392626800114,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0016189403995048853,
+      "loss": 0.1533,
+      "step": 29917
+    },
+    {
+      "epoch": 0.25970260674820533,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0016189160825120974,
+      "loss": 0.1465,
+      "step": 29918
+    },
+    {
+      "epoch": 0.25971128722840947,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0016188917649518312,
+      "loss": 0.1182,
+      "step": 29919
+    },
+    {
+      "epoch": 0.25971996770861366,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0016188674468241128,
+      "loss": 0.0757,
+      "step": 29920
+    },
+    {
+      "epoch": 0.2597286481888178,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0016188431281289688,
+      "loss": 0.0991,
+      "step": 29921
+    },
+    {
+      "epoch": 0.259737328669022,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0016188188088664256,
+      "loss": 0.0947,
+      "step": 29922
+    },
+    {
+      "epoch": 0.25974600914922613,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.00161879448903651,
+      "loss": 0.0747,
+      "step": 29923
+    },
+    {
+      "epoch": 0.2597546896294303,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0016187701686392482,
+      "loss": 0.0898,
+      "step": 29924
+    },
+    {
+      "epoch": 0.25976337010963446,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0016187458476746676,
+      "loss": 0.1064,
+      "step": 29925
+    },
+    {
+      "epoch": 0.25977205058983865,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0016187215261427939,
+      "loss": 0.1006,
+      "step": 29926
+    },
+    {
+      "epoch": 0.2597807310700428,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0016186972040436545,
+      "loss": 0.127,
+      "step": 29927
+    },
+    {
+      "epoch": 0.259789411550247,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0016186728813772755,
+      "loss": 0.0537,
+      "step": 29928
+    },
+    {
+      "epoch": 0.2597980920304511,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001618648558143684,
+      "loss": 0.0962,
+      "step": 29929
+    },
+    {
+      "epoch": 0.2598067725106553,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001618624234342906,
+      "loss": 0.0918,
+      "step": 29930
+    },
+    {
+      "epoch": 0.25981545299085945,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001618599909974968,
+      "loss": 0.085,
+      "step": 29931
+    },
+    {
+      "epoch": 0.25982413347106365,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0016185755850398969,
+      "loss": 0.0947,
+      "step": 29932
+    },
+    {
+      "epoch": 0.2598328139512678,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0016185512595377196,
+      "loss": 0.125,
+      "step": 29933
+    },
+    {
+      "epoch": 0.259841494431472,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0016185269334684626,
+      "loss": 0.0815,
+      "step": 29934
+    },
+    {
+      "epoch": 0.2598501749116761,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001618502606832152,
+      "loss": 0.1328,
+      "step": 29935
+    },
+    {
+      "epoch": 0.2598588553918803,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0016184782796288151,
+      "loss": 0.1143,
+      "step": 29936
+    },
+    {
+      "epoch": 0.25986753587208444,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0016184539518584778,
+      "loss": 0.1602,
+      "step": 29937
+    },
+    {
+      "epoch": 0.25987621635228864,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001618429623521167,
+      "loss": 0.1055,
+      "step": 29938
+    },
+    {
+      "epoch": 0.2598848968324928,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0016184052946169097,
+      "loss": 0.1445,
+      "step": 29939
+    },
+    {
+      "epoch": 0.2598935773126969,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0016183809651457322,
+      "loss": 0.1152,
+      "step": 29940
+    },
+    {
+      "epoch": 0.2599022577929011,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0016183566351076605,
+      "loss": 0.0889,
+      "step": 29941
+    },
+    {
+      "epoch": 0.25991093827310524,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0016183323045027222,
+      "loss": 0.0835,
+      "step": 29942
+    },
+    {
+      "epoch": 0.25991961875330943,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0016183079733309437,
+      "loss": 0.1245,
+      "step": 29943
+    },
+    {
+      "epoch": 0.25992829923351357,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.001618283641592351,
+      "loss": 0.0894,
+      "step": 29944
+    },
+    {
+      "epoch": 0.25993697971371776,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0016182593092869714,
+      "loss": 0.1064,
+      "step": 29945
+    },
+    {
+      "epoch": 0.2599456601939219,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0016182349764148308,
+      "loss": 0.1152,
+      "step": 29946
+    },
+    {
+      "epoch": 0.2599543406741261,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0016182106429759562,
+      "loss": 0.0918,
+      "step": 29947
+    },
+    {
+      "epoch": 0.25996302115433023,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0016181863089703744,
+      "loss": 0.1494,
+      "step": 29948
+    },
+    {
+      "epoch": 0.2599717016345344,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001618161974398112,
+      "loss": 0.0928,
+      "step": 29949
+    },
+    {
+      "epoch": 0.25998038211473856,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0016181376392591955,
+      "loss": 0.0962,
+      "step": 29950
+    },
+    {
+      "epoch": 0.25998906259494275,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0016181133035536512,
+      "loss": 0.0869,
+      "step": 29951
+    },
+    {
+      "epoch": 0.2599977430751469,
+      "grad_norm": 1.7734375,
+      "learning_rate": 0.0016180889672815061,
+      "loss": 0.2021,
+      "step": 29952
+    },
+    {
+      "epoch": 0.2600064235553511,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0016180646304427865,
+      "loss": 0.1206,
+      "step": 29953
+    },
+    {
+      "epoch": 0.2600151040355552,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0016180402930375194,
+      "loss": 0.0996,
+      "step": 29954
+    },
+    {
+      "epoch": 0.2600237845157594,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0016180159550657311,
+      "loss": 0.103,
+      "step": 29955
+    },
+    {
+      "epoch": 0.26003246499596355,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0016179916165274485,
+      "loss": 0.1045,
+      "step": 29956
+    },
+    {
+      "epoch": 0.26004114547616775,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0016179672774226982,
+      "loss": 0.1094,
+      "step": 29957
+    },
+    {
+      "epoch": 0.2600498259563719,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0016179429377515063,
+      "loss": 0.1055,
+      "step": 29958
+    },
+    {
+      "epoch": 0.2600585064365761,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0016179185975139,
+      "loss": 0.0986,
+      "step": 29959
+    },
+    {
+      "epoch": 0.2600671869167802,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0016178942567099056,
+      "loss": 0.0781,
+      "step": 29960
+    },
+    {
+      "epoch": 0.2600758673969844,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0016178699153395497,
+      "loss": 0.0996,
+      "step": 29961
+    },
+    {
+      "epoch": 0.26008454787718854,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0016178455734028592,
+      "loss": 0.085,
+      "step": 29962
+    },
+    {
+      "epoch": 0.26009322835739274,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0016178212308998608,
+      "loss": 0.0898,
+      "step": 29963
+    },
+    {
+      "epoch": 0.2601019088375969,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0016177968878305804,
+      "loss": 0.0654,
+      "step": 29964
+    },
+    {
+      "epoch": 0.26011058931780107,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0016177725441950455,
+      "loss": 0.1201,
+      "step": 29965
+    },
+    {
+      "epoch": 0.2601192697980052,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0016177481999932822,
+      "loss": 0.0913,
+      "step": 29966
+    },
+    {
+      "epoch": 0.2601279502782094,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0016177238552253172,
+      "loss": 0.1143,
+      "step": 29967
+    },
+    {
+      "epoch": 0.26013663075841353,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001617699509891177,
+      "loss": 0.103,
+      "step": 29968
+    },
+    {
+      "epoch": 0.2601453112386177,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016176751639908886,
+      "loss": 0.0894,
+      "step": 29969
+    },
+    {
+      "epoch": 0.26015399171882186,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0016176508175244784,
+      "loss": 0.0703,
+      "step": 29970
+    },
+    {
+      "epoch": 0.26016267219902606,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.001617626470491973,
+      "loss": 0.1279,
+      "step": 29971
+    },
+    {
+      "epoch": 0.2601713526792302,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0016176021228933991,
+      "loss": 0.1182,
+      "step": 29972
+    },
+    {
+      "epoch": 0.2601800331594344,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0016175777747287833,
+      "loss": 0.0811,
+      "step": 29973
+    },
+    {
+      "epoch": 0.2601887136396385,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0016175534259981522,
+      "loss": 0.1016,
+      "step": 29974
+    },
+    {
+      "epoch": 0.2601973941198427,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0016175290767015324,
+      "loss": 0.1016,
+      "step": 29975
+    },
+    {
+      "epoch": 0.26020607460004685,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0016175047268389509,
+      "loss": 0.0908,
+      "step": 29976
+    },
+    {
+      "epoch": 0.26021475508025105,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0016174803764104338,
+      "loss": 0.0796,
+      "step": 29977
+    },
+    {
+      "epoch": 0.2602234355604552,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0016174560254160076,
+      "loss": 0.0859,
+      "step": 29978
+    },
+    {
+      "epoch": 0.2602321160406594,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0016174316738556999,
+      "loss": 0.1211,
+      "step": 29979
+    },
+    {
+      "epoch": 0.2602407965208635,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0016174073217295362,
+      "loss": 0.1416,
+      "step": 29980
+    },
+    {
+      "epoch": 0.2602494770010677,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0016173829690375437,
+      "loss": 0.0977,
+      "step": 29981
+    },
+    {
+      "epoch": 0.26025815748127185,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0016173586157797488,
+      "loss": 0.1143,
+      "step": 29982
+    },
+    {
+      "epoch": 0.26026683796147604,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0016173342619561785,
+      "loss": 0.2295,
+      "step": 29983
+    },
+    {
+      "epoch": 0.2602755184416802,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0016173099075668593,
+      "loss": 0.0869,
+      "step": 29984
+    },
+    {
+      "epoch": 0.26028419892188437,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001617285552611818,
+      "loss": 0.1533,
+      "step": 29985
+    },
+    {
+      "epoch": 0.2602928794020885,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0016172611970910807,
+      "loss": 0.1006,
+      "step": 29986
+    },
+    {
+      "epoch": 0.2603015598822927,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.001617236841004674,
+      "loss": 0.1055,
+      "step": 29987
+    },
+    {
+      "epoch": 0.26031024036249684,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0016172124843526255,
+      "loss": 0.0684,
+      "step": 29988
+    },
+    {
+      "epoch": 0.26031892084270103,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0016171881271349609,
+      "loss": 0.0996,
+      "step": 29989
+    },
+    {
+      "epoch": 0.26032760132290517,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0016171637693517071,
+      "loss": 0.1016,
+      "step": 29990
+    },
+    {
+      "epoch": 0.26033628180310936,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0016171394110028907,
+      "loss": 0.0898,
+      "step": 29991
+    },
+    {
+      "epoch": 0.2603449622833135,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0016171150520885385,
+      "loss": 0.0908,
+      "step": 29992
+    },
+    {
+      "epoch": 0.2603536427635177,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001617090692608677,
+      "loss": 0.0918,
+      "step": 29993
+    },
+    {
+      "epoch": 0.2603623232437218,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0016170663325633331,
+      "loss": 0.1123,
+      "step": 29994
+    },
+    {
+      "epoch": 0.260371003723926,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001617041971952533,
+      "loss": 0.0791,
+      "step": 29995
+    },
+    {
+      "epoch": 0.26037968420413016,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0016170176107763038,
+      "loss": 0.1465,
+      "step": 29996
+    },
+    {
+      "epoch": 0.26038836468433435,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0016169932490346717,
+      "loss": 0.1162,
+      "step": 29997
+    },
+    {
+      "epoch": 0.2603970451645385,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0016169688867276636,
+      "loss": 0.0938,
+      "step": 29998
+    },
+    {
+      "epoch": 0.2604057256447427,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001616944523855306,
+      "loss": 0.1035,
+      "step": 29999
+    },
+    {
+      "epoch": 0.2604144061249468,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0016169201604176258,
+      "loss": 0.083,
+      "step": 30000
+    },
+    {
+      "epoch": 0.260423086605151,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0016168957964146494,
+      "loss": 0.1172,
+      "step": 30001
+    },
+    {
+      "epoch": 0.26043176708535515,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0016168714318464034,
+      "loss": 0.1094,
+      "step": 30002
+    },
+    {
+      "epoch": 0.26044044756555934,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001616847066712915,
+      "loss": 0.1035,
+      "step": 30003
+    },
+    {
+      "epoch": 0.2604491280457635,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0016168227010142098,
+      "loss": 0.125,
+      "step": 30004
+    },
+    {
+      "epoch": 0.26045780852596767,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0016167983347503154,
+      "loss": 0.0718,
+      "step": 30005
+    },
+    {
+      "epoch": 0.2604664890061718,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001616773967921258,
+      "loss": 0.1045,
+      "step": 30006
+    },
+    {
+      "epoch": 0.260475169486376,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0016167496005270645,
+      "loss": 0.1226,
+      "step": 30007
+    },
+    {
+      "epoch": 0.26048384996658014,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0016167252325677613,
+      "loss": 0.0811,
+      "step": 30008
+    },
+    {
+      "epoch": 0.26049253044678433,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0016167008640433753,
+      "loss": 0.0771,
+      "step": 30009
+    },
+    {
+      "epoch": 0.26050121092698847,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0016166764949539327,
+      "loss": 0.106,
+      "step": 30010
+    },
+    {
+      "epoch": 0.26050989140719266,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0016166521252994608,
+      "loss": 0.1045,
+      "step": 30011
+    },
+    {
+      "epoch": 0.2605185718873968,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0016166277550799858,
+      "loss": 0.104,
+      "step": 30012
+    },
+    {
+      "epoch": 0.260527252367601,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001616603384295534,
+      "loss": 0.0623,
+      "step": 30013
+    },
+    {
+      "epoch": 0.26053593284780513,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001616579012946133,
+      "loss": 0.1396,
+      "step": 30014
+    },
+    {
+      "epoch": 0.2605446133280093,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.001616554641031809,
+      "loss": 0.1162,
+      "step": 30015
+    },
+    {
+      "epoch": 0.26055329380821346,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0016165302685525885,
+      "loss": 0.1133,
+      "step": 30016
+    },
+    {
+      "epoch": 0.26056197428841765,
+      "grad_norm": 2.359375,
+      "learning_rate": 0.001616505895508498,
+      "loss": 0.3574,
+      "step": 30017
+    },
+    {
+      "epoch": 0.2605706547686218,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0016164815218995647,
+      "loss": 0.1143,
+      "step": 30018
+    },
+    {
+      "epoch": 0.260579335248826,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001616457147725815,
+      "loss": 0.1128,
+      "step": 30019
+    },
+    {
+      "epoch": 0.2605880157290301,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016164327729872754,
+      "loss": 0.0879,
+      "step": 30020
+    },
+    {
+      "epoch": 0.2605966962092343,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0016164083976839723,
+      "loss": 0.0845,
+      "step": 30021
+    },
+    {
+      "epoch": 0.26060537668943845,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0016163840218159334,
+      "loss": 0.1025,
+      "step": 30022
+    },
+    {
+      "epoch": 0.26061405716964264,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0016163596453831843,
+      "loss": 0.1289,
+      "step": 30023
+    },
+    {
+      "epoch": 0.2606227376498468,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0016163352683857522,
+      "loss": 0.1328,
+      "step": 30024
+    },
+    {
+      "epoch": 0.260631418130051,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0016163108908236637,
+      "loss": 0.0786,
+      "step": 30025
+    },
+    {
+      "epoch": 0.2606400986102551,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001616286512696945,
+      "loss": 0.0869,
+      "step": 30026
+    },
+    {
+      "epoch": 0.2606487790904593,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0016162621340056235,
+      "loss": 0.1494,
+      "step": 30027
+    },
+    {
+      "epoch": 0.26065745957066344,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0016162377547497254,
+      "loss": 0.1338,
+      "step": 30028
+    },
+    {
+      "epoch": 0.26066614005086763,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0016162133749292773,
+      "loss": 0.1099,
+      "step": 30029
+    },
+    {
+      "epoch": 0.26067482053107177,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001616188994544306,
+      "loss": 0.1133,
+      "step": 30030
+    },
+    {
+      "epoch": 0.26068350101127596,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0016161646135948384,
+      "loss": 0.1025,
+      "step": 30031
+    },
+    {
+      "epoch": 0.2606921814914801,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0016161402320809008,
+      "loss": 0.1108,
+      "step": 30032
+    },
+    {
+      "epoch": 0.2607008619716843,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.00161611585000252,
+      "loss": 0.1064,
+      "step": 30033
+    },
+    {
+      "epoch": 0.26070954245188843,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0016160914673597226,
+      "loss": 0.1016,
+      "step": 30034
+    },
+    {
+      "epoch": 0.2607182229320926,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0016160670841525353,
+      "loss": 0.0781,
+      "step": 30035
+    },
+    {
+      "epoch": 0.26072690341229676,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001616042700380985,
+      "loss": 0.0728,
+      "step": 30036
+    },
+    {
+      "epoch": 0.26073558389250096,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0016160183160450982,
+      "loss": 0.1484,
+      "step": 30037
+    },
+    {
+      "epoch": 0.2607442643727051,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0016159939311449012,
+      "loss": 0.103,
+      "step": 30038
+    },
+    {
+      "epoch": 0.2607529448529093,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001615969545680421,
+      "loss": 0.0791,
+      "step": 30039
+    },
+    {
+      "epoch": 0.2607616253331134,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0016159451596516848,
+      "loss": 0.0947,
+      "step": 30040
+    },
+    {
+      "epoch": 0.2607703058133176,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001615920773058718,
+      "loss": 0.1152,
+      "step": 30041
+    },
+    {
+      "epoch": 0.26077898629352175,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0016158963859015483,
+      "loss": 0.1113,
+      "step": 30042
+    },
+    {
+      "epoch": 0.26078766677372595,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0016158719981802022,
+      "loss": 0.0977,
+      "step": 30043
+    },
+    {
+      "epoch": 0.2607963472539301,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001615847609894706,
+      "loss": 0.0776,
+      "step": 30044
+    },
+    {
+      "epoch": 0.2608050277341343,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001615823221045087,
+      "loss": 0.0991,
+      "step": 30045
+    },
+    {
+      "epoch": 0.2608137082143384,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0016157988316313713,
+      "loss": 0.0942,
+      "step": 30046
+    },
+    {
+      "epoch": 0.2608223886945426,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0016157744416535854,
+      "loss": 0.1006,
+      "step": 30047
+    },
+    {
+      "epoch": 0.26083106917474674,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0016157500511117568,
+      "loss": 0.0889,
+      "step": 30048
+    },
+    {
+      "epoch": 0.26083974965495094,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0016157256600059114,
+      "loss": 0.085,
+      "step": 30049
+    },
+    {
+      "epoch": 0.2608484301351551,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0016157012683360762,
+      "loss": 0.1289,
+      "step": 30050
+    },
+    {
+      "epoch": 0.26085711061535927,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0016156768761022777,
+      "loss": 0.1172,
+      "step": 30051
+    },
+    {
+      "epoch": 0.2608657910955634,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0016156524833045428,
+      "loss": 0.1221,
+      "step": 30052
+    },
+    {
+      "epoch": 0.2608744715757676,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0016156280899428984,
+      "loss": 0.1191,
+      "step": 30053
+    },
+    {
+      "epoch": 0.26088315205597173,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0016156036960173709,
+      "loss": 0.084,
+      "step": 30054
+    },
+    {
+      "epoch": 0.2608918325361759,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0016155793015279864,
+      "loss": 0.0615,
+      "step": 30055
+    },
+    {
+      "epoch": 0.26090051301638006,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0016155549064747724,
+      "loss": 0.1396,
+      "step": 30056
+    },
+    {
+      "epoch": 0.26090919349658426,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0016155305108577554,
+      "loss": 0.0776,
+      "step": 30057
+    },
+    {
+      "epoch": 0.2609178739767884,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0016155061146769618,
+      "loss": 0.1436,
+      "step": 30058
+    },
+    {
+      "epoch": 0.2609265544569926,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0016154817179324184,
+      "loss": 0.1064,
+      "step": 30059
+    },
+    {
+      "epoch": 0.2609352349371967,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0016154573206241523,
+      "loss": 0.1123,
+      "step": 30060
+    },
+    {
+      "epoch": 0.2609439154174009,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0016154329227521895,
+      "loss": 0.1426,
+      "step": 30061
+    },
+    {
+      "epoch": 0.26095259589760506,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0016154085243165572,
+      "loss": 0.0972,
+      "step": 30062
+    },
+    {
+      "epoch": 0.26096127637780925,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0016153841253172816,
+      "loss": 0.1133,
+      "step": 30063
+    },
+    {
+      "epoch": 0.2609699568580134,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.00161535972575439,
+      "loss": 0.0884,
+      "step": 30064
+    },
+    {
+      "epoch": 0.2609786373382175,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0016153353256279086,
+      "loss": 0.0791,
+      "step": 30065
+    },
+    {
+      "epoch": 0.2609873178184217,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0016153109249378642,
+      "loss": 0.1123,
+      "step": 30066
+    },
+    {
+      "epoch": 0.26099599829862585,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0016152865236842835,
+      "loss": 0.0986,
+      "step": 30067
+    },
+    {
+      "epoch": 0.26100467877883005,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0016152621218671935,
+      "loss": 0.1309,
+      "step": 30068
+    },
+    {
+      "epoch": 0.2610133592590342,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.00161523771948662,
+      "loss": 0.1074,
+      "step": 30069
+    },
+    {
+      "epoch": 0.2610220397392384,
+      "grad_norm": 1.625,
+      "learning_rate": 0.0016152133165425906,
+      "loss": 0.2217,
+      "step": 30070
+    },
+    {
+      "epoch": 0.2610307202194425,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001615188913035132,
+      "loss": 0.1055,
+      "step": 30071
+    },
+    {
+      "epoch": 0.2610394006996467,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.00161516450896427,
+      "loss": 0.1309,
+      "step": 30072
+    },
+    {
+      "epoch": 0.26104808117985084,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0016151401043300322,
+      "loss": 0.1006,
+      "step": 30073
+    },
+    {
+      "epoch": 0.26105676166005504,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0016151156991324445,
+      "loss": 0.1104,
+      "step": 30074
+    },
+    {
+      "epoch": 0.2610654421402592,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0016150912933715345,
+      "loss": 0.1387,
+      "step": 30075
+    },
+    {
+      "epoch": 0.26107412262046337,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016150668870473282,
+      "loss": 0.0908,
+      "step": 30076
+    },
+    {
+      "epoch": 0.2610828031006675,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0016150424801598525,
+      "loss": 0.2051,
+      "step": 30077
+    },
+    {
+      "epoch": 0.2610914835808717,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0016150180727091338,
+      "loss": 0.0903,
+      "step": 30078
+    },
+    {
+      "epoch": 0.26110016406107583,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0016149936646951994,
+      "loss": 0.1123,
+      "step": 30079
+    },
+    {
+      "epoch": 0.26110884454128,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0016149692561180757,
+      "loss": 0.1235,
+      "step": 30080
+    },
+    {
+      "epoch": 0.26111752502148416,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0016149448469777894,
+      "loss": 0.1123,
+      "step": 30081
+    },
+    {
+      "epoch": 0.26112620550168836,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0016149204372743668,
+      "loss": 0.0918,
+      "step": 30082
+    },
+    {
+      "epoch": 0.2611348859818925,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001614896027007835,
+      "loss": 0.1289,
+      "step": 30083
+    },
+    {
+      "epoch": 0.2611435664620967,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.001614871616178221,
+      "loss": 0.1211,
+      "step": 30084
+    },
+    {
+      "epoch": 0.2611522469423008,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001614847204785551,
+      "loss": 0.127,
+      "step": 30085
+    },
+    {
+      "epoch": 0.261160927422505,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0016148227928298516,
+      "loss": 0.0923,
+      "step": 30086
+    },
+    {
+      "epoch": 0.26116960790270916,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00161479838031115,
+      "loss": 0.1289,
+      "step": 30087
+    },
+    {
+      "epoch": 0.26117828838291335,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0016147739672294724,
+      "loss": 0.1475,
+      "step": 30088
+    },
+    {
+      "epoch": 0.2611869688631175,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0016147495535848459,
+      "loss": 0.1055,
+      "step": 30089
+    },
+    {
+      "epoch": 0.2611956493433217,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0016147251393772968,
+      "loss": 0.0718,
+      "step": 30090
+    },
+    {
+      "epoch": 0.2612043298235258,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0016147007246068523,
+      "loss": 0.1553,
+      "step": 30091
+    },
+    {
+      "epoch": 0.26121301030373,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0016146763092735386,
+      "loss": 0.0718,
+      "step": 30092
+    },
+    {
+      "epoch": 0.26122169078393415,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0016146518933773829,
+      "loss": 0.0801,
+      "step": 30093
+    },
+    {
+      "epoch": 0.26123037126413834,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0016146274769184112,
+      "loss": 0.0811,
+      "step": 30094
+    },
+    {
+      "epoch": 0.2612390517443425,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001614603059896651,
+      "loss": 0.0835,
+      "step": 30095
+    },
+    {
+      "epoch": 0.26124773222454667,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0016145786423121284,
+      "loss": 0.0996,
+      "step": 30096
+    },
+    {
+      "epoch": 0.2612564127047508,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0016145542241648705,
+      "loss": 0.1309,
+      "step": 30097
+    },
+    {
+      "epoch": 0.261265093184955,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0016145298054549037,
+      "loss": 0.0737,
+      "step": 30098
+    },
+    {
+      "epoch": 0.26127377366515914,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.001614505386182255,
+      "loss": 0.0967,
+      "step": 30099
+    },
+    {
+      "epoch": 0.26128245414536333,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0016144809663469508,
+      "loss": 0.1074,
+      "step": 30100
+    },
+    {
+      "epoch": 0.26129113462556747,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001614456545949018,
+      "loss": 0.0928,
+      "step": 30101
+    },
+    {
+      "epoch": 0.26129981510577166,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0016144321249884832,
+      "loss": 0.1084,
+      "step": 30102
+    },
+    {
+      "epoch": 0.2613084955859758,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001614407703465373,
+      "loss": 0.0859,
+      "step": 30103
+    },
+    {
+      "epoch": 0.26131717606618,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0016143832813797145,
+      "loss": 0.124,
+      "step": 30104
+    },
+    {
+      "epoch": 0.2613258565463841,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0016143588587315342,
+      "loss": 0.0659,
+      "step": 30105
+    },
+    {
+      "epoch": 0.2613345370265883,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.001614334435520859,
+      "loss": 0.126,
+      "step": 30106
+    },
+    {
+      "epoch": 0.26134321750679246,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0016143100117477149,
+      "loss": 0.1138,
+      "step": 30107
+    },
+    {
+      "epoch": 0.26135189798699665,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0016142855874121292,
+      "loss": 0.1602,
+      "step": 30108
+    },
+    {
+      "epoch": 0.2613605784672008,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0016142611625141288,
+      "loss": 0.1777,
+      "step": 30109
+    },
+    {
+      "epoch": 0.261369258947405,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0016142367370537397,
+      "loss": 0.1484,
+      "step": 30110
+    },
+    {
+      "epoch": 0.2613779394276091,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0016142123110309892,
+      "loss": 0.1396,
+      "step": 30111
+    },
+    {
+      "epoch": 0.2613866199078133,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0016141878844459037,
+      "loss": 0.1172,
+      "step": 30112
+    },
+    {
+      "epoch": 0.26139530038801745,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0016141634572985107,
+      "loss": 0.0908,
+      "step": 30113
+    },
+    {
+      "epoch": 0.26140398086822164,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0016141390295888356,
+      "loss": 0.1377,
+      "step": 30114
+    },
+    {
+      "epoch": 0.2614126613484258,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.001614114601316906,
+      "loss": 0.082,
+      "step": 30115
+    },
+    {
+      "epoch": 0.26142134182862997,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0016140901724827485,
+      "loss": 0.1162,
+      "step": 30116
+    },
+    {
+      "epoch": 0.2614300223088341,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0016140657430863893,
+      "loss": 0.0996,
+      "step": 30117
+    },
+    {
+      "epoch": 0.2614387027890383,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001614041313127856,
+      "loss": 0.084,
+      "step": 30118
+    },
+    {
+      "epoch": 0.26144738326924244,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0016140168826071746,
+      "loss": 0.1006,
+      "step": 30119
+    },
+    {
+      "epoch": 0.26145606374944663,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0016139924515243722,
+      "loss": 0.1387,
+      "step": 30120
+    },
+    {
+      "epoch": 0.26146474422965077,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0016139680198794754,
+      "loss": 0.0825,
+      "step": 30121
+    },
+    {
+      "epoch": 0.26147342470985496,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0016139435876725108,
+      "loss": 0.082,
+      "step": 30122
+    },
+    {
+      "epoch": 0.2614821051900591,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001613919154903505,
+      "loss": 0.1074,
+      "step": 30123
+    },
+    {
+      "epoch": 0.2614907856702633,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0016138947215724852,
+      "loss": 0.1455,
+      "step": 30124
+    },
+    {
+      "epoch": 0.26149946615046743,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0016138702876794778,
+      "loss": 0.1035,
+      "step": 30125
+    },
+    {
+      "epoch": 0.2615081466306716,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0016138458532245097,
+      "loss": 0.1367,
+      "step": 30126
+    },
+    {
+      "epoch": 0.26151682711087576,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0016138214182076073,
+      "loss": 0.1084,
+      "step": 30127
+    },
+    {
+      "epoch": 0.26152550759107995,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.001613796982628798,
+      "loss": 0.127,
+      "step": 30128
+    },
+    {
+      "epoch": 0.2615341880712841,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0016137725464881072,
+      "loss": 0.1045,
+      "step": 30129
+    },
+    {
+      "epoch": 0.2615428685514883,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0016137481097855632,
+      "loss": 0.0898,
+      "step": 30130
+    },
+    {
+      "epoch": 0.2615515490316924,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0016137236725211914,
+      "loss": 0.0938,
+      "step": 30131
+    },
+    {
+      "epoch": 0.2615602295118966,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0016136992346950194,
+      "loss": 0.0977,
+      "step": 30132
+    },
+    {
+      "epoch": 0.26156890999210075,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0016136747963070737,
+      "loss": 0.0928,
+      "step": 30133
+    },
+    {
+      "epoch": 0.26157759047230494,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001613650357357381,
+      "loss": 0.1094,
+      "step": 30134
+    },
+    {
+      "epoch": 0.2615862709525091,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0016136259178459677,
+      "loss": 0.1465,
+      "step": 30135
+    },
+    {
+      "epoch": 0.2615949514327133,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.001613601477772861,
+      "loss": 0.4355,
+      "step": 30136
+    },
+    {
+      "epoch": 0.2616036319129174,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0016135770371380875,
+      "loss": 0.1504,
+      "step": 30137
+    },
+    {
+      "epoch": 0.2616123123931216,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0016135525959416737,
+      "loss": 0.1299,
+      "step": 30138
+    },
+    {
+      "epoch": 0.26162099287332574,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.001613528154183647,
+      "loss": 0.1045,
+      "step": 30139
+    },
+    {
+      "epoch": 0.26162967335352993,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001613503711864033,
+      "loss": 0.0781,
+      "step": 30140
+    },
+    {
+      "epoch": 0.26163835383373407,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0016134792689828593,
+      "loss": 0.105,
+      "step": 30141
+    },
+    {
+      "epoch": 0.26164703431393826,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0016134548255401521,
+      "loss": 0.1436,
+      "step": 30142
+    },
+    {
+      "epoch": 0.2616557147941424,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0016134303815359388,
+      "loss": 0.0928,
+      "step": 30143
+    },
+    {
+      "epoch": 0.2616643952743466,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0016134059369702456,
+      "loss": 0.1143,
+      "step": 30144
+    },
+    {
+      "epoch": 0.26167307575455073,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0016133814918430993,
+      "loss": 0.1621,
+      "step": 30145
+    },
+    {
+      "epoch": 0.2616817562347549,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0016133570461545272,
+      "loss": 0.1113,
+      "step": 30146
+    },
+    {
+      "epoch": 0.26169043671495906,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001613332599904555,
+      "loss": 0.1118,
+      "step": 30147
+    },
+    {
+      "epoch": 0.26169911719516326,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0016133081530932101,
+      "loss": 0.1001,
+      "step": 30148
+    },
+    {
+      "epoch": 0.2617077976753674,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0016132837057205192,
+      "loss": 0.0957,
+      "step": 30149
+    },
+    {
+      "epoch": 0.2617164781555716,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0016132592577865089,
+      "loss": 0.0884,
+      "step": 30150
+    },
+    {
+      "epoch": 0.2617251586357757,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001613234809291206,
+      "loss": 0.0977,
+      "step": 30151
+    },
+    {
+      "epoch": 0.2617338391159799,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0016132103602346374,
+      "loss": 0.1631,
+      "step": 30152
+    },
+    {
+      "epoch": 0.26174251959618405,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0016131859106168294,
+      "loss": 0.124,
+      "step": 30153
+    },
+    {
+      "epoch": 0.26175120007638825,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0016131614604378093,
+      "loss": 0.0796,
+      "step": 30154
+    },
+    {
+      "epoch": 0.2617598805565924,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001613137009697603,
+      "loss": 0.0981,
+      "step": 30155
+    },
+    {
+      "epoch": 0.2617685610367966,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0016131125583962382,
+      "loss": 0.1079,
+      "step": 30156
+    },
+    {
+      "epoch": 0.2617772415170007,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001613088106533741,
+      "loss": 0.103,
+      "step": 30157
+    },
+    {
+      "epoch": 0.2617859219972049,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0016130636541101385,
+      "loss": 0.1562,
+      "step": 30158
+    },
+    {
+      "epoch": 0.26179460247740904,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0016130392011254574,
+      "loss": 0.0957,
+      "step": 30159
+    },
+    {
+      "epoch": 0.26180328295761324,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001613014747579724,
+      "loss": 0.105,
+      "step": 30160
+    },
+    {
+      "epoch": 0.2618119634378174,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0016129902934729657,
+      "loss": 0.0859,
+      "step": 30161
+    },
+    {
+      "epoch": 0.26182064391802157,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001612965838805209,
+      "loss": 0.1641,
+      "step": 30162
+    },
+    {
+      "epoch": 0.2618293243982257,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0016129413835764804,
+      "loss": 0.1338,
+      "step": 30163
+    },
+    {
+      "epoch": 0.2618380048784299,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0016129169277868068,
+      "loss": 0.1094,
+      "step": 30164
+    },
+    {
+      "epoch": 0.26184668535863403,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0016128924714362148,
+      "loss": 0.1016,
+      "step": 30165
+    },
+    {
+      "epoch": 0.2618553658388382,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0016128680145247314,
+      "loss": 0.1221,
+      "step": 30166
+    },
+    {
+      "epoch": 0.26186404631904237,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0016128435570523835,
+      "loss": 0.1045,
+      "step": 30167
+    },
+    {
+      "epoch": 0.26187272679924656,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001612819099019197,
+      "loss": 0.0791,
+      "step": 30168
+    },
+    {
+      "epoch": 0.2618814072794507,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0016127946404252,
+      "loss": 0.1367,
+      "step": 30169
+    },
+    {
+      "epoch": 0.2618900877596549,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0016127701812704179,
+      "loss": 0.083,
+      "step": 30170
+    },
+    {
+      "epoch": 0.261898768239859,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0016127457215548784,
+      "loss": 0.1079,
+      "step": 30171
+    },
+    {
+      "epoch": 0.2619074487200632,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0016127212612786076,
+      "loss": 0.1187,
+      "step": 30172
+    },
+    {
+      "epoch": 0.26191612920026736,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0016126968004416327,
+      "loss": 0.1758,
+      "step": 30173
+    },
+    {
+      "epoch": 0.26192480968047155,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00161267233904398,
+      "loss": 0.0938,
+      "step": 30174
+    },
+    {
+      "epoch": 0.2619334901606757,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.001612647877085677,
+      "loss": 0.0747,
+      "step": 30175
+    },
+    {
+      "epoch": 0.2619421706408799,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.00161262341456675,
+      "loss": 0.1309,
+      "step": 30176
+    },
+    {
+      "epoch": 0.261950851121084,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001612598951487225,
+      "loss": 0.0986,
+      "step": 30177
+    },
+    {
+      "epoch": 0.2619595316012882,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0016125744878471302,
+      "loss": 0.1201,
+      "step": 30178
+    },
+    {
+      "epoch": 0.26196821208149235,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0016125500236464914,
+      "loss": 0.1104,
+      "step": 30179
+    },
+    {
+      "epoch": 0.26197689256169654,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0016125255588853356,
+      "loss": 0.1113,
+      "step": 30180
+    },
+    {
+      "epoch": 0.2619855730419007,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0016125010935636895,
+      "loss": 0.0845,
+      "step": 30181
+    },
+    {
+      "epoch": 0.26199425352210487,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.00161247662768158,
+      "loss": 0.0962,
+      "step": 30182
+    },
+    {
+      "epoch": 0.262002934002309,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0016124521612390337,
+      "loss": 0.0928,
+      "step": 30183
+    },
+    {
+      "epoch": 0.2620116144825132,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0016124276942360775,
+      "loss": 0.1079,
+      "step": 30184
+    },
+    {
+      "epoch": 0.26202029496271734,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0016124032266727382,
+      "loss": 0.1133,
+      "step": 30185
+    },
+    {
+      "epoch": 0.26202897544292153,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0016123787585490422,
+      "loss": 0.0898,
+      "step": 30186
+    },
+    {
+      "epoch": 0.26203765592312567,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0016123542898650166,
+      "loss": 0.1387,
+      "step": 30187
+    },
+    {
+      "epoch": 0.2620463364033298,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0016123298206206877,
+      "loss": 0.0918,
+      "step": 30188
+    },
+    {
+      "epoch": 0.262055016883534,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0016123053508160833,
+      "loss": 0.1602,
+      "step": 30189
+    },
+    {
+      "epoch": 0.26206369736373814,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0016122808804512289,
+      "loss": 0.1201,
+      "step": 30190
+    },
+    {
+      "epoch": 0.26207237784394233,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001612256409526152,
+      "loss": 0.1221,
+      "step": 30191
+    },
+    {
+      "epoch": 0.26208105832414647,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001612231938040879,
+      "loss": 0.0996,
+      "step": 30192
+    },
+    {
+      "epoch": 0.26208973880435066,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0016122074659954372,
+      "loss": 0.1172,
+      "step": 30193
+    },
+    {
+      "epoch": 0.2620984192845548,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0016121829933898523,
+      "loss": 0.1021,
+      "step": 30194
+    },
+    {
+      "epoch": 0.262107099764759,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0016121585202241524,
+      "loss": 0.1104,
+      "step": 30195
+    },
+    {
+      "epoch": 0.2621157802449631,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0016121340464983638,
+      "loss": 0.0962,
+      "step": 30196
+    },
+    {
+      "epoch": 0.2621244607251673,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0016121095722125129,
+      "loss": 0.1152,
+      "step": 30197
+    },
+    {
+      "epoch": 0.26213314120537146,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0016120850973666266,
+      "loss": 0.083,
+      "step": 30198
+    },
+    {
+      "epoch": 0.26214182168557565,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0016120606219607316,
+      "loss": 0.0859,
+      "step": 30199
+    },
+    {
+      "epoch": 0.2621505021657798,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001612036145994855,
+      "loss": 0.1079,
+      "step": 30200
+    },
+    {
+      "epoch": 0.262159182645984,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0016120116694690232,
+      "loss": 0.0845,
+      "step": 30201
+    },
+    {
+      "epoch": 0.2621678631261881,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0016119871923832635,
+      "loss": 0.0776,
+      "step": 30202
+    },
+    {
+      "epoch": 0.2621765436063923,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0016119627147376018,
+      "loss": 0.1113,
+      "step": 30203
+    },
+    {
+      "epoch": 0.26218522408659645,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0016119382365320657,
+      "loss": 0.1348,
+      "step": 30204
+    },
+    {
+      "epoch": 0.26219390456680064,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0016119137577666816,
+      "loss": 0.1079,
+      "step": 30205
+    },
+    {
+      "epoch": 0.2622025850470048,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001611889278441476,
+      "loss": 0.085,
+      "step": 30206
+    },
+    {
+      "epoch": 0.26221126552720897,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0016118647985564763,
+      "loss": 0.1133,
+      "step": 30207
+    },
+    {
+      "epoch": 0.2622199460074131,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0016118403181117089,
+      "loss": 0.105,
+      "step": 30208
+    },
+    {
+      "epoch": 0.2622286264876173,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0016118158371072008,
+      "loss": 0.0713,
+      "step": 30209
+    },
+    {
+      "epoch": 0.26223730696782144,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.001611791355542978,
+      "loss": 0.1582,
+      "step": 30210
+    },
+    {
+      "epoch": 0.26224598744802563,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0016117668734190686,
+      "loss": 0.0898,
+      "step": 30211
+    },
+    {
+      "epoch": 0.26225466792822977,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001611742390735498,
+      "loss": 0.1006,
+      "step": 30212
+    },
+    {
+      "epoch": 0.26226334840843396,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0016117179074922941,
+      "loss": 0.1084,
+      "step": 30213
+    },
+    {
+      "epoch": 0.2622720288886381,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001611693423689483,
+      "loss": 0.0908,
+      "step": 30214
+    },
+    {
+      "epoch": 0.2622807093688423,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0016116689393270913,
+      "loss": 0.1006,
+      "step": 30215
+    },
+    {
+      "epoch": 0.26228938984904643,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0016116444544051465,
+      "loss": 0.0903,
+      "step": 30216
+    },
+    {
+      "epoch": 0.2622980703292506,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001611619968923675,
+      "loss": 0.1191,
+      "step": 30217
+    },
+    {
+      "epoch": 0.26230675080945476,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0016115954828827038,
+      "loss": 0.1226,
+      "step": 30218
+    },
+    {
+      "epoch": 0.26231543128965895,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001611570996282259,
+      "loss": 0.0996,
+      "step": 30219
+    },
+    {
+      "epoch": 0.2623241117698631,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0016115465091223681,
+      "loss": 0.084,
+      "step": 30220
+    },
+    {
+      "epoch": 0.2623327922500673,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0016115220214030575,
+      "loss": 0.125,
+      "step": 30221
+    },
+    {
+      "epoch": 0.2623414727302714,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0016114975331243543,
+      "loss": 0.0908,
+      "step": 30222
+    },
+    {
+      "epoch": 0.2623501532104756,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0016114730442862851,
+      "loss": 0.0674,
+      "step": 30223
+    },
+    {
+      "epoch": 0.26235883369067975,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0016114485548888767,
+      "loss": 0.1011,
+      "step": 30224
+    },
+    {
+      "epoch": 0.26236751417088394,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0016114240649321555,
+      "loss": 0.1084,
+      "step": 30225
+    },
+    {
+      "epoch": 0.2623761946510881,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0016113995744161489,
+      "loss": 0.1406,
+      "step": 30226
+    },
+    {
+      "epoch": 0.26238487513129227,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0016113750833408832,
+      "loss": 0.124,
+      "step": 30227
+    },
+    {
+      "epoch": 0.2623935556114964,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0016113505917063858,
+      "loss": 0.127,
+      "step": 30228
+    },
+    {
+      "epoch": 0.2624022360917006,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0016113260995126825,
+      "loss": 0.1016,
+      "step": 30229
+    },
+    {
+      "epoch": 0.26241091657190474,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0016113016067598011,
+      "loss": 0.083,
+      "step": 30230
+    },
+    {
+      "epoch": 0.26241959705210893,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0016112771134477678,
+      "loss": 0.1162,
+      "step": 30231
+    },
+    {
+      "epoch": 0.26242827753231307,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0016112526195766097,
+      "loss": 0.0786,
+      "step": 30232
+    },
+    {
+      "epoch": 0.26243695801251726,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001611228125146353,
+      "loss": 0.1104,
+      "step": 30233
+    },
+    {
+      "epoch": 0.2624456384927214,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001611203630157025,
+      "loss": 0.1553,
+      "step": 30234
+    },
+    {
+      "epoch": 0.2624543189729256,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0016111791346086527,
+      "loss": 0.1079,
+      "step": 30235
+    },
+    {
+      "epoch": 0.26246299945312973,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0016111546385012626,
+      "loss": 0.0806,
+      "step": 30236
+    },
+    {
+      "epoch": 0.2624716799333339,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0016111301418348812,
+      "loss": 0.1143,
+      "step": 30237
+    },
+    {
+      "epoch": 0.26248036041353806,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0016111056446095355,
+      "loss": 0.0791,
+      "step": 30238
+    },
+    {
+      "epoch": 0.26248904089374225,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0016110811468252524,
+      "loss": 0.0938,
+      "step": 30239
+    },
+    {
+      "epoch": 0.2624977213739464,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0016110566484820589,
+      "loss": 0.1045,
+      "step": 30240
+    },
+    {
+      "epoch": 0.2625064018541506,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0016110321495799813,
+      "loss": 0.1777,
+      "step": 30241
+    },
+    {
+      "epoch": 0.2625150823343547,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0016110076501190467,
+      "loss": 0.1387,
+      "step": 30242
+    },
+    {
+      "epoch": 0.2625237628145589,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0016109831500992818,
+      "loss": 0.0884,
+      "step": 30243
+    },
+    {
+      "epoch": 0.26253244329476305,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0016109586495207133,
+      "loss": 0.1201,
+      "step": 30244
+    },
+    {
+      "epoch": 0.26254112377496724,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0016109341483833684,
+      "loss": 0.0977,
+      "step": 30245
+    },
+    {
+      "epoch": 0.2625498042551714,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001610909646687273,
+      "loss": 0.1187,
+      "step": 30246
+    },
+    {
+      "epoch": 0.2625584847353756,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0016108851444324552,
+      "loss": 0.1426,
+      "step": 30247
+    },
+    {
+      "epoch": 0.2625671652155797,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0016108606416189407,
+      "loss": 0.1108,
+      "step": 30248
+    },
+    {
+      "epoch": 0.2625758456957839,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0016108361382467567,
+      "loss": 0.1104,
+      "step": 30249
+    },
+    {
+      "epoch": 0.26258452617598804,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.00161081163431593,
+      "loss": 0.0869,
+      "step": 30250
+    },
+    {
+      "epoch": 0.26259320665619224,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0016107871298264873,
+      "loss": 0.1035,
+      "step": 30251
+    },
+    {
+      "epoch": 0.2626018871363964,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0016107626247784557,
+      "loss": 0.1191,
+      "step": 30252
+    },
+    {
+      "epoch": 0.26261056761660057,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0016107381191718616,
+      "loss": 0.1025,
+      "step": 30253
+    },
+    {
+      "epoch": 0.2626192480968047,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001610713613006732,
+      "loss": 0.0737,
+      "step": 30254
+    },
+    {
+      "epoch": 0.2626279285770089,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0016106891062830934,
+      "loss": 0.1396,
+      "step": 30255
+    },
+    {
+      "epoch": 0.26263660905721303,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0016106645990009733,
+      "loss": 0.1006,
+      "step": 30256
+    },
+    {
+      "epoch": 0.2626452895374172,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001610640091160398,
+      "loss": 0.0986,
+      "step": 30257
+    },
+    {
+      "epoch": 0.26265397001762136,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0016106155827613942,
+      "loss": 0.1309,
+      "step": 30258
+    },
+    {
+      "epoch": 0.26266265049782556,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001610591073803989,
+      "loss": 0.1289,
+      "step": 30259
+    },
+    {
+      "epoch": 0.2626713309780297,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0016105665642882087,
+      "loss": 0.0869,
+      "step": 30260
+    },
+    {
+      "epoch": 0.2626800114582339,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0016105420542140809,
+      "loss": 0.083,
+      "step": 30261
+    },
+    {
+      "epoch": 0.262688691938438,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0016105175435816319,
+      "loss": 0.0903,
+      "step": 30262
+    },
+    {
+      "epoch": 0.2626973724186422,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0016104930323908883,
+      "loss": 0.1602,
+      "step": 30263
+    },
+    {
+      "epoch": 0.26270605289884635,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0016104685206418776,
+      "loss": 0.0703,
+      "step": 30264
+    },
+    {
+      "epoch": 0.26271473337905055,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0016104440083346261,
+      "loss": 0.1309,
+      "step": 30265
+    },
+    {
+      "epoch": 0.2627234138592547,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0016104194954691606,
+      "loss": 0.1055,
+      "step": 30266
+    },
+    {
+      "epoch": 0.2627320943394589,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001610394982045508,
+      "loss": 0.1152,
+      "step": 30267
+    },
+    {
+      "epoch": 0.262740774819663,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0016103704680636953,
+      "loss": 0.1318,
+      "step": 30268
+    },
+    {
+      "epoch": 0.2627494552998672,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001610345953523749,
+      "loss": 0.0986,
+      "step": 30269
+    },
+    {
+      "epoch": 0.26275813578007134,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0016103214384256958,
+      "loss": 0.1191,
+      "step": 30270
+    },
+    {
+      "epoch": 0.26276681626027554,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001610296922769563,
+      "loss": 0.1113,
+      "step": 30271
+    },
+    {
+      "epoch": 0.2627754967404797,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0016102724065553769,
+      "loss": 0.1025,
+      "step": 30272
+    },
+    {
+      "epoch": 0.26278417722068387,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0016102478897831649,
+      "loss": 0.1221,
+      "step": 30273
+    },
+    {
+      "epoch": 0.262792857700888,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0016102233724529533,
+      "loss": 0.0762,
+      "step": 30274
+    },
+    {
+      "epoch": 0.2628015381810922,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0016101988545647692,
+      "loss": 0.0957,
+      "step": 30275
+    },
+    {
+      "epoch": 0.26281021866129634,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0016101743361186392,
+      "loss": 0.0879,
+      "step": 30276
+    },
+    {
+      "epoch": 0.26281889914150053,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0016101498171145901,
+      "loss": 0.0986,
+      "step": 30277
+    },
+    {
+      "epoch": 0.26282757962170467,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0016101252975526492,
+      "loss": 0.127,
+      "step": 30278
+    },
+    {
+      "epoch": 0.26283626010190886,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0016101007774328427,
+      "loss": 0.0977,
+      "step": 30279
+    },
+    {
+      "epoch": 0.262844940582113,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0016100762567551976,
+      "loss": 0.105,
+      "step": 30280
+    },
+    {
+      "epoch": 0.2628536210623172,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0016100517355197405,
+      "loss": 0.103,
+      "step": 30281
+    },
+    {
+      "epoch": 0.2628623015425213,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0016100272137264992,
+      "loss": 0.124,
+      "step": 30282
+    },
+    {
+      "epoch": 0.2628709820227255,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0016100026913754993,
+      "loss": 0.127,
+      "step": 30283
+    },
+    {
+      "epoch": 0.26287966250292966,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0016099781684667683,
+      "loss": 0.1079,
+      "step": 30284
+    },
+    {
+      "epoch": 0.26288834298313385,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0016099536450003328,
+      "loss": 0.1084,
+      "step": 30285
+    },
+    {
+      "epoch": 0.262897023463338,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0016099291209762195,
+      "loss": 0.0996,
+      "step": 30286
+    },
+    {
+      "epoch": 0.2629057039435422,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0016099045963944554,
+      "loss": 0.1787,
+      "step": 30287
+    },
+    {
+      "epoch": 0.2629143844237463,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0016098800712550673,
+      "loss": 0.0654,
+      "step": 30288
+    },
+    {
+      "epoch": 0.2629230649039505,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001609855545558082,
+      "loss": 0.1289,
+      "step": 30289
+    },
+    {
+      "epoch": 0.26293174538415465,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0016098310193035266,
+      "loss": 0.1318,
+      "step": 30290
+    },
+    {
+      "epoch": 0.26294042586435884,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0016098064924914275,
+      "loss": 0.0977,
+      "step": 30291
+    },
+    {
+      "epoch": 0.262949106344563,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0016097819651218117,
+      "loss": 0.1211,
+      "step": 30292
+    },
+    {
+      "epoch": 0.26295778682476717,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001609757437194706,
+      "loss": 0.0791,
+      "step": 30293
+    },
+    {
+      "epoch": 0.2629664673049713,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001609732908710137,
+      "loss": 0.0977,
+      "step": 30294
+    },
+    {
+      "epoch": 0.2629751477851755,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001609708379668132,
+      "loss": 0.126,
+      "step": 30295
+    },
+    {
+      "epoch": 0.26298382826537964,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0016096838500687177,
+      "loss": 0.1006,
+      "step": 30296
+    },
+    {
+      "epoch": 0.26299250874558383,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0016096593199119206,
+      "loss": 0.1016,
+      "step": 30297
+    },
+    {
+      "epoch": 0.26300118922578797,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0016096347891977679,
+      "loss": 0.0781,
+      "step": 30298
+    },
+    {
+      "epoch": 0.26300986970599216,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0016096102579262858,
+      "loss": 0.1143,
+      "step": 30299
+    },
+    {
+      "epoch": 0.2630185501861963,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.001609585726097502,
+      "loss": 0.0703,
+      "step": 30300
+    },
+    {
+      "epoch": 0.2630272306664005,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001609561193711443,
+      "loss": 0.1123,
+      "step": 30301
+    },
+    {
+      "epoch": 0.26303591114660463,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0016095366607681353,
+      "loss": 0.1211,
+      "step": 30302
+    },
+    {
+      "epoch": 0.2630445916268088,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.001609512127267606,
+      "loss": 0.0977,
+      "step": 30303
+    },
+    {
+      "epoch": 0.26305327210701296,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0016094875932098815,
+      "loss": 0.0986,
+      "step": 30304
+    },
+    {
+      "epoch": 0.26306195258721715,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0016094630585949898,
+      "loss": 0.0942,
+      "step": 30305
+    },
+    {
+      "epoch": 0.2630706330674213,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0016094385234229566,
+      "loss": 0.1328,
+      "step": 30306
+    },
+    {
+      "epoch": 0.2630793135476255,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001609413987693809,
+      "loss": 0.1035,
+      "step": 30307
+    },
+    {
+      "epoch": 0.2630879940278296,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001609389451407574,
+      "loss": 0.0986,
+      "step": 30308
+    },
+    {
+      "epoch": 0.2630966745080338,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001609364914564278,
+      "loss": 0.0879,
+      "step": 30309
+    },
+    {
+      "epoch": 0.26310535498823795,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0016093403771639488,
+      "loss": 0.085,
+      "step": 30310
+    },
+    {
+      "epoch": 0.2631140354684421,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001609315839206612,
+      "loss": 0.0908,
+      "step": 30311
+    },
+    {
+      "epoch": 0.2631227159486463,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0016092913006922954,
+      "loss": 0.0698,
+      "step": 30312
+    },
+    {
+      "epoch": 0.2631313964288504,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0016092667616210256,
+      "loss": 0.165,
+      "step": 30313
+    },
+    {
+      "epoch": 0.2631400769090546,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0016092422219928292,
+      "loss": 0.0649,
+      "step": 30314
+    },
+    {
+      "epoch": 0.26314875738925875,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001609217681807733,
+      "loss": 0.0918,
+      "step": 30315
+    },
+    {
+      "epoch": 0.26315743786946294,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0016091931410657642,
+      "loss": 0.123,
+      "step": 30316
+    },
+    {
+      "epoch": 0.2631661183496671,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0016091685997669492,
+      "loss": 0.0962,
+      "step": 30317
+    },
+    {
+      "epoch": 0.26317479882987127,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0016091440579113151,
+      "loss": 0.0918,
+      "step": 30318
+    },
+    {
+      "epoch": 0.2631834793100754,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0016091195154988892,
+      "loss": 0.0986,
+      "step": 30319
+    },
+    {
+      "epoch": 0.2631921597902796,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0016090949725296975,
+      "loss": 0.1084,
+      "step": 30320
+    },
+    {
+      "epoch": 0.26320084027048374,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.001609070429003767,
+      "loss": 0.1377,
+      "step": 30321
+    },
+    {
+      "epoch": 0.26320952075068793,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.001609045884921125,
+      "loss": 0.0781,
+      "step": 30322
+    },
+    {
+      "epoch": 0.26321820123089207,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001609021340281798,
+      "loss": 0.1738,
+      "step": 30323
+    },
+    {
+      "epoch": 0.26322688171109626,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0016089967950858126,
+      "loss": 0.1309,
+      "step": 30324
+    },
+    {
+      "epoch": 0.2632355621913004,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0016089722493331964,
+      "loss": 0.1045,
+      "step": 30325
+    },
+    {
+      "epoch": 0.2632442426715046,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0016089477030239757,
+      "loss": 0.1055,
+      "step": 30326
+    },
+    {
+      "epoch": 0.26325292315170873,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0016089231561581773,
+      "loss": 0.1152,
+      "step": 30327
+    },
+    {
+      "epoch": 0.2632616036319129,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.001608898608735828,
+      "loss": 0.085,
+      "step": 30328
+    },
+    {
+      "epoch": 0.26327028411211706,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0016088740607569552,
+      "loss": 0.1582,
+      "step": 30329
+    },
+    {
+      "epoch": 0.26327896459232125,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0016088495122215853,
+      "loss": 0.0938,
+      "step": 30330
+    },
+    {
+      "epoch": 0.2632876450725254,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0016088249631297455,
+      "loss": 0.0938,
+      "step": 30331
+    },
+    {
+      "epoch": 0.2632963255527296,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0016088004134814614,
+      "loss": 0.0967,
+      "step": 30332
+    },
+    {
+      "epoch": 0.2633050060329337,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0016087758632767617,
+      "loss": 0.1484,
+      "step": 30333
+    },
+    {
+      "epoch": 0.2633136865131379,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001608751312515672,
+      "loss": 0.1475,
+      "step": 30334
+    },
+    {
+      "epoch": 0.26332236699334205,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0016087267611982197,
+      "loss": 0.1123,
+      "step": 30335
+    },
+    {
+      "epoch": 0.26333104747354624,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0016087022093244315,
+      "loss": 0.0776,
+      "step": 30336
+    },
+    {
+      "epoch": 0.2633397279537504,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0016086776568943338,
+      "loss": 0.0752,
+      "step": 30337
+    },
+    {
+      "epoch": 0.2633484084339546,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0016086531039079541,
+      "loss": 0.1025,
+      "step": 30338
+    },
+    {
+      "epoch": 0.2633570889141587,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0016086285503653193,
+      "loss": 0.1338,
+      "step": 30339
+    },
+    {
+      "epoch": 0.2633657693943629,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0016086039962664555,
+      "loss": 0.1187,
+      "step": 30340
+    },
+    {
+      "epoch": 0.26337444987456704,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0016085794416113904,
+      "loss": 0.1055,
+      "step": 30341
+    },
+    {
+      "epoch": 0.26338313035477123,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.00160855488640015,
+      "loss": 0.0879,
+      "step": 30342
+    },
+    {
+      "epoch": 0.26339181083497537,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0016085303306327622,
+      "loss": 0.106,
+      "step": 30343
+    },
+    {
+      "epoch": 0.26340049131517956,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0016085057743092529,
+      "loss": 0.1152,
+      "step": 30344
+    },
+    {
+      "epoch": 0.2634091717953837,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0016084812174296493,
+      "loss": 0.1152,
+      "step": 30345
+    },
+    {
+      "epoch": 0.2634178522755879,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0016084566599939784,
+      "loss": 0.1167,
+      "step": 30346
+    },
+    {
+      "epoch": 0.26342653275579203,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0016084321020022668,
+      "loss": 0.1318,
+      "step": 30347
+    },
+    {
+      "epoch": 0.2634352132359962,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0016084075434545416,
+      "loss": 0.1543,
+      "step": 30348
+    },
+    {
+      "epoch": 0.26344389371620036,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0016083829843508298,
+      "loss": 0.1562,
+      "step": 30349
+    },
+    {
+      "epoch": 0.26345257419640455,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0016083584246911575,
+      "loss": 0.0835,
+      "step": 30350
+    },
+    {
+      "epoch": 0.2634612546766087,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0016083338644755523,
+      "loss": 0.0889,
+      "step": 30351
+    },
+    {
+      "epoch": 0.2634699351568129,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001608309303704041,
+      "loss": 0.0933,
+      "step": 30352
+    },
+    {
+      "epoch": 0.263478615637017,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.00160828474237665,
+      "loss": 0.0947,
+      "step": 30353
+    },
+    {
+      "epoch": 0.2634872961172212,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0016082601804934067,
+      "loss": 0.1118,
+      "step": 30354
+    },
+    {
+      "epoch": 0.26349597659742535,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0016082356180543375,
+      "loss": 0.1191,
+      "step": 30355
+    },
+    {
+      "epoch": 0.26350465707762954,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0016082110550594696,
+      "loss": 0.1089,
+      "step": 30356
+    },
+    {
+      "epoch": 0.2635133375578337,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0016081864915088295,
+      "loss": 0.0869,
+      "step": 30357
+    },
+    {
+      "epoch": 0.2635220180380379,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0016081619274024442,
+      "loss": 0.123,
+      "step": 30358
+    },
+    {
+      "epoch": 0.263530698518242,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001608137362740341,
+      "loss": 0.1348,
+      "step": 30359
+    },
+    {
+      "epoch": 0.2635393789984462,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0016081127975225464,
+      "loss": 0.1133,
+      "step": 30360
+    },
+    {
+      "epoch": 0.26354805947865034,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0016080882317490873,
+      "loss": 0.1133,
+      "step": 30361
+    },
+    {
+      "epoch": 0.26355673995885454,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0016080636654199904,
+      "loss": 0.0613,
+      "step": 30362
+    },
+    {
+      "epoch": 0.2635654204390587,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0016080390985352825,
+      "loss": 0.2109,
+      "step": 30363
+    },
+    {
+      "epoch": 0.26357410091926287,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0016080145310949912,
+      "loss": 0.082,
+      "step": 30364
+    },
+    {
+      "epoch": 0.263582781399467,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0016079899630991423,
+      "loss": 0.0674,
+      "step": 30365
+    },
+    {
+      "epoch": 0.2635914618796712,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0016079653945477636,
+      "loss": 0.0635,
+      "step": 30366
+    },
+    {
+      "epoch": 0.26360014235987533,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0016079408254408814,
+      "loss": 0.1357,
+      "step": 30367
+    },
+    {
+      "epoch": 0.2636088228400795,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001607916255778523,
+      "loss": 0.1191,
+      "step": 30368
+    },
+    {
+      "epoch": 0.26361750332028366,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0016078916855607147,
+      "loss": 0.0938,
+      "step": 30369
+    },
+    {
+      "epoch": 0.26362618380048786,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0016078671147874838,
+      "loss": 0.0879,
+      "step": 30370
+    },
+    {
+      "epoch": 0.263634864280692,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0016078425434588569,
+      "loss": 0.1367,
+      "step": 30371
+    },
+    {
+      "epoch": 0.2636435447608962,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0016078179715748612,
+      "loss": 0.0732,
+      "step": 30372
+    },
+    {
+      "epoch": 0.2636522252411003,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0016077933991355234,
+      "loss": 0.0674,
+      "step": 30373
+    },
+    {
+      "epoch": 0.2636609057213045,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0016077688261408702,
+      "loss": 0.1309,
+      "step": 30374
+    },
+    {
+      "epoch": 0.26366958620150865,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001607744252590929,
+      "loss": 0.104,
+      "step": 30375
+    },
+    {
+      "epoch": 0.26367826668171285,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.001607719678485726,
+      "loss": 0.1084,
+      "step": 30376
+    },
+    {
+      "epoch": 0.263686947161917,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0016076951038252886,
+      "loss": 0.1289,
+      "step": 30377
+    },
+    {
+      "epoch": 0.2636956276421212,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.001607670528609643,
+      "loss": 0.1416,
+      "step": 30378
+    },
+    {
+      "epoch": 0.2637043081223253,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001607645952838817,
+      "loss": 0.0986,
+      "step": 30379
+    },
+    {
+      "epoch": 0.2637129886025295,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0016076213765128366,
+      "loss": 0.1123,
+      "step": 30380
+    },
+    {
+      "epoch": 0.26372166908273365,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0016075967996317297,
+      "loss": 0.0781,
+      "step": 30381
+    },
+    {
+      "epoch": 0.26373034956293784,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0016075722221955222,
+      "loss": 0.0845,
+      "step": 30382
+    },
+    {
+      "epoch": 0.263739030043142,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0016075476442042416,
+      "loss": 0.0835,
+      "step": 30383
+    },
+    {
+      "epoch": 0.26374771052334617,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0016075230656579142,
+      "loss": 0.0859,
+      "step": 30384
+    },
+    {
+      "epoch": 0.2637563910035503,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0016074984865565671,
+      "loss": 0.0933,
+      "step": 30385
+    },
+    {
+      "epoch": 0.2637650714837545,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0016074739069002278,
+      "loss": 0.0938,
+      "step": 30386
+    },
+    {
+      "epoch": 0.26377375196395864,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0016074493266889224,
+      "loss": 0.1279,
+      "step": 30387
+    },
+    {
+      "epoch": 0.26378243244416283,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0016074247459226779,
+      "loss": 0.0977,
+      "step": 30388
+    },
+    {
+      "epoch": 0.26379111292436697,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0016074001646015214,
+      "loss": 0.1025,
+      "step": 30389
+    },
+    {
+      "epoch": 0.26379979340457116,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0016073755827254797,
+      "loss": 0.1094,
+      "step": 30390
+    },
+    {
+      "epoch": 0.2638084738847753,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0016073510002945796,
+      "loss": 0.1152,
+      "step": 30391
+    },
+    {
+      "epoch": 0.2638171543649795,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0016073264173088481,
+      "loss": 0.1504,
+      "step": 30392
+    },
+    {
+      "epoch": 0.2638258348451836,
+      "grad_norm": 5.78125,
+      "learning_rate": 0.001607301833768312,
+      "loss": 0.4023,
+      "step": 30393
+    },
+    {
+      "epoch": 0.2638345153253878,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0016072772496729987,
+      "loss": 0.1143,
+      "step": 30394
+    },
+    {
+      "epoch": 0.26384319580559196,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0016072526650229343,
+      "loss": 0.1113,
+      "step": 30395
+    },
+    {
+      "epoch": 0.26385187628579615,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.001607228079818146,
+      "loss": 0.1172,
+      "step": 30396
+    },
+    {
+      "epoch": 0.2638605567660003,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0016072034940586605,
+      "loss": 0.1318,
+      "step": 30397
+    },
+    {
+      "epoch": 0.2638692372462045,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0016071789077445051,
+      "loss": 0.105,
+      "step": 30398
+    },
+    {
+      "epoch": 0.2638779177264086,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0016071543208757064,
+      "loss": 0.1016,
+      "step": 30399
+    },
+    {
+      "epoch": 0.2638865982066128,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0016071297334522914,
+      "loss": 0.1011,
+      "step": 30400
+    },
+    {
+      "epoch": 0.26389527868681695,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001607105145474287,
+      "loss": 0.0962,
+      "step": 30401
+    },
+    {
+      "epoch": 0.26390395916702114,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0016070805569417198,
+      "loss": 0.1177,
+      "step": 30402
+    },
+    {
+      "epoch": 0.2639126396472253,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001607055967854617,
+      "loss": 0.1299,
+      "step": 30403
+    },
+    {
+      "epoch": 0.26392132012742947,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0016070313782130057,
+      "loss": 0.1045,
+      "step": 30404
+    },
+    {
+      "epoch": 0.2639300006076336,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0016070067880169124,
+      "loss": 0.1152,
+      "step": 30405
+    },
+    {
+      "epoch": 0.2639386810878378,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0016069821972663637,
+      "loss": 0.1201,
+      "step": 30406
+    },
+    {
+      "epoch": 0.26394736156804194,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0016069576059613872,
+      "loss": 0.1143,
+      "step": 30407
+    },
+    {
+      "epoch": 0.26395604204824613,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0016069330141020098,
+      "loss": 0.0801,
+      "step": 30408
+    },
+    {
+      "epoch": 0.26396472252845027,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0016069084216882574,
+      "loss": 0.127,
+      "step": 30409
+    },
+    {
+      "epoch": 0.26397340300865446,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001606883828720158,
+      "loss": 0.1035,
+      "step": 30410
+    },
+    {
+      "epoch": 0.2639820834888586,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0016068592351977378,
+      "loss": 0.1064,
+      "step": 30411
+    },
+    {
+      "epoch": 0.2639907639690628,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0016068346411210245,
+      "loss": 0.1001,
+      "step": 30412
+    },
+    {
+      "epoch": 0.26399944444926693,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001606810046490044,
+      "loss": 0.0957,
+      "step": 30413
+    },
+    {
+      "epoch": 0.2640081249294711,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016067854513048235,
+      "loss": 0.1064,
+      "step": 30414
+    },
+    {
+      "epoch": 0.26401680540967526,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0016067608555653903,
+      "loss": 0.1191,
+      "step": 30415
+    },
+    {
+      "epoch": 0.26402548588987945,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0016067362592717708,
+      "loss": 0.1099,
+      "step": 30416
+    },
+    {
+      "epoch": 0.2640341663700836,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0016067116624239926,
+      "loss": 0.1543,
+      "step": 30417
+    },
+    {
+      "epoch": 0.2640428468502878,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0016066870650220818,
+      "loss": 0.1553,
+      "step": 30418
+    },
+    {
+      "epoch": 0.2640515273304919,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0016066624670660657,
+      "loss": 0.1279,
+      "step": 30419
+    },
+    {
+      "epoch": 0.2640602078106961,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.001606637868555971,
+      "loss": 0.0869,
+      "step": 30420
+    },
+    {
+      "epoch": 0.26406888829090025,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001606613269491825,
+      "loss": 0.1396,
+      "step": 30421
+    },
+    {
+      "epoch": 0.26407756877110444,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0016065886698736545,
+      "loss": 0.1157,
+      "step": 30422
+    },
+    {
+      "epoch": 0.2640862492513086,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001606564069701486,
+      "loss": 0.0859,
+      "step": 30423
+    },
+    {
+      "epoch": 0.2640949297315128,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0016065394689753465,
+      "loss": 0.1025,
+      "step": 30424
+    },
+    {
+      "epoch": 0.2641036102117169,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001606514867695263,
+      "loss": 0.1201,
+      "step": 30425
+    },
+    {
+      "epoch": 0.2641122906919211,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0016064902658612626,
+      "loss": 0.1162,
+      "step": 30426
+    },
+    {
+      "epoch": 0.26412097117212524,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001606465663473372,
+      "loss": 0.1035,
+      "step": 30427
+    },
+    {
+      "epoch": 0.26412965165232943,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0016064410605316182,
+      "loss": 0.125,
+      "step": 30428
+    },
+    {
+      "epoch": 0.26413833213253357,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0016064164570360282,
+      "loss": 0.1055,
+      "step": 30429
+    },
+    {
+      "epoch": 0.26414701261273776,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0016063918529866286,
+      "loss": 0.1689,
+      "step": 30430
+    },
+    {
+      "epoch": 0.2641556930929419,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0016063672483834467,
+      "loss": 0.1504,
+      "step": 30431
+    },
+    {
+      "epoch": 0.2641643735731461,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0016063426432265091,
+      "loss": 0.1055,
+      "step": 30432
+    },
+    {
+      "epoch": 0.26417305405335023,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0016063180375158427,
+      "loss": 0.1143,
+      "step": 30433
+    },
+    {
+      "epoch": 0.26418173453355437,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0016062934312514746,
+      "loss": 0.1201,
+      "step": 30434
+    },
+    {
+      "epoch": 0.26419041501375856,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0016062688244334314,
+      "loss": 0.1406,
+      "step": 30435
+    },
+    {
+      "epoch": 0.2641990954939627,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0016062442170617405,
+      "loss": 0.1177,
+      "step": 30436
+    },
+    {
+      "epoch": 0.2642077759741669,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0016062196091364282,
+      "loss": 0.084,
+      "step": 30437
+    },
+    {
+      "epoch": 0.26421645645437103,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0016061950006575221,
+      "loss": 0.168,
+      "step": 30438
+    },
+    {
+      "epoch": 0.2642251369345752,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016061703916250484,
+      "loss": 0.083,
+      "step": 30439
+    },
+    {
+      "epoch": 0.26423381741477936,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0016061457820390349,
+      "loss": 0.0874,
+      "step": 30440
+    },
+    {
+      "epoch": 0.26424249789498355,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0016061211718995075,
+      "loss": 0.0928,
+      "step": 30441
+    },
+    {
+      "epoch": 0.2642511783751877,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0016060965612064936,
+      "loss": 0.126,
+      "step": 30442
+    },
+    {
+      "epoch": 0.2642598588553919,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0016060719499600203,
+      "loss": 0.1523,
+      "step": 30443
+    },
+    {
+      "epoch": 0.264268539335596,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0016060473381601143,
+      "loss": 0.1074,
+      "step": 30444
+    },
+    {
+      "epoch": 0.2642772198158002,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0016060227258068024,
+      "loss": 0.0928,
+      "step": 30445
+    },
+    {
+      "epoch": 0.26428590029600435,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0016059981129001115,
+      "loss": 0.1104,
+      "step": 30446
+    },
+    {
+      "epoch": 0.26429458077620854,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0016059734994400693,
+      "loss": 0.0645,
+      "step": 30447
+    },
+    {
+      "epoch": 0.2643032612564127,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0016059488854267014,
+      "loss": 0.0898,
+      "step": 30448
+    },
+    {
+      "epoch": 0.2643119417366169,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0016059242708600355,
+      "loss": 0.104,
+      "step": 30449
+    },
+    {
+      "epoch": 0.264320622216821,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0016058996557400988,
+      "loss": 0.0962,
+      "step": 30450
+    },
+    {
+      "epoch": 0.2643293026970252,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0016058750400669175,
+      "loss": 0.2891,
+      "step": 30451
+    },
+    {
+      "epoch": 0.26433798317722934,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0016058504238405191,
+      "loss": 0.0854,
+      "step": 30452
+    },
+    {
+      "epoch": 0.26434666365743353,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0016058258070609302,
+      "loss": 0.0918,
+      "step": 30453
+    },
+    {
+      "epoch": 0.26435534413763767,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0016058011897281776,
+      "loss": 0.1006,
+      "step": 30454
+    },
+    {
+      "epoch": 0.26436402461784186,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0016057765718422887,
+      "loss": 0.1592,
+      "step": 30455
+    },
+    {
+      "epoch": 0.264372705098046,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0016057519534032898,
+      "loss": 0.1094,
+      "step": 30456
+    },
+    {
+      "epoch": 0.2643813855782502,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0016057273344112085,
+      "loss": 0.1123,
+      "step": 30457
+    },
+    {
+      "epoch": 0.26439006605845433,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0016057027148660712,
+      "loss": 0.1699,
+      "step": 30458
+    },
+    {
+      "epoch": 0.2643987465386585,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0016056780947679053,
+      "loss": 0.1016,
+      "step": 30459
+    },
+    {
+      "epoch": 0.26440742701886266,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0016056534741167373,
+      "loss": 0.084,
+      "step": 30460
+    },
+    {
+      "epoch": 0.26441610749906685,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0016056288529125943,
+      "loss": 0.1216,
+      "step": 30461
+    },
+    {
+      "epoch": 0.264424787979271,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0016056042311555028,
+      "loss": 0.1113,
+      "step": 30462
+    },
+    {
+      "epoch": 0.2644334684594752,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0016055796088454903,
+      "loss": 0.1216,
+      "step": 30463
+    },
+    {
+      "epoch": 0.2644421489396793,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0016055549859825835,
+      "loss": 0.0688,
+      "step": 30464
+    },
+    {
+      "epoch": 0.2644508294198835,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0016055303625668095,
+      "loss": 0.1211,
+      "step": 30465
+    },
+    {
+      "epoch": 0.26445950990008765,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001605505738598195,
+      "loss": 0.0996,
+      "step": 30466
+    },
+    {
+      "epoch": 0.26446819038029185,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001605481114076767,
+      "loss": 0.1279,
+      "step": 30467
+    },
+    {
+      "epoch": 0.264476870860496,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001605456489002553,
+      "loss": 0.1123,
+      "step": 30468
+    },
+    {
+      "epoch": 0.2644855513407002,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0016054318633755788,
+      "loss": 0.1592,
+      "step": 30469
+    },
+    {
+      "epoch": 0.2644942318209043,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001605407237195872,
+      "loss": 0.1074,
+      "step": 30470
+    },
+    {
+      "epoch": 0.2645029123011085,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0016053826104634593,
+      "loss": 0.1123,
+      "step": 30471
+    },
+    {
+      "epoch": 0.26451159278131264,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001605357983178368,
+      "loss": 0.1221,
+      "step": 30472
+    },
+    {
+      "epoch": 0.26452027326151684,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0016053333553406246,
+      "loss": 0.0928,
+      "step": 30473
+    },
+    {
+      "epoch": 0.264528953741721,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0016053087269502567,
+      "loss": 0.0854,
+      "step": 30474
+    },
+    {
+      "epoch": 0.26453763422192517,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.00160528409800729,
+      "loss": 0.0986,
+      "step": 30475
+    },
+    {
+      "epoch": 0.2645463147021293,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0016052594685117528,
+      "loss": 0.0898,
+      "step": 30476
+    },
+    {
+      "epoch": 0.2645549951823335,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0016052348384636712,
+      "loss": 0.1084,
+      "step": 30477
+    },
+    {
+      "epoch": 0.26456367566253763,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0016052102078630726,
+      "loss": 0.1367,
+      "step": 30478
+    },
+    {
+      "epoch": 0.2645723561427418,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0016051855767099832,
+      "loss": 0.1104,
+      "step": 30479
+    },
+    {
+      "epoch": 0.26458103662294596,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0016051609450044308,
+      "loss": 0.0703,
+      "step": 30480
+    },
+    {
+      "epoch": 0.26458971710315016,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0016051363127464423,
+      "loss": 0.1289,
+      "step": 30481
+    },
+    {
+      "epoch": 0.2645983975833543,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0016051116799360439,
+      "loss": 0.1099,
+      "step": 30482
+    },
+    {
+      "epoch": 0.2646070780635585,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001605087046573263,
+      "loss": 0.1152,
+      "step": 30483
+    },
+    {
+      "epoch": 0.2646157585437626,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0016050624126581266,
+      "loss": 0.1309,
+      "step": 30484
+    },
+    {
+      "epoch": 0.2646244390239668,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0016050377781906611,
+      "loss": 0.1123,
+      "step": 30485
+    },
+    {
+      "epoch": 0.26463311950417095,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0016050131431708948,
+      "loss": 0.1709,
+      "step": 30486
+    },
+    {
+      "epoch": 0.26464179998437515,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0016049885075988528,
+      "loss": 0.0967,
+      "step": 30487
+    },
+    {
+      "epoch": 0.2646504804645793,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0016049638714745634,
+      "loss": 0.1055,
+      "step": 30488
+    },
+    {
+      "epoch": 0.2646591609447835,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0016049392347980529,
+      "loss": 0.0889,
+      "step": 30489
+    },
+    {
+      "epoch": 0.2646678414249876,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0016049145975693486,
+      "loss": 0.1162,
+      "step": 30490
+    },
+    {
+      "epoch": 0.2646765219051918,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0016048899597884774,
+      "loss": 0.0908,
+      "step": 30491
+    },
+    {
+      "epoch": 0.26468520238539595,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0016048653214554658,
+      "loss": 0.0752,
+      "step": 30492
+    },
+    {
+      "epoch": 0.26469388286560014,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0016048406825703411,
+      "loss": 0.0757,
+      "step": 30493
+    },
+    {
+      "epoch": 0.2647025633458043,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0016048160431331305,
+      "loss": 0.123,
+      "step": 30494
+    },
+    {
+      "epoch": 0.26471124382600847,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0016047914031438607,
+      "loss": 0.1289,
+      "step": 30495
+    },
+    {
+      "epoch": 0.2647199243062126,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0016047667626025582,
+      "loss": 0.1094,
+      "step": 30496
+    },
+    {
+      "epoch": 0.2647286047864168,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0016047421215092503,
+      "loss": 0.0908,
+      "step": 30497
+    },
+    {
+      "epoch": 0.26473728526662094,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0016047174798639646,
+      "loss": 0.1045,
+      "step": 30498
+    },
+    {
+      "epoch": 0.26474596574682513,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.001604692837666727,
+      "loss": 0.0625,
+      "step": 30499
+    },
+    {
+      "epoch": 0.26475464622702927,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0016046681949175652,
+      "loss": 0.1094,
+      "step": 30500
+    },
+    {
+      "epoch": 0.26476332670723346,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0016046435516165057,
+      "loss": 0.0781,
+      "step": 30501
+    },
+    {
+      "epoch": 0.2647720071874376,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0016046189077635754,
+      "loss": 0.127,
+      "step": 30502
+    },
+    {
+      "epoch": 0.2647806876676418,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0016045942633588015,
+      "loss": 0.127,
+      "step": 30503
+    },
+    {
+      "epoch": 0.2647893681478459,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0016045696184022109,
+      "loss": 0.1523,
+      "step": 30504
+    },
+    {
+      "epoch": 0.2647980486280501,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0016045449728938309,
+      "loss": 0.0986,
+      "step": 30505
+    },
+    {
+      "epoch": 0.26480672910825426,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0016045203268336876,
+      "loss": 0.1143,
+      "step": 30506
+    },
+    {
+      "epoch": 0.26481540958845845,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0016044956802218086,
+      "loss": 0.1025,
+      "step": 30507
+    },
+    {
+      "epoch": 0.2648240900686626,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0016044710330582208,
+      "loss": 0.0928,
+      "step": 30508
+    },
+    {
+      "epoch": 0.2648327705488668,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0016044463853429514,
+      "loss": 0.1211,
+      "step": 30509
+    },
+    {
+      "epoch": 0.2648414510290709,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0016044217370760263,
+      "loss": 0.1348,
+      "step": 30510
+    },
+    {
+      "epoch": 0.2648501315092751,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0016043970882574736,
+      "loss": 0.0854,
+      "step": 30511
+    },
+    {
+      "epoch": 0.26485881198947925,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0016043724388873197,
+      "loss": 0.0845,
+      "step": 30512
+    },
+    {
+      "epoch": 0.26486749246968344,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0016043477889655918,
+      "loss": 0.0952,
+      "step": 30513
+    },
+    {
+      "epoch": 0.2648761729498876,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0016043231384923167,
+      "loss": 0.104,
+      "step": 30514
+    },
+    {
+      "epoch": 0.26488485343009177,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0016042984874675213,
+      "loss": 0.106,
+      "step": 30515
+    },
+    {
+      "epoch": 0.2648935339102959,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0016042738358912328,
+      "loss": 0.1348,
+      "step": 30516
+    },
+    {
+      "epoch": 0.2649022143905001,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0016042491837634778,
+      "loss": 0.0811,
+      "step": 30517
+    },
+    {
+      "epoch": 0.26491089487070424,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0016042245310842838,
+      "loss": 0.1147,
+      "step": 30518
+    },
+    {
+      "epoch": 0.26491957535090843,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0016041998778536768,
+      "loss": 0.0645,
+      "step": 30519
+    },
+    {
+      "epoch": 0.26492825583111257,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0016041752240716848,
+      "loss": 0.124,
+      "step": 30520
+    },
+    {
+      "epoch": 0.26493693631131676,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0016041505697383346,
+      "loss": 0.1016,
+      "step": 30521
+    },
+    {
+      "epoch": 0.2649456167915209,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0016041259148536526,
+      "loss": 0.1426,
+      "step": 30522
+    },
+    {
+      "epoch": 0.2649542972717251,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.001604101259417666,
+      "loss": 0.0732,
+      "step": 30523
+    },
+    {
+      "epoch": 0.26496297775192923,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0016040766034304023,
+      "loss": 0.1797,
+      "step": 30524
+    },
+    {
+      "epoch": 0.2649716582321334,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0016040519468918876,
+      "loss": 0.0791,
+      "step": 30525
+    },
+    {
+      "epoch": 0.26498033871233756,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0016040272898021493,
+      "loss": 0.1011,
+      "step": 30526
+    },
+    {
+      "epoch": 0.26498901919254175,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0016040026321612142,
+      "loss": 0.1582,
+      "step": 30527
+    },
+    {
+      "epoch": 0.2649976996727459,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0016039779739691097,
+      "loss": 0.0835,
+      "step": 30528
+    },
+    {
+      "epoch": 0.2650063801529501,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0016039533152258623,
+      "loss": 0.1074,
+      "step": 30529
+    },
+    {
+      "epoch": 0.2650150606331542,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0016039286559314991,
+      "loss": 0.1074,
+      "step": 30530
+    },
+    {
+      "epoch": 0.2650237411133584,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001603903996086047,
+      "loss": 0.1084,
+      "step": 30531
+    },
+    {
+      "epoch": 0.26503242159356255,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0016038793356895334,
+      "loss": 0.1387,
+      "step": 30532
+    },
+    {
+      "epoch": 0.26504110207376674,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0016038546747419846,
+      "loss": 0.1367,
+      "step": 30533
+    },
+    {
+      "epoch": 0.2650497825539709,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001603830013243428,
+      "loss": 0.1025,
+      "step": 30534
+    },
+    {
+      "epoch": 0.2650584630341751,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0016038053511938908,
+      "loss": 0.0654,
+      "step": 30535
+    },
+    {
+      "epoch": 0.2650671435143792,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0016037806885933993,
+      "loss": 0.0977,
+      "step": 30536
+    },
+    {
+      "epoch": 0.2650758239945834,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0016037560254419806,
+      "loss": 0.1182,
+      "step": 30537
+    },
+    {
+      "epoch": 0.26508450447478754,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001603731361739662,
+      "loss": 0.0938,
+      "step": 30538
+    },
+    {
+      "epoch": 0.26509318495499173,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0016037066974864706,
+      "loss": 0.1143,
+      "step": 30539
+    },
+    {
+      "epoch": 0.26510186543519587,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0016036820326824329,
+      "loss": 0.0947,
+      "step": 30540
+    },
+    {
+      "epoch": 0.26511054591540006,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0016036573673275762,
+      "loss": 0.1104,
+      "step": 30541
+    },
+    {
+      "epoch": 0.2651192263956042,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0016036327014219275,
+      "loss": 0.0889,
+      "step": 30542
+    },
+    {
+      "epoch": 0.2651279068758084,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0016036080349655136,
+      "loss": 0.1118,
+      "step": 30543
+    },
+    {
+      "epoch": 0.26513658735601253,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001603583367958361,
+      "loss": 0.0957,
+      "step": 30544
+    },
+    {
+      "epoch": 0.2651452678362167,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001603558700400498,
+      "loss": 0.0972,
+      "step": 30545
+    },
+    {
+      "epoch": 0.26515394831642086,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0016035340322919504,
+      "loss": 0.082,
+      "step": 30546
+    },
+    {
+      "epoch": 0.26516262879662506,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0016035093636327456,
+      "loss": 0.1113,
+      "step": 30547
+    },
+    {
+      "epoch": 0.2651713092768292,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0016034846944229103,
+      "loss": 0.1396,
+      "step": 30548
+    },
+    {
+      "epoch": 0.2651799897570334,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0016034600246624718,
+      "loss": 0.1006,
+      "step": 30549
+    },
+    {
+      "epoch": 0.2651886702372375,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001603435354351457,
+      "loss": 0.1079,
+      "step": 30550
+    },
+    {
+      "epoch": 0.2651973507174417,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0016034106834898931,
+      "loss": 0.1025,
+      "step": 30551
+    },
+    {
+      "epoch": 0.26520603119764585,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0016033860120778065,
+      "loss": 0.0967,
+      "step": 30552
+    },
+    {
+      "epoch": 0.26521471167785005,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0016033613401152248,
+      "loss": 0.0825,
+      "step": 30553
+    },
+    {
+      "epoch": 0.2652233921580542,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0016033366676021746,
+      "loss": 0.085,
+      "step": 30554
+    },
+    {
+      "epoch": 0.2652320726382584,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001603311994538683,
+      "loss": 0.1074,
+      "step": 30555
+    },
+    {
+      "epoch": 0.2652407531184625,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0016032873209247769,
+      "loss": 0.1143,
+      "step": 30556
+    },
+    {
+      "epoch": 0.26524943359866665,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001603262646760483,
+      "loss": 0.0767,
+      "step": 30557
+    },
+    {
+      "epoch": 0.26525811407887084,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0016032379720458292,
+      "loss": 0.1118,
+      "step": 30558
+    },
+    {
+      "epoch": 0.265266794559075,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0016032132967808417,
+      "loss": 0.1201,
+      "step": 30559
+    },
+    {
+      "epoch": 0.2652754750392792,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0016031886209655478,
+      "loss": 0.1387,
+      "step": 30560
+    },
+    {
+      "epoch": 0.2652841555194833,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0016031639445999743,
+      "loss": 0.1289,
+      "step": 30561
+    },
+    {
+      "epoch": 0.2652928359996875,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001603139267684148,
+      "loss": 0.0913,
+      "step": 30562
+    },
+    {
+      "epoch": 0.26530151647989164,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0016031145902180965,
+      "loss": 0.2295,
+      "step": 30563
+    },
+    {
+      "epoch": 0.26531019696009583,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0016030899122018464,
+      "loss": 0.1143,
+      "step": 30564
+    },
+    {
+      "epoch": 0.26531887744029997,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001603065233635425,
+      "loss": 0.0879,
+      "step": 30565
+    },
+    {
+      "epoch": 0.26532755792050416,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0016030405545188586,
+      "loss": 0.1162,
+      "step": 30566
+    },
+    {
+      "epoch": 0.2653362384007083,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0016030158748521748,
+      "loss": 0.1602,
+      "step": 30567
+    },
+    {
+      "epoch": 0.2653449188809125,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0016029911946354003,
+      "loss": 0.1187,
+      "step": 30568
+    },
+    {
+      "epoch": 0.26535359936111663,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0016029665138685622,
+      "loss": 0.1816,
+      "step": 30569
+    },
+    {
+      "epoch": 0.2653622798413208,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0016029418325516876,
+      "loss": 0.1318,
+      "step": 30570
+    },
+    {
+      "epoch": 0.26537096032152496,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0016029171506848033,
+      "loss": 0.0947,
+      "step": 30571
+    },
+    {
+      "epoch": 0.26537964080172916,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001602892468267936,
+      "loss": 0.0811,
+      "step": 30572
+    },
+    {
+      "epoch": 0.2653883212819333,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0016028677853011136,
+      "loss": 0.103,
+      "step": 30573
+    },
+    {
+      "epoch": 0.2653970017621375,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0016028431017843623,
+      "loss": 0.1406,
+      "step": 30574
+    },
+    {
+      "epoch": 0.2654056822423416,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0016028184177177092,
+      "loss": 0.1162,
+      "step": 30575
+    },
+    {
+      "epoch": 0.2654143627225458,
+      "grad_norm": 1.875,
+      "learning_rate": 0.0016027937331011815,
+      "loss": 0.3105,
+      "step": 30576
+    },
+    {
+      "epoch": 0.26542304320274995,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0016027690479348063,
+      "loss": 0.0737,
+      "step": 30577
+    },
+    {
+      "epoch": 0.26543172368295415,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0016027443622186107,
+      "loss": 0.1187,
+      "step": 30578
+    },
+    {
+      "epoch": 0.2654404041631583,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0016027196759526208,
+      "loss": 0.0986,
+      "step": 30579
+    },
+    {
+      "epoch": 0.2654490846433625,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0016026949891368642,
+      "loss": 0.127,
+      "step": 30580
+    },
+    {
+      "epoch": 0.2654577651235666,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0016026703017713684,
+      "loss": 0.0913,
+      "step": 30581
+    },
+    {
+      "epoch": 0.2654664456037708,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0016026456138561597,
+      "loss": 0.1172,
+      "step": 30582
+    },
+    {
+      "epoch": 0.26547512608397494,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001602620925391265,
+      "loss": 0.0908,
+      "step": 30583
+    },
+    {
+      "epoch": 0.26548380656417914,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0016025962363767124,
+      "loss": 0.0752,
+      "step": 30584
+    },
+    {
+      "epoch": 0.2654924870443833,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0016025715468125274,
+      "loss": 0.1514,
+      "step": 30585
+    },
+    {
+      "epoch": 0.26550116752458747,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0016025468566987378,
+      "loss": 0.1787,
+      "step": 30586
+    },
+    {
+      "epoch": 0.2655098480047916,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0016025221660353707,
+      "loss": 0.1025,
+      "step": 30587
+    },
+    {
+      "epoch": 0.2655185284849958,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0016024974748224529,
+      "loss": 0.1016,
+      "step": 30588
+    },
+    {
+      "epoch": 0.26552720896519993,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0016024727830600111,
+      "loss": 0.0752,
+      "step": 30589
+    },
+    {
+      "epoch": 0.2655358894454041,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0016024480907480731,
+      "loss": 0.0864,
+      "step": 30590
+    },
+    {
+      "epoch": 0.26554456992560826,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0016024233978866654,
+      "loss": 0.1025,
+      "step": 30591
+    },
+    {
+      "epoch": 0.26555325040581246,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0016023987044758144,
+      "loss": 0.1621,
+      "step": 30592
+    },
+    {
+      "epoch": 0.2655619308860166,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0016023740105155482,
+      "loss": 0.1133,
+      "step": 30593
+    },
+    {
+      "epoch": 0.2655706113662208,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0016023493160058934,
+      "loss": 0.085,
+      "step": 30594
+    },
+    {
+      "epoch": 0.2655792918464249,
+      "grad_norm": 2.375,
+      "learning_rate": 0.0016023246209468766,
+      "loss": 0.4629,
+      "step": 30595
+    },
+    {
+      "epoch": 0.2655879723266291,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0016022999253385256,
+      "loss": 0.0928,
+      "step": 30596
+    },
+    {
+      "epoch": 0.26559665280683326,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0016022752291808666,
+      "loss": 0.1387,
+      "step": 30597
+    },
+    {
+      "epoch": 0.26560533328703745,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001602250532473927,
+      "loss": 0.1309,
+      "step": 30598
+    },
+    {
+      "epoch": 0.2656140137672416,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0016022258352177338,
+      "loss": 0.0938,
+      "step": 30599
+    },
+    {
+      "epoch": 0.2656226942474458,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001602201137412314,
+      "loss": 0.1162,
+      "step": 30600
+    },
+    {
+      "epoch": 0.2656313747276499,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0016021764390576948,
+      "loss": 0.1104,
+      "step": 30601
+    },
+    {
+      "epoch": 0.2656400552078541,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0016021517401539026,
+      "loss": 0.0908,
+      "step": 30602
+    },
+    {
+      "epoch": 0.26564873568805825,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001602127040700965,
+      "loss": 0.1064,
+      "step": 30603
+    },
+    {
+      "epoch": 0.26565741616826244,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0016021023406989088,
+      "loss": 0.0903,
+      "step": 30604
+    },
+    {
+      "epoch": 0.2656660966484666,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001602077640147761,
+      "loss": 0.0977,
+      "step": 30605
+    },
+    {
+      "epoch": 0.26567477712867077,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0016020529390475488,
+      "loss": 0.0967,
+      "step": 30606
+    },
+    {
+      "epoch": 0.2656834576088749,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0016020282373982986,
+      "loss": 0.1143,
+      "step": 30607
+    },
+    {
+      "epoch": 0.2656921380890791,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0016020035352000382,
+      "loss": 0.0977,
+      "step": 30608
+    },
+    {
+      "epoch": 0.26570081856928324,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0016019788324527942,
+      "loss": 0.1167,
+      "step": 30609
+    },
+    {
+      "epoch": 0.26570949904948743,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0016019541291565937,
+      "loss": 0.084,
+      "step": 30610
+    },
+    {
+      "epoch": 0.26571817952969157,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0016019294253114638,
+      "loss": 0.0884,
+      "step": 30611
+    },
+    {
+      "epoch": 0.26572686000989576,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0016019047209174314,
+      "loss": 0.0884,
+      "step": 30612
+    },
+    {
+      "epoch": 0.2657355404900999,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0016018800159745233,
+      "loss": 0.0811,
+      "step": 30613
+    },
+    {
+      "epoch": 0.2657442209703041,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0016018553104827668,
+      "loss": 0.083,
+      "step": 30614
+    },
+    {
+      "epoch": 0.2657529014505082,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001601830604442189,
+      "loss": 0.0918,
+      "step": 30615
+    },
+    {
+      "epoch": 0.2657615819307124,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001601805897852817,
+      "loss": 0.1738,
+      "step": 30616
+    },
+    {
+      "epoch": 0.26577026241091656,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0016017811907146776,
+      "loss": 0.0913,
+      "step": 30617
+    },
+    {
+      "epoch": 0.26577894289112075,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0016017564830277972,
+      "loss": 0.1201,
+      "step": 30618
+    },
+    {
+      "epoch": 0.2657876233713249,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001601731774792204,
+      "loss": 0.0957,
+      "step": 30619
+    },
+    {
+      "epoch": 0.2657963038515291,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0016017070660079246,
+      "loss": 0.104,
+      "step": 30620
+    },
+    {
+      "epoch": 0.2658049843317332,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0016016823566749857,
+      "loss": 0.1621,
+      "step": 30621
+    },
+    {
+      "epoch": 0.2658136648119374,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001601657646793414,
+      "loss": 0.0879,
+      "step": 30622
+    },
+    {
+      "epoch": 0.26582234529214155,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0016016329363632376,
+      "loss": 0.1123,
+      "step": 30623
+    },
+    {
+      "epoch": 0.26583102577234574,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001601608225384483,
+      "loss": 0.1162,
+      "step": 30624
+    },
+    {
+      "epoch": 0.2658397062525499,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001601583513857177,
+      "loss": 0.1465,
+      "step": 30625
+    },
+    {
+      "epoch": 0.26584838673275407,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0016015588017813467,
+      "loss": 0.1123,
+      "step": 30626
+    },
+    {
+      "epoch": 0.2658570672129582,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0016015340891570194,
+      "loss": 0.1387,
+      "step": 30627
+    },
+    {
+      "epoch": 0.2658657476931624,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001601509375984222,
+      "loss": 0.0786,
+      "step": 30628
+    },
+    {
+      "epoch": 0.26587442817336654,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0016014846622629815,
+      "loss": 0.0811,
+      "step": 30629
+    },
+    {
+      "epoch": 0.26588310865357073,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0016014599479933249,
+      "loss": 0.0957,
+      "step": 30630
+    },
+    {
+      "epoch": 0.26589178913377487,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0016014352331752788,
+      "loss": 0.0879,
+      "step": 30631
+    },
+    {
+      "epoch": 0.26590046961397906,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0016014105178088715,
+      "loss": 0.1235,
+      "step": 30632
+    },
+    {
+      "epoch": 0.2659091500941832,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0016013858018941284,
+      "loss": 0.124,
+      "step": 30633
+    },
+    {
+      "epoch": 0.2659178305743874,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0016013610854310775,
+      "loss": 0.1367,
+      "step": 30634
+    },
+    {
+      "epoch": 0.26592651105459153,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0016013363684197457,
+      "loss": 0.1069,
+      "step": 30635
+    },
+    {
+      "epoch": 0.2659351915347957,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0016013116508601603,
+      "loss": 0.0933,
+      "step": 30636
+    },
+    {
+      "epoch": 0.26594387201499986,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0016012869327523475,
+      "loss": 0.0913,
+      "step": 30637
+    },
+    {
+      "epoch": 0.26595255249520405,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001601262214096335,
+      "loss": 0.1147,
+      "step": 30638
+    },
+    {
+      "epoch": 0.2659612329754082,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.00160123749489215,
+      "loss": 0.0977,
+      "step": 30639
+    },
+    {
+      "epoch": 0.2659699134556124,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001601212775139819,
+      "loss": 0.0806,
+      "step": 30640
+    },
+    {
+      "epoch": 0.2659785939358165,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0016011880548393693,
+      "loss": 0.123,
+      "step": 30641
+    },
+    {
+      "epoch": 0.2659872744160207,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001601163333990828,
+      "loss": 0.0791,
+      "step": 30642
+    },
+    {
+      "epoch": 0.26599595489622485,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0016011386125942217,
+      "loss": 0.0903,
+      "step": 30643
+    },
+    {
+      "epoch": 0.26600463537642904,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0016011138906495777,
+      "loss": 0.0923,
+      "step": 30644
+    },
+    {
+      "epoch": 0.2660133158566332,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001601089168156923,
+      "loss": 0.1328,
+      "step": 30645
+    },
+    {
+      "epoch": 0.2660219963368374,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0016010644451162853,
+      "loss": 0.0972,
+      "step": 30646
+    },
+    {
+      "epoch": 0.2660306768170415,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0016010397215276905,
+      "loss": 0.0693,
+      "step": 30647
+    },
+    {
+      "epoch": 0.2660393572972457,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0016010149973911666,
+      "loss": 0.082,
+      "step": 30648
+    },
+    {
+      "epoch": 0.26604803777744984,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00160099027270674,
+      "loss": 0.1089,
+      "step": 30649
+    },
+    {
+      "epoch": 0.26605671825765403,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001600965547474438,
+      "loss": 0.1226,
+      "step": 30650
+    },
+    {
+      "epoch": 0.26606539873785817,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0016009408216942874,
+      "loss": 0.0811,
+      "step": 30651
+    },
+    {
+      "epoch": 0.26607407921806236,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0016009160953663155,
+      "loss": 0.0908,
+      "step": 30652
+    },
+    {
+      "epoch": 0.2660827596982665,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0016008913684905492,
+      "loss": 0.0908,
+      "step": 30653
+    },
+    {
+      "epoch": 0.2660914401784707,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0016008666410670164,
+      "loss": 0.1094,
+      "step": 30654
+    },
+    {
+      "epoch": 0.26610012065867483,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0016008419130957427,
+      "loss": 0.0938,
+      "step": 30655
+    },
+    {
+      "epoch": 0.266108801138879,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0016008171845767558,
+      "loss": 0.0879,
+      "step": 30656
+    },
+    {
+      "epoch": 0.26611748161908316,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0016007924555100827,
+      "loss": 0.1406,
+      "step": 30657
+    },
+    {
+      "epoch": 0.26612616209928736,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0016007677258957507,
+      "loss": 0.1079,
+      "step": 30658
+    },
+    {
+      "epoch": 0.2661348425794915,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0016007429957337865,
+      "loss": 0.1035,
+      "step": 30659
+    },
+    {
+      "epoch": 0.2661435230596957,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0016007182650242175,
+      "loss": 0.1172,
+      "step": 30660
+    },
+    {
+      "epoch": 0.2661522035398998,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0016006935337670705,
+      "loss": 0.0771,
+      "step": 30661
+    },
+    {
+      "epoch": 0.266160884020104,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0016006688019623723,
+      "loss": 0.0977,
+      "step": 30662
+    },
+    {
+      "epoch": 0.26616956450030815,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0016006440696101506,
+      "loss": 0.1055,
+      "step": 30663
+    },
+    {
+      "epoch": 0.26617824498051235,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0016006193367104318,
+      "loss": 0.1328,
+      "step": 30664
+    },
+    {
+      "epoch": 0.2661869254607165,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001600594603263243,
+      "loss": 0.1069,
+      "step": 30665
+    },
+    {
+      "epoch": 0.2661956059409207,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001600569869268612,
+      "loss": 0.1216,
+      "step": 30666
+    },
+    {
+      "epoch": 0.2662042864211248,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0016005451347265648,
+      "loss": 0.084,
+      "step": 30667
+    },
+    {
+      "epoch": 0.266212966901329,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0016005203996371294,
+      "loss": 0.1025,
+      "step": 30668
+    },
+    {
+      "epoch": 0.26622164738153314,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0016004956640003323,
+      "loss": 0.0708,
+      "step": 30669
+    },
+    {
+      "epoch": 0.26623032786173734,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0016004709278162007,
+      "loss": 0.1162,
+      "step": 30670
+    },
+    {
+      "epoch": 0.2662390083419415,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0016004461910847615,
+      "loss": 0.0962,
+      "step": 30671
+    },
+    {
+      "epoch": 0.26624768882214567,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0016004214538060417,
+      "loss": 0.0972,
+      "step": 30672
+    },
+    {
+      "epoch": 0.2662563693023498,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0016003967159800689,
+      "loss": 0.1133,
+      "step": 30673
+    },
+    {
+      "epoch": 0.266265049782554,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0016003719776068693,
+      "loss": 0.1094,
+      "step": 30674
+    },
+    {
+      "epoch": 0.26627373026275813,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0016003472386864708,
+      "loss": 0.0864,
+      "step": 30675
+    },
+    {
+      "epoch": 0.2662824107429623,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0016003224992189,
+      "loss": 0.1426,
+      "step": 30676
+    },
+    {
+      "epoch": 0.26629109122316647,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016002977592041838,
+      "loss": 0.0854,
+      "step": 30677
+    },
+    {
+      "epoch": 0.26629977170337066,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0016002730186423497,
+      "loss": 0.0879,
+      "step": 30678
+    },
+    {
+      "epoch": 0.2663084521835748,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0016002482775334247,
+      "loss": 0.1182,
+      "step": 30679
+    },
+    {
+      "epoch": 0.26631713266377893,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0016002235358774353,
+      "loss": 0.1221,
+      "step": 30680
+    },
+    {
+      "epoch": 0.2663258131439831,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0016001987936744093,
+      "loss": 0.0996,
+      "step": 30681
+    },
+    {
+      "epoch": 0.26633449362418726,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0016001740509243734,
+      "loss": 0.0952,
+      "step": 30682
+    },
+    {
+      "epoch": 0.26634317410439146,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0016001493076273543,
+      "loss": 0.0967,
+      "step": 30683
+    },
+    {
+      "epoch": 0.2663518545845956,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0016001245637833799,
+      "loss": 0.0762,
+      "step": 30684
+    },
+    {
+      "epoch": 0.2663605350647998,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0016000998193924764,
+      "loss": 0.1011,
+      "step": 30685
+    },
+    {
+      "epoch": 0.2663692155450039,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0016000750744546714,
+      "loss": 0.103,
+      "step": 30686
+    },
+    {
+      "epoch": 0.2663778960252081,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0016000503289699919,
+      "loss": 0.1152,
+      "step": 30687
+    },
+    {
+      "epoch": 0.26638657650541225,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0016000255829384648,
+      "loss": 0.2969,
+      "step": 30688
+    },
+    {
+      "epoch": 0.26639525698561645,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0016000008363601175,
+      "loss": 0.1055,
+      "step": 30689
+    },
+    {
+      "epoch": 0.2664039374658206,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0015999760892349765,
+      "loss": 0.0806,
+      "step": 30690
+    },
+    {
+      "epoch": 0.2664126179460248,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0015999513415630691,
+      "loss": 0.1523,
+      "step": 30691
+    },
+    {
+      "epoch": 0.2664212984262289,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0015999265933444224,
+      "loss": 0.1768,
+      "step": 30692
+    },
+    {
+      "epoch": 0.2664299789064331,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001599901844579064,
+      "loss": 0.1641,
+      "step": 30693
+    },
+    {
+      "epoch": 0.26643865938663724,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0015998770952670202,
+      "loss": 0.1011,
+      "step": 30694
+    },
+    {
+      "epoch": 0.26644733986684144,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001599852345408318,
+      "loss": 0.1167,
+      "step": 30695
+    },
+    {
+      "epoch": 0.2664560203470456,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001599827595002985,
+      "loss": 0.0952,
+      "step": 30696
+    },
+    {
+      "epoch": 0.26646470082724977,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001599802844051048,
+      "loss": 0.1299,
+      "step": 30697
+    },
+    {
+      "epoch": 0.2664733813074539,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001599778092552534,
+      "loss": 0.1045,
+      "step": 30698
+    },
+    {
+      "epoch": 0.2664820617876581,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0015997533405074705,
+      "loss": 0.085,
+      "step": 30699
+    },
+    {
+      "epoch": 0.26649074226786224,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001599728587915884,
+      "loss": 0.1143,
+      "step": 30700
+    },
+    {
+      "epoch": 0.26649942274806643,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0015997038347778019,
+      "loss": 0.0996,
+      "step": 30701
+    },
+    {
+      "epoch": 0.26650810322827057,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0015996790810932516,
+      "loss": 0.1211,
+      "step": 30702
+    },
+    {
+      "epoch": 0.26651678370847476,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001599654326862259,
+      "loss": 0.1348,
+      "step": 30703
+    },
+    {
+      "epoch": 0.2665254641886789,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0015996295720848526,
+      "loss": 0.124,
+      "step": 30704
+    },
+    {
+      "epoch": 0.2665341446688831,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0015996048167610584,
+      "loss": 0.1201,
+      "step": 30705
+    },
+    {
+      "epoch": 0.2665428251490872,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0015995800608909038,
+      "loss": 0.1182,
+      "step": 30706
+    },
+    {
+      "epoch": 0.2665515056292914,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0015995553044744165,
+      "loss": 0.1064,
+      "step": 30707
+    },
+    {
+      "epoch": 0.26656018610949556,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0015995305475116225,
+      "loss": 0.1416,
+      "step": 30708
+    },
+    {
+      "epoch": 0.26656886658969975,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0015995057900025497,
+      "loss": 0.0996,
+      "step": 30709
+    },
+    {
+      "epoch": 0.2665775470699039,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0015994810319472246,
+      "loss": 0.1484,
+      "step": 30710
+    },
+    {
+      "epoch": 0.2665862275501081,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0015994562733456748,
+      "loss": 0.1094,
+      "step": 30711
+    },
+    {
+      "epoch": 0.2665949080303122,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0015994315141979272,
+      "loss": 0.0977,
+      "step": 30712
+    },
+    {
+      "epoch": 0.2666035885105164,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0015994067545040086,
+      "loss": 0.1172,
+      "step": 30713
+    },
+    {
+      "epoch": 0.26661226899072055,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0015993819942639463,
+      "loss": 0.1045,
+      "step": 30714
+    },
+    {
+      "epoch": 0.26662094947092474,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001599357233477767,
+      "loss": 0.1138,
+      "step": 30715
+    },
+    {
+      "epoch": 0.2666296299511289,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0015993324721454987,
+      "loss": 0.0972,
+      "step": 30716
+    },
+    {
+      "epoch": 0.26663831043133307,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0015993077102671677,
+      "loss": 0.125,
+      "step": 30717
+    },
+    {
+      "epoch": 0.2666469909115372,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0015992829478428015,
+      "loss": 0.0962,
+      "step": 30718
+    },
+    {
+      "epoch": 0.2666556713917414,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0015992581848724265,
+      "loss": 0.0903,
+      "step": 30719
+    },
+    {
+      "epoch": 0.26666435187194554,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0015992334213560705,
+      "loss": 0.1084,
+      "step": 30720
+    },
+    {
+      "epoch": 0.26667303235214973,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0015992086572937604,
+      "loss": 0.0806,
+      "step": 30721
+    },
+    {
+      "epoch": 0.26668171283235387,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001599183892685523,
+      "loss": 0.0854,
+      "step": 30722
+    },
+    {
+      "epoch": 0.26669039331255806,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0015991591275313858,
+      "loss": 0.1152,
+      "step": 30723
+    },
+    {
+      "epoch": 0.2666990737927622,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.001599134361831376,
+      "loss": 0.0693,
+      "step": 30724
+    },
+    {
+      "epoch": 0.2667077542729664,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0015991095955855197,
+      "loss": 0.0776,
+      "step": 30725
+    },
+    {
+      "epoch": 0.26671643475317053,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0015990848287938452,
+      "loss": 0.2539,
+      "step": 30726
+    },
+    {
+      "epoch": 0.2667251152333747,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0015990600614563784,
+      "loss": 0.1172,
+      "step": 30727
+    },
+    {
+      "epoch": 0.26673379571357886,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0015990352935731475,
+      "loss": 0.1138,
+      "step": 30728
+    },
+    {
+      "epoch": 0.26674247619378305,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0015990105251441791,
+      "loss": 0.1416,
+      "step": 30729
+    },
+    {
+      "epoch": 0.2667511566739872,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0015989857561695,
+      "loss": 0.1089,
+      "step": 30730
+    },
+    {
+      "epoch": 0.2667598371541914,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001598960986649138,
+      "loss": 0.1064,
+      "step": 30731
+    },
+    {
+      "epoch": 0.2667685176343955,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0015989362165831193,
+      "loss": 0.1201,
+      "step": 30732
+    },
+    {
+      "epoch": 0.2667771981145997,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0015989114459714717,
+      "loss": 0.1289,
+      "step": 30733
+    },
+    {
+      "epoch": 0.26678587859480385,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001598886674814222,
+      "loss": 0.1069,
+      "step": 30734
+    },
+    {
+      "epoch": 0.26679455907500804,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0015988619031113973,
+      "loss": 0.1465,
+      "step": 30735
+    },
+    {
+      "epoch": 0.2668032395552122,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0015988371308630247,
+      "loss": 0.1318,
+      "step": 30736
+    },
+    {
+      "epoch": 0.2668119200354164,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0015988123580691314,
+      "loss": 0.1006,
+      "step": 30737
+    },
+    {
+      "epoch": 0.2668206005156205,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0015987875847297444,
+      "loss": 0.105,
+      "step": 30738
+    },
+    {
+      "epoch": 0.2668292809958247,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0015987628108448907,
+      "loss": 0.1123,
+      "step": 30739
+    },
+    {
+      "epoch": 0.26683796147602884,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0015987380364145974,
+      "loss": 0.106,
+      "step": 30740
+    },
+    {
+      "epoch": 0.26684664195623303,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0015987132614388919,
+      "loss": 0.1641,
+      "step": 30741
+    },
+    {
+      "epoch": 0.26685532243643717,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001598688485917801,
+      "loss": 0.0791,
+      "step": 30742
+    },
+    {
+      "epoch": 0.26686400291664136,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0015986637098513517,
+      "loss": 0.1162,
+      "step": 30743
+    },
+    {
+      "epoch": 0.2668726833968455,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0015986389332395714,
+      "loss": 0.1338,
+      "step": 30744
+    },
+    {
+      "epoch": 0.2668813638770497,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0015986141560824872,
+      "loss": 0.1338,
+      "step": 30745
+    },
+    {
+      "epoch": 0.26689004435725383,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0015985893783801257,
+      "loss": 0.1128,
+      "step": 30746
+    },
+    {
+      "epoch": 0.266898724837458,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0015985646001325144,
+      "loss": 0.127,
+      "step": 30747
+    },
+    {
+      "epoch": 0.26690740531766216,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0015985398213396807,
+      "loss": 0.0913,
+      "step": 30748
+    },
+    {
+      "epoch": 0.26691608579786635,
+      "grad_norm": 4.03125,
+      "learning_rate": 0.001598515042001651,
+      "loss": 0.1484,
+      "step": 30749
+    },
+    {
+      "epoch": 0.2669247662780705,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0015984902621184532,
+      "loss": 0.0981,
+      "step": 30750
+    },
+    {
+      "epoch": 0.2669334467582747,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0015984654816901137,
+      "loss": 0.127,
+      "step": 30751
+    },
+    {
+      "epoch": 0.2669421272384788,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0015984407007166594,
+      "loss": 0.1289,
+      "step": 30752
+    },
+    {
+      "epoch": 0.266950807718683,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0015984159191981182,
+      "loss": 0.0894,
+      "step": 30753
+    },
+    {
+      "epoch": 0.26695948819888715,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001598391137134517,
+      "loss": 0.0947,
+      "step": 30754
+    },
+    {
+      "epoch": 0.26696816867909134,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0015983663545258825,
+      "loss": 0.1367,
+      "step": 30755
+    },
+    {
+      "epoch": 0.2669768491592955,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001598341571372242,
+      "loss": 0.1611,
+      "step": 30756
+    },
+    {
+      "epoch": 0.2669855296394997,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001598316787673623,
+      "loss": 0.0854,
+      "step": 30757
+    },
+    {
+      "epoch": 0.2669942101197038,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0015982920034300519,
+      "loss": 0.0854,
+      "step": 30758
+    },
+    {
+      "epoch": 0.267002890599908,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0015982672186415563,
+      "loss": 0.1299,
+      "step": 30759
+    },
+    {
+      "epoch": 0.26701157108011214,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001598242433308163,
+      "loss": 0.1201,
+      "step": 30760
+    },
+    {
+      "epoch": 0.26702025156031634,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0015982176474298995,
+      "loss": 0.1001,
+      "step": 30761
+    },
+    {
+      "epoch": 0.2670289320405205,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0015981928610067926,
+      "loss": 0.0693,
+      "step": 30762
+    },
+    {
+      "epoch": 0.26703761252072467,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0015981680740388693,
+      "loss": 0.0811,
+      "step": 30763
+    },
+    {
+      "epoch": 0.2670462930009288,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0015981432865261572,
+      "loss": 0.0962,
+      "step": 30764
+    },
+    {
+      "epoch": 0.267054973481133,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0015981184984686827,
+      "loss": 0.1001,
+      "step": 30765
+    },
+    {
+      "epoch": 0.26706365396133713,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0015980937098664734,
+      "loss": 0.0942,
+      "step": 30766
+    },
+    {
+      "epoch": 0.2670723344415413,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0015980689207195566,
+      "loss": 0.1113,
+      "step": 30767
+    },
+    {
+      "epoch": 0.26708101492174546,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0015980441310279587,
+      "loss": 0.0986,
+      "step": 30768
+    },
+    {
+      "epoch": 0.26708969540194966,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0015980193407917074,
+      "loss": 0.0884,
+      "step": 30769
+    },
+    {
+      "epoch": 0.2670983758821538,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0015979945500108296,
+      "loss": 0.0854,
+      "step": 30770
+    },
+    {
+      "epoch": 0.267107056362358,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0015979697586853527,
+      "loss": 0.105,
+      "step": 30771
+    },
+    {
+      "epoch": 0.2671157368425621,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0015979449668153032,
+      "loss": 0.1445,
+      "step": 30772
+    },
+    {
+      "epoch": 0.2671244173227663,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0015979201744007086,
+      "loss": 0.0598,
+      "step": 30773
+    },
+    {
+      "epoch": 0.26713309780297045,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0015978953814415962,
+      "loss": 0.1094,
+      "step": 30774
+    },
+    {
+      "epoch": 0.26714177828317465,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0015978705879379927,
+      "loss": 0.1006,
+      "step": 30775
+    },
+    {
+      "epoch": 0.2671504587633788,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0015978457938899257,
+      "loss": 0.1172,
+      "step": 30776
+    },
+    {
+      "epoch": 0.267159139243583,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0015978209992974215,
+      "loss": 0.085,
+      "step": 30777
+    },
+    {
+      "epoch": 0.2671678197237871,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001597796204160508,
+      "loss": 0.1221,
+      "step": 30778
+    },
+    {
+      "epoch": 0.2671765002039913,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0015977714084792123,
+      "loss": 0.1162,
+      "step": 30779
+    },
+    {
+      "epoch": 0.26718518068419544,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0015977466122535611,
+      "loss": 0.0938,
+      "step": 30780
+    },
+    {
+      "epoch": 0.26719386116439964,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0015977218154835816,
+      "loss": 0.1094,
+      "step": 30781
+    },
+    {
+      "epoch": 0.2672025416446038,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0015976970181693008,
+      "loss": 0.1094,
+      "step": 30782
+    },
+    {
+      "epoch": 0.26721122212480797,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0015976722203107463,
+      "loss": 0.0947,
+      "step": 30783
+    },
+    {
+      "epoch": 0.2672199026050121,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.001597647421907945,
+      "loss": 0.1025,
+      "step": 30784
+    },
+    {
+      "epoch": 0.2672285830852163,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0015976226229609239,
+      "loss": 0.1406,
+      "step": 30785
+    },
+    {
+      "epoch": 0.26723726356542044,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0015975978234697103,
+      "loss": 0.1367,
+      "step": 30786
+    },
+    {
+      "epoch": 0.26724594404562463,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0015975730234343308,
+      "loss": 0.0708,
+      "step": 30787
+    },
+    {
+      "epoch": 0.26725462452582877,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0015975482228548132,
+      "loss": 0.0889,
+      "step": 30788
+    },
+    {
+      "epoch": 0.26726330500603296,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0015975234217311845,
+      "loss": 0.1055,
+      "step": 30789
+    },
+    {
+      "epoch": 0.2672719854862371,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0015974986200634712,
+      "loss": 0.0928,
+      "step": 30790
+    },
+    {
+      "epoch": 0.2672806659664413,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0015974738178517012,
+      "loss": 0.1094,
+      "step": 30791
+    },
+    {
+      "epoch": 0.2672893464466454,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0015974490150959014,
+      "loss": 0.0591,
+      "step": 30792
+    },
+    {
+      "epoch": 0.2672980269268496,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0015974242117960985,
+      "loss": 0.0986,
+      "step": 30793
+    },
+    {
+      "epoch": 0.26730670740705376,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0015973994079523201,
+      "loss": 0.2119,
+      "step": 30794
+    },
+    {
+      "epoch": 0.26731538788725795,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001597374603564593,
+      "loss": 0.1455,
+      "step": 30795
+    },
+    {
+      "epoch": 0.2673240683674621,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0015973497986329448,
+      "loss": 0.1016,
+      "step": 30796
+    },
+    {
+      "epoch": 0.2673327488476663,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0015973249931574022,
+      "loss": 0.0918,
+      "step": 30797
+    },
+    {
+      "epoch": 0.2673414293278704,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0015973001871379927,
+      "loss": 0.0967,
+      "step": 30798
+    },
+    {
+      "epoch": 0.2673501098080746,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.001597275380574743,
+      "loss": 0.2188,
+      "step": 30799
+    },
+    {
+      "epoch": 0.26735879028827875,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0015972505734676804,
+      "loss": 0.0889,
+      "step": 30800
+    },
+    {
+      "epoch": 0.26736747076848294,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0015972257658168321,
+      "loss": 0.1035,
+      "step": 30801
+    },
+    {
+      "epoch": 0.2673761512486871,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001597200957622225,
+      "loss": 0.082,
+      "step": 30802
+    },
+    {
+      "epoch": 0.2673848317288912,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0015971761488838866,
+      "loss": 0.0718,
+      "step": 30803
+    },
+    {
+      "epoch": 0.2673935122090954,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0015971513396018434,
+      "loss": 0.0913,
+      "step": 30804
+    },
+    {
+      "epoch": 0.26740219268929954,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0015971265297761235,
+      "loss": 0.0908,
+      "step": 30805
+    },
+    {
+      "epoch": 0.26741087316950374,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0015971017194067534,
+      "loss": 0.127,
+      "step": 30806
+    },
+    {
+      "epoch": 0.2674195536497079,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0015970769084937603,
+      "loss": 0.1445,
+      "step": 30807
+    },
+    {
+      "epoch": 0.26742823412991207,
+      "grad_norm": 3.03125,
+      "learning_rate": 0.0015970520970371711,
+      "loss": 0.124,
+      "step": 30808
+    },
+    {
+      "epoch": 0.2674369146101162,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0015970272850370135,
+      "loss": 0.1045,
+      "step": 30809
+    },
+    {
+      "epoch": 0.2674455950903204,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0015970024724933144,
+      "loss": 0.0771,
+      "step": 30810
+    },
+    {
+      "epoch": 0.26745427557052454,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0015969776594061005,
+      "loss": 0.1191,
+      "step": 30811
+    },
+    {
+      "epoch": 0.26746295605072873,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0015969528457753994,
+      "loss": 0.1074,
+      "step": 30812
+    },
+    {
+      "epoch": 0.26747163653093287,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0015969280316012382,
+      "loss": 0.1387,
+      "step": 30813
+    },
+    {
+      "epoch": 0.26748031701113706,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001596903216883644,
+      "loss": 0.1064,
+      "step": 30814
+    },
+    {
+      "epoch": 0.2674889974913412,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0015968784016226441,
+      "loss": 0.0908,
+      "step": 30815
+    },
+    {
+      "epoch": 0.2674976779715454,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001596853585818265,
+      "loss": 0.082,
+      "step": 30816
+    },
+    {
+      "epoch": 0.2675063584517495,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0015968287694705348,
+      "loss": 0.0938,
+      "step": 30817
+    },
+    {
+      "epoch": 0.2675150389319537,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0015968039525794796,
+      "loss": 0.0786,
+      "step": 30818
+    },
+    {
+      "epoch": 0.26752371941215786,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0015967791351451275,
+      "loss": 0.0908,
+      "step": 30819
+    },
+    {
+      "epoch": 0.26753239989236205,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0015967543171675049,
+      "loss": 0.0981,
+      "step": 30820
+    },
+    {
+      "epoch": 0.2675410803725662,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0015967294986466397,
+      "loss": 0.0957,
+      "step": 30821
+    },
+    {
+      "epoch": 0.2675497608527704,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0015967046795825582,
+      "loss": 0.1152,
+      "step": 30822
+    },
+    {
+      "epoch": 0.2675584413329745,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.001596679859975288,
+      "loss": 0.1069,
+      "step": 30823
+    },
+    {
+      "epoch": 0.2675671218131787,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.001596655039824856,
+      "loss": 0.0806,
+      "step": 30824
+    },
+    {
+      "epoch": 0.26757580229338285,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0015966302191312896,
+      "loss": 0.0903,
+      "step": 30825
+    },
+    {
+      "epoch": 0.26758448277358704,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0015966053978946158,
+      "loss": 0.1084,
+      "step": 30826
+    },
+    {
+      "epoch": 0.2675931632537912,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001596580576114862,
+      "loss": 0.1191,
+      "step": 30827
+    },
+    {
+      "epoch": 0.26760184373399537,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001596555753792055,
+      "loss": 0.0938,
+      "step": 30828
+    },
+    {
+      "epoch": 0.2676105242141995,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0015965309309262218,
+      "loss": 0.1162,
+      "step": 30829
+    },
+    {
+      "epoch": 0.2676192046944037,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0015965061075173905,
+      "loss": 0.1104,
+      "step": 30830
+    },
+    {
+      "epoch": 0.26762788517460784,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0015964812835655873,
+      "loss": 0.1289,
+      "step": 30831
+    },
+    {
+      "epoch": 0.26763656565481203,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0015964564590708395,
+      "loss": 0.0679,
+      "step": 30832
+    },
+    {
+      "epoch": 0.26764524613501617,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0015964316340331746,
+      "loss": 0.0815,
+      "step": 30833
+    },
+    {
+      "epoch": 0.26765392661522036,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0015964068084526192,
+      "loss": 0.1128,
+      "step": 30834
+    },
+    {
+      "epoch": 0.2676626070954245,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0015963819823292009,
+      "loss": 0.126,
+      "step": 30835
+    },
+    {
+      "epoch": 0.2676712875756287,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0015963571556629468,
+      "loss": 0.1553,
+      "step": 30836
+    },
+    {
+      "epoch": 0.26767996805583283,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0015963323284538839,
+      "loss": 0.0859,
+      "step": 30837
+    },
+    {
+      "epoch": 0.267688648536037,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0015963075007020397,
+      "loss": 0.0884,
+      "step": 30838
+    },
+    {
+      "epoch": 0.26769732901624116,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0015962826724074408,
+      "loss": 0.1289,
+      "step": 30839
+    },
+    {
+      "epoch": 0.26770600949644535,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0015962578435701148,
+      "loss": 0.1318,
+      "step": 30840
+    },
+    {
+      "epoch": 0.2677146899766495,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0015962330141900882,
+      "loss": 0.1328,
+      "step": 30841
+    },
+    {
+      "epoch": 0.2677233704568537,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0015962081842673893,
+      "loss": 0.1143,
+      "step": 30842
+    },
+    {
+      "epoch": 0.2677320509370578,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001596183353802044,
+      "loss": 0.167,
+      "step": 30843
+    },
+    {
+      "epoch": 0.267740731417262,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0015961585227940802,
+      "loss": 0.1279,
+      "step": 30844
+    },
+    {
+      "epoch": 0.26774941189746615,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0015961336912435253,
+      "loss": 0.1299,
+      "step": 30845
+    },
+    {
+      "epoch": 0.26775809237767034,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0015961088591504055,
+      "loss": 0.1396,
+      "step": 30846
+    },
+    {
+      "epoch": 0.2677667728578745,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001596084026514749,
+      "loss": 0.1035,
+      "step": 30847
+    },
+    {
+      "epoch": 0.2677754533380787,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001596059193336582,
+      "loss": 0.127,
+      "step": 30848
+    },
+    {
+      "epoch": 0.2677841338182828,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0015960343596159326,
+      "loss": 0.0908,
+      "step": 30849
+    },
+    {
+      "epoch": 0.267792814298487,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0015960095253528268,
+      "loss": 0.1113,
+      "step": 30850
+    },
+    {
+      "epoch": 0.26780149477869114,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0015959846905472927,
+      "loss": 0.1494,
+      "step": 30851
+    },
+    {
+      "epoch": 0.26781017525889533,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0015959598551993578,
+      "loss": 0.1387,
+      "step": 30852
+    },
+    {
+      "epoch": 0.26781885573909947,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001595935019309048,
+      "loss": 0.0708,
+      "step": 30853
+    },
+    {
+      "epoch": 0.26782753621930366,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0015959101828763916,
+      "loss": 0.1592,
+      "step": 30854
+    },
+    {
+      "epoch": 0.2678362166995078,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.001595885345901415,
+      "loss": 0.1338,
+      "step": 30855
+    },
+    {
+      "epoch": 0.267844897179712,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0015958605083841455,
+      "loss": 0.1172,
+      "step": 30856
+    },
+    {
+      "epoch": 0.26785357765991613,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0015958356703246104,
+      "loss": 0.1006,
+      "step": 30857
+    },
+    {
+      "epoch": 0.2678622581401203,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0015958108317228371,
+      "loss": 0.0845,
+      "step": 30858
+    },
+    {
+      "epoch": 0.26787093862032446,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0015957859925788522,
+      "loss": 0.1348,
+      "step": 30859
+    },
+    {
+      "epoch": 0.26787961910052865,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0015957611528926833,
+      "loss": 0.1089,
+      "step": 30860
+    },
+    {
+      "epoch": 0.2678882995807328,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0015957363126643577,
+      "loss": 0.1416,
+      "step": 30861
+    },
+    {
+      "epoch": 0.267896980060937,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0015957114718939023,
+      "loss": 0.1475,
+      "step": 30862
+    },
+    {
+      "epoch": 0.2679056605411411,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001595686630581344,
+      "loss": 0.1318,
+      "step": 30863
+    },
+    {
+      "epoch": 0.2679143410213453,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0015956617887267106,
+      "loss": 0.1279,
+      "step": 30864
+    },
+    {
+      "epoch": 0.26792302150154945,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0015956369463300283,
+      "loss": 0.0854,
+      "step": 30865
+    },
+    {
+      "epoch": 0.26793170198175364,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0015956121033913256,
+      "loss": 0.0903,
+      "step": 30866
+    },
+    {
+      "epoch": 0.2679403824619578,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0015955872599106287,
+      "loss": 0.1133,
+      "step": 30867
+    },
+    {
+      "epoch": 0.267949062942162,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001595562415887965,
+      "loss": 0.1016,
+      "step": 30868
+    },
+    {
+      "epoch": 0.2679577434223661,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0015955375713233614,
+      "loss": 0.0825,
+      "step": 30869
+    },
+    {
+      "epoch": 0.2679664239025703,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0015955127262168457,
+      "loss": 0.1436,
+      "step": 30870
+    },
+    {
+      "epoch": 0.26797510438277444,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0015954878805684446,
+      "loss": 0.1279,
+      "step": 30871
+    },
+    {
+      "epoch": 0.26798378486297864,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0015954630343781855,
+      "loss": 0.0894,
+      "step": 30872
+    },
+    {
+      "epoch": 0.2679924653431828,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0015954381876460955,
+      "loss": 0.0986,
+      "step": 30873
+    },
+    {
+      "epoch": 0.26800114582338697,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0015954133403722013,
+      "loss": 0.1045,
+      "step": 30874
+    },
+    {
+      "epoch": 0.2680098263035911,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0015953884925565313,
+      "loss": 0.1201,
+      "step": 30875
+    },
+    {
+      "epoch": 0.2680185067837953,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001595363644199111,
+      "loss": 0.1338,
+      "step": 30876
+    },
+    {
+      "epoch": 0.26802718726399943,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001595338795299969,
+      "loss": 0.0967,
+      "step": 30877
+    },
+    {
+      "epoch": 0.2680358677442036,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001595313945859132,
+      "loss": 0.1436,
+      "step": 30878
+    },
+    {
+      "epoch": 0.26804454822440776,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001595289095876627,
+      "loss": 0.0977,
+      "step": 30879
+    },
+    {
+      "epoch": 0.26805322870461196,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.001595264245352481,
+      "loss": 0.1079,
+      "step": 30880
+    },
+    {
+      "epoch": 0.2680619091848161,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0015952393942867216,
+      "loss": 0.0908,
+      "step": 30881
+    },
+    {
+      "epoch": 0.2680705896650203,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001595214542679376,
+      "loss": 0.0869,
+      "step": 30882
+    },
+    {
+      "epoch": 0.2680792701452244,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001595189690530471,
+      "loss": 0.0918,
+      "step": 30883
+    },
+    {
+      "epoch": 0.2680879506254286,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0015951648378400346,
+      "loss": 0.0923,
+      "step": 30884
+    },
+    {
+      "epoch": 0.26809663110563275,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0015951399846080927,
+      "loss": 0.0986,
+      "step": 30885
+    },
+    {
+      "epoch": 0.26810531158583695,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0015951151308346734,
+      "loss": 0.1074,
+      "step": 30886
+    },
+    {
+      "epoch": 0.2681139920660411,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0015950902765198033,
+      "loss": 0.0859,
+      "step": 30887
+    },
+    {
+      "epoch": 0.2681226725462453,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0015950654216635104,
+      "loss": 0.1436,
+      "step": 30888
+    },
+    {
+      "epoch": 0.2681313530264494,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0015950405662658214,
+      "loss": 0.124,
+      "step": 30889
+    },
+    {
+      "epoch": 0.2681400335066536,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001595015710326763,
+      "loss": 0.0879,
+      "step": 30890
+    },
+    {
+      "epoch": 0.26814871398685775,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0015949908538463633,
+      "loss": 0.0669,
+      "step": 30891
+    },
+    {
+      "epoch": 0.26815739446706194,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0015949659968246491,
+      "loss": 0.1074,
+      "step": 30892
+    },
+    {
+      "epoch": 0.2681660749472661,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0015949411392616472,
+      "loss": 0.0938,
+      "step": 30893
+    },
+    {
+      "epoch": 0.26817475542747027,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0015949162811573852,
+      "loss": 0.0796,
+      "step": 30894
+    },
+    {
+      "epoch": 0.2681834359076744,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0015948914225118902,
+      "loss": 0.1021,
+      "step": 30895
+    },
+    {
+      "epoch": 0.2681921163878786,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0015948665633251896,
+      "loss": 0.1172,
+      "step": 30896
+    },
+    {
+      "epoch": 0.26820079686808274,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00159484170359731,
+      "loss": 0.1221,
+      "step": 30897
+    },
+    {
+      "epoch": 0.26820947734828693,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001594816843328279,
+      "loss": 0.0991,
+      "step": 30898
+    },
+    {
+      "epoch": 0.26821815782849107,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0015947919825181239,
+      "loss": 0.1143,
+      "step": 30899
+    },
+    {
+      "epoch": 0.26822683830869526,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0015947671211668716,
+      "loss": 0.0962,
+      "step": 30900
+    },
+    {
+      "epoch": 0.2682355187888994,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0015947422592745494,
+      "loss": 0.1396,
+      "step": 30901
+    },
+    {
+      "epoch": 0.2682441992691036,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0015947173968411847,
+      "loss": 0.0723,
+      "step": 30902
+    },
+    {
+      "epoch": 0.2682528797493077,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001594692533866804,
+      "loss": 0.1035,
+      "step": 30903
+    },
+    {
+      "epoch": 0.2682615602295119,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0015946676703514355,
+      "loss": 0.1162,
+      "step": 30904
+    },
+    {
+      "epoch": 0.26827024070971606,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0015946428062951056,
+      "loss": 0.1064,
+      "step": 30905
+    },
+    {
+      "epoch": 0.26827892118992025,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001594617941697842,
+      "loss": 0.1445,
+      "step": 30906
+    },
+    {
+      "epoch": 0.2682876016701244,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0015945930765596714,
+      "loss": 0.1455,
+      "step": 30907
+    },
+    {
+      "epoch": 0.2682962821503286,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0015945682108806213,
+      "loss": 0.0869,
+      "step": 30908
+    },
+    {
+      "epoch": 0.2683049626305327,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001594543344660719,
+      "loss": 0.0898,
+      "step": 30909
+    },
+    {
+      "epoch": 0.2683136431107369,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0015945184778999916,
+      "loss": 0.0967,
+      "step": 30910
+    },
+    {
+      "epoch": 0.26832232359094105,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0015944936105984659,
+      "loss": 0.0977,
+      "step": 30911
+    },
+    {
+      "epoch": 0.26833100407114524,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0015944687427561692,
+      "loss": 0.1426,
+      "step": 30912
+    },
+    {
+      "epoch": 0.2683396845513494,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0015944438743731295,
+      "loss": 0.0723,
+      "step": 30913
+    },
+    {
+      "epoch": 0.26834836503155357,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0015944190054493733,
+      "loss": 0.1406,
+      "step": 30914
+    },
+    {
+      "epoch": 0.2683570455117577,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0015943941359849276,
+      "loss": 0.1416,
+      "step": 30915
+    },
+    {
+      "epoch": 0.2683657259919619,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0015943692659798202,
+      "loss": 0.0918,
+      "step": 30916
+    },
+    {
+      "epoch": 0.26837440647216604,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.001594344395434078,
+      "loss": 0.1396,
+      "step": 30917
+    },
+    {
+      "epoch": 0.26838308695237023,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0015943195243477279,
+      "loss": 0.1089,
+      "step": 30918
+    },
+    {
+      "epoch": 0.26839176743257437,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0015942946527207976,
+      "loss": 0.125,
+      "step": 30919
+    },
+    {
+      "epoch": 0.26840044791277856,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001594269780553314,
+      "loss": 0.1138,
+      "step": 30920
+    },
+    {
+      "epoch": 0.2684091283929827,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0015942449078453044,
+      "loss": 0.1367,
+      "step": 30921
+    },
+    {
+      "epoch": 0.2684178088731869,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001594220034596796,
+      "loss": 0.1099,
+      "step": 30922
+    },
+    {
+      "epoch": 0.26842648935339103,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001594195160807816,
+      "loss": 0.1113,
+      "step": 30923
+    },
+    {
+      "epoch": 0.2684351698335952,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0015941702864783918,
+      "loss": 0.103,
+      "step": 30924
+    },
+    {
+      "epoch": 0.26844385031379936,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0015941454116085503,
+      "loss": 0.125,
+      "step": 30925
+    },
+    {
+      "epoch": 0.2684525307940035,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0015941205361983187,
+      "loss": 0.1348,
+      "step": 30926
+    },
+    {
+      "epoch": 0.2684612112742077,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0015940956602477241,
+      "loss": 0.0879,
+      "step": 30927
+    },
+    {
+      "epoch": 0.2684698917544118,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0015940707837567944,
+      "loss": 0.1021,
+      "step": 30928
+    },
+    {
+      "epoch": 0.268478572234616,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0015940459067255558,
+      "loss": 0.0903,
+      "step": 30929
+    },
+    {
+      "epoch": 0.26848725271482016,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0015940210291540368,
+      "loss": 0.1406,
+      "step": 30930
+    },
+    {
+      "epoch": 0.26849593319502435,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001593996151042263,
+      "loss": 0.0762,
+      "step": 30931
+    },
+    {
+      "epoch": 0.2685046136752285,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0015939712723902627,
+      "loss": 0.083,
+      "step": 30932
+    },
+    {
+      "epoch": 0.2685132941554327,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0015939463931980628,
+      "loss": 0.124,
+      "step": 30933
+    },
+    {
+      "epoch": 0.2685219746356368,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0015939215134656909,
+      "loss": 0.1211,
+      "step": 30934
+    },
+    {
+      "epoch": 0.268530655115841,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0015938966331931733,
+      "loss": 0.0854,
+      "step": 30935
+    },
+    {
+      "epoch": 0.26853933559604515,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0015938717523805385,
+      "loss": 0.1074,
+      "step": 30936
+    },
+    {
+      "epoch": 0.26854801607624934,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0015938468710278123,
+      "loss": 0.127,
+      "step": 30937
+    },
+    {
+      "epoch": 0.2685566965564535,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0015938219891350226,
+      "loss": 0.0957,
+      "step": 30938
+    },
+    {
+      "epoch": 0.26856537703665767,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0015937971067021969,
+      "loss": 0.1504,
+      "step": 30939
+    },
+    {
+      "epoch": 0.2685740575168618,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001593772223729362,
+      "loss": 0.103,
+      "step": 30940
+    },
+    {
+      "epoch": 0.268582737997066,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0015937473402165452,
+      "loss": 0.0928,
+      "step": 30941
+    },
+    {
+      "epoch": 0.26859141847727014,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0015937224561637735,
+      "loss": 0.0996,
+      "step": 30942
+    },
+    {
+      "epoch": 0.26860009895747433,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0015936975715710748,
+      "loss": 0.0898,
+      "step": 30943
+    },
+    {
+      "epoch": 0.26860877943767847,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0015936726864384757,
+      "loss": 0.1328,
+      "step": 30944
+    },
+    {
+      "epoch": 0.26861745991788266,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0015936478007660033,
+      "loss": 0.1299,
+      "step": 30945
+    },
+    {
+      "epoch": 0.2686261403980868,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0015936229145536854,
+      "loss": 0.125,
+      "step": 30946
+    },
+    {
+      "epoch": 0.268634820878291,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0015935980278015487,
+      "loss": 0.0967,
+      "step": 30947
+    },
+    {
+      "epoch": 0.26864350135849513,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0015935731405096207,
+      "loss": 0.1211,
+      "step": 30948
+    },
+    {
+      "epoch": 0.2686521818386993,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001593548252677929,
+      "loss": 0.1377,
+      "step": 30949
+    },
+    {
+      "epoch": 0.26866086231890346,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0015935233643064994,
+      "loss": 0.082,
+      "step": 30950
+    },
+    {
+      "epoch": 0.26866954279910765,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0015934984753953605,
+      "loss": 0.1377,
+      "step": 30951
+    },
+    {
+      "epoch": 0.2686782232793118,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001593473585944539,
+      "loss": 0.1445,
+      "step": 30952
+    },
+    {
+      "epoch": 0.268686903759516,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0015934486959540623,
+      "loss": 0.0947,
+      "step": 30953
+    },
+    {
+      "epoch": 0.2686955842397201,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0015934238054239578,
+      "loss": 0.125,
+      "step": 30954
+    },
+    {
+      "epoch": 0.2687042647199243,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001593398914354252,
+      "loss": 0.0879,
+      "step": 30955
+    },
+    {
+      "epoch": 0.26871294520012845,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0015933740227449726,
+      "loss": 0.0728,
+      "step": 30956
+    },
+    {
+      "epoch": 0.26872162568033264,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0015933491305961474,
+      "loss": 0.0649,
+      "step": 30957
+    },
+    {
+      "epoch": 0.2687303061605368,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0015933242379078023,
+      "loss": 0.0898,
+      "step": 30958
+    },
+    {
+      "epoch": 0.268738986640741,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0015932993446799652,
+      "loss": 0.1113,
+      "step": 30959
+    },
+    {
+      "epoch": 0.2687476671209451,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0015932744509126638,
+      "loss": 0.1367,
+      "step": 30960
+    },
+    {
+      "epoch": 0.2687563476011493,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0015932495566059247,
+      "loss": 0.1211,
+      "step": 30961
+    },
+    {
+      "epoch": 0.26876502808135344,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0015932246617597753,
+      "loss": 0.1064,
+      "step": 30962
+    },
+    {
+      "epoch": 0.26877370856155763,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0015931997663742427,
+      "loss": 0.084,
+      "step": 30963
+    },
+    {
+      "epoch": 0.26878238904176177,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.0015931748704493545,
+      "loss": 0.1279,
+      "step": 30964
+    },
+    {
+      "epoch": 0.26879106952196596,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0015931499739851374,
+      "loss": 0.0762,
+      "step": 30965
+    },
+    {
+      "epoch": 0.2687997500021701,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001593125076981619,
+      "loss": 0.1504,
+      "step": 30966
+    },
+    {
+      "epoch": 0.2688084304823743,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0015931001794388265,
+      "loss": 0.1055,
+      "step": 30967
+    },
+    {
+      "epoch": 0.26881711096257843,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0015930752813567868,
+      "loss": 0.0923,
+      "step": 30968
+    },
+    {
+      "epoch": 0.2688257914427826,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0015930503827355277,
+      "loss": 0.1074,
+      "step": 30969
+    },
+    {
+      "epoch": 0.26883447192298676,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001593025483575076,
+      "loss": 0.0918,
+      "step": 30970
+    },
+    {
+      "epoch": 0.26884315240319095,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0015930005838754588,
+      "loss": 0.0752,
+      "step": 30971
+    },
+    {
+      "epoch": 0.2688518328833951,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0015929756836367038,
+      "loss": 0.1152,
+      "step": 30972
+    },
+    {
+      "epoch": 0.2688605133635993,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0015929507828588382,
+      "loss": 0.1104,
+      "step": 30973
+    },
+    {
+      "epoch": 0.2688691938438034,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0015929258815418886,
+      "loss": 0.1289,
+      "step": 30974
+    },
+    {
+      "epoch": 0.2688778743240076,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001592900979685883,
+      "loss": 0.0991,
+      "step": 30975
+    },
+    {
+      "epoch": 0.26888655480421175,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0015928760772908482,
+      "loss": 0.166,
+      "step": 30976
+    },
+    {
+      "epoch": 0.26889523528441595,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0015928511743568113,
+      "loss": 0.1641,
+      "step": 30977
+    },
+    {
+      "epoch": 0.2689039157646201,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0015928262708837999,
+      "loss": 0.1396,
+      "step": 30978
+    },
+    {
+      "epoch": 0.2689125962448243,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001592801366871841,
+      "loss": 0.0952,
+      "step": 30979
+    },
+    {
+      "epoch": 0.2689212767250284,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0015927764623209625,
+      "loss": 0.1543,
+      "step": 30980
+    },
+    {
+      "epoch": 0.2689299572052326,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0015927515572311903,
+      "loss": 0.1191,
+      "step": 30981
+    },
+    {
+      "epoch": 0.26893863768543674,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001592726651602553,
+      "loss": 0.1182,
+      "step": 30982
+    },
+    {
+      "epoch": 0.26894731816564094,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001592701745435077,
+      "loss": 0.0879,
+      "step": 30983
+    },
+    {
+      "epoch": 0.2689559986458451,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0015926768387287894,
+      "loss": 0.0898,
+      "step": 30984
+    },
+    {
+      "epoch": 0.26896467912604927,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0015926519314837183,
+      "loss": 0.1006,
+      "step": 30985
+    },
+    {
+      "epoch": 0.2689733596062534,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0015926270236998902,
+      "loss": 0.0977,
+      "step": 30986
+    },
+    {
+      "epoch": 0.2689820400864576,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0015926021153773332,
+      "loss": 0.083,
+      "step": 30987
+    },
+    {
+      "epoch": 0.26899072056666173,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001592577206516073,
+      "loss": 0.1094,
+      "step": 30988
+    },
+    {
+      "epoch": 0.2689994010468659,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0015925522971161384,
+      "loss": 0.1094,
+      "step": 30989
+    },
+    {
+      "epoch": 0.26900808152707006,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0015925273871775557,
+      "loss": 0.1084,
+      "step": 30990
+    },
+    {
+      "epoch": 0.26901676200727426,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0015925024767003526,
+      "loss": 0.1172,
+      "step": 30991
+    },
+    {
+      "epoch": 0.2690254424874784,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.001592477565684556,
+      "loss": 0.1504,
+      "step": 30992
+    },
+    {
+      "epoch": 0.2690341229676826,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0015924526541301936,
+      "loss": 0.123,
+      "step": 30993
+    },
+    {
+      "epoch": 0.2690428034478867,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.001592427742037292,
+      "loss": 0.1357,
+      "step": 30994
+    },
+    {
+      "epoch": 0.2690514839280909,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0015924028294058792,
+      "loss": 0.1348,
+      "step": 30995
+    },
+    {
+      "epoch": 0.26906016440829505,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0015923779162359821,
+      "loss": 0.1172,
+      "step": 30996
+    },
+    {
+      "epoch": 0.26906884488849925,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0015923530025276278,
+      "loss": 0.1182,
+      "step": 30997
+    },
+    {
+      "epoch": 0.2690775253687034,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0015923280882808432,
+      "loss": 0.1758,
+      "step": 30998
+    },
+    {
+      "epoch": 0.2690862058489076,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0015923031734956566,
+      "loss": 0.0952,
+      "step": 30999
+    },
+    {
+      "epoch": 0.2690948863291117,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0015922782581720945,
+      "loss": 0.125,
+      "step": 31000
+    },
+    {
+      "epoch": 0.2691035668093159,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0015922533423101844,
+      "loss": 0.0928,
+      "step": 31001
+    },
+    {
+      "epoch": 0.26911224728952005,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0015922284259099531,
+      "loss": 0.1162,
+      "step": 31002
+    },
+    {
+      "epoch": 0.26912092776972424,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0015922035089714285,
+      "loss": 0.1973,
+      "step": 31003
+    },
+    {
+      "epoch": 0.2691296082499284,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0015921785914946372,
+      "loss": 0.104,
+      "step": 31004
+    },
+    {
+      "epoch": 0.26913828873013257,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0015921536734796072,
+      "loss": 0.1182,
+      "step": 31005
+    },
+    {
+      "epoch": 0.2691469692103367,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0015921287549263653,
+      "loss": 0.0835,
+      "step": 31006
+    },
+    {
+      "epoch": 0.2691556496905409,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0015921038358349382,
+      "loss": 0.127,
+      "step": 31007
+    },
+    {
+      "epoch": 0.26916433017074504,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0015920789162053544,
+      "loss": 0.1211,
+      "step": 31008
+    },
+    {
+      "epoch": 0.26917301065094923,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0015920539960376401,
+      "loss": 0.0845,
+      "step": 31009
+    },
+    {
+      "epoch": 0.26918169113115337,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0015920290753318233,
+      "loss": 0.1094,
+      "step": 31010
+    },
+    {
+      "epoch": 0.26919037161135756,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0015920041540879305,
+      "loss": 0.1279,
+      "step": 31011
+    },
+    {
+      "epoch": 0.2691990520915617,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0015919792323059896,
+      "loss": 0.0981,
+      "step": 31012
+    },
+    {
+      "epoch": 0.2692077325717659,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0015919543099860275,
+      "loss": 0.085,
+      "step": 31013
+    },
+    {
+      "epoch": 0.26921641305197,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0015919293871280717,
+      "loss": 0.1064,
+      "step": 31014
+    },
+    {
+      "epoch": 0.2692250935321742,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0015919044637321491,
+      "loss": 0.124,
+      "step": 31015
+    },
+    {
+      "epoch": 0.26923377401237836,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0015918795397982873,
+      "loss": 0.1123,
+      "step": 31016
+    },
+    {
+      "epoch": 0.26924245449258255,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0015918546153265135,
+      "loss": 0.0986,
+      "step": 31017
+    },
+    {
+      "epoch": 0.2692511349727867,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0015918296903168548,
+      "loss": 0.1172,
+      "step": 31018
+    },
+    {
+      "epoch": 0.2692598154529909,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0015918047647693384,
+      "loss": 0.1221,
+      "step": 31019
+    },
+    {
+      "epoch": 0.269268495933195,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0015917798386839918,
+      "loss": 0.1201,
+      "step": 31020
+    },
+    {
+      "epoch": 0.2692771764133992,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0015917549120608423,
+      "loss": 0.1069,
+      "step": 31021
+    },
+    {
+      "epoch": 0.26928585689360335,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0015917299848999169,
+      "loss": 0.0898,
+      "step": 31022
+    },
+    {
+      "epoch": 0.26929453737380754,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001591705057201243,
+      "loss": 0.1572,
+      "step": 31023
+    },
+    {
+      "epoch": 0.2693032178540117,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0015916801289648477,
+      "loss": 0.083,
+      "step": 31024
+    },
+    {
+      "epoch": 0.26931189833421587,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0015916552001907585,
+      "loss": 0.0791,
+      "step": 31025
+    },
+    {
+      "epoch": 0.26932057881442,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0015916302708790027,
+      "loss": 0.1143,
+      "step": 31026
+    },
+    {
+      "epoch": 0.2693292592946242,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0015916053410296074,
+      "loss": 0.1387,
+      "step": 31027
+    },
+    {
+      "epoch": 0.26933793977482834,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0015915804106425993,
+      "loss": 0.0898,
+      "step": 31028
+    },
+    {
+      "epoch": 0.26934662025503253,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001591555479718007,
+      "loss": 0.0762,
+      "step": 31029
+    },
+    {
+      "epoch": 0.26935530073523667,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0015915305482558568,
+      "loss": 0.125,
+      "step": 31030
+    },
+    {
+      "epoch": 0.26936398121544086,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0015915056162561763,
+      "loss": 0.0801,
+      "step": 31031
+    },
+    {
+      "epoch": 0.269372661695645,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0015914806837189922,
+      "loss": 0.125,
+      "step": 31032
+    },
+    {
+      "epoch": 0.2693813421758492,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0015914557506443326,
+      "loss": 0.1729,
+      "step": 31033
+    },
+    {
+      "epoch": 0.26939002265605333,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001591430817032224,
+      "loss": 0.1221,
+      "step": 31034
+    },
+    {
+      "epoch": 0.2693987031362575,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0015914058828826944,
+      "loss": 0.1094,
+      "step": 31035
+    },
+    {
+      "epoch": 0.26940738361646166,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.001591380948195771,
+      "loss": 0.1035,
+      "step": 31036
+    },
+    {
+      "epoch": 0.26941606409666585,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0015913560129714802,
+      "loss": 0.1309,
+      "step": 31037
+    },
+    {
+      "epoch": 0.26942474457687,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00159133107720985,
+      "loss": 0.0791,
+      "step": 31038
+    },
+    {
+      "epoch": 0.2694334250570742,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0015913061409109073,
+      "loss": 0.126,
+      "step": 31039
+    },
+    {
+      "epoch": 0.2694421055372783,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.00159128120407468,
+      "loss": 0.1016,
+      "step": 31040
+    },
+    {
+      "epoch": 0.2694507860174825,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0015912562667011949,
+      "loss": 0.0938,
+      "step": 31041
+    },
+    {
+      "epoch": 0.26945946649768665,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001591231328790479,
+      "loss": 0.1211,
+      "step": 31042
+    },
+    {
+      "epoch": 0.26946814697789084,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0015912063903425599,
+      "loss": 0.1348,
+      "step": 31043
+    },
+    {
+      "epoch": 0.269476827458095,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0015911814513574654,
+      "loss": 0.0918,
+      "step": 31044
+    },
+    {
+      "epoch": 0.2694855079382992,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001591156511835222,
+      "loss": 0.1484,
+      "step": 31045
+    },
+    {
+      "epoch": 0.2694941884185033,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0015911315717758573,
+      "loss": 0.1094,
+      "step": 31046
+    },
+    {
+      "epoch": 0.2695028688987075,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001591106631179398,
+      "loss": 0.0928,
+      "step": 31047
+    },
+    {
+      "epoch": 0.26951154937891164,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0015910816900458723,
+      "loss": 0.0781,
+      "step": 31048
+    },
+    {
+      "epoch": 0.2695202298591158,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0015910567483753068,
+      "loss": 0.0991,
+      "step": 31049
+    },
+    {
+      "epoch": 0.26952891033931997,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001591031806167729,
+      "loss": 0.0977,
+      "step": 31050
+    },
+    {
+      "epoch": 0.2695375908195241,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0015910068634231668,
+      "loss": 0.0542,
+      "step": 31051
+    },
+    {
+      "epoch": 0.2695462712997283,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0015909819201416465,
+      "loss": 0.1406,
+      "step": 31052
+    },
+    {
+      "epoch": 0.26955495177993244,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0015909569763231956,
+      "loss": 0.0781,
+      "step": 31053
+    },
+    {
+      "epoch": 0.26956363226013663,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0015909320319678413,
+      "loss": 0.1113,
+      "step": 31054
+    },
+    {
+      "epoch": 0.26957231274034077,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0015909070870756114,
+      "loss": 0.1172,
+      "step": 31055
+    },
+    {
+      "epoch": 0.26958099322054496,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0015908821416465329,
+      "loss": 0.0864,
+      "step": 31056
+    },
+    {
+      "epoch": 0.2695896737007491,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0015908571956806331,
+      "loss": 0.103,
+      "step": 31057
+    },
+    {
+      "epoch": 0.2695983541809533,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0015908322491779393,
+      "loss": 0.0825,
+      "step": 31058
+    },
+    {
+      "epoch": 0.26960703466115743,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0015908073021384786,
+      "loss": 0.1211,
+      "step": 31059
+    },
+    {
+      "epoch": 0.2696157151413616,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0015907823545622786,
+      "loss": 0.0972,
+      "step": 31060
+    },
+    {
+      "epoch": 0.26962439562156576,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001590757406449366,
+      "loss": 0.124,
+      "step": 31061
+    },
+    {
+      "epoch": 0.26963307610176995,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001590732457799769,
+      "loss": 0.1011,
+      "step": 31062
+    },
+    {
+      "epoch": 0.2696417565819741,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001590707508613514,
+      "loss": 0.0903,
+      "step": 31063
+    },
+    {
+      "epoch": 0.2696504370621783,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0015906825588906286,
+      "loss": 0.0986,
+      "step": 31064
+    },
+    {
+      "epoch": 0.2696591175423824,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0015906576086311404,
+      "loss": 0.0684,
+      "step": 31065
+    },
+    {
+      "epoch": 0.2696677980225866,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0015906326578350764,
+      "loss": 0.0957,
+      "step": 31066
+    },
+    {
+      "epoch": 0.26967647850279075,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0015906077065024633,
+      "loss": 0.1113,
+      "step": 31067
+    },
+    {
+      "epoch": 0.26968515898299494,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0015905827546333297,
+      "loss": 0.126,
+      "step": 31068
+    },
+    {
+      "epoch": 0.2696938394631991,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0015905578022277018,
+      "loss": 0.1123,
+      "step": 31069
+    },
+    {
+      "epoch": 0.2697025199434033,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0015905328492856075,
+      "loss": 0.1289,
+      "step": 31070
+    },
+    {
+      "epoch": 0.2697112004236074,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0015905078958070738,
+      "loss": 0.0859,
+      "step": 31071
+    },
+    {
+      "epoch": 0.2697198809038116,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0015904829417921278,
+      "loss": 0.1152,
+      "step": 31072
+    },
+    {
+      "epoch": 0.26972856138401574,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0015904579872407971,
+      "loss": 0.1152,
+      "step": 31073
+    },
+    {
+      "epoch": 0.26973724186421993,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001590433032153109,
+      "loss": 0.1328,
+      "step": 31074
+    },
+    {
+      "epoch": 0.26974592234442407,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001590408076529091,
+      "loss": 0.1328,
+      "step": 31075
+    },
+    {
+      "epoch": 0.26975460282462826,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0015903831203687693,
+      "loss": 0.106,
+      "step": 31076
+    },
+    {
+      "epoch": 0.2697632833048324,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0015903581636721724,
+      "loss": 0.1582,
+      "step": 31077
+    },
+    {
+      "epoch": 0.2697719637850366,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0015903332064393274,
+      "loss": 0.1387,
+      "step": 31078
+    },
+    {
+      "epoch": 0.26978064426524073,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0015903082486702614,
+      "loss": 0.127,
+      "step": 31079
+    },
+    {
+      "epoch": 0.2697893247454449,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0015902832903650012,
+      "loss": 0.166,
+      "step": 31080
+    },
+    {
+      "epoch": 0.26979800522564906,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001590258331523575,
+      "loss": 0.1523,
+      "step": 31081
+    },
+    {
+      "epoch": 0.26980668570585326,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0015902333721460095,
+      "loss": 0.1445,
+      "step": 31082
+    },
+    {
+      "epoch": 0.2698153661860574,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0015902084122323323,
+      "loss": 0.0815,
+      "step": 31083
+    },
+    {
+      "epoch": 0.2698240466662616,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0015901834517825701,
+      "loss": 0.105,
+      "step": 31084
+    },
+    {
+      "epoch": 0.2698327271464657,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001590158490796751,
+      "loss": 0.0874,
+      "step": 31085
+    },
+    {
+      "epoch": 0.2698414076266699,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001590133529274902,
+      "loss": 0.0981,
+      "step": 31086
+    },
+    {
+      "epoch": 0.26985008810687405,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0015901085672170503,
+      "loss": 0.1514,
+      "step": 31087
+    },
+    {
+      "epoch": 0.26985876858707825,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0015900836046232232,
+      "loss": 0.1816,
+      "step": 31088
+    },
+    {
+      "epoch": 0.2698674490672824,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0015900586414934479,
+      "loss": 0.0918,
+      "step": 31089
+    },
+    {
+      "epoch": 0.2698761295474866,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0015900336778277516,
+      "loss": 0.1025,
+      "step": 31090
+    },
+    {
+      "epoch": 0.2698848100276907,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0015900087136261625,
+      "loss": 0.1079,
+      "step": 31091
+    },
+    {
+      "epoch": 0.2698934905078949,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0015899837488887068,
+      "loss": 0.1147,
+      "step": 31092
+    },
+    {
+      "epoch": 0.26990217098809904,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0015899587836154122,
+      "loss": 0.0996,
+      "step": 31093
+    },
+    {
+      "epoch": 0.26991085146830324,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0015899338178063062,
+      "loss": 0.0874,
+      "step": 31094
+    },
+    {
+      "epoch": 0.2699195319485074,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.001589908851461416,
+      "loss": 0.103,
+      "step": 31095
+    },
+    {
+      "epoch": 0.26992821242871157,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0015898838845807685,
+      "loss": 0.0947,
+      "step": 31096
+    },
+    {
+      "epoch": 0.2699368929089157,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0015898589171643915,
+      "loss": 0.0815,
+      "step": 31097
+    },
+    {
+      "epoch": 0.2699455733891199,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0015898339492123124,
+      "loss": 0.0977,
+      "step": 31098
+    },
+    {
+      "epoch": 0.26995425386932403,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001589808980724558,
+      "loss": 0.1094,
+      "step": 31099
+    },
+    {
+      "epoch": 0.2699629343495282,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0015897840117011562,
+      "loss": 0.1245,
+      "step": 31100
+    },
+    {
+      "epoch": 0.26997161482973236,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0015897590421421338,
+      "loss": 0.0996,
+      "step": 31101
+    },
+    {
+      "epoch": 0.26998029530993656,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0015897340720475177,
+      "loss": 0.1133,
+      "step": 31102
+    },
+    {
+      "epoch": 0.2699889757901407,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0015897091014173367,
+      "loss": 0.1016,
+      "step": 31103
+    },
+    {
+      "epoch": 0.2699976562703449,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0015896841302516164,
+      "loss": 0.0801,
+      "step": 31104
+    },
+    {
+      "epoch": 0.270006336750549,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0015896591585503853,
+      "loss": 0.127,
+      "step": 31105
+    },
+    {
+      "epoch": 0.2700150172307532,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0015896341863136701,
+      "loss": 0.0859,
+      "step": 31106
+    },
+    {
+      "epoch": 0.27002369771095736,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0015896092135414986,
+      "loss": 0.126,
+      "step": 31107
+    },
+    {
+      "epoch": 0.27003237819116155,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0015895842402338977,
+      "loss": 0.2021,
+      "step": 31108
+    },
+    {
+      "epoch": 0.2700410586713657,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0015895592663908947,
+      "loss": 0.1162,
+      "step": 31109
+    },
+    {
+      "epoch": 0.2700497391515699,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0015895342920125171,
+      "loss": 0.0737,
+      "step": 31110
+    },
+    {
+      "epoch": 0.270058419631774,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001589509317098792,
+      "loss": 0.0762,
+      "step": 31111
+    },
+    {
+      "epoch": 0.2700671001119782,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001589484341649747,
+      "loss": 0.1299,
+      "step": 31112
+    },
+    {
+      "epoch": 0.27007578059218235,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0015894593656654095,
+      "loss": 0.0781,
+      "step": 31113
+    },
+    {
+      "epoch": 0.27008446107238654,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0015894343891458065,
+      "loss": 0.1309,
+      "step": 31114
+    },
+    {
+      "epoch": 0.2700931415525907,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0015894094120909653,
+      "loss": 0.1602,
+      "step": 31115
+    },
+    {
+      "epoch": 0.27010182203279487,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0015893844345009132,
+      "loss": 0.0869,
+      "step": 31116
+    },
+    {
+      "epoch": 0.270110502512999,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001589359456375678,
+      "loss": 0.1104,
+      "step": 31117
+    },
+    {
+      "epoch": 0.2701191829932032,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0015893344777152866,
+      "loss": 0.1069,
+      "step": 31118
+    },
+    {
+      "epoch": 0.27012786347340734,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0015893094985197657,
+      "loss": 0.0864,
+      "step": 31119
+    },
+    {
+      "epoch": 0.27013654395361153,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001589284518789144,
+      "loss": 0.1182,
+      "step": 31120
+    },
+    {
+      "epoch": 0.27014522443381567,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0015892595385234478,
+      "loss": 0.2119,
+      "step": 31121
+    },
+    {
+      "epoch": 0.27015390491401986,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001589234557722705,
+      "loss": 0.0811,
+      "step": 31122
+    },
+    {
+      "epoch": 0.270162585394224,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0015892095763869424,
+      "loss": 0.1182,
+      "step": 31123
+    },
+    {
+      "epoch": 0.2701712658744282,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0015891845945161874,
+      "loss": 0.0957,
+      "step": 31124
+    },
+    {
+      "epoch": 0.2701799463546323,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001589159612110468,
+      "loss": 0.1016,
+      "step": 31125
+    },
+    {
+      "epoch": 0.2701886268348365,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0015891346291698105,
+      "loss": 0.1367,
+      "step": 31126
+    },
+    {
+      "epoch": 0.27019730731504066,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0015891096456942429,
+      "loss": 0.1074,
+      "step": 31127
+    },
+    {
+      "epoch": 0.27020598779524485,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0015890846616837921,
+      "loss": 0.1289,
+      "step": 31128
+    },
+    {
+      "epoch": 0.270214668275449,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001589059677138486,
+      "loss": 0.0986,
+      "step": 31129
+    },
+    {
+      "epoch": 0.2702233487556532,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0015890346920583516,
+      "loss": 0.0928,
+      "step": 31130
+    },
+    {
+      "epoch": 0.2702320292358573,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0015890097064434159,
+      "loss": 0.1406,
+      "step": 31131
+    },
+    {
+      "epoch": 0.2702407097160615,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0015889847202937067,
+      "loss": 0.1914,
+      "step": 31132
+    },
+    {
+      "epoch": 0.27024939019626565,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001588959733609251,
+      "loss": 0.0928,
+      "step": 31133
+    },
+    {
+      "epoch": 0.27025807067646984,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0015889347463900767,
+      "loss": 0.1084,
+      "step": 31134
+    },
+    {
+      "epoch": 0.270266751156674,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0015889097586362106,
+      "loss": 0.0767,
+      "step": 31135
+    },
+    {
+      "epoch": 0.27027543163687817,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0015888847703476798,
+      "loss": 0.1182,
+      "step": 31136
+    },
+    {
+      "epoch": 0.2702841121170823,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015888597815245122,
+      "loss": 0.1309,
+      "step": 31137
+    },
+    {
+      "epoch": 0.2702927925972865,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0015888347921667347,
+      "loss": 0.1172,
+      "step": 31138
+    },
+    {
+      "epoch": 0.27030147307749064,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001588809802274375,
+      "loss": 0.1221,
+      "step": 31139
+    },
+    {
+      "epoch": 0.27031015355769483,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0015887848118474603,
+      "loss": 0.1357,
+      "step": 31140
+    },
+    {
+      "epoch": 0.27031883403789897,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0015887598208860177,
+      "loss": 0.1152,
+      "step": 31141
+    },
+    {
+      "epoch": 0.27032751451810316,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001588734829390075,
+      "loss": 0.1318,
+      "step": 31142
+    },
+    {
+      "epoch": 0.2703361949983073,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001588709837359659,
+      "loss": 0.1377,
+      "step": 31143
+    },
+    {
+      "epoch": 0.2703448754785115,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0015886848447947974,
+      "loss": 0.1211,
+      "step": 31144
+    },
+    {
+      "epoch": 0.27035355595871563,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0015886598516955174,
+      "loss": 0.0986,
+      "step": 31145
+    },
+    {
+      "epoch": 0.2703622364389198,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.001588634858061846,
+      "loss": 0.0781,
+      "step": 31146
+    },
+    {
+      "epoch": 0.27037091691912396,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0015886098638938108,
+      "loss": 0.1631,
+      "step": 31147
+    },
+    {
+      "epoch": 0.27037959739932815,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0015885848691914398,
+      "loss": 0.104,
+      "step": 31148
+    },
+    {
+      "epoch": 0.2703882778795323,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001588559873954759,
+      "loss": 0.1245,
+      "step": 31149
+    },
+    {
+      "epoch": 0.2703969583597365,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0015885348781837971,
+      "loss": 0.1357,
+      "step": 31150
+    },
+    {
+      "epoch": 0.2704056388399406,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015885098818785807,
+      "loss": 0.124,
+      "step": 31151
+    },
+    {
+      "epoch": 0.2704143193201448,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0015884848850391368,
+      "loss": 0.0957,
+      "step": 31152
+    },
+    {
+      "epoch": 0.27042299980034895,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0015884598876654936,
+      "loss": 0.0801,
+      "step": 31153
+    },
+    {
+      "epoch": 0.27043168028055314,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0015884348897576776,
+      "loss": 0.1064,
+      "step": 31154
+    },
+    {
+      "epoch": 0.2704403607607573,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.001588409891315717,
+      "loss": 0.0942,
+      "step": 31155
+    },
+    {
+      "epoch": 0.2704490412409615,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0015883848923396386,
+      "loss": 0.1113,
+      "step": 31156
+    },
+    {
+      "epoch": 0.2704577217211656,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0015883598928294696,
+      "loss": 0.1436,
+      "step": 31157
+    },
+    {
+      "epoch": 0.2704664022013698,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0015883348927852375,
+      "loss": 0.085,
+      "step": 31158
+    },
+    {
+      "epoch": 0.27047508268157394,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00158830989220697,
+      "loss": 0.1445,
+      "step": 31159
+    },
+    {
+      "epoch": 0.27048376316177813,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001588284891094694,
+      "loss": 0.1108,
+      "step": 31160
+    },
+    {
+      "epoch": 0.27049244364198227,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001588259889448437,
+      "loss": 0.083,
+      "step": 31161
+    },
+    {
+      "epoch": 0.27050112412218646,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0015882348872682263,
+      "loss": 0.1211,
+      "step": 31162
+    },
+    {
+      "epoch": 0.2705098046023906,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0015882098845540894,
+      "loss": 0.084,
+      "step": 31163
+    },
+    {
+      "epoch": 0.2705184850825948,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0015881848813060534,
+      "loss": 0.0786,
+      "step": 31164
+    },
+    {
+      "epoch": 0.27052716556279893,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0015881598775241454,
+      "loss": 0.0791,
+      "step": 31165
+    },
+    {
+      "epoch": 0.2705358460430031,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0015881348732083936,
+      "loss": 0.0947,
+      "step": 31166
+    },
+    {
+      "epoch": 0.27054452652320726,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0015881098683588244,
+      "loss": 0.1025,
+      "step": 31167
+    },
+    {
+      "epoch": 0.27055320700341146,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001588084862975466,
+      "loss": 0.1152,
+      "step": 31168
+    },
+    {
+      "epoch": 0.2705618874836156,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0015880598570583453,
+      "loss": 0.0679,
+      "step": 31169
+    },
+    {
+      "epoch": 0.2705705679638198,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0015880348506074895,
+      "loss": 0.1113,
+      "step": 31170
+    },
+    {
+      "epoch": 0.2705792484440239,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001588009843622926,
+      "loss": 0.1143,
+      "step": 31171
+    },
+    {
+      "epoch": 0.2705879289242281,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0015879848361046824,
+      "loss": 0.1016,
+      "step": 31172
+    },
+    {
+      "epoch": 0.27059660940443225,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.001587959828052786,
+      "loss": 0.0737,
+      "step": 31173
+    },
+    {
+      "epoch": 0.2706052898846364,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001587934819467264,
+      "loss": 0.0913,
+      "step": 31174
+    },
+    {
+      "epoch": 0.2706139703648406,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0015879098103481438,
+      "loss": 0.0923,
+      "step": 31175
+    },
+    {
+      "epoch": 0.2706226508450447,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0015878848006954528,
+      "loss": 0.1309,
+      "step": 31176
+    },
+    {
+      "epoch": 0.2706313313252489,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0015878597905092183,
+      "loss": 0.0806,
+      "step": 31177
+    },
+    {
+      "epoch": 0.27064001180545305,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0015878347797894676,
+      "loss": 0.1143,
+      "step": 31178
+    },
+    {
+      "epoch": 0.27064869228565724,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0015878097685362283,
+      "loss": 0.125,
+      "step": 31179
+    },
+    {
+      "epoch": 0.2706573727658614,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0015877847567495273,
+      "loss": 0.0967,
+      "step": 31180
+    },
+    {
+      "epoch": 0.2706660532460656,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0015877597444293923,
+      "loss": 0.1138,
+      "step": 31181
+    },
+    {
+      "epoch": 0.2706747337262697,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0015877347315758508,
+      "loss": 0.0967,
+      "step": 31182
+    },
+    {
+      "epoch": 0.2706834142064739,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0015877097181889294,
+      "loss": 0.0923,
+      "step": 31183
+    },
+    {
+      "epoch": 0.27069209468667804,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0015876847042686564,
+      "loss": 0.1338,
+      "step": 31184
+    },
+    {
+      "epoch": 0.27070077516688223,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0015876596898150588,
+      "loss": 0.0884,
+      "step": 31185
+    },
+    {
+      "epoch": 0.27070945564708637,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0015876346748281637,
+      "loss": 0.1396,
+      "step": 31186
+    },
+    {
+      "epoch": 0.27071813612729057,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0015876096593079986,
+      "loss": 0.1416,
+      "step": 31187
+    },
+    {
+      "epoch": 0.2707268166074947,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001587584643254591,
+      "loss": 0.0938,
+      "step": 31188
+    },
+    {
+      "epoch": 0.2707354970876989,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001587559626667968,
+      "loss": 0.0918,
+      "step": 31189
+    },
+    {
+      "epoch": 0.27074417756790303,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0015875346095481579,
+      "loss": 0.1011,
+      "step": 31190
+    },
+    {
+      "epoch": 0.2707528580481072,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0015875095918951865,
+      "loss": 0.0913,
+      "step": 31191
+    },
+    {
+      "epoch": 0.27076153852831136,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0015874845737090817,
+      "loss": 0.0811,
+      "step": 31192
+    },
+    {
+      "epoch": 0.27077021900851556,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0015874595549898716,
+      "loss": 0.0884,
+      "step": 31193
+    },
+    {
+      "epoch": 0.2707788994887197,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001587434535737583,
+      "loss": 0.1309,
+      "step": 31194
+    },
+    {
+      "epoch": 0.2707875799689239,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0015874095159522433,
+      "loss": 0.0781,
+      "step": 31195
+    },
+    {
+      "epoch": 0.270796260449128,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.00158738449563388,
+      "loss": 0.1436,
+      "step": 31196
+    },
+    {
+      "epoch": 0.2708049409293322,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00158735947478252,
+      "loss": 0.1143,
+      "step": 31197
+    },
+    {
+      "epoch": 0.27081362140953635,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001587334453398191,
+      "loss": 0.1289,
+      "step": 31198
+    },
+    {
+      "epoch": 0.27082230188974055,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001587309431480921,
+      "loss": 0.1055,
+      "step": 31199
+    },
+    {
+      "epoch": 0.2708309823699447,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.001587284409030736,
+      "loss": 0.1211,
+      "step": 31200
+    },
+    {
+      "epoch": 0.2708396628501489,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001587259386047664,
+      "loss": 0.1035,
+      "step": 31201
+    },
+    {
+      "epoch": 0.270848343330353,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001587234362531733,
+      "loss": 0.123,
+      "step": 31202
+    },
+    {
+      "epoch": 0.2708570238105572,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0015872093384829695,
+      "loss": 0.1025,
+      "step": 31203
+    },
+    {
+      "epoch": 0.27086570429076134,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0015871843139014016,
+      "loss": 0.0894,
+      "step": 31204
+    },
+    {
+      "epoch": 0.27087438477096554,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0015871592887870558,
+      "loss": 0.1709,
+      "step": 31205
+    },
+    {
+      "epoch": 0.2708830652511697,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0015871342631399602,
+      "loss": 0.084,
+      "step": 31206
+    },
+    {
+      "epoch": 0.27089174573137387,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0015871092369601416,
+      "loss": 0.083,
+      "step": 31207
+    },
+    {
+      "epoch": 0.270900426211578,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0015870842102476282,
+      "loss": 0.0977,
+      "step": 31208
+    },
+    {
+      "epoch": 0.2709091066917822,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0015870591830024461,
+      "loss": 0.0996,
+      "step": 31209
+    },
+    {
+      "epoch": 0.27091778717198634,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0015870341552246237,
+      "loss": 0.0957,
+      "step": 31210
+    },
+    {
+      "epoch": 0.27092646765219053,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0015870091269141883,
+      "loss": 0.1445,
+      "step": 31211
+    },
+    {
+      "epoch": 0.27093514813239467,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001586984098071167,
+      "loss": 0.1201,
+      "step": 31212
+    },
+    {
+      "epoch": 0.27094382861259886,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0015869590686955872,
+      "loss": 0.1221,
+      "step": 31213
+    },
+    {
+      "epoch": 0.270952509092803,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0015869340387874757,
+      "loss": 0.1064,
+      "step": 31214
+    },
+    {
+      "epoch": 0.2709611895730072,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0015869090083468608,
+      "loss": 0.1074,
+      "step": 31215
+    },
+    {
+      "epoch": 0.2709698700532113,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0015868839773737697,
+      "loss": 0.1104,
+      "step": 31216
+    },
+    {
+      "epoch": 0.2709785505334155,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.00158685894586823,
+      "loss": 0.1123,
+      "step": 31217
+    },
+    {
+      "epoch": 0.27098723101361966,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001586833913830268,
+      "loss": 0.1099,
+      "step": 31218
+    },
+    {
+      "epoch": 0.27099591149382385,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001586808881259912,
+      "loss": 0.1074,
+      "step": 31219
+    },
+    {
+      "epoch": 0.271004591974028,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0015867838481571888,
+      "loss": 0.1025,
+      "step": 31220
+    },
+    {
+      "epoch": 0.2710132724542322,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0015867588145221264,
+      "loss": 0.1426,
+      "step": 31221
+    },
+    {
+      "epoch": 0.2710219529344363,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001586733780354752,
+      "loss": 0.1182,
+      "step": 31222
+    },
+    {
+      "epoch": 0.2710306334146405,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0015867087456550926,
+      "loss": 0.0845,
+      "step": 31223
+    },
+    {
+      "epoch": 0.27103931389484465,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0015866837104231758,
+      "loss": 0.0977,
+      "step": 31224
+    },
+    {
+      "epoch": 0.27104799437504884,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0015866586746590293,
+      "loss": 0.1094,
+      "step": 31225
+    },
+    {
+      "epoch": 0.271056674855253,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0015866336383626798,
+      "loss": 0.0869,
+      "step": 31226
+    },
+    {
+      "epoch": 0.27106535533545717,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0015866086015341554,
+      "loss": 0.1035,
+      "step": 31227
+    },
+    {
+      "epoch": 0.2710740358156613,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001586583564173483,
+      "loss": 0.1089,
+      "step": 31228
+    },
+    {
+      "epoch": 0.2710827162958655,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.00158655852628069,
+      "loss": 0.0913,
+      "step": 31229
+    },
+    {
+      "epoch": 0.27109139677606964,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0015865334878558041,
+      "loss": 0.1152,
+      "step": 31230
+    },
+    {
+      "epoch": 0.27110007725627383,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0015865084488988525,
+      "loss": 0.0903,
+      "step": 31231
+    },
+    {
+      "epoch": 0.27110875773647797,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0015864834094098624,
+      "loss": 0.1201,
+      "step": 31232
+    },
+    {
+      "epoch": 0.27111743821668216,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0015864583693888615,
+      "loss": 0.0771,
+      "step": 31233
+    },
+    {
+      "epoch": 0.2711261186968863,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.001586433328835877,
+      "loss": 0.1543,
+      "step": 31234
+    },
+    {
+      "epoch": 0.2711347991770905,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0015864082877509364,
+      "loss": 0.1133,
+      "step": 31235
+    },
+    {
+      "epoch": 0.27114347965729463,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0015863832461340667,
+      "loss": 0.1094,
+      "step": 31236
+    },
+    {
+      "epoch": 0.2711521601374988,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001586358203985296,
+      "loss": 0.0835,
+      "step": 31237
+    },
+    {
+      "epoch": 0.27116084061770296,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001586333161304651,
+      "loss": 0.1187,
+      "step": 31238
+    },
+    {
+      "epoch": 0.27116952109790715,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0015863081180921596,
+      "loss": 0.1562,
+      "step": 31239
+    },
+    {
+      "epoch": 0.2711782015781113,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0015862830743478486,
+      "loss": 0.1025,
+      "step": 31240
+    },
+    {
+      "epoch": 0.2711868820583155,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001586258030071746,
+      "loss": 0.1143,
+      "step": 31241
+    },
+    {
+      "epoch": 0.2711955625385196,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0015862329852638789,
+      "loss": 0.0869,
+      "step": 31242
+    },
+    {
+      "epoch": 0.2712042430187238,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0015862079399242745,
+      "loss": 0.125,
+      "step": 31243
+    },
+    {
+      "epoch": 0.27121292349892795,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0015861828940529608,
+      "loss": 0.1719,
+      "step": 31244
+    },
+    {
+      "epoch": 0.27122160397913214,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0015861578476499645,
+      "loss": 0.0957,
+      "step": 31245
+    },
+    {
+      "epoch": 0.2712302844593363,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0015861328007153131,
+      "loss": 0.1162,
+      "step": 31246
+    },
+    {
+      "epoch": 0.2712389649395405,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0015861077532490348,
+      "loss": 0.1216,
+      "step": 31247
+    },
+    {
+      "epoch": 0.2712476454197446,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001586082705251156,
+      "loss": 0.1328,
+      "step": 31248
+    },
+    {
+      "epoch": 0.2712563258999488,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0015860576567217042,
+      "loss": 0.1348,
+      "step": 31249
+    },
+    {
+      "epoch": 0.27126500638015294,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0015860326076607071,
+      "loss": 0.1289,
+      "step": 31250
+    },
+    {
+      "epoch": 0.27127368686035713,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0015860075580681925,
+      "loss": 0.0884,
+      "step": 31251
+    },
+    {
+      "epoch": 0.27128236734056127,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001585982507944187,
+      "loss": 0.0898,
+      "step": 31252
+    },
+    {
+      "epoch": 0.27129104782076546,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0015859574572887182,
+      "loss": 0.1514,
+      "step": 31253
+    },
+    {
+      "epoch": 0.2712997283009696,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001585932406101814,
+      "loss": 0.1011,
+      "step": 31254
+    },
+    {
+      "epoch": 0.2713084087811738,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0015859073543835012,
+      "loss": 0.1201,
+      "step": 31255
+    },
+    {
+      "epoch": 0.27131708926137793,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0015858823021338076,
+      "loss": 0.0908,
+      "step": 31256
+    },
+    {
+      "epoch": 0.2713257697415821,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0015858572493527598,
+      "loss": 0.1328,
+      "step": 31257
+    },
+    {
+      "epoch": 0.27133445022178626,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0015858321960403865,
+      "loss": 0.0874,
+      "step": 31258
+    },
+    {
+      "epoch": 0.27134313070199045,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0015858071421967143,
+      "loss": 0.0898,
+      "step": 31259
+    },
+    {
+      "epoch": 0.2713518111821946,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001585782087821771,
+      "loss": 0.1387,
+      "step": 31260
+    },
+    {
+      "epoch": 0.2713604916623988,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001585757032915583,
+      "loss": 0.125,
+      "step": 31261
+    },
+    {
+      "epoch": 0.2713691721426029,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0015857319774781786,
+      "loss": 0.0918,
+      "step": 31262
+    },
+    {
+      "epoch": 0.2713778526228071,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001585706921509585,
+      "loss": 0.1089,
+      "step": 31263
+    },
+    {
+      "epoch": 0.27138653310301125,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.00158568186500983,
+      "loss": 0.0928,
+      "step": 31264
+    },
+    {
+      "epoch": 0.27139521358321544,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0015856568079789404,
+      "loss": 0.1338,
+      "step": 31265
+    },
+    {
+      "epoch": 0.2714038940634196,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0015856317504169435,
+      "loss": 0.0918,
+      "step": 31266
+    },
+    {
+      "epoch": 0.2714125745436238,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0015856066923238673,
+      "loss": 0.1074,
+      "step": 31267
+    },
+    {
+      "epoch": 0.2714212550238279,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0015855816336997394,
+      "loss": 0.1328,
+      "step": 31268
+    },
+    {
+      "epoch": 0.2714299355040321,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0015855565745445859,
+      "loss": 0.1074,
+      "step": 31269
+    },
+    {
+      "epoch": 0.27143861598423624,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0015855315148584352,
+      "loss": 0.1016,
+      "step": 31270
+    },
+    {
+      "epoch": 0.27144729646444044,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0015855064546413148,
+      "loss": 0.0928,
+      "step": 31271
+    },
+    {
+      "epoch": 0.2714559769446446,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001585481393893252,
+      "loss": 0.0771,
+      "step": 31272
+    },
+    {
+      "epoch": 0.27146465742484877,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0015854563326142734,
+      "loss": 0.084,
+      "step": 31273
+    },
+    {
+      "epoch": 0.2714733379050529,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0015854312708044073,
+      "loss": 0.1162,
+      "step": 31274
+    },
+    {
+      "epoch": 0.2714820183852571,
+      "grad_norm": 3.546875,
+      "learning_rate": 0.001585406208463681,
+      "loss": 0.2812,
+      "step": 31275
+    },
+    {
+      "epoch": 0.27149069886546123,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0015853811455921215,
+      "loss": 0.1309,
+      "step": 31276
+    },
+    {
+      "epoch": 0.2714993793456654,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0015853560821897567,
+      "loss": 0.0908,
+      "step": 31277
+    },
+    {
+      "epoch": 0.27150805982586956,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001585331018256614,
+      "loss": 0.0898,
+      "step": 31278
+    },
+    {
+      "epoch": 0.27151674030607376,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0015853059537927203,
+      "loss": 0.0938,
+      "step": 31279
+    },
+    {
+      "epoch": 0.2715254207862779,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0015852808887981034,
+      "loss": 0.1094,
+      "step": 31280
+    },
+    {
+      "epoch": 0.2715341012664821,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0015852558232727907,
+      "loss": 0.0986,
+      "step": 31281
+    },
+    {
+      "epoch": 0.2715427817466862,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0015852307572168094,
+      "loss": 0.1152,
+      "step": 31282
+    },
+    {
+      "epoch": 0.2715514622268904,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001585205690630187,
+      "loss": 0.1084,
+      "step": 31283
+    },
+    {
+      "epoch": 0.27156014270709455,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001585180623512951,
+      "loss": 0.1211,
+      "step": 31284
+    },
+    {
+      "epoch": 0.27156882318729875,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0015851555558651287,
+      "loss": 0.0845,
+      "step": 31285
+    },
+    {
+      "epoch": 0.2715775036675029,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0015851304876867477,
+      "loss": 0.0996,
+      "step": 31286
+    },
+    {
+      "epoch": 0.2715861841477071,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0015851054189778351,
+      "loss": 0.0811,
+      "step": 31287
+    },
+    {
+      "epoch": 0.2715948646279112,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0015850803497384188,
+      "loss": 0.1123,
+      "step": 31288
+    },
+    {
+      "epoch": 0.2716035451081154,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0015850552799685253,
+      "loss": 0.1006,
+      "step": 31289
+    },
+    {
+      "epoch": 0.27161222558831954,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0015850302096681833,
+      "loss": 0.1147,
+      "step": 31290
+    },
+    {
+      "epoch": 0.27162090606852374,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0015850051388374194,
+      "loss": 0.0947,
+      "step": 31291
+    },
+    {
+      "epoch": 0.2716295865487279,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0015849800674762606,
+      "loss": 0.0938,
+      "step": 31292
+    },
+    {
+      "epoch": 0.27163826702893207,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0015849549955847355,
+      "loss": 0.1172,
+      "step": 31293
+    },
+    {
+      "epoch": 0.2716469475091362,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001584929923162871,
+      "loss": 0.0972,
+      "step": 31294
+    },
+    {
+      "epoch": 0.2716556279893404,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0015849048502106942,
+      "loss": 0.1309,
+      "step": 31295
+    },
+    {
+      "epoch": 0.27166430846954454,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0015848797767282323,
+      "loss": 0.124,
+      "step": 31296
+    },
+    {
+      "epoch": 0.2716729889497487,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001584854702715514,
+      "loss": 0.1104,
+      "step": 31297
+    },
+    {
+      "epoch": 0.27168166942995287,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0015848296281725655,
+      "loss": 0.0796,
+      "step": 31298
+    },
+    {
+      "epoch": 0.271690349910157,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0015848045530994143,
+      "loss": 0.1104,
+      "step": 31299
+    },
+    {
+      "epoch": 0.2716990303903612,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0015847794774960884,
+      "loss": 0.126,
+      "step": 31300
+    },
+    {
+      "epoch": 0.27170771087056533,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0015847544013626149,
+      "loss": 0.0957,
+      "step": 31301
+    },
+    {
+      "epoch": 0.2717163913507695,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0015847293246990214,
+      "loss": 0.1084,
+      "step": 31302
+    },
+    {
+      "epoch": 0.27172507183097366,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001584704247505335,
+      "loss": 0.1182,
+      "step": 31303
+    },
+    {
+      "epoch": 0.27173375231117786,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0015846791697815836,
+      "loss": 0.0781,
+      "step": 31304
+    },
+    {
+      "epoch": 0.271742432791382,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001584654091527794,
+      "loss": 0.0874,
+      "step": 31305
+    },
+    {
+      "epoch": 0.2717511132715862,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0015846290127439942,
+      "loss": 0.1367,
+      "step": 31306
+    },
+    {
+      "epoch": 0.2717597937517903,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0015846039334302111,
+      "loss": 0.0781,
+      "step": 31307
+    },
+    {
+      "epoch": 0.2717684742319945,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001584578853586473,
+      "loss": 0.0981,
+      "step": 31308
+    },
+    {
+      "epoch": 0.27177715471219865,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0015845537732128063,
+      "loss": 0.0986,
+      "step": 31309
+    },
+    {
+      "epoch": 0.27178583519240285,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001584528692309239,
+      "loss": 0.0908,
+      "step": 31310
+    },
+    {
+      "epoch": 0.271794515672607,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0015845036108757984,
+      "loss": 0.1445,
+      "step": 31311
+    },
+    {
+      "epoch": 0.2718031961528112,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.001584478528912512,
+      "loss": 0.1064,
+      "step": 31312
+    },
+    {
+      "epoch": 0.2718118766330153,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0015844534464194069,
+      "loss": 0.0752,
+      "step": 31313
+    },
+    {
+      "epoch": 0.2718205571132195,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001584428363396511,
+      "loss": 0.126,
+      "step": 31314
+    },
+    {
+      "epoch": 0.27182923759342364,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0015844032798438513,
+      "loss": 0.1099,
+      "step": 31315
+    },
+    {
+      "epoch": 0.27183791807362784,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0015843781957614557,
+      "loss": 0.1289,
+      "step": 31316
+    },
+    {
+      "epoch": 0.271846598553832,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0015843531111493514,
+      "loss": 0.1074,
+      "step": 31317
+    },
+    {
+      "epoch": 0.27185527903403617,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0015843280260075655,
+      "loss": 0.084,
+      "step": 31318
+    },
+    {
+      "epoch": 0.2718639595142403,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001584302940336126,
+      "loss": 0.1011,
+      "step": 31319
+    },
+    {
+      "epoch": 0.2718726399944445,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0015842778541350602,
+      "loss": 0.1006,
+      "step": 31320
+    },
+    {
+      "epoch": 0.27188132047464864,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0015842527674043952,
+      "loss": 0.0981,
+      "step": 31321
+    },
+    {
+      "epoch": 0.27189000095485283,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0015842276801441586,
+      "loss": 0.1279,
+      "step": 31322
+    },
+    {
+      "epoch": 0.27189868143505697,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0015842025923543778,
+      "loss": 0.1143,
+      "step": 31323
+    },
+    {
+      "epoch": 0.27190736191526116,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0015841775040350806,
+      "loss": 0.1289,
+      "step": 31324
+    },
+    {
+      "epoch": 0.2719160423954653,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0015841524151862941,
+      "loss": 0.0933,
+      "step": 31325
+    },
+    {
+      "epoch": 0.2719247228756695,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001584127325808046,
+      "loss": 0.0918,
+      "step": 31326
+    },
+    {
+      "epoch": 0.2719334033558736,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0015841022359003633,
+      "loss": 0.0967,
+      "step": 31327
+    },
+    {
+      "epoch": 0.2719420838360778,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0015840771454632737,
+      "loss": 0.1152,
+      "step": 31328
+    },
+    {
+      "epoch": 0.27195076431628196,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0015840520544968045,
+      "loss": 0.1406,
+      "step": 31329
+    },
+    {
+      "epoch": 0.27195944479648615,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0015840269630009833,
+      "loss": 0.0908,
+      "step": 31330
+    },
+    {
+      "epoch": 0.2719681252766903,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0015840018709758375,
+      "loss": 0.0747,
+      "step": 31331
+    },
+    {
+      "epoch": 0.2719768057568945,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0015839767784213944,
+      "loss": 0.1338,
+      "step": 31332
+    },
+    {
+      "epoch": 0.2719854862370986,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0015839516853376817,
+      "loss": 0.0967,
+      "step": 31333
+    },
+    {
+      "epoch": 0.2719941667173028,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0015839265917247268,
+      "loss": 0.1914,
+      "step": 31334
+    },
+    {
+      "epoch": 0.27200284719750695,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0015839014975825567,
+      "loss": 0.1445,
+      "step": 31335
+    },
+    {
+      "epoch": 0.27201152767771114,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0015838764029111995,
+      "loss": 0.0801,
+      "step": 31336
+    },
+    {
+      "epoch": 0.2720202081579153,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0015838513077106825,
+      "loss": 0.1016,
+      "step": 31337
+    },
+    {
+      "epoch": 0.27202888863811947,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0015838262119810328,
+      "loss": 0.1279,
+      "step": 31338
+    },
+    {
+      "epoch": 0.2720375691183236,
+      "grad_norm": 3.75,
+      "learning_rate": 0.0015838011157222778,
+      "loss": 0.2578,
+      "step": 31339
+    },
+    {
+      "epoch": 0.2720462495985278,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0015837760189344454,
+      "loss": 0.0693,
+      "step": 31340
+    },
+    {
+      "epoch": 0.27205493007873194,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0015837509216175627,
+      "loss": 0.1123,
+      "step": 31341
+    },
+    {
+      "epoch": 0.27206361055893613,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0015837258237716572,
+      "loss": 0.1211,
+      "step": 31342
+    },
+    {
+      "epoch": 0.27207229103914027,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0015837007253967567,
+      "loss": 0.0967,
+      "step": 31343
+    },
+    {
+      "epoch": 0.27208097151934446,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001583675626492888,
+      "loss": 0.1074,
+      "step": 31344
+    },
+    {
+      "epoch": 0.2720896519995486,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0015836505270600792,
+      "loss": 0.1016,
+      "step": 31345
+    },
+    {
+      "epoch": 0.2720983324797528,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0015836254270983577,
+      "loss": 0.0918,
+      "step": 31346
+    },
+    {
+      "epoch": 0.27210701295995693,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0015836003266077499,
+      "loss": 0.1221,
+      "step": 31347
+    },
+    {
+      "epoch": 0.2721156934401611,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0015835752255882846,
+      "loss": 0.1123,
+      "step": 31348
+    },
+    {
+      "epoch": 0.27212437392036526,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0015835501240399888,
+      "loss": 0.0996,
+      "step": 31349
+    },
+    {
+      "epoch": 0.27213305440056945,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0015835250219628895,
+      "loss": 0.1045,
+      "step": 31350
+    },
+    {
+      "epoch": 0.2721417348807736,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0015834999193570145,
+      "loss": 0.1709,
+      "step": 31351
+    },
+    {
+      "epoch": 0.2721504153609778,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0015834748162223914,
+      "loss": 0.1147,
+      "step": 31352
+    },
+    {
+      "epoch": 0.2721590958411819,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0015834497125590474,
+      "loss": 0.1099,
+      "step": 31353
+    },
+    {
+      "epoch": 0.2721677763213861,
+      "grad_norm": 3.140625,
+      "learning_rate": 0.0015834246083670104,
+      "loss": 0.3105,
+      "step": 31354
+    },
+    {
+      "epoch": 0.27217645680159025,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0015833995036463073,
+      "loss": 0.0859,
+      "step": 31355
+    },
+    {
+      "epoch": 0.27218513728179444,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0015833743983969658,
+      "loss": 0.1143,
+      "step": 31356
+    },
+    {
+      "epoch": 0.2721938177619986,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001583349292619013,
+      "loss": 0.0884,
+      "step": 31357
+    },
+    {
+      "epoch": 0.2722024982422028,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0015833241863124773,
+      "loss": 0.0845,
+      "step": 31358
+    },
+    {
+      "epoch": 0.2722111787224069,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001583299079477385,
+      "loss": 0.083,
+      "step": 31359
+    },
+    {
+      "epoch": 0.2722198592026111,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0015832739721137641,
+      "loss": 0.0825,
+      "step": 31360
+    },
+    {
+      "epoch": 0.27222853968281524,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0015832488642216424,
+      "loss": 0.0918,
+      "step": 31361
+    },
+    {
+      "epoch": 0.27223722016301943,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0015832237558010465,
+      "loss": 0.1406,
+      "step": 31362
+    },
+    {
+      "epoch": 0.27224590064322357,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0015831986468520048,
+      "loss": 0.1221,
+      "step": 31363
+    },
+    {
+      "epoch": 0.27225458112342776,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0015831735373745442,
+      "loss": 0.0869,
+      "step": 31364
+    },
+    {
+      "epoch": 0.2722632616036319,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0015831484273686922,
+      "loss": 0.0942,
+      "step": 31365
+    },
+    {
+      "epoch": 0.2722719420838361,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0015831233168344763,
+      "loss": 0.0962,
+      "step": 31366
+    },
+    {
+      "epoch": 0.27228062256404023,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001583098205771924,
+      "loss": 0.1367,
+      "step": 31367
+    },
+    {
+      "epoch": 0.2722893030442444,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0015830730941810629,
+      "loss": 0.0864,
+      "step": 31368
+    },
+    {
+      "epoch": 0.27229798352444856,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0015830479820619205,
+      "loss": 0.0781,
+      "step": 31369
+    },
+    {
+      "epoch": 0.27230666400465275,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0015830228694145236,
+      "loss": 0.1348,
+      "step": 31370
+    },
+    {
+      "epoch": 0.2723153444848569,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0015829977562389003,
+      "loss": 0.1201,
+      "step": 31371
+    },
+    {
+      "epoch": 0.2723240249650611,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001582972642535078,
+      "loss": 0.0947,
+      "step": 31372
+    },
+    {
+      "epoch": 0.2723327054452652,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0015829475283030846,
+      "loss": 0.0737,
+      "step": 31373
+    },
+    {
+      "epoch": 0.2723413859254694,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001582922413542946,
+      "loss": 0.1191,
+      "step": 31374
+    },
+    {
+      "epoch": 0.27235006640567355,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0015828972982546913,
+      "loss": 0.1182,
+      "step": 31375
+    },
+    {
+      "epoch": 0.27235874688587774,
+      "grad_norm": 0.0595703125,
+      "learning_rate": 0.0015828721824383476,
+      "loss": 0.0615,
+      "step": 31376
+    },
+    {
+      "epoch": 0.2723674273660819,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0015828470660939418,
+      "loss": 0.1338,
+      "step": 31377
+    },
+    {
+      "epoch": 0.2723761078462861,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0015828219492215018,
+      "loss": 0.0908,
+      "step": 31378
+    },
+    {
+      "epoch": 0.2723847883264902,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0015827968318210548,
+      "loss": 0.1396,
+      "step": 31379
+    },
+    {
+      "epoch": 0.2723934688066944,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0015827717138926286,
+      "loss": 0.082,
+      "step": 31380
+    },
+    {
+      "epoch": 0.27240214928689854,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0015827465954362503,
+      "loss": 0.1113,
+      "step": 31381
+    },
+    {
+      "epoch": 0.27241082976710274,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001582721476451948,
+      "loss": 0.0708,
+      "step": 31382
+    },
+    {
+      "epoch": 0.2724195102473069,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0015826963569397483,
+      "loss": 0.1104,
+      "step": 31383
+    },
+    {
+      "epoch": 0.27242819072751107,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0015826712368996795,
+      "loss": 0.0894,
+      "step": 31384
+    },
+    {
+      "epoch": 0.2724368712077152,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0015826461163317683,
+      "loss": 0.0962,
+      "step": 31385
+    },
+    {
+      "epoch": 0.2724455516879194,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0015826209952360432,
+      "loss": 0.0967,
+      "step": 31386
+    },
+    {
+      "epoch": 0.27245423216812353,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0015825958736125302,
+      "loss": 0.1079,
+      "step": 31387
+    },
+    {
+      "epoch": 0.2724629126483277,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001582570751461258,
+      "loss": 0.0864,
+      "step": 31388
+    },
+    {
+      "epoch": 0.27247159312853186,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0015825456287822538,
+      "loss": 0.1113,
+      "step": 31389
+    },
+    {
+      "epoch": 0.27248027360873606,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0015825205055755448,
+      "loss": 0.1318,
+      "step": 31390
+    },
+    {
+      "epoch": 0.2724889540889402,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0015824953818411583,
+      "loss": 0.1211,
+      "step": 31391
+    },
+    {
+      "epoch": 0.2724976345691444,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0015824702575791226,
+      "loss": 0.1143,
+      "step": 31392
+    },
+    {
+      "epoch": 0.2725063150493485,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0015824451327894646,
+      "loss": 0.085,
+      "step": 31393
+    },
+    {
+      "epoch": 0.2725149955295527,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0015824200074722118,
+      "loss": 0.1182,
+      "step": 31394
+    },
+    {
+      "epoch": 0.27252367600975685,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0015823948816273917,
+      "loss": 0.1289,
+      "step": 31395
+    },
+    {
+      "epoch": 0.27253235648996105,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0015823697552550316,
+      "loss": 0.1064,
+      "step": 31396
+    },
+    {
+      "epoch": 0.2725410369701652,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0015823446283551596,
+      "loss": 0.1128,
+      "step": 31397
+    },
+    {
+      "epoch": 0.2725497174503694,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0015823195009278027,
+      "loss": 0.1011,
+      "step": 31398
+    },
+    {
+      "epoch": 0.2725583979305735,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001582294372972988,
+      "loss": 0.1143,
+      "step": 31399
+    },
+    {
+      "epoch": 0.2725670784107777,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0015822692444907437,
+      "loss": 0.0928,
+      "step": 31400
+    },
+    {
+      "epoch": 0.27257575889098185,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001582244115481097,
+      "loss": 0.1064,
+      "step": 31401
+    },
+    {
+      "epoch": 0.27258443937118604,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0015822189859440751,
+      "loss": 0.127,
+      "step": 31402
+    },
+    {
+      "epoch": 0.2725931198513902,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0015821938558797065,
+      "loss": 0.1064,
+      "step": 31403
+    },
+    {
+      "epoch": 0.27260180033159437,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0015821687252880175,
+      "loss": 0.0879,
+      "step": 31404
+    },
+    {
+      "epoch": 0.2726104808117985,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0015821435941690359,
+      "loss": 0.1318,
+      "step": 31405
+    },
+    {
+      "epoch": 0.2726191612920027,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0015821184625227896,
+      "loss": 0.1113,
+      "step": 31406
+    },
+    {
+      "epoch": 0.27262784177220684,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0015820933303493055,
+      "loss": 0.1006,
+      "step": 31407
+    },
+    {
+      "epoch": 0.27263652225241103,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0015820681976486113,
+      "loss": 0.0771,
+      "step": 31408
+    },
+    {
+      "epoch": 0.27264520273261517,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0015820430644207348,
+      "loss": 0.1025,
+      "step": 31409
+    },
+    {
+      "epoch": 0.27265388321281936,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0015820179306657032,
+      "loss": 0.0908,
+      "step": 31410
+    },
+    {
+      "epoch": 0.2726625636930235,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0015819927963835441,
+      "loss": 0.0811,
+      "step": 31411
+    },
+    {
+      "epoch": 0.2726712441732277,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.001581967661574285,
+      "loss": 0.1738,
+      "step": 31412
+    },
+    {
+      "epoch": 0.2726799246534318,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001581942526237953,
+      "loss": 0.0908,
+      "step": 31413
+    },
+    {
+      "epoch": 0.272688605133636,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0015819173903745762,
+      "loss": 0.125,
+      "step": 31414
+    },
+    {
+      "epoch": 0.27269728561384016,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0015818922539841817,
+      "loss": 0.1074,
+      "step": 31415
+    },
+    {
+      "epoch": 0.27270596609404435,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0015818671170667968,
+      "loss": 0.1133,
+      "step": 31416
+    },
+    {
+      "epoch": 0.2727146465742485,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0015818419796224494,
+      "loss": 0.0957,
+      "step": 31417
+    },
+    {
+      "epoch": 0.2727233270544527,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001581816841651167,
+      "loss": 0.0928,
+      "step": 31418
+    },
+    {
+      "epoch": 0.2727320075346568,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0015817917031529767,
+      "loss": 0.1602,
+      "step": 31419
+    },
+    {
+      "epoch": 0.27274068801486095,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0015817665641279066,
+      "loss": 0.084,
+      "step": 31420
+    },
+    {
+      "epoch": 0.27274936849506515,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0015817414245759835,
+      "loss": 0.0957,
+      "step": 31421
+    },
+    {
+      "epoch": 0.2727580489752693,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0015817162844972352,
+      "loss": 0.0942,
+      "step": 31422
+    },
+    {
+      "epoch": 0.2727667294554735,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0015816911438916897,
+      "loss": 0.123,
+      "step": 31423
+    },
+    {
+      "epoch": 0.2727754099356776,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0015816660027593737,
+      "loss": 0.0889,
+      "step": 31424
+    },
+    {
+      "epoch": 0.2727840904158818,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001581640861100315,
+      "loss": 0.0938,
+      "step": 31425
+    },
+    {
+      "epoch": 0.27279277089608595,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0015816157189145407,
+      "loss": 0.1172,
+      "step": 31426
+    },
+    {
+      "epoch": 0.27280145137629014,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0015815905762020792,
+      "loss": 0.1074,
+      "step": 31427
+    },
+    {
+      "epoch": 0.2728101318564943,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0015815654329629572,
+      "loss": 0.1006,
+      "step": 31428
+    },
+    {
+      "epoch": 0.27281881233669847,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0015815402891972026,
+      "loss": 0.1191,
+      "step": 31429
+    },
+    {
+      "epoch": 0.2728274928169026,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0015815151449048428,
+      "loss": 0.1523,
+      "step": 31430
+    },
+    {
+      "epoch": 0.2728361732971068,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001581490000085905,
+      "loss": 0.1455,
+      "step": 31431
+    },
+    {
+      "epoch": 0.27284485377731094,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0015814648547404176,
+      "loss": 0.1182,
+      "step": 31432
+    },
+    {
+      "epoch": 0.27285353425751513,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.001581439708868407,
+      "loss": 0.1064,
+      "step": 31433
+    },
+    {
+      "epoch": 0.27286221473771927,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001581414562469901,
+      "loss": 0.1211,
+      "step": 31434
+    },
+    {
+      "epoch": 0.27287089521792346,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0015813894155449278,
+      "loss": 0.0977,
+      "step": 31435
+    },
+    {
+      "epoch": 0.2728795756981276,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0015813642680935138,
+      "loss": 0.1338,
+      "step": 31436
+    },
+    {
+      "epoch": 0.2728882561783318,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0015813391201156873,
+      "loss": 0.1211,
+      "step": 31437
+    },
+    {
+      "epoch": 0.2728969366585359,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0015813139716114757,
+      "loss": 0.1094,
+      "step": 31438
+    },
+    {
+      "epoch": 0.2729056171387401,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001581288822580906,
+      "loss": 0.1162,
+      "step": 31439
+    },
+    {
+      "epoch": 0.27291429761894426,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0015812636730240066,
+      "loss": 0.103,
+      "step": 31440
+    },
+    {
+      "epoch": 0.27292297809914845,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0015812385229408044,
+      "loss": 0.0854,
+      "step": 31441
+    },
+    {
+      "epoch": 0.2729316585793526,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001581213372331327,
+      "loss": 0.1348,
+      "step": 31442
+    },
+    {
+      "epoch": 0.2729403390595568,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0015811882211956014,
+      "loss": 0.0806,
+      "step": 31443
+    },
+    {
+      "epoch": 0.2729490195397609,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001581163069533656,
+      "loss": 0.1172,
+      "step": 31444
+    },
+    {
+      "epoch": 0.2729577000199651,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0015811379173455177,
+      "loss": 0.1055,
+      "step": 31445
+    },
+    {
+      "epoch": 0.27296638050016925,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0015811127646312144,
+      "loss": 0.0996,
+      "step": 31446
+    },
+    {
+      "epoch": 0.27297506098037344,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0015810876113907732,
+      "loss": 0.0869,
+      "step": 31447
+    },
+    {
+      "epoch": 0.2729837414605776,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0015810624576242218,
+      "loss": 0.1084,
+      "step": 31448
+    },
+    {
+      "epoch": 0.27299242194078177,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0015810373033315881,
+      "loss": 0.0791,
+      "step": 31449
+    },
+    {
+      "epoch": 0.2730011024209859,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001581012148512899,
+      "loss": 0.123,
+      "step": 31450
+    },
+    {
+      "epoch": 0.2730097829011901,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0015809869931681822,
+      "loss": 0.062,
+      "step": 31451
+    },
+    {
+      "epoch": 0.27301846338139424,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0015809618372974652,
+      "loss": 0.1182,
+      "step": 31452
+    },
+    {
+      "epoch": 0.27302714386159843,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0015809366809007759,
+      "loss": 0.1123,
+      "step": 31453
+    },
+    {
+      "epoch": 0.27303582434180257,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001580911523978141,
+      "loss": 0.123,
+      "step": 31454
+    },
+    {
+      "epoch": 0.27304450482200676,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0015808863665295887,
+      "loss": 0.1245,
+      "step": 31455
+    },
+    {
+      "epoch": 0.2730531853022109,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0015808612085551463,
+      "loss": 0.0947,
+      "step": 31456
+    },
+    {
+      "epoch": 0.2730618657824151,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0015808360500548413,
+      "loss": 0.0894,
+      "step": 31457
+    },
+    {
+      "epoch": 0.27307054626261923,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0015808108910287011,
+      "loss": 0.1133,
+      "step": 31458
+    },
+    {
+      "epoch": 0.2730792267428234,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0015807857314767534,
+      "loss": 0.0732,
+      "step": 31459
+    },
+    {
+      "epoch": 0.27308790722302756,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0015807605713990258,
+      "loss": 0.1089,
+      "step": 31460
+    },
+    {
+      "epoch": 0.27309658770323175,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0015807354107955456,
+      "loss": 0.0942,
+      "step": 31461
+    },
+    {
+      "epoch": 0.2731052681834359,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.00158071024966634,
+      "loss": 0.0977,
+      "step": 31462
+    },
+    {
+      "epoch": 0.2731139486636401,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0015806850880114376,
+      "loss": 0.1074,
+      "step": 31463
+    },
+    {
+      "epoch": 0.2731226291438442,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0015806599258308648,
+      "loss": 0.1387,
+      "step": 31464
+    },
+    {
+      "epoch": 0.2731313096240484,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0015806347631246497,
+      "loss": 0.0737,
+      "step": 31465
+    },
+    {
+      "epoch": 0.27313999010425255,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0015806095998928194,
+      "loss": 0.084,
+      "step": 31466
+    },
+    {
+      "epoch": 0.27314867058445674,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0015805844361354016,
+      "loss": 0.1055,
+      "step": 31467
+    },
+    {
+      "epoch": 0.2731573510646609,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0015805592718524243,
+      "loss": 0.1562,
+      "step": 31468
+    },
+    {
+      "epoch": 0.2731660315448651,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0015805341070439139,
+      "loss": 0.1182,
+      "step": 31469
+    },
+    {
+      "epoch": 0.2731747120250692,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0015805089417098993,
+      "loss": 0.1211,
+      "step": 31470
+    },
+    {
+      "epoch": 0.2731833925052734,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001580483775850407,
+      "loss": 0.0869,
+      "step": 31471
+    },
+    {
+      "epoch": 0.27319207298547754,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0015804586094654652,
+      "loss": 0.1123,
+      "step": 31472
+    },
+    {
+      "epoch": 0.27320075346568173,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0015804334425551009,
+      "loss": 0.0869,
+      "step": 31473
+    },
+    {
+      "epoch": 0.27320943394588587,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0015804082751193415,
+      "loss": 0.0918,
+      "step": 31474
+    },
+    {
+      "epoch": 0.27321811442609006,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0015803831071582153,
+      "loss": 0.0825,
+      "step": 31475
+    },
+    {
+      "epoch": 0.2732267949062942,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001580357938671749,
+      "loss": 0.1138,
+      "step": 31476
+    },
+    {
+      "epoch": 0.2732354753864984,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0015803327696599708,
+      "loss": 0.0898,
+      "step": 31477
+    },
+    {
+      "epoch": 0.27324415586670253,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0015803076001229075,
+      "loss": 0.0986,
+      "step": 31478
+    },
+    {
+      "epoch": 0.2732528363469067,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0015802824300605873,
+      "loss": 0.1064,
+      "step": 31479
+    },
+    {
+      "epoch": 0.27326151682711086,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0015802572594730375,
+      "loss": 0.0957,
+      "step": 31480
+    },
+    {
+      "epoch": 0.27327019730731505,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0015802320883602854,
+      "loss": 0.1279,
+      "step": 31481
+    },
+    {
+      "epoch": 0.2732788777875192,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0015802069167223588,
+      "loss": 0.1035,
+      "step": 31482
+    },
+    {
+      "epoch": 0.2732875582677234,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0015801817445592853,
+      "loss": 0.1113,
+      "step": 31483
+    },
+    {
+      "epoch": 0.2732962387479275,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001580156571871092,
+      "loss": 0.0967,
+      "step": 31484
+    },
+    {
+      "epoch": 0.2733049192281317,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0015801313986578067,
+      "loss": 0.085,
+      "step": 31485
+    },
+    {
+      "epoch": 0.27331359970833585,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001580106224919457,
+      "loss": 0.1079,
+      "step": 31486
+    },
+    {
+      "epoch": 0.27332228018854005,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0015800810506560702,
+      "loss": 0.1245,
+      "step": 31487
+    },
+    {
+      "epoch": 0.2733309606687442,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0015800558758676739,
+      "loss": 0.1216,
+      "step": 31488
+    },
+    {
+      "epoch": 0.2733396411489484,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0015800307005542963,
+      "loss": 0.1094,
+      "step": 31489
+    },
+    {
+      "epoch": 0.2733483216291525,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001580005524715964,
+      "loss": 0.0957,
+      "step": 31490
+    },
+    {
+      "epoch": 0.2733570021093567,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0015799803483527045,
+      "loss": 0.084,
+      "step": 31491
+    },
+    {
+      "epoch": 0.27336568258956084,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0015799551714645462,
+      "loss": 0.1084,
+      "step": 31492
+    },
+    {
+      "epoch": 0.27337436306976504,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001579929994051516,
+      "loss": 0.0771,
+      "step": 31493
+    },
+    {
+      "epoch": 0.2733830435499692,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0015799048161136415,
+      "loss": 0.1025,
+      "step": 31494
+    },
+    {
+      "epoch": 0.27339172403017337,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.00157987963765095,
+      "loss": 0.1162,
+      "step": 31495
+    },
+    {
+      "epoch": 0.2734004045103775,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0015798544586634697,
+      "loss": 0.1436,
+      "step": 31496
+    },
+    {
+      "epoch": 0.2734090849905817,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0015798292791512276,
+      "loss": 0.1104,
+      "step": 31497
+    },
+    {
+      "epoch": 0.27341776547078583,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001579804099114252,
+      "loss": 0.1006,
+      "step": 31498
+    },
+    {
+      "epoch": 0.27342644595099,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001579778918552569,
+      "loss": 0.1143,
+      "step": 31499
+    },
+    {
+      "epoch": 0.27343512643119416,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0015797537374662076,
+      "loss": 0.0928,
+      "step": 31500
+    },
+    {
+      "epoch": 0.27344380691139836,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0015797285558551944,
+      "loss": 0.1094,
+      "step": 31501
+    },
+    {
+      "epoch": 0.2734524873916025,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0015797033737195575,
+      "loss": 0.106,
+      "step": 31502
+    },
+    {
+      "epoch": 0.2734611678718067,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0015796781910593241,
+      "loss": 0.165,
+      "step": 31503
+    },
+    {
+      "epoch": 0.2734698483520108,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0015796530078745217,
+      "loss": 0.125,
+      "step": 31504
+    },
+    {
+      "epoch": 0.273478528832215,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001579627824165178,
+      "loss": 0.1377,
+      "step": 31505
+    },
+    {
+      "epoch": 0.27348720931241915,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0015796026399313204,
+      "loss": 0.1152,
+      "step": 31506
+    },
+    {
+      "epoch": 0.27349588979262335,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001579577455172977,
+      "loss": 0.1084,
+      "step": 31507
+    },
+    {
+      "epoch": 0.2735045702728275,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0015795522698901744,
+      "loss": 0.0903,
+      "step": 31508
+    },
+    {
+      "epoch": 0.2735132507530317,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001579527084082941,
+      "loss": 0.1172,
+      "step": 31509
+    },
+    {
+      "epoch": 0.2735219312332358,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001579501897751304,
+      "loss": 0.1221,
+      "step": 31510
+    },
+    {
+      "epoch": 0.27353061171344,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0015794767108952908,
+      "loss": 0.1118,
+      "step": 31511
+    },
+    {
+      "epoch": 0.27353929219364415,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0015794515235149288,
+      "loss": 0.1484,
+      "step": 31512
+    },
+    {
+      "epoch": 0.27354797267384834,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001579426335610246,
+      "loss": 0.166,
+      "step": 31513
+    },
+    {
+      "epoch": 0.2735566531540525,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0015794011471812702,
+      "loss": 0.0933,
+      "step": 31514
+    },
+    {
+      "epoch": 0.27356533363425667,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001579375958228028,
+      "loss": 0.0938,
+      "step": 31515
+    },
+    {
+      "epoch": 0.2735740141144608,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0015793507687505478,
+      "loss": 0.0947,
+      "step": 31516
+    },
+    {
+      "epoch": 0.273582694594665,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0015793255787488563,
+      "loss": 0.0771,
+      "step": 31517
+    },
+    {
+      "epoch": 0.27359137507486914,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0015793003882229823,
+      "loss": 0.1123,
+      "step": 31518
+    },
+    {
+      "epoch": 0.27360005555507333,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001579275197172952,
+      "loss": 0.0933,
+      "step": 31519
+    },
+    {
+      "epoch": 0.27360873603527747,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0015792500055987937,
+      "loss": 0.0928,
+      "step": 31520
+    },
+    {
+      "epoch": 0.27361741651548166,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0015792248135005347,
+      "loss": 0.1328,
+      "step": 31521
+    },
+    {
+      "epoch": 0.2736260969956858,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0015791996208782027,
+      "loss": 0.1777,
+      "step": 31522
+    },
+    {
+      "epoch": 0.27363477747589,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0015791744277318255,
+      "loss": 0.1133,
+      "step": 31523
+    },
+    {
+      "epoch": 0.2736434579560941,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0015791492340614297,
+      "loss": 0.166,
+      "step": 31524
+    },
+    {
+      "epoch": 0.2736521384362983,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0015791240398670438,
+      "loss": 0.103,
+      "step": 31525
+    },
+    {
+      "epoch": 0.27366081891650246,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0015790988451486953,
+      "loss": 0.0815,
+      "step": 31526
+    },
+    {
+      "epoch": 0.27366949939670665,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0015790736499064113,
+      "loss": 0.1133,
+      "step": 31527
+    },
+    {
+      "epoch": 0.2736781798769108,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.00157904845414022,
+      "loss": 0.0786,
+      "step": 31528
+    },
+    {
+      "epoch": 0.273686860357115,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0015790232578501478,
+      "loss": 0.1025,
+      "step": 31529
+    },
+    {
+      "epoch": 0.2736955408373191,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0015789980610362232,
+      "loss": 0.124,
+      "step": 31530
+    },
+    {
+      "epoch": 0.2737042213175233,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0015789728636984737,
+      "loss": 0.0732,
+      "step": 31531
+    },
+    {
+      "epoch": 0.27371290179772745,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0015789476658369266,
+      "loss": 0.103,
+      "step": 31532
+    },
+    {
+      "epoch": 0.27372158227793164,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0015789224674516093,
+      "loss": 0.1074,
+      "step": 31533
+    },
+    {
+      "epoch": 0.2737302627581358,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0015788972685425499,
+      "loss": 0.103,
+      "step": 31534
+    },
+    {
+      "epoch": 0.27373894323833997,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0015788720691097752,
+      "loss": 0.1055,
+      "step": 31535
+    },
+    {
+      "epoch": 0.2737476237185441,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0015788468691533138,
+      "loss": 0.0933,
+      "step": 31536
+    },
+    {
+      "epoch": 0.2737563041987483,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0015788216686731923,
+      "loss": 0.1226,
+      "step": 31537
+    },
+    {
+      "epoch": 0.27376498467895244,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.001578796467669439,
+      "loss": 0.1211,
+      "step": 31538
+    },
+    {
+      "epoch": 0.27377366515915663,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0015787712661420802,
+      "loss": 0.1445,
+      "step": 31539
+    },
+    {
+      "epoch": 0.27378234563936077,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0015787460640911454,
+      "loss": 0.1416,
+      "step": 31540
+    },
+    {
+      "epoch": 0.27379102611956496,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0015787208615166603,
+      "loss": 0.0742,
+      "step": 31541
+    },
+    {
+      "epoch": 0.2737997065997691,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0015786956584186538,
+      "loss": 0.1436,
+      "step": 31542
+    },
+    {
+      "epoch": 0.27380838707997324,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0015786704547971525,
+      "loss": 0.1025,
+      "step": 31543
+    },
+    {
+      "epoch": 0.27381706756017743,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0015786452506521844,
+      "loss": 0.1328,
+      "step": 31544
+    },
+    {
+      "epoch": 0.27382574804038157,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.001578620045983777,
+      "loss": 0.085,
+      "step": 31545
+    },
+    {
+      "epoch": 0.27383442852058576,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0015785948407919584,
+      "loss": 0.0806,
+      "step": 31546
+    },
+    {
+      "epoch": 0.2738431090007899,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0015785696350767552,
+      "loss": 0.1011,
+      "step": 31547
+    },
+    {
+      "epoch": 0.2738517894809941,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0015785444288381957,
+      "loss": 0.0791,
+      "step": 31548
+    },
+    {
+      "epoch": 0.2738604699611982,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0015785192220763074,
+      "loss": 0.105,
+      "step": 31549
+    },
+    {
+      "epoch": 0.2738691504414024,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0015784940147911173,
+      "loss": 0.0957,
+      "step": 31550
+    },
+    {
+      "epoch": 0.27387783092160656,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015784688069826534,
+      "loss": 0.1465,
+      "step": 31551
+    },
+    {
+      "epoch": 0.27388651140181075,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0015784435986509434,
+      "loss": 0.1143,
+      "step": 31552
+    },
+    {
+      "epoch": 0.2738951918820149,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0015784183897960143,
+      "loss": 0.0928,
+      "step": 31553
+    },
+    {
+      "epoch": 0.2739038723622191,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0015783931804178941,
+      "loss": 0.0859,
+      "step": 31554
+    },
+    {
+      "epoch": 0.2739125528424232,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0015783679705166104,
+      "loss": 0.0796,
+      "step": 31555
+    },
+    {
+      "epoch": 0.2739212333226274,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001578342760092191,
+      "loss": 0.1133,
+      "step": 31556
+    },
+    {
+      "epoch": 0.27392991380283155,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0015783175491446626,
+      "loss": 0.1104,
+      "step": 31557
+    },
+    {
+      "epoch": 0.27393859428303574,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0015782923376740539,
+      "loss": 0.1221,
+      "step": 31558
+    },
+    {
+      "epoch": 0.2739472747632399,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0015782671256803916,
+      "loss": 0.1157,
+      "step": 31559
+    },
+    {
+      "epoch": 0.27395595524344407,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0015782419131637033,
+      "loss": 0.0957,
+      "step": 31560
+    },
+    {
+      "epoch": 0.2739646357236482,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001578216700124017,
+      "loss": 0.1191,
+      "step": 31561
+    },
+    {
+      "epoch": 0.2739733162038524,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.00157819148656136,
+      "loss": 0.1108,
+      "step": 31562
+    },
+    {
+      "epoch": 0.27398199668405654,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0015781662724757603,
+      "loss": 0.1182,
+      "step": 31563
+    },
+    {
+      "epoch": 0.27399067716426073,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0015781410578672448,
+      "loss": 0.1123,
+      "step": 31564
+    },
+    {
+      "epoch": 0.27399935764446487,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001578115842735842,
+      "loss": 0.124,
+      "step": 31565
+    },
+    {
+      "epoch": 0.27400803812466906,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0015780906270815782,
+      "loss": 0.1133,
+      "step": 31566
+    },
+    {
+      "epoch": 0.2740167186048732,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0015780654109044821,
+      "loss": 0.0806,
+      "step": 31567
+    },
+    {
+      "epoch": 0.2740253990850774,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0015780401942045808,
+      "loss": 0.1279,
+      "step": 31568
+    },
+    {
+      "epoch": 0.27403407956528153,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0015780149769819016,
+      "loss": 0.1348,
+      "step": 31569
+    },
+    {
+      "epoch": 0.2740427600454857,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0015779897592364729,
+      "loss": 0.1177,
+      "step": 31570
+    },
+    {
+      "epoch": 0.27405144052568986,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.0015779645409683214,
+      "loss": 0.4141,
+      "step": 31571
+    },
+    {
+      "epoch": 0.27406012100589405,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0015779393221774752,
+      "loss": 0.0996,
+      "step": 31572
+    },
+    {
+      "epoch": 0.2740688014860982,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0015779141028639616,
+      "loss": 0.1523,
+      "step": 31573
+    },
+    {
+      "epoch": 0.2740774819663024,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0015778888830278086,
+      "loss": 0.1006,
+      "step": 31574
+    },
+    {
+      "epoch": 0.2740861624465065,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0015778636626690432,
+      "loss": 0.1064,
+      "step": 31575
+    },
+    {
+      "epoch": 0.2740948429267107,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0015778384417876937,
+      "loss": 0.1162,
+      "step": 31576
+    },
+    {
+      "epoch": 0.27410352340691485,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0015778132203837867,
+      "loss": 0.0781,
+      "step": 31577
+    },
+    {
+      "epoch": 0.27411220388711904,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0015777879984573506,
+      "loss": 0.0918,
+      "step": 31578
+    },
+    {
+      "epoch": 0.2741208843673232,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0015777627760084128,
+      "loss": 0.105,
+      "step": 31579
+    },
+    {
+      "epoch": 0.2741295648475274,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0015777375530370008,
+      "loss": 0.0952,
+      "step": 31580
+    },
+    {
+      "epoch": 0.2741382453277315,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0015777123295431421,
+      "loss": 0.1094,
+      "step": 31581
+    },
+    {
+      "epoch": 0.2741469258079357,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0015776871055268645,
+      "loss": 0.0957,
+      "step": 31582
+    },
+    {
+      "epoch": 0.27415560628813984,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0015776618809881953,
+      "loss": 0.1602,
+      "step": 31583
+    },
+    {
+      "epoch": 0.27416428676834403,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0015776366559271623,
+      "loss": 0.0938,
+      "step": 31584
+    },
+    {
+      "epoch": 0.27417296724854817,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001577611430343793,
+      "loss": 0.1074,
+      "step": 31585
+    },
+    {
+      "epoch": 0.27418164772875236,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0015775862042381149,
+      "loss": 0.127,
+      "step": 31586
+    },
+    {
+      "epoch": 0.2741903282089565,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001577560977610156,
+      "loss": 0.0942,
+      "step": 31587
+    },
+    {
+      "epoch": 0.2741990086891607,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0015775357504599435,
+      "loss": 0.106,
+      "step": 31588
+    },
+    {
+      "epoch": 0.27420768916936483,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0015775105227875052,
+      "loss": 0.0879,
+      "step": 31589
+    },
+    {
+      "epoch": 0.274216369649569,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0015774852945928683,
+      "loss": 0.0898,
+      "step": 31590
+    },
+    {
+      "epoch": 0.27422505012977316,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0015774600658760608,
+      "loss": 0.126,
+      "step": 31591
+    },
+    {
+      "epoch": 0.27423373060997736,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0015774348366371098,
+      "loss": 0.0771,
+      "step": 31592
+    },
+    {
+      "epoch": 0.2742424110901815,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0015774096068760437,
+      "loss": 0.0889,
+      "step": 31593
+    },
+    {
+      "epoch": 0.2742510915703857,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0015773843765928895,
+      "loss": 0.1289,
+      "step": 31594
+    },
+    {
+      "epoch": 0.2742597720505898,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001577359145787675,
+      "loss": 0.0898,
+      "step": 31595
+    },
+    {
+      "epoch": 0.274268452530794,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0015773339144604275,
+      "loss": 0.0742,
+      "step": 31596
+    },
+    {
+      "epoch": 0.27427713301099815,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0015773086826111752,
+      "loss": 0.1504,
+      "step": 31597
+    },
+    {
+      "epoch": 0.27428581349120235,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0015772834502399448,
+      "loss": 0.1162,
+      "step": 31598
+    },
+    {
+      "epoch": 0.2742944939714065,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0015772582173467645,
+      "loss": 0.1128,
+      "step": 31599
+    },
+    {
+      "epoch": 0.2743031744516107,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0015772329839316619,
+      "loss": 0.0762,
+      "step": 31600
+    },
+    {
+      "epoch": 0.2743118549318148,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0015772077499946644,
+      "loss": 0.0718,
+      "step": 31601
+    },
+    {
+      "epoch": 0.274320535412019,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0015771825155357997,
+      "loss": 0.0986,
+      "step": 31602
+    },
+    {
+      "epoch": 0.27432921589222314,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0015771572805550953,
+      "loss": 0.0913,
+      "step": 31603
+    },
+    {
+      "epoch": 0.27433789637242734,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001577132045052579,
+      "loss": 0.1094,
+      "step": 31604
+    },
+    {
+      "epoch": 0.2743465768526315,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0015771068090282782,
+      "loss": 0.1074,
+      "step": 31605
+    },
+    {
+      "epoch": 0.27435525733283567,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0015770815724822206,
+      "loss": 0.0986,
+      "step": 31606
+    },
+    {
+      "epoch": 0.2743639378130398,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0015770563354144337,
+      "loss": 0.1348,
+      "step": 31607
+    },
+    {
+      "epoch": 0.274372618293244,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0015770310978249454,
+      "loss": 0.1201,
+      "step": 31608
+    },
+    {
+      "epoch": 0.27438129877344813,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0015770058597137828,
+      "loss": 0.0908,
+      "step": 31609
+    },
+    {
+      "epoch": 0.2743899792536523,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0015769806210809737,
+      "loss": 0.0869,
+      "step": 31610
+    },
+    {
+      "epoch": 0.27439865973385646,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0015769553819265458,
+      "loss": 0.082,
+      "step": 31611
+    },
+    {
+      "epoch": 0.27440734021406066,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0015769301422505267,
+      "loss": 0.1084,
+      "step": 31612
+    },
+    {
+      "epoch": 0.2744160206942648,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0015769049020529443,
+      "loss": 0.2871,
+      "step": 31613
+    },
+    {
+      "epoch": 0.274424701174469,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0015768796613338256,
+      "loss": 0.1104,
+      "step": 31614
+    },
+    {
+      "epoch": 0.2744333816546731,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0015768544200931982,
+      "loss": 0.105,
+      "step": 31615
+    },
+    {
+      "epoch": 0.2744420621348773,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0015768291783310902,
+      "loss": 0.0962,
+      "step": 31616
+    },
+    {
+      "epoch": 0.27445074261508146,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001576803936047529,
+      "loss": 0.1011,
+      "step": 31617
+    },
+    {
+      "epoch": 0.27445942309528565,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0015767786932425422,
+      "loss": 0.1201,
+      "step": 31618
+    },
+    {
+      "epoch": 0.2744681035754898,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0015767534499161572,
+      "loss": 0.0928,
+      "step": 31619
+    },
+    {
+      "epoch": 0.274476784055694,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001576728206068402,
+      "loss": 0.1113,
+      "step": 31620
+    },
+    {
+      "epoch": 0.2744854645358981,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0015767029616993039,
+      "loss": 0.1221,
+      "step": 31621
+    },
+    {
+      "epoch": 0.2744941450161023,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0015766777168088907,
+      "loss": 0.1572,
+      "step": 31622
+    },
+    {
+      "epoch": 0.27450282549630645,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0015766524713971902,
+      "loss": 0.0762,
+      "step": 31623
+    },
+    {
+      "epoch": 0.27451150597651064,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001576627225464229,
+      "loss": 0.0615,
+      "step": 31624
+    },
+    {
+      "epoch": 0.2745201864567148,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001576601979010036,
+      "loss": 0.0957,
+      "step": 31625
+    },
+    {
+      "epoch": 0.27452886693691897,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001576576732034638,
+      "loss": 0.1162,
+      "step": 31626
+    },
+    {
+      "epoch": 0.2745375474171231,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0015765514845380631,
+      "loss": 0.1133,
+      "step": 31627
+    },
+    {
+      "epoch": 0.2745462278973273,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0015765262365203382,
+      "loss": 0.0967,
+      "step": 31628
+    },
+    {
+      "epoch": 0.27455490837753144,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0015765009879814915,
+      "loss": 0.1553,
+      "step": 31629
+    },
+    {
+      "epoch": 0.27456358885773563,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0015764757389215502,
+      "loss": 0.1162,
+      "step": 31630
+    },
+    {
+      "epoch": 0.27457226933793977,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001576450489340543,
+      "loss": 0.1162,
+      "step": 31631
+    },
+    {
+      "epoch": 0.27458094981814396,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0015764252392384964,
+      "loss": 0.105,
+      "step": 31632
+    },
+    {
+      "epoch": 0.2745896302983481,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0015763999886154378,
+      "loss": 0.1162,
+      "step": 31633
+    },
+    {
+      "epoch": 0.2745983107785523,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001576374737471396,
+      "loss": 0.125,
+      "step": 31634
+    },
+    {
+      "epoch": 0.2746069912587564,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0015763494858063976,
+      "loss": 0.1245,
+      "step": 31635
+    },
+    {
+      "epoch": 0.2746156717389606,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0015763242336204709,
+      "loss": 0.0776,
+      "step": 31636
+    },
+    {
+      "epoch": 0.27462435221916476,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0015762989809136427,
+      "loss": 0.1289,
+      "step": 31637
+    },
+    {
+      "epoch": 0.27463303269936895,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0015762737276859412,
+      "loss": 0.085,
+      "step": 31638
+    },
+    {
+      "epoch": 0.2746417131795731,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0015762484739373938,
+      "loss": 0.0791,
+      "step": 31639
+    },
+    {
+      "epoch": 0.2746503936597773,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0015762232196680286,
+      "loss": 0.1006,
+      "step": 31640
+    },
+    {
+      "epoch": 0.2746590741399814,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0015761979648778726,
+      "loss": 0.4883,
+      "step": 31641
+    },
+    {
+      "epoch": 0.2746677546201856,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0015761727095669536,
+      "loss": 0.1025,
+      "step": 31642
+    },
+    {
+      "epoch": 0.27467643510038975,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0015761474537352995,
+      "loss": 0.125,
+      "step": 31643
+    },
+    {
+      "epoch": 0.27468511558059394,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0015761221973829376,
+      "loss": 0.1089,
+      "step": 31644
+    },
+    {
+      "epoch": 0.2746937960607981,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001576096940509896,
+      "loss": 0.0991,
+      "step": 31645
+    },
+    {
+      "epoch": 0.27470247654100227,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0015760716831162015,
+      "loss": 0.0957,
+      "step": 31646
+    },
+    {
+      "epoch": 0.2747111570212064,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001576046425201882,
+      "loss": 0.0957,
+      "step": 31647
+    },
+    {
+      "epoch": 0.2747198375014106,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0015760211667669656,
+      "loss": 0.1406,
+      "step": 31648
+    },
+    {
+      "epoch": 0.27472851798161474,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0015759959078114794,
+      "loss": 0.0786,
+      "step": 31649
+    },
+    {
+      "epoch": 0.27473719846181893,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0015759706483354512,
+      "loss": 0.126,
+      "step": 31650
+    },
+    {
+      "epoch": 0.27474587894202307,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0015759453883389088,
+      "loss": 0.124,
+      "step": 31651
+    },
+    {
+      "epoch": 0.27475455942222726,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0015759201278218798,
+      "loss": 0.0869,
+      "step": 31652
+    },
+    {
+      "epoch": 0.2747632399024314,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0015758948667843917,
+      "loss": 0.1221,
+      "step": 31653
+    },
+    {
+      "epoch": 0.2747719203826356,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001575869605226472,
+      "loss": 0.1006,
+      "step": 31654
+    },
+    {
+      "epoch": 0.27478060086283973,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015758443431481482,
+      "loss": 0.0776,
+      "step": 31655
+    },
+    {
+      "epoch": 0.2747892813430439,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0015758190805494484,
+      "loss": 0.0918,
+      "step": 31656
+    },
+    {
+      "epoch": 0.27479796182324806,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0015757938174304,
+      "loss": 0.166,
+      "step": 31657
+    },
+    {
+      "epoch": 0.27480664230345225,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001575768553791031,
+      "loss": 0.1104,
+      "step": 31658
+    },
+    {
+      "epoch": 0.2748153227836564,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0015757432896313678,
+      "loss": 0.1079,
+      "step": 31659
+    },
+    {
+      "epoch": 0.2748240032638606,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0015757180249514396,
+      "loss": 0.1436,
+      "step": 31660
+    },
+    {
+      "epoch": 0.2748326837440647,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.001575692759751273,
+      "loss": 0.123,
+      "step": 31661
+    },
+    {
+      "epoch": 0.2748413642242689,
+      "grad_norm": 1.25,
+      "learning_rate": 0.001575667494030896,
+      "loss": 0.123,
+      "step": 31662
+    },
+    {
+      "epoch": 0.27485004470447305,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0015756422277903364,
+      "loss": 0.1094,
+      "step": 31663
+    },
+    {
+      "epoch": 0.27485872518467724,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0015756169610296213,
+      "loss": 0.104,
+      "step": 31664
+    },
+    {
+      "epoch": 0.2748674056648814,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001575591693748779,
+      "loss": 0.127,
+      "step": 31665
+    },
+    {
+      "epoch": 0.2748760861450855,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0015755664259478365,
+      "loss": 0.105,
+      "step": 31666
+    },
+    {
+      "epoch": 0.2748847666252897,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0015755411576268215,
+      "loss": 0.0884,
+      "step": 31667
+    },
+    {
+      "epoch": 0.27489344710549385,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0015755158887857624,
+      "loss": 0.0952,
+      "step": 31668
+    },
+    {
+      "epoch": 0.27490212758569804,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0015754906194246857,
+      "loss": 0.1309,
+      "step": 31669
+    },
+    {
+      "epoch": 0.2749108080659022,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0015754653495436198,
+      "loss": 0.0825,
+      "step": 31670
+    },
+    {
+      "epoch": 0.27491948854610637,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0015754400791425922,
+      "loss": 0.1152,
+      "step": 31671
+    },
+    {
+      "epoch": 0.2749281690263105,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0015754148082216305,
+      "loss": 0.0796,
+      "step": 31672
+    },
+    {
+      "epoch": 0.2749368495065147,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0015753895367807622,
+      "loss": 0.127,
+      "step": 31673
+    },
+    {
+      "epoch": 0.27494552998671884,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0015753642648200152,
+      "loss": 0.0996,
+      "step": 31674
+    },
+    {
+      "epoch": 0.27495421046692303,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.001575338992339417,
+      "loss": 0.1045,
+      "step": 31675
+    },
+    {
+      "epoch": 0.27496289094712717,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0015753137193389951,
+      "loss": 0.1128,
+      "step": 31676
+    },
+    {
+      "epoch": 0.27497157142733136,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001575288445818777,
+      "loss": 0.1475,
+      "step": 31677
+    },
+    {
+      "epoch": 0.2749802519075355,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0015752631717787908,
+      "loss": 0.1025,
+      "step": 31678
+    },
+    {
+      "epoch": 0.2749889323877397,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0015752378972190642,
+      "loss": 0.0781,
+      "step": 31679
+    },
+    {
+      "epoch": 0.27499761286794383,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0015752126221396244,
+      "loss": 0.1064,
+      "step": 31680
+    },
+    {
+      "epoch": 0.275006293348148,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001575187346540499,
+      "loss": 0.125,
+      "step": 31681
+    },
+    {
+      "epoch": 0.27501497382835216,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0015751620704217162,
+      "loss": 0.1211,
+      "step": 31682
+    },
+    {
+      "epoch": 0.27502365430855635,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0015751367937833032,
+      "loss": 0.0801,
+      "step": 31683
+    },
+    {
+      "epoch": 0.2750323347887605,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0015751115166252876,
+      "loss": 0.0845,
+      "step": 31684
+    },
+    {
+      "epoch": 0.2750410152689647,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0015750862389476974,
+      "loss": 0.0688,
+      "step": 31685
+    },
+    {
+      "epoch": 0.2750496957491688,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00157506096075056,
+      "loss": 0.1177,
+      "step": 31686
+    },
+    {
+      "epoch": 0.275058376229373,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0015750356820339027,
+      "loss": 0.1494,
+      "step": 31687
+    },
+    {
+      "epoch": 0.27506705670957715,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0015750104027977539,
+      "loss": 0.1113,
+      "step": 31688
+    },
+    {
+      "epoch": 0.27507573718978134,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0015749851230421408,
+      "loss": 0.0981,
+      "step": 31689
+    },
+    {
+      "epoch": 0.2750844176699855,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001574959842767091,
+      "loss": 0.0835,
+      "step": 31690
+    },
+    {
+      "epoch": 0.2750930981501897,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0015749345619726323,
+      "loss": 0.1182,
+      "step": 31691
+    },
+    {
+      "epoch": 0.2751017786303938,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0015749092806587924,
+      "loss": 0.0684,
+      "step": 31692
+    },
+    {
+      "epoch": 0.275110459110598,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0015748839988255984,
+      "loss": 0.084,
+      "step": 31693
+    },
+    {
+      "epoch": 0.27511913959080214,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001574858716473079,
+      "loss": 0.1016,
+      "step": 31694
+    },
+    {
+      "epoch": 0.27512782007100633,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0015748334336012609,
+      "loss": 0.1064,
+      "step": 31695
+    },
+    {
+      "epoch": 0.27513650055121047,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0015748081502101722,
+      "loss": 0.1152,
+      "step": 31696
+    },
+    {
+      "epoch": 0.27514518103141467,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0015747828662998405,
+      "loss": 0.1113,
+      "step": 31697
+    },
+    {
+      "epoch": 0.2751538615116188,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001574757581870293,
+      "loss": 0.1084,
+      "step": 31698
+    },
+    {
+      "epoch": 0.275162541991823,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0015747322969215582,
+      "loss": 0.0796,
+      "step": 31699
+    },
+    {
+      "epoch": 0.27517122247202713,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0015747070114536632,
+      "loss": 0.0996,
+      "step": 31700
+    },
+    {
+      "epoch": 0.2751799029522313,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0015746817254666359,
+      "loss": 0.1318,
+      "step": 31701
+    },
+    {
+      "epoch": 0.27518858343243546,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0015746564389605032,
+      "loss": 0.124,
+      "step": 31702
+    },
+    {
+      "epoch": 0.27519726391263966,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0015746311519352937,
+      "loss": 0.0972,
+      "step": 31703
+    },
+    {
+      "epoch": 0.2752059443928438,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001574605864391035,
+      "loss": 0.0962,
+      "step": 31704
+    },
+    {
+      "epoch": 0.275214624873048,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0015745805763277543,
+      "loss": 0.1011,
+      "step": 31705
+    },
+    {
+      "epoch": 0.2752233053532521,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0015745552877454792,
+      "loss": 0.0918,
+      "step": 31706
+    },
+    {
+      "epoch": 0.2752319858334563,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0015745299986442377,
+      "loss": 0.1196,
+      "step": 31707
+    },
+    {
+      "epoch": 0.27524066631366045,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0015745047090240574,
+      "loss": 0.1045,
+      "step": 31708
+    },
+    {
+      "epoch": 0.27524934679386465,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0015744794188849655,
+      "loss": 0.0967,
+      "step": 31709
+    },
+    {
+      "epoch": 0.2752580272740688,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0015744541282269905,
+      "loss": 0.0664,
+      "step": 31710
+    },
+    {
+      "epoch": 0.275266707754273,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0015744288370501593,
+      "loss": 0.0981,
+      "step": 31711
+    },
+    {
+      "epoch": 0.2752753882344771,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0015744035453545,
+      "loss": 0.0933,
+      "step": 31712
+    },
+    {
+      "epoch": 0.2752840687146813,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0015743782531400402,
+      "loss": 0.0859,
+      "step": 31713
+    },
+    {
+      "epoch": 0.27529274919488544,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0015743529604068075,
+      "loss": 0.1055,
+      "step": 31714
+    },
+    {
+      "epoch": 0.27530142967508964,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0015743276671548291,
+      "loss": 0.0967,
+      "step": 31715
+    },
+    {
+      "epoch": 0.2753101101552938,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0015743023733841334,
+      "loss": 0.0967,
+      "step": 31716
+    },
+    {
+      "epoch": 0.27531879063549797,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0015742770790947477,
+      "loss": 0.1543,
+      "step": 31717
+    },
+    {
+      "epoch": 0.2753274711157021,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0015742517842866999,
+      "loss": 0.1045,
+      "step": 31718
+    },
+    {
+      "epoch": 0.2753361515959063,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0015742264889600174,
+      "loss": 0.0859,
+      "step": 31719
+    },
+    {
+      "epoch": 0.27534483207611044,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0015742011931147275,
+      "loss": 0.1309,
+      "step": 31720
+    },
+    {
+      "epoch": 0.27535351255631463,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0015741758967508586,
+      "loss": 0.0566,
+      "step": 31721
+    },
+    {
+      "epoch": 0.27536219303651877,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0015741505998684382,
+      "loss": 0.1001,
+      "step": 31722
+    },
+    {
+      "epoch": 0.27537087351672296,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0015741253024674938,
+      "loss": 0.1045,
+      "step": 31723
+    },
+    {
+      "epoch": 0.2753795539969271,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0015741000045480528,
+      "loss": 0.1631,
+      "step": 31724
+    },
+    {
+      "epoch": 0.2753882344771313,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0015740747061101436,
+      "loss": 0.0869,
+      "step": 31725
+    },
+    {
+      "epoch": 0.2753969149573354,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0015740494071537932,
+      "loss": 0.3359,
+      "step": 31726
+    },
+    {
+      "epoch": 0.2754055954375396,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0015740241076790294,
+      "loss": 0.0869,
+      "step": 31727
+    },
+    {
+      "epoch": 0.27541427591774376,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.00157399880768588,
+      "loss": 0.1562,
+      "step": 31728
+    },
+    {
+      "epoch": 0.27542295639794795,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0015739735071743726,
+      "loss": 0.1006,
+      "step": 31729
+    },
+    {
+      "epoch": 0.2754316368781521,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0015739482061445354,
+      "loss": 0.1172,
+      "step": 31730
+    },
+    {
+      "epoch": 0.2754403173583563,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.001573922904596395,
+      "loss": 0.0845,
+      "step": 31731
+    },
+    {
+      "epoch": 0.2754489978385604,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0015738976025299798,
+      "loss": 0.124,
+      "step": 31732
+    },
+    {
+      "epoch": 0.2754576783187646,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0015738722999453173,
+      "loss": 0.0977,
+      "step": 31733
+    },
+    {
+      "epoch": 0.27546635879896875,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0015738469968424348,
+      "loss": 0.0928,
+      "step": 31734
+    },
+    {
+      "epoch": 0.27547503927917294,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001573821693221361,
+      "loss": 0.1289,
+      "step": 31735
+    },
+    {
+      "epoch": 0.2754837197593771,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0015737963890821226,
+      "loss": 0.1006,
+      "step": 31736
+    },
+    {
+      "epoch": 0.27549240023958127,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0015737710844247474,
+      "loss": 0.1172,
+      "step": 31737
+    },
+    {
+      "epoch": 0.2755010807197854,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0015737457792492633,
+      "loss": 0.1523,
+      "step": 31738
+    },
+    {
+      "epoch": 0.2755097611999896,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0015737204735556982,
+      "loss": 0.1138,
+      "step": 31739
+    },
+    {
+      "epoch": 0.27551844168019374,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0015736951673440795,
+      "loss": 0.0869,
+      "step": 31740
+    },
+    {
+      "epoch": 0.27552712216039793,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0015736698606144347,
+      "loss": 0.0938,
+      "step": 31741
+    },
+    {
+      "epoch": 0.27553580264060207,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0015736445533667918,
+      "loss": 0.127,
+      "step": 31742
+    },
+    {
+      "epoch": 0.27554448312080626,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0015736192456011781,
+      "loss": 0.1367,
+      "step": 31743
+    },
+    {
+      "epoch": 0.2755531636010104,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0015735939373176218,
+      "loss": 0.0898,
+      "step": 31744
+    },
+    {
+      "epoch": 0.2755618440812146,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0015735686285161502,
+      "loss": 0.1221,
+      "step": 31745
+    },
+    {
+      "epoch": 0.27557052456141873,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001573543319196791,
+      "loss": 0.1196,
+      "step": 31746
+    },
+    {
+      "epoch": 0.2755792050416229,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.001573518009359572,
+      "loss": 0.1084,
+      "step": 31747
+    },
+    {
+      "epoch": 0.27558788552182706,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0015734926990045206,
+      "loss": 0.1025,
+      "step": 31748
+    },
+    {
+      "epoch": 0.27559656600203125,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001573467388131665,
+      "loss": 0.1191,
+      "step": 31749
+    },
+    {
+      "epoch": 0.2756052464822354,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0015734420767410326,
+      "loss": 0.1055,
+      "step": 31750
+    },
+    {
+      "epoch": 0.2756139269624396,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0015734167648326508,
+      "loss": 0.1064,
+      "step": 31751
+    },
+    {
+      "epoch": 0.2756226074426437,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0015733914524065475,
+      "loss": 0.1035,
+      "step": 31752
+    },
+    {
+      "epoch": 0.2756312879228479,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001573366139462751,
+      "loss": 0.0898,
+      "step": 31753
+    },
+    {
+      "epoch": 0.27563996840305205,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0015733408260012877,
+      "loss": 0.1289,
+      "step": 31754
+    },
+    {
+      "epoch": 0.27564864888325624,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001573315512022186,
+      "loss": 0.0698,
+      "step": 31755
+    },
+    {
+      "epoch": 0.2756573293634604,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001573290197525474,
+      "loss": 0.1123,
+      "step": 31756
+    },
+    {
+      "epoch": 0.2756660098436646,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001573264882511179,
+      "loss": 0.0864,
+      "step": 31757
+    },
+    {
+      "epoch": 0.2756746903238687,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0015732395669793282,
+      "loss": 0.1465,
+      "step": 31758
+    },
+    {
+      "epoch": 0.2756833708040729,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.00157321425092995,
+      "loss": 0.0981,
+      "step": 31759
+    },
+    {
+      "epoch": 0.27569205128427704,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0015731889343630718,
+      "loss": 0.1108,
+      "step": 31760
+    },
+    {
+      "epoch": 0.27570073176448123,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0015731636172787212,
+      "loss": 0.0713,
+      "step": 31761
+    },
+    {
+      "epoch": 0.27570941224468537,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001573138299676926,
+      "loss": 0.0938,
+      "step": 31762
+    },
+    {
+      "epoch": 0.27571809272488956,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0015731129815577138,
+      "loss": 0.0898,
+      "step": 31763
+    },
+    {
+      "epoch": 0.2757267732050937,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0015730876629211127,
+      "loss": 0.0864,
+      "step": 31764
+    },
+    {
+      "epoch": 0.2757354536852979,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0015730623437671496,
+      "loss": 0.105,
+      "step": 31765
+    },
+    {
+      "epoch": 0.27574413416550203,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0015730370240958527,
+      "loss": 0.0645,
+      "step": 31766
+    },
+    {
+      "epoch": 0.2757528146457062,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0015730117039072498,
+      "loss": 0.0869,
+      "step": 31767
+    },
+    {
+      "epoch": 0.27576149512591036,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0015729863832013682,
+      "loss": 0.0796,
+      "step": 31768
+    },
+    {
+      "epoch": 0.27577017560611455,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0015729610619782362,
+      "loss": 0.0815,
+      "step": 31769
+    },
+    {
+      "epoch": 0.2757788560863187,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0015729357402378807,
+      "loss": 0.0986,
+      "step": 31770
+    },
+    {
+      "epoch": 0.2757875365665229,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0015729104179803297,
+      "loss": 0.1221,
+      "step": 31771
+    },
+    {
+      "epoch": 0.275796217046727,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0015728850952056116,
+      "loss": 0.1504,
+      "step": 31772
+    },
+    {
+      "epoch": 0.2758048975269312,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0015728597719137528,
+      "loss": 0.1113,
+      "step": 31773
+    },
+    {
+      "epoch": 0.27581357800713535,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0015728344481047817,
+      "loss": 0.1006,
+      "step": 31774
+    },
+    {
+      "epoch": 0.27582225848733954,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0015728091237787263,
+      "loss": 0.0645,
+      "step": 31775
+    },
+    {
+      "epoch": 0.2758309389675437,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0015727837989356138,
+      "loss": 0.1094,
+      "step": 31776
+    },
+    {
+      "epoch": 0.2758396194477479,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001572758473575472,
+      "loss": 0.1182,
+      "step": 31777
+    },
+    {
+      "epoch": 0.275848299927952,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0015727331476983286,
+      "loss": 0.0898,
+      "step": 31778
+    },
+    {
+      "epoch": 0.2758569804081562,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0015727078213042117,
+      "loss": 0.1196,
+      "step": 31779
+    },
+    {
+      "epoch": 0.27586566088836034,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0015726824943931479,
+      "loss": 0.0903,
+      "step": 31780
+    },
+    {
+      "epoch": 0.27587434136856454,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0015726571669651664,
+      "loss": 0.1152,
+      "step": 31781
+    },
+    {
+      "epoch": 0.2758830218487687,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0015726318390202937,
+      "loss": 0.0603,
+      "step": 31782
+    },
+    {
+      "epoch": 0.27589170232897287,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0015726065105585577,
+      "loss": 0.1523,
+      "step": 31783
+    },
+    {
+      "epoch": 0.275900382809177,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0015725811815799867,
+      "loss": 0.1211,
+      "step": 31784
+    },
+    {
+      "epoch": 0.2759090632893812,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001572555852084608,
+      "loss": 0.0933,
+      "step": 31785
+    },
+    {
+      "epoch": 0.27591774376958533,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0015725305220724492,
+      "loss": 0.0879,
+      "step": 31786
+    },
+    {
+      "epoch": 0.2759264242497895,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0015725051915435383,
+      "loss": 0.1279,
+      "step": 31787
+    },
+    {
+      "epoch": 0.27593510472999366,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0015724798604979026,
+      "loss": 0.1309,
+      "step": 31788
+    },
+    {
+      "epoch": 0.2759437852101978,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0015724545289355703,
+      "loss": 0.0918,
+      "step": 31789
+    },
+    {
+      "epoch": 0.275952465690402,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0015724291968565686,
+      "loss": 0.1064,
+      "step": 31790
+    },
+    {
+      "epoch": 0.27596114617060613,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0015724038642609252,
+      "loss": 0.2021,
+      "step": 31791
+    },
+    {
+      "epoch": 0.2759698266508103,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0015723785311486684,
+      "loss": 0.0742,
+      "step": 31792
+    },
+    {
+      "epoch": 0.27597850713101446,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0015723531975198254,
+      "loss": 0.0884,
+      "step": 31793
+    },
+    {
+      "epoch": 0.27598718761121865,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001572327863374424,
+      "loss": 0.1748,
+      "step": 31794
+    },
+    {
+      "epoch": 0.2759958680914228,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0015723025287124922,
+      "loss": 0.1182,
+      "step": 31795
+    },
+    {
+      "epoch": 0.276004548571627,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0015722771935340572,
+      "loss": 0.1211,
+      "step": 31796
+    },
+    {
+      "epoch": 0.2760132290518311,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001572251857839147,
+      "loss": 0.1118,
+      "step": 31797
+    },
+    {
+      "epoch": 0.2760219095320353,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0015722265216277895,
+      "loss": 0.1094,
+      "step": 31798
+    },
+    {
+      "epoch": 0.27603059001223945,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001572201184900012,
+      "loss": 0.0698,
+      "step": 31799
+    },
+    {
+      "epoch": 0.27603927049244364,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0015721758476558423,
+      "loss": 0.125,
+      "step": 31800
+    },
+    {
+      "epoch": 0.2760479509726478,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0015721505098953083,
+      "loss": 0.1582,
+      "step": 31801
+    },
+    {
+      "epoch": 0.276056631452852,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0015721251716184373,
+      "loss": 0.0957,
+      "step": 31802
+    },
+    {
+      "epoch": 0.2760653119330561,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0015720998328252575,
+      "loss": 0.1016,
+      "step": 31803
+    },
+    {
+      "epoch": 0.2760739924132603,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0015720744935157967,
+      "loss": 0.1118,
+      "step": 31804
+    },
+    {
+      "epoch": 0.27608267289346444,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001572049153690082,
+      "loss": 0.1201,
+      "step": 31805
+    },
+    {
+      "epoch": 0.27609135337366864,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0015720238133481415,
+      "loss": 0.2031,
+      "step": 31806
+    },
+    {
+      "epoch": 0.2761000338538728,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0015719984724900029,
+      "loss": 0.124,
+      "step": 31807
+    },
+    {
+      "epoch": 0.27610871433407697,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001571973131115694,
+      "loss": 0.1064,
+      "step": 31808
+    },
+    {
+      "epoch": 0.2761173948142811,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001571947789225242,
+      "loss": 0.1104,
+      "step": 31809
+    },
+    {
+      "epoch": 0.2761260752944853,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0015719224468186753,
+      "loss": 0.1172,
+      "step": 31810
+    },
+    {
+      "epoch": 0.27613475577468943,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0015718971038960212,
+      "loss": 0.085,
+      "step": 31811
+    },
+    {
+      "epoch": 0.2761434362548936,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0015718717604573074,
+      "loss": 0.0864,
+      "step": 31812
+    },
+    {
+      "epoch": 0.27615211673509776,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0015718464165025618,
+      "loss": 0.0908,
+      "step": 31813
+    },
+    {
+      "epoch": 0.27616079721530196,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0015718210720318123,
+      "loss": 0.124,
+      "step": 31814
+    },
+    {
+      "epoch": 0.2761694776955061,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0015717957270450858,
+      "loss": 0.0991,
+      "step": 31815
+    },
+    {
+      "epoch": 0.2761781581757103,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001571770381542411,
+      "loss": 0.1069,
+      "step": 31816
+    },
+    {
+      "epoch": 0.2761868386559144,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0015717450355238153,
+      "loss": 0.1074,
+      "step": 31817
+    },
+    {
+      "epoch": 0.2761955191361186,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0015717196889893264,
+      "loss": 0.0908,
+      "step": 31818
+    },
+    {
+      "epoch": 0.27620419961632275,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0015716943419389713,
+      "loss": 0.1377,
+      "step": 31819
+    },
+    {
+      "epoch": 0.27621288009652695,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001571668994372779,
+      "loss": 0.1338,
+      "step": 31820
+    },
+    {
+      "epoch": 0.2762215605767311,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0015716436462907765,
+      "loss": 0.1182,
+      "step": 31821
+    },
+    {
+      "epoch": 0.2762302410569353,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0015716182976929912,
+      "loss": 0.0869,
+      "step": 31822
+    },
+    {
+      "epoch": 0.2762389215371394,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0015715929485794514,
+      "loss": 0.1465,
+      "step": 31823
+    },
+    {
+      "epoch": 0.2762476020173436,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0015715675989501844,
+      "loss": 0.0869,
+      "step": 31824
+    },
+    {
+      "epoch": 0.27625628249754774,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0015715422488052186,
+      "loss": 0.1167,
+      "step": 31825
+    },
+    {
+      "epoch": 0.27626496297775194,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0015715168981445813,
+      "loss": 0.1123,
+      "step": 31826
+    },
+    {
+      "epoch": 0.2762736434579561,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0015714915469683,
+      "loss": 0.0806,
+      "step": 31827
+    },
+    {
+      "epoch": 0.27628232393816027,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0015714661952764026,
+      "loss": 0.0884,
+      "step": 31828
+    },
+    {
+      "epoch": 0.2762910044183644,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0015714408430689169,
+      "loss": 0.0933,
+      "step": 31829
+    },
+    {
+      "epoch": 0.2762996848985686,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0015714154903458707,
+      "loss": 0.1328,
+      "step": 31830
+    },
+    {
+      "epoch": 0.27630836537877274,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0015713901371072914,
+      "loss": 0.0825,
+      "step": 31831
+    },
+    {
+      "epoch": 0.27631704585897693,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0015713647833532067,
+      "loss": 0.127,
+      "step": 31832
+    },
+    {
+      "epoch": 0.27632572633918107,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0015713394290836449,
+      "loss": 0.0869,
+      "step": 31833
+    },
+    {
+      "epoch": 0.27633440681938526,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0015713140742986333,
+      "loss": 0.1099,
+      "step": 31834
+    },
+    {
+      "epoch": 0.2763430872995894,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0015712887189982,
+      "loss": 0.0859,
+      "step": 31835
+    },
+    {
+      "epoch": 0.2763517677797936,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001571263363182372,
+      "loss": 0.1191,
+      "step": 31836
+    },
+    {
+      "epoch": 0.2763604482599977,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0015712380068511778,
+      "loss": 0.0791,
+      "step": 31837
+    },
+    {
+      "epoch": 0.2763691287402019,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001571212650004645,
+      "loss": 0.1045,
+      "step": 31838
+    },
+    {
+      "epoch": 0.27637780922040606,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0015711872926428005,
+      "loss": 0.1182,
+      "step": 31839
+    },
+    {
+      "epoch": 0.27638648970061025,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001571161934765673,
+      "loss": 0.0889,
+      "step": 31840
+    },
+    {
+      "epoch": 0.2763951701808144,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0015711365763732897,
+      "loss": 0.1445,
+      "step": 31841
+    },
+    {
+      "epoch": 0.2764038506610186,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0015711112174656787,
+      "loss": 0.1035,
+      "step": 31842
+    },
+    {
+      "epoch": 0.2764125311412227,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0015710858580428675,
+      "loss": 0.0977,
+      "step": 31843
+    },
+    {
+      "epoch": 0.2764212116214269,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001571060498104884,
+      "loss": 0.1152,
+      "step": 31844
+    },
+    {
+      "epoch": 0.27642989210163105,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0015710351376517556,
+      "loss": 0.1758,
+      "step": 31845
+    },
+    {
+      "epoch": 0.27643857258183524,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0015710097766835102,
+      "loss": 0.1377,
+      "step": 31846
+    },
+    {
+      "epoch": 0.2764472530620394,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0015709844152001758,
+      "loss": 0.0889,
+      "step": 31847
+    },
+    {
+      "epoch": 0.27645593354224357,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.00157095905320178,
+      "loss": 0.062,
+      "step": 31848
+    },
+    {
+      "epoch": 0.2764646140224477,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0015709336906883503,
+      "loss": 0.0698,
+      "step": 31849
+    },
+    {
+      "epoch": 0.2764732945026519,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0015709083276599148,
+      "loss": 0.0835,
+      "step": 31850
+    },
+    {
+      "epoch": 0.27648197498285604,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0015708829641165004,
+      "loss": 0.062,
+      "step": 31851
+    },
+    {
+      "epoch": 0.27649065546306023,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0015708576000581361,
+      "loss": 0.1582,
+      "step": 31852
+    },
+    {
+      "epoch": 0.27649933594326437,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0015708322354848485,
+      "loss": 0.1445,
+      "step": 31853
+    },
+    {
+      "epoch": 0.27650801642346856,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0015708068703966665,
+      "loss": 0.0781,
+      "step": 31854
+    },
+    {
+      "epoch": 0.2765166969036727,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0015707815047936167,
+      "loss": 0.0835,
+      "step": 31855
+    },
+    {
+      "epoch": 0.2765253773838769,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0015707561386757273,
+      "loss": 0.1709,
+      "step": 31856
+    },
+    {
+      "epoch": 0.27653405786408103,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0015707307720430263,
+      "loss": 0.0889,
+      "step": 31857
+    },
+    {
+      "epoch": 0.2765427383442852,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001570705404895541,
+      "loss": 0.1245,
+      "step": 31858
+    },
+    {
+      "epoch": 0.27655141882448936,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0015706800372332996,
+      "loss": 0.1094,
+      "step": 31859
+    },
+    {
+      "epoch": 0.27656009930469355,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0015706546690563295,
+      "loss": 0.1279,
+      "step": 31860
+    },
+    {
+      "epoch": 0.2765687797848977,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0015706293003646583,
+      "loss": 0.1191,
+      "step": 31861
+    },
+    {
+      "epoch": 0.2765774602651019,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.001570603931158314,
+      "loss": 0.1094,
+      "step": 31862
+    },
+    {
+      "epoch": 0.276586140745306,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0015705785614373246,
+      "loss": 0.1025,
+      "step": 31863
+    },
+    {
+      "epoch": 0.2765948212255102,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0015705531912017171,
+      "loss": 0.0869,
+      "step": 31864
+    },
+    {
+      "epoch": 0.27660350170571435,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.00157052782045152,
+      "loss": 0.0698,
+      "step": 31865
+    },
+    {
+      "epoch": 0.27661218218591854,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001570502449186761,
+      "loss": 0.1045,
+      "step": 31866
+    },
+    {
+      "epoch": 0.2766208626661227,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0015704770774074672,
+      "loss": 0.0967,
+      "step": 31867
+    },
+    {
+      "epoch": 0.2766295431463269,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001570451705113667,
+      "loss": 0.0972,
+      "step": 31868
+    },
+    {
+      "epoch": 0.276638223626531,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0015704263323053876,
+      "loss": 0.1011,
+      "step": 31869
+    },
+    {
+      "epoch": 0.2766469041067352,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0015704009589826574,
+      "loss": 0.1235,
+      "step": 31870
+    },
+    {
+      "epoch": 0.27665558458693934,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0015703755851455032,
+      "loss": 0.1279,
+      "step": 31871
+    },
+    {
+      "epoch": 0.27666426506714353,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0015703502107939538,
+      "loss": 0.1133,
+      "step": 31872
+    },
+    {
+      "epoch": 0.27667294554734767,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0015703248359280365,
+      "loss": 0.0972,
+      "step": 31873
+    },
+    {
+      "epoch": 0.27668162602755186,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0015702994605477788,
+      "loss": 0.084,
+      "step": 31874
+    },
+    {
+      "epoch": 0.276690306507756,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0015702740846532087,
+      "loss": 0.1172,
+      "step": 31875
+    },
+    {
+      "epoch": 0.2766989869879602,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001570248708244354,
+      "loss": 0.0947,
+      "step": 31876
+    },
+    {
+      "epoch": 0.27670766746816433,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0015702233313212423,
+      "loss": 0.1089,
+      "step": 31877
+    },
+    {
+      "epoch": 0.2767163479483685,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0015701979538839017,
+      "loss": 0.1201,
+      "step": 31878
+    },
+    {
+      "epoch": 0.27672502842857266,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0015701725759323592,
+      "loss": 0.1064,
+      "step": 31879
+    },
+    {
+      "epoch": 0.27673370890877685,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0015701471974666433,
+      "loss": 0.1187,
+      "step": 31880
+    },
+    {
+      "epoch": 0.276742389388981,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0015701218184867817,
+      "loss": 0.0762,
+      "step": 31881
+    },
+    {
+      "epoch": 0.2767510698691852,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0015700964389928017,
+      "loss": 0.0991,
+      "step": 31882
+    },
+    {
+      "epoch": 0.2767597503493893,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0015700710589847315,
+      "loss": 0.1475,
+      "step": 31883
+    },
+    {
+      "epoch": 0.2767684308295935,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0015700456784625983,
+      "loss": 0.1118,
+      "step": 31884
+    },
+    {
+      "epoch": 0.27677711130979765,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0015700202974264304,
+      "loss": 0.1289,
+      "step": 31885
+    },
+    {
+      "epoch": 0.27678579179000184,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0015699949158762556,
+      "loss": 0.0854,
+      "step": 31886
+    },
+    {
+      "epoch": 0.276794472270206,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0015699695338121011,
+      "loss": 0.1025,
+      "step": 31887
+    },
+    {
+      "epoch": 0.2768031527504102,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001569944151233995,
+      "loss": 0.1094,
+      "step": 31888
+    },
+    {
+      "epoch": 0.2768118332306143,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0015699187681419652,
+      "loss": 0.0996,
+      "step": 31889
+    },
+    {
+      "epoch": 0.2768205137108185,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001569893384536039,
+      "loss": 0.1045,
+      "step": 31890
+    },
+    {
+      "epoch": 0.27682919419102264,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0015698680004162446,
+      "loss": 0.1592,
+      "step": 31891
+    },
+    {
+      "epoch": 0.27683787467122684,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0015698426157826095,
+      "loss": 0.0894,
+      "step": 31892
+    },
+    {
+      "epoch": 0.276846555151431,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0015698172306351618,
+      "loss": 0.1289,
+      "step": 31893
+    },
+    {
+      "epoch": 0.27685523563163517,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0015697918449739292,
+      "loss": 0.106,
+      "step": 31894
+    },
+    {
+      "epoch": 0.2768639161118393,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0015697664587989392,
+      "loss": 0.0801,
+      "step": 31895
+    },
+    {
+      "epoch": 0.2768725965920435,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0015697410721102194,
+      "loss": 0.1328,
+      "step": 31896
+    },
+    {
+      "epoch": 0.27688127707224763,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001569715684907798,
+      "loss": 0.126,
+      "step": 31897
+    },
+    {
+      "epoch": 0.2768899575524518,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0015696902971917026,
+      "loss": 0.1152,
+      "step": 31898
+    },
+    {
+      "epoch": 0.27689863803265596,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0015696649089619605,
+      "loss": 0.0737,
+      "step": 31899
+    },
+    {
+      "epoch": 0.27690731851286016,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0015696395202186007,
+      "loss": 0.0884,
+      "step": 31900
+    },
+    {
+      "epoch": 0.2769159989930643,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0015696141309616497,
+      "loss": 0.1045,
+      "step": 31901
+    },
+    {
+      "epoch": 0.2769246794732685,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0015695887411911358,
+      "loss": 0.1309,
+      "step": 31902
+    },
+    {
+      "epoch": 0.2769333599534726,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015695633509070865,
+      "loss": 0.1465,
+      "step": 31903
+    },
+    {
+      "epoch": 0.2769420404336768,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.00156953796010953,
+      "loss": 0.1201,
+      "step": 31904
+    },
+    {
+      "epoch": 0.27695072091388095,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0015695125687984939,
+      "loss": 0.0698,
+      "step": 31905
+    },
+    {
+      "epoch": 0.27695940139408515,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001569487176974006,
+      "loss": 0.064,
+      "step": 31906
+    },
+    {
+      "epoch": 0.2769680818742893,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0015694617846360936,
+      "loss": 0.1211,
+      "step": 31907
+    },
+    {
+      "epoch": 0.2769767623544935,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0015694363917847854,
+      "loss": 0.1289,
+      "step": 31908
+    },
+    {
+      "epoch": 0.2769854428346976,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0015694109984201083,
+      "loss": 0.1011,
+      "step": 31909
+    },
+    {
+      "epoch": 0.2769941233149018,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.00156938560454209,
+      "loss": 0.1289,
+      "step": 31910
+    },
+    {
+      "epoch": 0.27700280379510595,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001569360210150759,
+      "loss": 0.0889,
+      "step": 31911
+    },
+    {
+      "epoch": 0.2770114842753101,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001569334815246143,
+      "loss": 0.1191,
+      "step": 31912
+    },
+    {
+      "epoch": 0.2770201647555143,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0015693094198282694,
+      "loss": 0.2432,
+      "step": 31913
+    },
+    {
+      "epoch": 0.2770288452357184,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0015692840238971657,
+      "loss": 0.0938,
+      "step": 31914
+    },
+    {
+      "epoch": 0.2770375257159226,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0015692586274528603,
+      "loss": 0.1133,
+      "step": 31915
+    },
+    {
+      "epoch": 0.27704620619612674,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001569233230495381,
+      "loss": 0.0747,
+      "step": 31916
+    },
+    {
+      "epoch": 0.27705488667633094,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0015692078330247548,
+      "loss": 0.1006,
+      "step": 31917
+    },
+    {
+      "epoch": 0.2770635671565351,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.00156918243504101,
+      "loss": 0.1328,
+      "step": 31918
+    },
+    {
+      "epoch": 0.27707224763673927,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0015691570365441744,
+      "loss": 0.0962,
+      "step": 31919
+    },
+    {
+      "epoch": 0.2770809281169434,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0015691316375342759,
+      "loss": 0.0669,
+      "step": 31920
+    },
+    {
+      "epoch": 0.2770896085971476,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001569106238011342,
+      "loss": 0.1074,
+      "step": 31921
+    },
+    {
+      "epoch": 0.27709828907735173,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0015690808379754007,
+      "loss": 0.1162,
+      "step": 31922
+    },
+    {
+      "epoch": 0.2771069695575559,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0015690554374264791,
+      "loss": 0.1191,
+      "step": 31923
+    },
+    {
+      "epoch": 0.27711565003776006,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0015690300363646061,
+      "loss": 0.1011,
+      "step": 31924
+    },
+    {
+      "epoch": 0.27712433051796426,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0015690046347898088,
+      "loss": 0.0913,
+      "step": 31925
+    },
+    {
+      "epoch": 0.2771330109981684,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001568979232702115,
+      "loss": 0.1309,
+      "step": 31926
+    },
+    {
+      "epoch": 0.2771416914783726,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0015689538301015527,
+      "loss": 0.084,
+      "step": 31927
+    },
+    {
+      "epoch": 0.2771503719585767,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0015689284269881489,
+      "loss": 0.1133,
+      "step": 31928
+    },
+    {
+      "epoch": 0.2771590524387809,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0015689030233619328,
+      "loss": 0.0928,
+      "step": 31929
+    },
+    {
+      "epoch": 0.27716773291898505,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001568877619222931,
+      "loss": 0.1396,
+      "step": 31930
+    },
+    {
+      "epoch": 0.27717641339918925,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0015688522145711717,
+      "loss": 0.0928,
+      "step": 31931
+    },
+    {
+      "epoch": 0.2771850938793934,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0015688268094066827,
+      "loss": 0.0869,
+      "step": 31932
+    },
+    {
+      "epoch": 0.2771937743595976,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001568801403729492,
+      "loss": 0.1279,
+      "step": 31933
+    },
+    {
+      "epoch": 0.2772024548398017,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0015687759975396268,
+      "loss": 0.0713,
+      "step": 31934
+    },
+    {
+      "epoch": 0.2772111353200059,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0015687505908371154,
+      "loss": 0.1582,
+      "step": 31935
+    },
+    {
+      "epoch": 0.27721981580021005,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0015687251836219852,
+      "loss": 0.1465,
+      "step": 31936
+    },
+    {
+      "epoch": 0.27722849628041424,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0015686997758942642,
+      "loss": 0.1079,
+      "step": 31937
+    },
+    {
+      "epoch": 0.2772371767606184,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0015686743676539805,
+      "loss": 0.1016,
+      "step": 31938
+    },
+    {
+      "epoch": 0.27724585724082257,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0015686489589011611,
+      "loss": 0.1211,
+      "step": 31939
+    },
+    {
+      "epoch": 0.2772545377210267,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0015686235496358346,
+      "loss": 0.1006,
+      "step": 31940
+    },
+    {
+      "epoch": 0.2772632182012309,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0015685981398580282,
+      "loss": 0.0996,
+      "step": 31941
+    },
+    {
+      "epoch": 0.27727189868143504,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00156857272956777,
+      "loss": 0.1138,
+      "step": 31942
+    },
+    {
+      "epoch": 0.27728057916163923,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001568547318765088,
+      "loss": 0.1357,
+      "step": 31943
+    },
+    {
+      "epoch": 0.27728925964184337,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0015685219074500093,
+      "loss": 0.0898,
+      "step": 31944
+    },
+    {
+      "epoch": 0.27729794012204756,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0015684964956225623,
+      "loss": 0.1543,
+      "step": 31945
+    },
+    {
+      "epoch": 0.2773066206022517,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0015684710832827744,
+      "loss": 0.0791,
+      "step": 31946
+    },
+    {
+      "epoch": 0.2773153010824559,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0015684456704306736,
+      "loss": 0.1099,
+      "step": 31947
+    },
+    {
+      "epoch": 0.27732398156266,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0015684202570662876,
+      "loss": 0.1152,
+      "step": 31948
+    },
+    {
+      "epoch": 0.2773326620428642,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0015683948431896445,
+      "loss": 0.0947,
+      "step": 31949
+    },
+    {
+      "epoch": 0.27734134252306836,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0015683694288007712,
+      "loss": 0.0962,
+      "step": 31950
+    },
+    {
+      "epoch": 0.27735002300327255,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0015683440138996969,
+      "loss": 0.1226,
+      "step": 31951
+    },
+    {
+      "epoch": 0.2773587034834767,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0015683185984864483,
+      "loss": 0.1582,
+      "step": 31952
+    },
+    {
+      "epoch": 0.2773673839636809,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0015682931825610534,
+      "loss": 0.1436,
+      "step": 31953
+    },
+    {
+      "epoch": 0.277376064443885,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0015682677661235404,
+      "loss": 0.1084,
+      "step": 31954
+    },
+    {
+      "epoch": 0.2773847449240892,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0015682423491739366,
+      "loss": 0.0986,
+      "step": 31955
+    },
+    {
+      "epoch": 0.27739342540429335,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0015682169317122702,
+      "loss": 0.1094,
+      "step": 31956
+    },
+    {
+      "epoch": 0.27740210588449754,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0015681915137385682,
+      "loss": 0.1177,
+      "step": 31957
+    },
+    {
+      "epoch": 0.2774107863647017,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0015681660952528596,
+      "loss": 0.0859,
+      "step": 31958
+    },
+    {
+      "epoch": 0.27741946684490587,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0015681406762551714,
+      "loss": 0.1191,
+      "step": 31959
+    },
+    {
+      "epoch": 0.27742814732511,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0015681152567455317,
+      "loss": 0.0913,
+      "step": 31960
+    },
+    {
+      "epoch": 0.2774368278053142,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0015680898367239683,
+      "loss": 0.0918,
+      "step": 31961
+    },
+    {
+      "epoch": 0.27744550828551834,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0015680644161905084,
+      "loss": 0.0737,
+      "step": 31962
+    },
+    {
+      "epoch": 0.27745418876572253,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0015680389951451808,
+      "loss": 0.1143,
+      "step": 31963
+    },
+    {
+      "epoch": 0.27746286924592667,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0015680135735880124,
+      "loss": 0.1201,
+      "step": 31964
+    },
+    {
+      "epoch": 0.27747154972613086,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0015679881515190317,
+      "loss": 0.1006,
+      "step": 31965
+    },
+    {
+      "epoch": 0.277480230206335,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0015679627289382659,
+      "loss": 0.1074,
+      "step": 31966
+    },
+    {
+      "epoch": 0.2774889106865392,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0015679373058457432,
+      "loss": 0.124,
+      "step": 31967
+    },
+    {
+      "epoch": 0.27749759116674333,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0015679118822414912,
+      "loss": 0.1045,
+      "step": 31968
+    },
+    {
+      "epoch": 0.2775062716469475,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001567886458125538,
+      "loss": 0.1104,
+      "step": 31969
+    },
+    {
+      "epoch": 0.27751495212715166,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001567861033497911,
+      "loss": 0.0928,
+      "step": 31970
+    },
+    {
+      "epoch": 0.27752363260735585,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0015678356083586385,
+      "loss": 0.0898,
+      "step": 31971
+    },
+    {
+      "epoch": 0.27753231308756,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.001567810182707748,
+      "loss": 0.1152,
+      "step": 31972
+    },
+    {
+      "epoch": 0.2775409935677642,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001567784756545267,
+      "loss": 0.0991,
+      "step": 31973
+    },
+    {
+      "epoch": 0.2775496740479683,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0015677593298712238,
+      "loss": 0.1465,
+      "step": 31974
+    },
+    {
+      "epoch": 0.2775583545281725,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001567733902685646,
+      "loss": 0.1045,
+      "step": 31975
+    },
+    {
+      "epoch": 0.27756703500837665,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0015677084749885614,
+      "loss": 0.1035,
+      "step": 31976
+    },
+    {
+      "epoch": 0.27757571548858084,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001567683046779998,
+      "loss": 0.1045,
+      "step": 31977
+    },
+    {
+      "epoch": 0.277584395968785,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0015676576180599833,
+      "loss": 0.0918,
+      "step": 31978
+    },
+    {
+      "epoch": 0.2775930764489892,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0015676321888285453,
+      "loss": 0.168,
+      "step": 31979
+    },
+    {
+      "epoch": 0.2776017569291933,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0015676067590857116,
+      "loss": 0.0742,
+      "step": 31980
+    },
+    {
+      "epoch": 0.2776104374093975,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0015675813288315106,
+      "loss": 0.1074,
+      "step": 31981
+    },
+    {
+      "epoch": 0.27761911788960164,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0015675558980659692,
+      "loss": 0.0986,
+      "step": 31982
+    },
+    {
+      "epoch": 0.27762779836980583,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001567530466789116,
+      "loss": 0.0938,
+      "step": 31983
+    },
+    {
+      "epoch": 0.27763647885000997,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001567505035000978,
+      "loss": 0.0938,
+      "step": 31984
+    },
+    {
+      "epoch": 0.27764515933021416,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.001567479602701584,
+      "loss": 0.2734,
+      "step": 31985
+    },
+    {
+      "epoch": 0.2776538398104183,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0015674541698909614,
+      "loss": 0.0791,
+      "step": 31986
+    },
+    {
+      "epoch": 0.2776625202906225,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0015674287365691378,
+      "loss": 0.104,
+      "step": 31987
+    },
+    {
+      "epoch": 0.27767120077082663,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0015674033027361413,
+      "loss": 0.0859,
+      "step": 31988
+    },
+    {
+      "epoch": 0.2776798812510308,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001567377868391999,
+      "loss": 0.0947,
+      "step": 31989
+    },
+    {
+      "epoch": 0.27768856173123496,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0015673524335367399,
+      "loss": 0.1211,
+      "step": 31990
+    },
+    {
+      "epoch": 0.27769724221143915,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0015673269981703914,
+      "loss": 0.1523,
+      "step": 31991
+    },
+    {
+      "epoch": 0.2777059226916433,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0015673015622929803,
+      "loss": 0.1113,
+      "step": 31992
+    },
+    {
+      "epoch": 0.2777146031718475,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0015672761259045355,
+      "loss": 0.1797,
+      "step": 31993
+    },
+    {
+      "epoch": 0.2777232836520516,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0015672506890050846,
+      "loss": 0.0908,
+      "step": 31994
+    },
+    {
+      "epoch": 0.2777319641322558,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0015672252515946557,
+      "loss": 0.1582,
+      "step": 31995
+    },
+    {
+      "epoch": 0.27774064461245995,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001567199813673276,
+      "loss": 0.0942,
+      "step": 31996
+    },
+    {
+      "epoch": 0.27774932509266415,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0015671743752409737,
+      "loss": 0.3672,
+      "step": 31997
+    },
+    {
+      "epoch": 0.2777580055728683,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0015671489362977767,
+      "loss": 0.0854,
+      "step": 31998
+    },
+    {
+      "epoch": 0.2777666860530725,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0015671234968437121,
+      "loss": 0.0967,
+      "step": 31999
+    },
+    {
+      "epoch": 0.2777753665332766,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0015670980568788086,
+      "loss": 0.1011,
+      "step": 32000
+    },
+    {
+      "epoch": 0.2777840470134808,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0015670726164030938,
+      "loss": 0.0918,
+      "step": 32001
+    },
+    {
+      "epoch": 0.27779272749368494,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0015670471754165951,
+      "loss": 0.103,
+      "step": 32002
+    },
+    {
+      "epoch": 0.27780140797388914,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001567021733919341,
+      "loss": 0.1162,
+      "step": 32003
+    },
+    {
+      "epoch": 0.2778100884540933,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0015669962919113586,
+      "loss": 0.103,
+      "step": 32004
+    },
+    {
+      "epoch": 0.27781876893429747,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001566970849392676,
+      "loss": 0.0898,
+      "step": 32005
+    },
+    {
+      "epoch": 0.2778274494145016,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0015669454063633212,
+      "loss": 0.1211,
+      "step": 32006
+    },
+    {
+      "epoch": 0.2778361298947058,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0015669199628233223,
+      "loss": 0.0757,
+      "step": 32007
+    },
+    {
+      "epoch": 0.27784481037490993,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0015668945187727063,
+      "loss": 0.1279,
+      "step": 32008
+    },
+    {
+      "epoch": 0.2778534908551141,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0015668690742115018,
+      "loss": 0.1299,
+      "step": 32009
+    },
+    {
+      "epoch": 0.27786217133531826,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001566843629139736,
+      "loss": 0.1064,
+      "step": 32010
+    },
+    {
+      "epoch": 0.27787085181552246,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0015668181835574372,
+      "loss": 0.0864,
+      "step": 32011
+    },
+    {
+      "epoch": 0.2778795322957266,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0015667927374646332,
+      "loss": 0.0952,
+      "step": 32012
+    },
+    {
+      "epoch": 0.2778882127759308,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0015667672908613512,
+      "loss": 0.0928,
+      "step": 32013
+    },
+    {
+      "epoch": 0.2778968932561349,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00156674184374762,
+      "loss": 0.1377,
+      "step": 32014
+    },
+    {
+      "epoch": 0.2779055737363391,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0015667163961234668,
+      "loss": 0.0957,
+      "step": 32015
+    },
+    {
+      "epoch": 0.27791425421654326,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0015666909479889193,
+      "loss": 0.1465,
+      "step": 32016
+    },
+    {
+      "epoch": 0.27792293469674745,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0015666654993440059,
+      "loss": 0.4688,
+      "step": 32017
+    },
+    {
+      "epoch": 0.2779316151769516,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001566640050188754,
+      "loss": 0.0928,
+      "step": 32018
+    },
+    {
+      "epoch": 0.2779402956571558,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0015666146005231913,
+      "loss": 0.082,
+      "step": 32019
+    },
+    {
+      "epoch": 0.2779489761373599,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0015665891503473463,
+      "loss": 0.1118,
+      "step": 32020
+    },
+    {
+      "epoch": 0.2779576566175641,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0015665636996612465,
+      "loss": 0.1113,
+      "step": 32021
+    },
+    {
+      "epoch": 0.27796633709776825,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0015665382484649191,
+      "loss": 0.1621,
+      "step": 32022
+    },
+    {
+      "epoch": 0.27797501757797244,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0015665127967583927,
+      "loss": 0.0928,
+      "step": 32023
+    },
+    {
+      "epoch": 0.2779836980581766,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001566487344541695,
+      "loss": 0.0898,
+      "step": 32024
+    },
+    {
+      "epoch": 0.27799237853838077,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0015664618918148538,
+      "loss": 0.1816,
+      "step": 32025
+    },
+    {
+      "epoch": 0.2780010590185849,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0015664364385778967,
+      "loss": 0.0723,
+      "step": 32026
+    },
+    {
+      "epoch": 0.2780097394987891,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001566410984830852,
+      "loss": 0.209,
+      "step": 32027
+    },
+    {
+      "epoch": 0.27801841997899324,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.001566385530573747,
+      "loss": 0.0835,
+      "step": 32028
+    },
+    {
+      "epoch": 0.27802710045919743,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0015663600758066098,
+      "loss": 0.1055,
+      "step": 32029
+    },
+    {
+      "epoch": 0.27803578093940157,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0015663346205294687,
+      "loss": 0.0996,
+      "step": 32030
+    },
+    {
+      "epoch": 0.27804446141960576,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0015663091647423505,
+      "loss": 0.0747,
+      "step": 32031
+    },
+    {
+      "epoch": 0.2780531418998099,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0015662837084452837,
+      "loss": 0.0869,
+      "step": 32032
+    },
+    {
+      "epoch": 0.2780618223800141,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001566258251638296,
+      "loss": 0.123,
+      "step": 32033
+    },
+    {
+      "epoch": 0.2780705028602182,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0015662327943214154,
+      "loss": 0.1221,
+      "step": 32034
+    },
+    {
+      "epoch": 0.27807918334042236,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0015662073364946696,
+      "loss": 0.1426,
+      "step": 32035
+    },
+    {
+      "epoch": 0.27808786382062656,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0015661818781580867,
+      "loss": 0.0977,
+      "step": 32036
+    },
+    {
+      "epoch": 0.2780965443008307,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001566156419311694,
+      "loss": 0.1113,
+      "step": 32037
+    },
+    {
+      "epoch": 0.2781052247810349,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0015661309599555198,
+      "loss": 0.1133,
+      "step": 32038
+    },
+    {
+      "epoch": 0.278113905261239,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0015661055000895918,
+      "loss": 0.1201,
+      "step": 32039
+    },
+    {
+      "epoch": 0.2781225857414432,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0015660800397139377,
+      "loss": 0.0786,
+      "step": 32040
+    },
+    {
+      "epoch": 0.27813126622164736,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0015660545788285856,
+      "loss": 0.1147,
+      "step": 32041
+    },
+    {
+      "epoch": 0.27813994670185155,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.001566029117433563,
+      "loss": 0.1128,
+      "step": 32042
+    },
+    {
+      "epoch": 0.2781486271820557,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0015660036555288981,
+      "loss": 0.168,
+      "step": 32043
+    },
+    {
+      "epoch": 0.2781573076622599,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0015659781931146185,
+      "loss": 0.0981,
+      "step": 32044
+    },
+    {
+      "epoch": 0.278165988142464,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0015659527301907523,
+      "loss": 0.1387,
+      "step": 32045
+    },
+    {
+      "epoch": 0.2781746686226682,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001565927266757327,
+      "loss": 0.082,
+      "step": 32046
+    },
+    {
+      "epoch": 0.27818334910287235,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0015659018028143707,
+      "loss": 0.1006,
+      "step": 32047
+    },
+    {
+      "epoch": 0.27819202958307654,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0015658763383619115,
+      "loss": 0.1025,
+      "step": 32048
+    },
+    {
+      "epoch": 0.2782007100632807,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0015658508733999768,
+      "loss": 0.0854,
+      "step": 32049
+    },
+    {
+      "epoch": 0.27820939054348487,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0015658254079285946,
+      "loss": 0.0991,
+      "step": 32050
+    },
+    {
+      "epoch": 0.278218071023689,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0015657999419477926,
+      "loss": 0.1133,
+      "step": 32051
+    },
+    {
+      "epoch": 0.2782267515038932,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0015657744754575987,
+      "loss": 0.1348,
+      "step": 32052
+    },
+    {
+      "epoch": 0.27823543198409734,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0015657490084580409,
+      "loss": 0.1279,
+      "step": 32053
+    },
+    {
+      "epoch": 0.27824411246430153,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001565723540949147,
+      "loss": 0.1309,
+      "step": 32054
+    },
+    {
+      "epoch": 0.27825279294450567,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0015656980729309448,
+      "loss": 0.1084,
+      "step": 32055
+    },
+    {
+      "epoch": 0.27826147342470986,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0015656726044034623,
+      "loss": 0.0889,
+      "step": 32056
+    },
+    {
+      "epoch": 0.278270153904914,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0015656471353667272,
+      "loss": 0.0669,
+      "step": 32057
+    },
+    {
+      "epoch": 0.2782788343851182,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0015656216658207673,
+      "loss": 0.0879,
+      "step": 32058
+    },
+    {
+      "epoch": 0.2782875148653223,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0015655961957656108,
+      "loss": 0.1133,
+      "step": 32059
+    },
+    {
+      "epoch": 0.2782961953455265,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0015655707252012852,
+      "loss": 0.1147,
+      "step": 32060
+    },
+    {
+      "epoch": 0.27830487582573066,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0015655452541278182,
+      "loss": 0.1172,
+      "step": 32061
+    },
+    {
+      "epoch": 0.27831355630593485,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001565519782545238,
+      "loss": 0.1123,
+      "step": 32062
+    },
+    {
+      "epoch": 0.278322236786139,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001565494310453572,
+      "loss": 0.0981,
+      "step": 32063
+    },
+    {
+      "epoch": 0.2783309172663432,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0015654688378528494,
+      "loss": 0.0869,
+      "step": 32064
+    },
+    {
+      "epoch": 0.2783395977465473,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001565443364743096,
+      "loss": 0.1016,
+      "step": 32065
+    },
+    {
+      "epoch": 0.2783482782267515,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0015654178911243416,
+      "loss": 0.1465,
+      "step": 32066
+    },
+    {
+      "epoch": 0.27835695870695565,
+      "grad_norm": 0.055419921875,
+      "learning_rate": 0.0015653924169966127,
+      "loss": 0.0625,
+      "step": 32067
+    },
+    {
+      "epoch": 0.27836563918715984,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001565366942359938,
+      "loss": 0.1084,
+      "step": 32068
+    },
+    {
+      "epoch": 0.278374319667364,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0015653414672143447,
+      "loss": 0.1221,
+      "step": 32069
+    },
+    {
+      "epoch": 0.27838300014756817,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0015653159915598612,
+      "loss": 0.0928,
+      "step": 32070
+    },
+    {
+      "epoch": 0.2783916806277723,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0015652905153965147,
+      "loss": 0.1133,
+      "step": 32071
+    },
+    {
+      "epoch": 0.2784003611079765,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0015652650387243338,
+      "loss": 0.1152,
+      "step": 32072
+    },
+    {
+      "epoch": 0.27840904158818064,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0015652395615433458,
+      "loss": 0.1187,
+      "step": 32073
+    },
+    {
+      "epoch": 0.27841772206838483,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0015652140838535791,
+      "loss": 0.1914,
+      "step": 32074
+    },
+    {
+      "epoch": 0.27842640254858897,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001565188605655061,
+      "loss": 0.103,
+      "step": 32075
+    },
+    {
+      "epoch": 0.27843508302879316,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0015651631269478202,
+      "loss": 0.1094,
+      "step": 32076
+    },
+    {
+      "epoch": 0.2784437635089973,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0015651376477318834,
+      "loss": 0.1104,
+      "step": 32077
+    },
+    {
+      "epoch": 0.2784524439892015,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0015651121680072793,
+      "loss": 0.0967,
+      "step": 32078
+    },
+    {
+      "epoch": 0.27846112446940563,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0015650866877740355,
+      "loss": 0.0811,
+      "step": 32079
+    },
+    {
+      "epoch": 0.2784698049496098,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0015650612070321798,
+      "loss": 0.1348,
+      "step": 32080
+    },
+    {
+      "epoch": 0.27847848542981396,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.00156503572578174,
+      "loss": 0.1147,
+      "step": 32081
+    },
+    {
+      "epoch": 0.27848716591001815,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0015650102440227444,
+      "loss": 0.1787,
+      "step": 32082
+    },
+    {
+      "epoch": 0.2784958463902223,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0015649847617552205,
+      "loss": 0.1152,
+      "step": 32083
+    },
+    {
+      "epoch": 0.2785045268704265,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0015649592789791962,
+      "loss": 0.0928,
+      "step": 32084
+    },
+    {
+      "epoch": 0.2785132073506306,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0015649337956946997,
+      "loss": 0.1045,
+      "step": 32085
+    },
+    {
+      "epoch": 0.2785218878308348,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0015649083119017582,
+      "loss": 0.085,
+      "step": 32086
+    },
+    {
+      "epoch": 0.27853056831103895,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0015648828276004,
+      "loss": 0.0864,
+      "step": 32087
+    },
+    {
+      "epoch": 0.27853924879124314,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001564857342790653,
+      "loss": 0.1064,
+      "step": 32088
+    },
+    {
+      "epoch": 0.2785479292714473,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0015648318574725453,
+      "loss": 0.0806,
+      "step": 32089
+    },
+    {
+      "epoch": 0.2785566097516515,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001564806371646104,
+      "loss": 0.0825,
+      "step": 32090
+    },
+    {
+      "epoch": 0.2785652902318556,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0015647808853113576,
+      "loss": 0.0869,
+      "step": 32091
+    },
+    {
+      "epoch": 0.2785739707120598,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0015647553984683338,
+      "loss": 0.0908,
+      "step": 32092
+    },
+    {
+      "epoch": 0.27858265119226394,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0015647299111170604,
+      "loss": 0.1504,
+      "step": 32093
+    },
+    {
+      "epoch": 0.27859133167246813,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0015647044232575658,
+      "loss": 0.0767,
+      "step": 32094
+    },
+    {
+      "epoch": 0.27860001215267227,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0015646789348898768,
+      "loss": 0.1514,
+      "step": 32095
+    },
+    {
+      "epoch": 0.27860869263287646,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0015646534460140222,
+      "loss": 0.1484,
+      "step": 32096
+    },
+    {
+      "epoch": 0.2786173731130806,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0015646279566300296,
+      "loss": 0.1177,
+      "step": 32097
+    },
+    {
+      "epoch": 0.2786260535932848,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0015646024667379267,
+      "loss": 0.1055,
+      "step": 32098
+    },
+    {
+      "epoch": 0.27863473407348893,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0015645769763377418,
+      "loss": 0.1157,
+      "step": 32099
+    },
+    {
+      "epoch": 0.2786434145536931,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0015645514854295023,
+      "loss": 0.1157,
+      "step": 32100
+    },
+    {
+      "epoch": 0.27865209503389726,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001564525994013236,
+      "loss": 0.0957,
+      "step": 32101
+    },
+    {
+      "epoch": 0.27866077551410146,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001564500502088972,
+      "loss": 0.1582,
+      "step": 32102
+    },
+    {
+      "epoch": 0.2786694559943056,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0015644750096567364,
+      "loss": 0.1025,
+      "step": 32103
+    },
+    {
+      "epoch": 0.2786781364745098,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.001564449516716558,
+      "loss": 0.0801,
+      "step": 32104
+    },
+    {
+      "epoch": 0.2786868169547139,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0015644240232684649,
+      "loss": 0.1719,
+      "step": 32105
+    },
+    {
+      "epoch": 0.2786954974349181,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0015643985293124843,
+      "loss": 0.0762,
+      "step": 32106
+    },
+    {
+      "epoch": 0.27870417791512225,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0015643730348486448,
+      "loss": 0.0718,
+      "step": 32107
+    },
+    {
+      "epoch": 0.27871285839532645,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0015643475398769736,
+      "loss": 0.1055,
+      "step": 32108
+    },
+    {
+      "epoch": 0.2787215388755306,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001564322044397499,
+      "loss": 0.1172,
+      "step": 32109
+    },
+    {
+      "epoch": 0.2787302193557348,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0015642965484102486,
+      "loss": 0.1611,
+      "step": 32110
+    },
+    {
+      "epoch": 0.2787388998359389,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001564271051915251,
+      "loss": 0.0957,
+      "step": 32111
+    },
+    {
+      "epoch": 0.2787475803161431,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0015642455549125332,
+      "loss": 0.123,
+      "step": 32112
+    },
+    {
+      "epoch": 0.27875626079634724,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0015642200574021233,
+      "loss": 0.1182,
+      "step": 32113
+    },
+    {
+      "epoch": 0.27876494127655144,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0015641945593840496,
+      "loss": 0.1113,
+      "step": 32114
+    },
+    {
+      "epoch": 0.2787736217567556,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0015641690608583397,
+      "loss": 0.1367,
+      "step": 32115
+    },
+    {
+      "epoch": 0.27878230223695977,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0015641435618250212,
+      "loss": 0.1211,
+      "step": 32116
+    },
+    {
+      "epoch": 0.2787909827171639,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0015641180622841225,
+      "loss": 0.0898,
+      "step": 32117
+    },
+    {
+      "epoch": 0.2787996631973681,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0015640925622356712,
+      "loss": 0.1182,
+      "step": 32118
+    },
+    {
+      "epoch": 0.27880834367757223,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0015640670616796954,
+      "loss": 0.083,
+      "step": 32119
+    },
+    {
+      "epoch": 0.2788170241577764,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0015640415606162229,
+      "loss": 0.0903,
+      "step": 32120
+    },
+    {
+      "epoch": 0.27882570463798056,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0015640160590452811,
+      "loss": 0.0889,
+      "step": 32121
+    },
+    {
+      "epoch": 0.27883438511818476,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0015639905569668985,
+      "loss": 0.1348,
+      "step": 32122
+    },
+    {
+      "epoch": 0.2788430655983889,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0015639650543811028,
+      "loss": 0.1895,
+      "step": 32123
+    },
+    {
+      "epoch": 0.2788517460785931,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001563939551287922,
+      "loss": 0.1152,
+      "step": 32124
+    },
+    {
+      "epoch": 0.2788604265587972,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0015639140476873838,
+      "loss": 0.1357,
+      "step": 32125
+    },
+    {
+      "epoch": 0.2788691070390014,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001563888543579516,
+      "loss": 0.1289,
+      "step": 32126
+    },
+    {
+      "epoch": 0.27887778751920556,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0015638630389643471,
+      "loss": 0.0967,
+      "step": 32127
+    },
+    {
+      "epoch": 0.27888646799940975,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0015638375338419043,
+      "loss": 0.1133,
+      "step": 32128
+    },
+    {
+      "epoch": 0.2788951484796139,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0015638120282122155,
+      "loss": 0.1328,
+      "step": 32129
+    },
+    {
+      "epoch": 0.2789038289598181,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.001563786522075309,
+      "loss": 0.1484,
+      "step": 32130
+    },
+    {
+      "epoch": 0.2789125094400222,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0015637610154312126,
+      "loss": 0.1377,
+      "step": 32131
+    },
+    {
+      "epoch": 0.2789211899202264,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0015637355082799543,
+      "loss": 0.0708,
+      "step": 32132
+    },
+    {
+      "epoch": 0.27892987040043055,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0015637100006215615,
+      "loss": 0.207,
+      "step": 32133
+    },
+    {
+      "epoch": 0.27893855088063474,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0015636844924560626,
+      "loss": 0.0854,
+      "step": 32134
+    },
+    {
+      "epoch": 0.2789472313608389,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0015636589837834851,
+      "loss": 0.1016,
+      "step": 32135
+    },
+    {
+      "epoch": 0.27895591184104307,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0015636334746038573,
+      "loss": 0.1221,
+      "step": 32136
+    },
+    {
+      "epoch": 0.2789645923212472,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001563607964917207,
+      "loss": 0.1162,
+      "step": 32137
+    },
+    {
+      "epoch": 0.2789732728014514,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001563582454723562,
+      "loss": 0.0938,
+      "step": 32138
+    },
+    {
+      "epoch": 0.27898195328165554,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0015635569440229499,
+      "loss": 0.1396,
+      "step": 32139
+    },
+    {
+      "epoch": 0.27899063376185973,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001563531432815399,
+      "loss": 0.0977,
+      "step": 32140
+    },
+    {
+      "epoch": 0.27899931424206387,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0015635059211009368,
+      "loss": 0.1099,
+      "step": 32141
+    },
+    {
+      "epoch": 0.27900799472226806,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0015634804088795922,
+      "loss": 0.1367,
+      "step": 32142
+    },
+    {
+      "epoch": 0.2790166752024722,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0015634548961513917,
+      "loss": 0.0957,
+      "step": 32143
+    },
+    {
+      "epoch": 0.2790253556826764,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0015634293829163645,
+      "loss": 0.1328,
+      "step": 32144
+    },
+    {
+      "epoch": 0.2790340361628805,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0015634038691745373,
+      "loss": 0.1216,
+      "step": 32145
+    },
+    {
+      "epoch": 0.2790427166430847,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0015633783549259391,
+      "loss": 0.0991,
+      "step": 32146
+    },
+    {
+      "epoch": 0.27905139712328886,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0015633528401705968,
+      "loss": 0.1055,
+      "step": 32147
+    },
+    {
+      "epoch": 0.27906007760349305,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001563327324908539,
+      "loss": 0.0947,
+      "step": 32148
+    },
+    {
+      "epoch": 0.2790687580836972,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0015633018091397936,
+      "loss": 0.126,
+      "step": 32149
+    },
+    {
+      "epoch": 0.2790774385639014,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0015632762928643883,
+      "loss": 0.1416,
+      "step": 32150
+    },
+    {
+      "epoch": 0.2790861190441055,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0015632507760823507,
+      "loss": 0.0791,
+      "step": 32151
+    },
+    {
+      "epoch": 0.2790947995243097,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001563225258793709,
+      "loss": 0.1113,
+      "step": 32152
+    },
+    {
+      "epoch": 0.27910348000451385,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0015631997409984912,
+      "loss": 0.1152,
+      "step": 32153
+    },
+    {
+      "epoch": 0.27911216048471804,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0015631742226967253,
+      "loss": 0.1104,
+      "step": 32154
+    },
+    {
+      "epoch": 0.2791208409649222,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0015631487038884392,
+      "loss": 0.1162,
+      "step": 32155
+    },
+    {
+      "epoch": 0.27912952144512637,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0015631231845736602,
+      "loss": 0.1084,
+      "step": 32156
+    },
+    {
+      "epoch": 0.2791382019253305,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0015630976647524168,
+      "loss": 0.0625,
+      "step": 32157
+    },
+    {
+      "epoch": 0.27914688240553465,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0015630721444247367,
+      "loss": 0.1396,
+      "step": 32158
+    },
+    {
+      "epoch": 0.27915556288573884,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0015630466235906479,
+      "loss": 0.1104,
+      "step": 32159
+    },
+    {
+      "epoch": 0.279164243365943,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001563021102250178,
+      "loss": 0.0947,
+      "step": 32160
+    },
+    {
+      "epoch": 0.27917292384614717,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0015629955804033558,
+      "loss": 0.0845,
+      "step": 32161
+    },
+    {
+      "epoch": 0.2791816043263513,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0015629700580502082,
+      "loss": 0.0684,
+      "step": 32162
+    },
+    {
+      "epoch": 0.2791902848065555,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0015629445351907638,
+      "loss": 0.1289,
+      "step": 32163
+    },
+    {
+      "epoch": 0.27919896528675964,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0015629190118250501,
+      "loss": 0.1309,
+      "step": 32164
+    },
+    {
+      "epoch": 0.27920764576696383,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001562893487953095,
+      "loss": 0.126,
+      "step": 32165
+    },
+    {
+      "epoch": 0.27921632624716797,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0015628679635749263,
+      "loss": 0.127,
+      "step": 32166
+    },
+    {
+      "epoch": 0.27922500672737216,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0015628424386905722,
+      "loss": 0.0908,
+      "step": 32167
+    },
+    {
+      "epoch": 0.2792336872075763,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0015628169133000609,
+      "loss": 0.1084,
+      "step": 32168
+    },
+    {
+      "epoch": 0.2792423676877805,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0015627913874034197,
+      "loss": 0.1123,
+      "step": 32169
+    },
+    {
+      "epoch": 0.27925104816798463,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001562765861000677,
+      "loss": 0.0962,
+      "step": 32170
+    },
+    {
+      "epoch": 0.2792597286481888,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0015627403340918605,
+      "loss": 0.1045,
+      "step": 32171
+    },
+    {
+      "epoch": 0.27926840912839296,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0015627148066769982,
+      "loss": 0.082,
+      "step": 32172
+    },
+    {
+      "epoch": 0.27927708960859715,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0015626892787561175,
+      "loss": 0.1079,
+      "step": 32173
+    },
+    {
+      "epoch": 0.2792857700888013,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001562663750329247,
+      "loss": 0.0806,
+      "step": 32174
+    },
+    {
+      "epoch": 0.2792944505690055,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0015626382213964147,
+      "loss": 0.0928,
+      "step": 32175
+    },
+    {
+      "epoch": 0.2793031310492096,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001562612691957648,
+      "loss": 0.082,
+      "step": 32176
+    },
+    {
+      "epoch": 0.2793118115294138,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0015625871620129747,
+      "loss": 0.1221,
+      "step": 32177
+    },
+    {
+      "epoch": 0.27932049200961795,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001562561631562423,
+      "loss": 0.1167,
+      "step": 32178
+    },
+    {
+      "epoch": 0.27932917248982214,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0015625361006060213,
+      "loss": 0.1279,
+      "step": 32179
+    },
+    {
+      "epoch": 0.2793378529700263,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0015625105691437968,
+      "loss": 0.1133,
+      "step": 32180
+    },
+    {
+      "epoch": 0.27934653345023047,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001562485037175778,
+      "loss": 0.1289,
+      "step": 32181
+    },
+    {
+      "epoch": 0.2793552139304346,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0015624595047019922,
+      "loss": 0.1406,
+      "step": 32182
+    },
+    {
+      "epoch": 0.2793638944106388,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0015624339717224677,
+      "loss": 0.0894,
+      "step": 32183
+    },
+    {
+      "epoch": 0.27937257489084294,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0015624084382372325,
+      "loss": 0.1187,
+      "step": 32184
+    },
+    {
+      "epoch": 0.27938125537104713,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0015623829042463143,
+      "loss": 0.1826,
+      "step": 32185
+    },
+    {
+      "epoch": 0.27938993585125127,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001562357369749741,
+      "loss": 0.1094,
+      "step": 32186
+    },
+    {
+      "epoch": 0.27939861633145546,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0015623318347475403,
+      "loss": 0.1484,
+      "step": 32187
+    },
+    {
+      "epoch": 0.2794072968116596,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001562306299239741,
+      "loss": 0.0991,
+      "step": 32188
+    },
+    {
+      "epoch": 0.2794159772918638,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0015622807632263704,
+      "loss": 0.0762,
+      "step": 32189
+    },
+    {
+      "epoch": 0.27942465777206793,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0015622552267074564,
+      "loss": 0.1016,
+      "step": 32190
+    },
+    {
+      "epoch": 0.2794333382522721,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0015622296896830269,
+      "loss": 0.1074,
+      "step": 32191
+    },
+    {
+      "epoch": 0.27944201873247626,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00156220415215311,
+      "loss": 0.0986,
+      "step": 32192
+    },
+    {
+      "epoch": 0.27945069921268045,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001562178614117734,
+      "loss": 0.1328,
+      "step": 32193
+    },
+    {
+      "epoch": 0.2794593796928846,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001562153075576926,
+      "loss": 0.0845,
+      "step": 32194
+    },
+    {
+      "epoch": 0.2794680601730888,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0015621275365307146,
+      "loss": 0.1748,
+      "step": 32195
+    },
+    {
+      "epoch": 0.2794767406532929,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001562101996979127,
+      "loss": 0.0889,
+      "step": 32196
+    },
+    {
+      "epoch": 0.2794854211334971,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001562076456922192,
+      "loss": 0.1611,
+      "step": 32197
+    },
+    {
+      "epoch": 0.27949410161370125,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001562050916359937,
+      "loss": 0.1069,
+      "step": 32198
+    },
+    {
+      "epoch": 0.27950278209390544,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0015620253752923901,
+      "loss": 0.1006,
+      "step": 32199
+    },
+    {
+      "epoch": 0.2795114625741096,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0015619998337195794,
+      "loss": 0.0908,
+      "step": 32200
+    },
+    {
+      "epoch": 0.2795201430543138,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0015619742916415324,
+      "loss": 0.1172,
+      "step": 32201
+    },
+    {
+      "epoch": 0.2795288235345179,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001561948749058277,
+      "loss": 0.0889,
+      "step": 32202
+    },
+    {
+      "epoch": 0.2795375040147221,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0015619232059698419,
+      "loss": 0.1113,
+      "step": 32203
+    },
+    {
+      "epoch": 0.27954618449492624,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0015618976623762541,
+      "loss": 0.0962,
+      "step": 32204
+    },
+    {
+      "epoch": 0.27955486497513043,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0015618721182775422,
+      "loss": 0.0991,
+      "step": 32205
+    },
+    {
+      "epoch": 0.27956354545533457,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0015618465736737337,
+      "loss": 0.1504,
+      "step": 32206
+    },
+    {
+      "epoch": 0.27957222593553877,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0015618210285648568,
+      "loss": 0.0703,
+      "step": 32207
+    },
+    {
+      "epoch": 0.2795809064157429,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0015617954829509394,
+      "loss": 0.0806,
+      "step": 32208
+    },
+    {
+      "epoch": 0.2795895868959471,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0015617699368320095,
+      "loss": 0.0947,
+      "step": 32209
+    },
+    {
+      "epoch": 0.27959826737615123,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0015617443902080947,
+      "loss": 0.1465,
+      "step": 32210
+    },
+    {
+      "epoch": 0.2796069478563554,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0015617188430792238,
+      "loss": 0.126,
+      "step": 32211
+    },
+    {
+      "epoch": 0.27961562833655956,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0015616932954454232,
+      "loss": 0.1279,
+      "step": 32212
+    },
+    {
+      "epoch": 0.27962430881676376,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0015616677473067222,
+      "loss": 0.1113,
+      "step": 32213
+    },
+    {
+      "epoch": 0.2796329892969679,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0015616421986631485,
+      "loss": 0.1016,
+      "step": 32214
+    },
+    {
+      "epoch": 0.2796416697771721,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0015616166495147296,
+      "loss": 0.1143,
+      "step": 32215
+    },
+    {
+      "epoch": 0.2796503502573762,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0015615910998614934,
+      "loss": 0.085,
+      "step": 32216
+    },
+    {
+      "epoch": 0.2796590307375804,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0015615655497034683,
+      "loss": 0.1182,
+      "step": 32217
+    },
+    {
+      "epoch": 0.27966771121778455,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0015615399990406822,
+      "loss": 0.1035,
+      "step": 32218
+    },
+    {
+      "epoch": 0.27967639169798875,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0015615144478731628,
+      "loss": 0.1201,
+      "step": 32219
+    },
+    {
+      "epoch": 0.2796850721781929,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0015614888962009383,
+      "loss": 0.0938,
+      "step": 32220
+    },
+    {
+      "epoch": 0.2796937526583971,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001561463344024036,
+      "loss": 0.0967,
+      "step": 32221
+    },
+    {
+      "epoch": 0.2797024331386012,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001561437791342485,
+      "loss": 0.0811,
+      "step": 32222
+    },
+    {
+      "epoch": 0.2797111136188054,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001561412238156312,
+      "loss": 0.1084,
+      "step": 32223
+    },
+    {
+      "epoch": 0.27971979409900954,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0015613866844655458,
+      "loss": 0.1855,
+      "step": 32224
+    },
+    {
+      "epoch": 0.27972847457921374,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001561361130270214,
+      "loss": 0.1416,
+      "step": 32225
+    },
+    {
+      "epoch": 0.2797371550594179,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0015613355755703445,
+      "loss": 0.123,
+      "step": 32226
+    },
+    {
+      "epoch": 0.27974583553962207,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0015613100203659656,
+      "loss": 0.0923,
+      "step": 32227
+    },
+    {
+      "epoch": 0.2797545160198262,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0015612844646571045,
+      "loss": 0.1021,
+      "step": 32228
+    },
+    {
+      "epoch": 0.2797631965000304,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.00156125890844379,
+      "loss": 0.1621,
+      "step": 32229
+    },
+    {
+      "epoch": 0.27977187698023454,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0015612333517260495,
+      "loss": 0.1182,
+      "step": 32230
+    },
+    {
+      "epoch": 0.27978055746043873,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0015612077945039114,
+      "loss": 0.082,
+      "step": 32231
+    },
+    {
+      "epoch": 0.27978923794064287,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0015611822367774034,
+      "loss": 0.1133,
+      "step": 32232
+    },
+    {
+      "epoch": 0.27979791842084706,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0015611566785465533,
+      "loss": 0.0869,
+      "step": 32233
+    },
+    {
+      "epoch": 0.2798065989010512,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0015611311198113892,
+      "loss": 0.1348,
+      "step": 32234
+    },
+    {
+      "epoch": 0.2798152793812554,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0015611055605719388,
+      "loss": 0.1099,
+      "step": 32235
+    },
+    {
+      "epoch": 0.2798239598614595,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0015610800008282306,
+      "loss": 0.0972,
+      "step": 32236
+    },
+    {
+      "epoch": 0.2798326403416637,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0015610544405802922,
+      "loss": 0.1299,
+      "step": 32237
+    },
+    {
+      "epoch": 0.27984132082186786,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0015610288798281516,
+      "loss": 0.0918,
+      "step": 32238
+    },
+    {
+      "epoch": 0.27985000130207205,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0015610033185718365,
+      "loss": 0.0991,
+      "step": 32239
+    },
+    {
+      "epoch": 0.2798586817822762,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0015609777568113751,
+      "loss": 0.1025,
+      "step": 32240
+    },
+    {
+      "epoch": 0.2798673622624804,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0015609521945467958,
+      "loss": 0.0996,
+      "step": 32241
+    },
+    {
+      "epoch": 0.2798760427426845,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001560926631778126,
+      "loss": 0.0962,
+      "step": 32242
+    },
+    {
+      "epoch": 0.2798847232228887,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0015609010685053934,
+      "loss": 0.1016,
+      "step": 32243
+    },
+    {
+      "epoch": 0.27989340370309285,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0015608755047286263,
+      "loss": 0.0938,
+      "step": 32244
+    },
+    {
+      "epoch": 0.27990208418329704,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0015608499404478528,
+      "loss": 0.1172,
+      "step": 32245
+    },
+    {
+      "epoch": 0.2799107646635012,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001560824375663101,
+      "loss": 0.1309,
+      "step": 32246
+    },
+    {
+      "epoch": 0.27991944514370537,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0015607988103743984,
+      "loss": 0.1011,
+      "step": 32247
+    },
+    {
+      "epoch": 0.2799281256239095,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0015607732445817731,
+      "loss": 0.124,
+      "step": 32248
+    },
+    {
+      "epoch": 0.2799368061041137,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0015607476782852532,
+      "loss": 0.1123,
+      "step": 32249
+    },
+    {
+      "epoch": 0.27994548658431784,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0015607221114848663,
+      "loss": 0.1211,
+      "step": 32250
+    },
+    {
+      "epoch": 0.27995416706452203,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001560696544180641,
+      "loss": 0.0957,
+      "step": 32251
+    },
+    {
+      "epoch": 0.27996284754472617,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0015606709763726048,
+      "loss": 0.1133,
+      "step": 32252
+    },
+    {
+      "epoch": 0.27997152802493036,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0015606454080607855,
+      "loss": 0.0889,
+      "step": 32253
+    },
+    {
+      "epoch": 0.2799802085051345,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0015606198392452114,
+      "loss": 0.0972,
+      "step": 32254
+    },
+    {
+      "epoch": 0.2799888889853387,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0015605942699259104,
+      "loss": 0.1338,
+      "step": 32255
+    },
+    {
+      "epoch": 0.27999756946554283,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0015605687001029106,
+      "loss": 0.0713,
+      "step": 32256
+    },
+    {
+      "epoch": 0.280006249945747,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0015605431297762396,
+      "loss": 0.1348,
+      "step": 32257
+    },
+    {
+      "epoch": 0.28001493042595116,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0015605175589459255,
+      "loss": 0.0806,
+      "step": 32258
+    },
+    {
+      "epoch": 0.28002361090615535,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0015604919876119965,
+      "loss": 0.0986,
+      "step": 32259
+    },
+    {
+      "epoch": 0.2800322913863595,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0015604664157744803,
+      "loss": 0.105,
+      "step": 32260
+    },
+    {
+      "epoch": 0.2800409718665637,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0015604408434334047,
+      "loss": 0.0957,
+      "step": 32261
+    },
+    {
+      "epoch": 0.2800496523467678,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0015604152705887984,
+      "loss": 0.1143,
+      "step": 32262
+    },
+    {
+      "epoch": 0.280058332826972,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0015603896972406885,
+      "loss": 0.1475,
+      "step": 32263
+    },
+    {
+      "epoch": 0.28006701330717615,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0015603641233891033,
+      "loss": 0.1328,
+      "step": 32264
+    },
+    {
+      "epoch": 0.28007569378738034,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001560338549034071,
+      "loss": 0.1094,
+      "step": 32265
+    },
+    {
+      "epoch": 0.2800843742675845,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0015603129741756194,
+      "loss": 0.0762,
+      "step": 32266
+    },
+    {
+      "epoch": 0.2800930547477887,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0015602873988137765,
+      "loss": 0.0923,
+      "step": 32267
+    },
+    {
+      "epoch": 0.2801017352279928,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0015602618229485698,
+      "loss": 0.0752,
+      "step": 32268
+    },
+    {
+      "epoch": 0.280110415708197,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0015602362465800283,
+      "loss": 0.0747,
+      "step": 32269
+    },
+    {
+      "epoch": 0.28011909618840114,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0015602106697081791,
+      "loss": 0.1172,
+      "step": 32270
+    },
+    {
+      "epoch": 0.28012777666860533,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0015601850923330505,
+      "loss": 0.083,
+      "step": 32271
+    },
+    {
+      "epoch": 0.28013645714880947,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0015601595144546698,
+      "loss": 0.0923,
+      "step": 32272
+    },
+    {
+      "epoch": 0.28014513762901366,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0015601339360730665,
+      "loss": 0.0811,
+      "step": 32273
+    },
+    {
+      "epoch": 0.2801538181092178,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0015601083571882667,
+      "loss": 0.0762,
+      "step": 32274
+    },
+    {
+      "epoch": 0.280162498589422,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0015600827778003003,
+      "loss": 0.1064,
+      "step": 32275
+    },
+    {
+      "epoch": 0.28017117906962613,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0015600571979091935,
+      "loss": 0.1152,
+      "step": 32276
+    },
+    {
+      "epoch": 0.2801798595498303,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0015600316175149753,
+      "loss": 0.0869,
+      "step": 32277
+    },
+    {
+      "epoch": 0.28018854003003446,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0015600060366176738,
+      "loss": 0.0825,
+      "step": 32278
+    },
+    {
+      "epoch": 0.28019722051023865,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001559980455217316,
+      "loss": 0.127,
+      "step": 32279
+    },
+    {
+      "epoch": 0.2802059009904428,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0015599548733139308,
+      "loss": 0.1797,
+      "step": 32280
+    },
+    {
+      "epoch": 0.28021458147064693,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001559929290907546,
+      "loss": 0.1055,
+      "step": 32281
+    },
+    {
+      "epoch": 0.2802232619508511,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0015599037079981892,
+      "loss": 0.123,
+      "step": 32282
+    },
+    {
+      "epoch": 0.28023194243105526,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0015598781245858887,
+      "loss": 0.1074,
+      "step": 32283
+    },
+    {
+      "epoch": 0.28024062291125945,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0015598525406706725,
+      "loss": 0.1221,
+      "step": 32284
+    },
+    {
+      "epoch": 0.2802493033914636,
+      "grad_norm": 1.875,
+      "learning_rate": 0.0015598269562525685,
+      "loss": 0.0991,
+      "step": 32285
+    },
+    {
+      "epoch": 0.2802579838716678,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0015598013713316041,
+      "loss": 0.1055,
+      "step": 32286
+    },
+    {
+      "epoch": 0.2802666643518719,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0015597757859078082,
+      "loss": 0.1064,
+      "step": 32287
+    },
+    {
+      "epoch": 0.2802753448320761,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0015597501999812086,
+      "loss": 0.0977,
+      "step": 32288
+    },
+    {
+      "epoch": 0.28028402531228025,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001559724613551833,
+      "loss": 0.1201,
+      "step": 32289
+    },
+    {
+      "epoch": 0.28029270579248444,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0015596990266197094,
+      "loss": 0.1055,
+      "step": 32290
+    },
+    {
+      "epoch": 0.2803013862726886,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001559673439184866,
+      "loss": 0.1045,
+      "step": 32291
+    },
+    {
+      "epoch": 0.2803100667528928,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0015596478512473302,
+      "loss": 0.1367,
+      "step": 32292
+    },
+    {
+      "epoch": 0.2803187472330969,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0015596222628071307,
+      "loss": 0.125,
+      "step": 32293
+    },
+    {
+      "epoch": 0.2803274277133011,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0015595966738642952,
+      "loss": 0.0635,
+      "step": 32294
+    },
+    {
+      "epoch": 0.28033610819350524,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.001559571084418852,
+      "loss": 0.104,
+      "step": 32295
+    },
+    {
+      "epoch": 0.28034478867370943,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0015595454944708283,
+      "loss": 0.0986,
+      "step": 32296
+    },
+    {
+      "epoch": 0.28035346915391357,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0015595199040202528,
+      "loss": 0.0786,
+      "step": 32297
+    },
+    {
+      "epoch": 0.28036214963411776,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001559494313067153,
+      "loss": 0.1387,
+      "step": 32298
+    },
+    {
+      "epoch": 0.2803708301143219,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0015594687216115575,
+      "loss": 0.0688,
+      "step": 32299
+    },
+    {
+      "epoch": 0.2803795105945261,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0015594431296534934,
+      "loss": 0.1016,
+      "step": 32300
+    },
+    {
+      "epoch": 0.28038819107473023,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0015594175371929895,
+      "loss": 0.0879,
+      "step": 32301
+    },
+    {
+      "epoch": 0.2803968715549344,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0015593919442300733,
+      "loss": 0.1011,
+      "step": 32302
+    },
+    {
+      "epoch": 0.28040555203513856,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001559366350764773,
+      "loss": 0.0874,
+      "step": 32303
+    },
+    {
+      "epoch": 0.28041423251534275,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001559340756797117,
+      "loss": 0.0918,
+      "step": 32304
+    },
+    {
+      "epoch": 0.2804229129955469,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0015593151623271323,
+      "loss": 0.1206,
+      "step": 32305
+    },
+    {
+      "epoch": 0.2804315934757511,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001559289567354848,
+      "loss": 0.1079,
+      "step": 32306
+    },
+    {
+      "epoch": 0.2804402739559552,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001559263971880291,
+      "loss": 0.0967,
+      "step": 32307
+    },
+    {
+      "epoch": 0.2804489544361594,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.00155923837590349,
+      "loss": 0.0947,
+      "step": 32308
+    },
+    {
+      "epoch": 0.28045763491636355,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001559212779424473,
+      "loss": 0.1406,
+      "step": 32309
+    },
+    {
+      "epoch": 0.28046631539656774,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0015591871824432678,
+      "loss": 0.1069,
+      "step": 32310
+    },
+    {
+      "epoch": 0.2804749958767719,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0015591615849599022,
+      "loss": 0.0928,
+      "step": 32311
+    },
+    {
+      "epoch": 0.2804836763569761,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0015591359869744042,
+      "loss": 0.084,
+      "step": 32312
+    },
+    {
+      "epoch": 0.2804923568371802,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0015591103884868022,
+      "loss": 0.1201,
+      "step": 32313
+    },
+    {
+      "epoch": 0.2805010373173844,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001559084789497124,
+      "loss": 0.1104,
+      "step": 32314
+    },
+    {
+      "epoch": 0.28050971779758854,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0015590591900053977,
+      "loss": 0.0557,
+      "step": 32315
+    },
+    {
+      "epoch": 0.28051839827779274,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0015590335900116511,
+      "loss": 0.1426,
+      "step": 32316
+    },
+    {
+      "epoch": 0.2805270787579969,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0015590079895159122,
+      "loss": 0.123,
+      "step": 32317
+    },
+    {
+      "epoch": 0.28053575923820107,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001558982388518209,
+      "loss": 0.082,
+      "step": 32318
+    },
+    {
+      "epoch": 0.2805444397184052,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0015589567870185697,
+      "loss": 0.1855,
+      "step": 32319
+    },
+    {
+      "epoch": 0.2805531201986094,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0015589311850170218,
+      "loss": 0.1162,
+      "step": 32320
+    },
+    {
+      "epoch": 0.28056180067881353,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001558905582513594,
+      "loss": 0.1367,
+      "step": 32321
+    },
+    {
+      "epoch": 0.2805704811590177,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0015588799795083139,
+      "loss": 0.1309,
+      "step": 32322
+    },
+    {
+      "epoch": 0.28057916163922186,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0015588543760012095,
+      "loss": 0.0928,
+      "step": 32323
+    },
+    {
+      "epoch": 0.28058784211942606,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0015588287719923088,
+      "loss": 0.0923,
+      "step": 32324
+    },
+    {
+      "epoch": 0.2805965225996302,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0015588031674816398,
+      "loss": 0.1504,
+      "step": 32325
+    },
+    {
+      "epoch": 0.2806052030798344,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0015587775624692308,
+      "loss": 0.1523,
+      "step": 32326
+    },
+    {
+      "epoch": 0.2806138835600385,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0015587519569551097,
+      "loss": 0.0938,
+      "step": 32327
+    },
+    {
+      "epoch": 0.2806225640402427,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0015587263509393042,
+      "loss": 0.1084,
+      "step": 32328
+    },
+    {
+      "epoch": 0.28063124452044685,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001558700744421842,
+      "loss": 0.1099,
+      "step": 32329
+    },
+    {
+      "epoch": 0.28063992500065105,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001558675137402752,
+      "loss": 0.1426,
+      "step": 32330
+    },
+    {
+      "epoch": 0.2806486054808552,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0015586495298820612,
+      "loss": 0.0967,
+      "step": 32331
+    },
+    {
+      "epoch": 0.2806572859610594,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0015586239218597989,
+      "loss": 0.0747,
+      "step": 32332
+    },
+    {
+      "epoch": 0.2806659664412635,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001558598313335992,
+      "loss": 0.1348,
+      "step": 32333
+    },
+    {
+      "epoch": 0.2806746469214677,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0015585727043106689,
+      "loss": 0.0874,
+      "step": 32334
+    },
+    {
+      "epoch": 0.28068332740167184,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0015585470947838576,
+      "loss": 0.1196,
+      "step": 32335
+    },
+    {
+      "epoch": 0.28069200788187604,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0015585214847555864,
+      "loss": 0.0908,
+      "step": 32336
+    },
+    {
+      "epoch": 0.2807006883620802,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0015584958742258827,
+      "loss": 0.1445,
+      "step": 32337
+    },
+    {
+      "epoch": 0.28070936884228437,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0015584702631947747,
+      "loss": 0.124,
+      "step": 32338
+    },
+    {
+      "epoch": 0.2807180493224885,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0015584446516622907,
+      "loss": 0.103,
+      "step": 32339
+    },
+    {
+      "epoch": 0.2807267298026927,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0015584190396284582,
+      "loss": 0.1123,
+      "step": 32340
+    },
+    {
+      "epoch": 0.28073541028289684,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0015583934270933057,
+      "loss": 0.1074,
+      "step": 32341
+    },
+    {
+      "epoch": 0.28074409076310103,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0015583678140568612,
+      "loss": 0.0928,
+      "step": 32342
+    },
+    {
+      "epoch": 0.28075277124330517,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0015583422005191524,
+      "loss": 0.1465,
+      "step": 32343
+    },
+    {
+      "epoch": 0.28076145172350936,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0015583165864802076,
+      "loss": 0.1084,
+      "step": 32344
+    },
+    {
+      "epoch": 0.2807701322037135,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0015582909719400546,
+      "loss": 0.0996,
+      "step": 32345
+    },
+    {
+      "epoch": 0.2807788126839177,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0015582653568987214,
+      "loss": 0.0864,
+      "step": 32346
+    },
+    {
+      "epoch": 0.2807874931641218,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001558239741356236,
+      "loss": 0.0483,
+      "step": 32347
+    },
+    {
+      "epoch": 0.280796173644326,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001558214125312627,
+      "loss": 0.127,
+      "step": 32348
+    },
+    {
+      "epoch": 0.28080485412453016,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0015581885087679213,
+      "loss": 0.1836,
+      "step": 32349
+    },
+    {
+      "epoch": 0.28081353460473435,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0015581628917221478,
+      "loss": 0.085,
+      "step": 32350
+    },
+    {
+      "epoch": 0.2808222150849385,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0015581372741753341,
+      "loss": 0.0613,
+      "step": 32351
+    },
+    {
+      "epoch": 0.2808308955651427,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0015581116561275087,
+      "loss": 0.0898,
+      "step": 32352
+    },
+    {
+      "epoch": 0.2808395760453468,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001558086037578699,
+      "loss": 0.084,
+      "step": 32353
+    },
+    {
+      "epoch": 0.280848256525551,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0015580604185289333,
+      "loss": 0.1216,
+      "step": 32354
+    },
+    {
+      "epoch": 0.28085693700575515,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0015580347989782397,
+      "loss": 0.1099,
+      "step": 32355
+    },
+    {
+      "epoch": 0.28086561748595934,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001558009178926646,
+      "loss": 0.1074,
+      "step": 32356
+    },
+    {
+      "epoch": 0.2808742979661635,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0015579835583741806,
+      "loss": 0.106,
+      "step": 32357
+    },
+    {
+      "epoch": 0.28088297844636767,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0015579579373208713,
+      "loss": 0.1064,
+      "step": 32358
+    },
+    {
+      "epoch": 0.2808916589265718,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0015579323157667456,
+      "loss": 0.1494,
+      "step": 32359
+    },
+    {
+      "epoch": 0.280900339406776,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0015579066937118323,
+      "loss": 0.0918,
+      "step": 32360
+    },
+    {
+      "epoch": 0.28090901988698014,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001557881071156159,
+      "loss": 0.0947,
+      "step": 32361
+    },
+    {
+      "epoch": 0.28091770036718433,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.001557855448099754,
+      "loss": 0.1074,
+      "step": 32362
+    },
+    {
+      "epoch": 0.28092638084738847,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0015578298245426453,
+      "loss": 0.1108,
+      "step": 32363
+    },
+    {
+      "epoch": 0.28093506132759266,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0015578042004848604,
+      "loss": 0.1094,
+      "step": 32364
+    },
+    {
+      "epoch": 0.2809437418077968,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001557778575926428,
+      "loss": 0.1338,
+      "step": 32365
+    },
+    {
+      "epoch": 0.280952422288001,
+      "grad_norm": 2.390625,
+      "learning_rate": 0.0015577529508673759,
+      "loss": 0.2715,
+      "step": 32366
+    },
+    {
+      "epoch": 0.28096110276820513,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0015577273253077318,
+      "loss": 0.0928,
+      "step": 32367
+    },
+    {
+      "epoch": 0.2809697832484093,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.001557701699247524,
+      "loss": 0.0908,
+      "step": 32368
+    },
+    {
+      "epoch": 0.28097846372861346,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0015576760726867802,
+      "loss": 0.1074,
+      "step": 32369
+    },
+    {
+      "epoch": 0.28098714420881765,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0015576504456255291,
+      "loss": 0.0908,
+      "step": 32370
+    },
+    {
+      "epoch": 0.2809958246890218,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0015576248180637984,
+      "loss": 0.1016,
+      "step": 32371
+    },
+    {
+      "epoch": 0.281004505169226,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001557599190001616,
+      "loss": 0.1416,
+      "step": 32372
+    },
+    {
+      "epoch": 0.2810131856494301,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.00155757356143901,
+      "loss": 0.0942,
+      "step": 32373
+    },
+    {
+      "epoch": 0.2810218661296343,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0015575479323760085,
+      "loss": 0.1328,
+      "step": 32374
+    },
+    {
+      "epoch": 0.28103054660983845,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0015575223028126395,
+      "loss": 0.106,
+      "step": 32375
+    },
+    {
+      "epoch": 0.28103922709004264,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0015574966727489306,
+      "loss": 0.1011,
+      "step": 32376
+    },
+    {
+      "epoch": 0.2810479075702468,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0015574710421849105,
+      "loss": 0.0977,
+      "step": 32377
+    },
+    {
+      "epoch": 0.281056588050451,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001557445411120607,
+      "loss": 0.1143,
+      "step": 32378
+    },
+    {
+      "epoch": 0.2810652685306551,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0015574197795560482,
+      "loss": 0.1035,
+      "step": 32379
+    },
+    {
+      "epoch": 0.2810739490108593,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0015573941474912613,
+      "loss": 0.085,
+      "step": 32380
+    },
+    {
+      "epoch": 0.28108262949106344,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0015573685149262756,
+      "loss": 0.103,
+      "step": 32381
+    },
+    {
+      "epoch": 0.28109130997126763,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0015573428818611186,
+      "loss": 0.1699,
+      "step": 32382
+    },
+    {
+      "epoch": 0.28109999045147177,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0015573172482958183,
+      "loss": 0.0952,
+      "step": 32383
+    },
+    {
+      "epoch": 0.28110867093167596,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0015572916142304026,
+      "loss": 0.125,
+      "step": 32384
+    },
+    {
+      "epoch": 0.2811173514118801,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0015572659796648995,
+      "loss": 0.0732,
+      "step": 32385
+    },
+    {
+      "epoch": 0.2811260318920843,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0015572403445993372,
+      "loss": 0.1011,
+      "step": 32386
+    },
+    {
+      "epoch": 0.28113471237228843,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0015572147090337444,
+      "loss": 0.0947,
+      "step": 32387
+    },
+    {
+      "epoch": 0.2811433928524926,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0015571890729681476,
+      "loss": 0.1152,
+      "step": 32388
+    },
+    {
+      "epoch": 0.28115207333269676,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001557163436402576,
+      "loss": 0.0889,
+      "step": 32389
+    },
+    {
+      "epoch": 0.28116075381290095,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0015571377993370574,
+      "loss": 0.0947,
+      "step": 32390
+    },
+    {
+      "epoch": 0.2811694342931051,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.00155711216177162,
+      "loss": 0.1348,
+      "step": 32391
+    },
+    {
+      "epoch": 0.2811781147733093,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0015570865237062913,
+      "loss": 0.1201,
+      "step": 32392
+    },
+    {
+      "epoch": 0.2811867952535134,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0015570608851410996,
+      "loss": 0.0752,
+      "step": 32393
+    },
+    {
+      "epoch": 0.2811954757337176,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015570352460760734,
+      "loss": 0.0938,
+      "step": 32394
+    },
+    {
+      "epoch": 0.28120415621392175,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0015570096065112397,
+      "loss": 0.0991,
+      "step": 32395
+    },
+    {
+      "epoch": 0.28121283669412594,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0015569839664466277,
+      "loss": 0.0859,
+      "step": 32396
+    },
+    {
+      "epoch": 0.2812215171743301,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0015569583258822643,
+      "loss": 0.084,
+      "step": 32397
+    },
+    {
+      "epoch": 0.2812301976545343,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0015569326848181787,
+      "loss": 0.127,
+      "step": 32398
+    },
+    {
+      "epoch": 0.2812388781347384,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001556907043254398,
+      "loss": 0.127,
+      "step": 32399
+    },
+    {
+      "epoch": 0.2812475586149426,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0015568814011909509,
+      "loss": 0.0796,
+      "step": 32400
+    },
+    {
+      "epoch": 0.28125623909514674,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0015568557586278654,
+      "loss": 0.0889,
+      "step": 32401
+    },
+    {
+      "epoch": 0.28126491957535094,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0015568301155651687,
+      "loss": 0.104,
+      "step": 32402
+    },
+    {
+      "epoch": 0.2812736000555551,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0015568044720028899,
+      "loss": 0.1699,
+      "step": 32403
+    },
+    {
+      "epoch": 0.28128228053575927,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0015567788279410566,
+      "loss": 0.0884,
+      "step": 32404
+    },
+    {
+      "epoch": 0.2812909610159634,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0015567531833796965,
+      "loss": 0.1133,
+      "step": 32405
+    },
+    {
+      "epoch": 0.28129964149616754,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001556727538318838,
+      "loss": 0.1006,
+      "step": 32406
+    },
+    {
+      "epoch": 0.28130832197637173,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0015567018927585094,
+      "loss": 0.0903,
+      "step": 32407
+    },
+    {
+      "epoch": 0.28131700245657587,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0015566762466987386,
+      "loss": 0.0947,
+      "step": 32408
+    },
+    {
+      "epoch": 0.28132568293678006,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001556650600139553,
+      "loss": 0.1211,
+      "step": 32409
+    },
+    {
+      "epoch": 0.2813343634169842,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0015566249530809815,
+      "loss": 0.0835,
+      "step": 32410
+    },
+    {
+      "epoch": 0.2813430438971884,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0015565993055230518,
+      "loss": 0.0898,
+      "step": 32411
+    },
+    {
+      "epoch": 0.28135172437739253,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0015565736574657918,
+      "loss": 0.1064,
+      "step": 32412
+    },
+    {
+      "epoch": 0.2813604048575967,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.00155654800890923,
+      "loss": 0.0884,
+      "step": 32413
+    },
+    {
+      "epoch": 0.28136908533780086,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001556522359853394,
+      "loss": 0.1846,
+      "step": 32414
+    },
+    {
+      "epoch": 0.28137776581800505,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0015564967102983118,
+      "loss": 0.1309,
+      "step": 32415
+    },
+    {
+      "epoch": 0.2813864462982092,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0015564710602440118,
+      "loss": 0.1094,
+      "step": 32416
+    },
+    {
+      "epoch": 0.2813951267784134,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0015564454096905218,
+      "loss": 0.1006,
+      "step": 32417
+    },
+    {
+      "epoch": 0.2814038072586175,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0015564197586378703,
+      "loss": 0.1113,
+      "step": 32418
+    },
+    {
+      "epoch": 0.2814124877388217,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0015563941070860848,
+      "loss": 0.1777,
+      "step": 32419
+    },
+    {
+      "epoch": 0.28142116821902585,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0015563684550351936,
+      "loss": 0.0811,
+      "step": 32420
+    },
+    {
+      "epoch": 0.28142984869923005,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0015563428024852246,
+      "loss": 0.125,
+      "step": 32421
+    },
+    {
+      "epoch": 0.2814385291794342,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001556317149436206,
+      "loss": 0.1084,
+      "step": 32422
+    },
+    {
+      "epoch": 0.2814472096596384,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0015562914958881663,
+      "loss": 0.1387,
+      "step": 32423
+    },
+    {
+      "epoch": 0.2814558901398425,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0015562658418411324,
+      "loss": 0.0894,
+      "step": 32424
+    },
+    {
+      "epoch": 0.2814645706200467,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0015562401872951331,
+      "loss": 0.1162,
+      "step": 32425
+    },
+    {
+      "epoch": 0.28147325110025084,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0015562145322501966,
+      "loss": 0.0923,
+      "step": 32426
+    },
+    {
+      "epoch": 0.28148193158045504,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001556188876706351,
+      "loss": 0.1084,
+      "step": 32427
+    },
+    {
+      "epoch": 0.2814906120606592,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0015561632206636234,
+      "loss": 0.0879,
+      "step": 32428
+    },
+    {
+      "epoch": 0.28149929254086337,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0015561375641220433,
+      "loss": 0.0742,
+      "step": 32429
+    },
+    {
+      "epoch": 0.2815079730210675,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0015561119070816374,
+      "loss": 0.0996,
+      "step": 32430
+    },
+    {
+      "epoch": 0.2815166535012717,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0015560862495424348,
+      "loss": 0.1387,
+      "step": 32431
+    },
+    {
+      "epoch": 0.28152533398147583,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0015560605915044632,
+      "loss": 0.0947,
+      "step": 32432
+    },
+    {
+      "epoch": 0.28153401446168,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0015560349329677503,
+      "loss": 0.1582,
+      "step": 32433
+    },
+    {
+      "epoch": 0.28154269494188416,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0015560092739323244,
+      "loss": 0.1406,
+      "step": 32434
+    },
+    {
+      "epoch": 0.28155137542208836,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.001555983614398214,
+      "loss": 0.0957,
+      "step": 32435
+    },
+    {
+      "epoch": 0.2815600559022925,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001555957954365446,
+      "loss": 0.0986,
+      "step": 32436
+    },
+    {
+      "epoch": 0.2815687363824967,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0015559322938340499,
+      "loss": 0.0791,
+      "step": 32437
+    },
+    {
+      "epoch": 0.2815774168627008,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001555906632804053,
+      "loss": 0.1211,
+      "step": 32438
+    },
+    {
+      "epoch": 0.281586097342905,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0015558809712754834,
+      "loss": 0.1069,
+      "step": 32439
+    },
+    {
+      "epoch": 0.28159477782310915,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0015558553092483694,
+      "loss": 0.0874,
+      "step": 32440
+    },
+    {
+      "epoch": 0.28160345830331335,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0015558296467227384,
+      "loss": 0.085,
+      "step": 32441
+    },
+    {
+      "epoch": 0.2816121387835175,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0015558039836986192,
+      "loss": 0.126,
+      "step": 32442
+    },
+    {
+      "epoch": 0.2816208192637217,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0015557783201760398,
+      "loss": 0.0796,
+      "step": 32443
+    },
+    {
+      "epoch": 0.2816294997439258,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0015557526561550282,
+      "loss": 0.1035,
+      "step": 32444
+    },
+    {
+      "epoch": 0.28163818022413,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0015557269916356116,
+      "loss": 0.0938,
+      "step": 32445
+    },
+    {
+      "epoch": 0.28164686070433415,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0015557013266178192,
+      "loss": 0.1006,
+      "step": 32446
+    },
+    {
+      "epoch": 0.28165554118453834,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001555675661101679,
+      "loss": 0.1406,
+      "step": 32447
+    },
+    {
+      "epoch": 0.2816642216647425,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0015556499950872184,
+      "loss": 0.0854,
+      "step": 32448
+    },
+    {
+      "epoch": 0.28167290214494667,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.001555624328574466,
+      "loss": 0.1201,
+      "step": 32449
+    },
+    {
+      "epoch": 0.2816815826251508,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0015555986615634495,
+      "loss": 0.0703,
+      "step": 32450
+    },
+    {
+      "epoch": 0.281690263105355,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001555572994054197,
+      "loss": 0.1064,
+      "step": 32451
+    },
+    {
+      "epoch": 0.28169894358555914,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0015555473260467371,
+      "loss": 0.1143,
+      "step": 32452
+    },
+    {
+      "epoch": 0.28170762406576333,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0015555216575410973,
+      "loss": 0.0967,
+      "step": 32453
+    },
+    {
+      "epoch": 0.28171630454596747,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001555495988537306,
+      "loss": 0.1143,
+      "step": 32454
+    },
+    {
+      "epoch": 0.28172498502617166,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0015554703190353912,
+      "loss": 0.1182,
+      "step": 32455
+    },
+    {
+      "epoch": 0.2817336655063758,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0015554446490353804,
+      "loss": 0.0737,
+      "step": 32456
+    },
+    {
+      "epoch": 0.28174234598658,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0015554189785373027,
+      "loss": 0.1416,
+      "step": 32457
+    },
+    {
+      "epoch": 0.2817510264667841,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0015553933075411854,
+      "loss": 0.123,
+      "step": 32458
+    },
+    {
+      "epoch": 0.2817597069469883,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0015553676360470568,
+      "loss": 0.1074,
+      "step": 32459
+    },
+    {
+      "epoch": 0.28176838742719246,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.001555341964054945,
+      "loss": 0.1211,
+      "step": 32460
+    },
+    {
+      "epoch": 0.28177706790739665,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0015553162915648785,
+      "loss": 0.0869,
+      "step": 32461
+    },
+    {
+      "epoch": 0.2817857483876008,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0015552906185768846,
+      "loss": 0.1631,
+      "step": 32462
+    },
+    {
+      "epoch": 0.281794428867805,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0015552649450909918,
+      "loss": 0.1211,
+      "step": 32463
+    },
+    {
+      "epoch": 0.2818031093480091,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.001555239271107228,
+      "loss": 0.1221,
+      "step": 32464
+    },
+    {
+      "epoch": 0.2818117898282133,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001555213596625621,
+      "loss": 0.0723,
+      "step": 32465
+    },
+    {
+      "epoch": 0.28182047030841745,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0015551879216461997,
+      "loss": 0.0952,
+      "step": 32466
+    },
+    {
+      "epoch": 0.28182915078862164,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0015551622461689913,
+      "loss": 0.0967,
+      "step": 32467
+    },
+    {
+      "epoch": 0.2818378312688258,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001555136570194025,
+      "loss": 0.1299,
+      "step": 32468
+    },
+    {
+      "epoch": 0.28184651174902997,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0015551108937213276,
+      "loss": 0.1396,
+      "step": 32469
+    },
+    {
+      "epoch": 0.2818551922292341,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001555085216750928,
+      "loss": 0.1084,
+      "step": 32470
+    },
+    {
+      "epoch": 0.2818638727094383,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0015550595392828544,
+      "loss": 0.1162,
+      "step": 32471
+    },
+    {
+      "epoch": 0.28187255318964244,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001555033861317134,
+      "loss": 0.1123,
+      "step": 32472
+    },
+    {
+      "epoch": 0.28188123366984663,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0015550081828537958,
+      "loss": 0.127,
+      "step": 32473
+    },
+    {
+      "epoch": 0.28188991415005077,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0015549825038928668,
+      "loss": 0.0894,
+      "step": 32474
+    },
+    {
+      "epoch": 0.28189859463025496,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0015549568244343766,
+      "loss": 0.1465,
+      "step": 32475
+    },
+    {
+      "epoch": 0.2819072751104591,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001554931144478352,
+      "loss": 0.1035,
+      "step": 32476
+    },
+    {
+      "epoch": 0.2819159555906633,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0015549054640248215,
+      "loss": 0.1436,
+      "step": 32477
+    },
+    {
+      "epoch": 0.28192463607086743,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0015548797830738136,
+      "loss": 0.1123,
+      "step": 32478
+    },
+    {
+      "epoch": 0.2819333165510716,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0015548541016253557,
+      "loss": 0.1328,
+      "step": 32479
+    },
+    {
+      "epoch": 0.28194199703127576,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001554828419679476,
+      "loss": 0.1118,
+      "step": 32480
+    },
+    {
+      "epoch": 0.28195067751147995,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0015548027372362032,
+      "loss": 0.1055,
+      "step": 32481
+    },
+    {
+      "epoch": 0.2819593579916841,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001554777054295565,
+      "loss": 0.0947,
+      "step": 32482
+    },
+    {
+      "epoch": 0.2819680384718883,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001554751370857589,
+      "loss": 0.1089,
+      "step": 32483
+    },
+    {
+      "epoch": 0.2819767189520924,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001554725686922304,
+      "loss": 0.1328,
+      "step": 32484
+    },
+    {
+      "epoch": 0.2819853994322966,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0015547000024897378,
+      "loss": 0.1035,
+      "step": 32485
+    },
+    {
+      "epoch": 0.28199407991250075,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0015546743175599185,
+      "loss": 0.0957,
+      "step": 32486
+    },
+    {
+      "epoch": 0.28200276039270494,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001554648632132874,
+      "loss": 0.126,
+      "step": 32487
+    },
+    {
+      "epoch": 0.2820114408729091,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001554622946208633,
+      "loss": 0.1631,
+      "step": 32488
+    },
+    {
+      "epoch": 0.2820201213531133,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0015545972597872226,
+      "loss": 0.1406,
+      "step": 32489
+    },
+    {
+      "epoch": 0.2820288018333174,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001554571572868672,
+      "loss": 0.1475,
+      "step": 32490
+    },
+    {
+      "epoch": 0.2820374823135216,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0015545458854530087,
+      "loss": 0.0991,
+      "step": 32491
+    },
+    {
+      "epoch": 0.28204616279372574,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0015545201975402608,
+      "loss": 0.1211,
+      "step": 32492
+    },
+    {
+      "epoch": 0.28205484327392993,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0015544945091304561,
+      "loss": 0.1055,
+      "step": 32493
+    },
+    {
+      "epoch": 0.28206352375413407,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0015544688202236232,
+      "loss": 0.1406,
+      "step": 32494
+    },
+    {
+      "epoch": 0.28207220423433826,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00155444313081979,
+      "loss": 0.1162,
+      "step": 32495
+    },
+    {
+      "epoch": 0.2820808847145424,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0015544174409189851,
+      "loss": 0.1104,
+      "step": 32496
+    },
+    {
+      "epoch": 0.2820895651947466,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0015543917505212355,
+      "loss": 0.123,
+      "step": 32497
+    },
+    {
+      "epoch": 0.28209824567495073,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0015543660596265703,
+      "loss": 0.1299,
+      "step": 32498
+    },
+    {
+      "epoch": 0.2821069261551549,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0015543403682350173,
+      "loss": 0.1211,
+      "step": 32499
+    },
+    {
+      "epoch": 0.28211560663535906,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0015543146763466044,
+      "loss": 0.1104,
+      "step": 32500
+    },
+    {
+      "epoch": 0.28212428711556325,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0015542889839613595,
+      "loss": 0.1094,
+      "step": 32501
+    },
+    {
+      "epoch": 0.2821329675957674,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001554263291079311,
+      "loss": 0.1045,
+      "step": 32502
+    },
+    {
+      "epoch": 0.2821416480759716,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0015542375977004872,
+      "loss": 0.0967,
+      "step": 32503
+    },
+    {
+      "epoch": 0.2821503285561757,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0015542119038249157,
+      "loss": 0.1074,
+      "step": 32504
+    },
+    {
+      "epoch": 0.2821590090363799,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0015541862094526255,
+      "loss": 0.0752,
+      "step": 32505
+    },
+    {
+      "epoch": 0.28216768951658405,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0015541605145836435,
+      "loss": 0.1191,
+      "step": 32506
+    },
+    {
+      "epoch": 0.28217636999678825,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0015541348192179985,
+      "loss": 0.105,
+      "step": 32507
+    },
+    {
+      "epoch": 0.2821850504769924,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0015541091233557186,
+      "loss": 0.1001,
+      "step": 32508
+    },
+    {
+      "epoch": 0.2821937309571966,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0015540834269968317,
+      "loss": 0.0879,
+      "step": 32509
+    },
+    {
+      "epoch": 0.2822024114374007,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0015540577301413662,
+      "loss": 0.1836,
+      "step": 32510
+    },
+    {
+      "epoch": 0.2822110919176049,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0015540320327893497,
+      "loss": 0.0918,
+      "step": 32511
+    },
+    {
+      "epoch": 0.28221977239780904,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001554006334940811,
+      "loss": 0.1641,
+      "step": 32512
+    },
+    {
+      "epoch": 0.28222845287801324,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0015539806365957773,
+      "loss": 0.1074,
+      "step": 32513
+    },
+    {
+      "epoch": 0.2822371333582174,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0015539549377542776,
+      "loss": 0.126,
+      "step": 32514
+    },
+    {
+      "epoch": 0.28224581383842157,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0015539292384163393,
+      "loss": 0.0767,
+      "step": 32515
+    },
+    {
+      "epoch": 0.2822544943186257,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001553903538581991,
+      "loss": 0.125,
+      "step": 32516
+    },
+    {
+      "epoch": 0.2822631747988299,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0015538778382512607,
+      "loss": 0.1045,
+      "step": 32517
+    },
+    {
+      "epoch": 0.28227185527903403,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001553852137424176,
+      "loss": 0.1035,
+      "step": 32518
+    },
+    {
+      "epoch": 0.2822805357592382,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0015538264361007659,
+      "loss": 0.1289,
+      "step": 32519
+    },
+    {
+      "epoch": 0.28228921623944236,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0015538007342810575,
+      "loss": 0.1074,
+      "step": 32520
+    },
+    {
+      "epoch": 0.28229789671964656,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0015537750319650797,
+      "loss": 0.0776,
+      "step": 32521
+    },
+    {
+      "epoch": 0.2823065771998507,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0015537493291528607,
+      "loss": 0.124,
+      "step": 32522
+    },
+    {
+      "epoch": 0.2823152576800549,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0015537236258444278,
+      "loss": 0.1523,
+      "step": 32523
+    },
+    {
+      "epoch": 0.282323938160259,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0015536979220398095,
+      "loss": 0.0908,
+      "step": 32524
+    },
+    {
+      "epoch": 0.2823326186404632,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0015536722177390344,
+      "loss": 0.0918,
+      "step": 32525
+    },
+    {
+      "epoch": 0.28234129912066736,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.00155364651294213,
+      "loss": 0.0732,
+      "step": 32526
+    },
+    {
+      "epoch": 0.28234997960087155,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0015536208076491246,
+      "loss": 0.0767,
+      "step": 32527
+    },
+    {
+      "epoch": 0.2823586600810757,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001553595101860046,
+      "loss": 0.1035,
+      "step": 32528
+    },
+    {
+      "epoch": 0.2823673405612798,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001553569395574923,
+      "loss": 0.1001,
+      "step": 32529
+    },
+    {
+      "epoch": 0.282376021041484,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0015535436887937832,
+      "loss": 0.0957,
+      "step": 32530
+    },
+    {
+      "epoch": 0.28238470152168815,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0015535179815166548,
+      "loss": 0.1055,
+      "step": 32531
+    },
+    {
+      "epoch": 0.28239338200189235,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0015534922737435659,
+      "loss": 0.1123,
+      "step": 32532
+    },
+    {
+      "epoch": 0.2824020624820965,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0015534665654745447,
+      "loss": 0.1279,
+      "step": 32533
+    },
+    {
+      "epoch": 0.2824107429623007,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0015534408567096192,
+      "loss": 0.105,
+      "step": 32534
+    },
+    {
+      "epoch": 0.2824194234425048,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0015534151474488175,
+      "loss": 0.1035,
+      "step": 32535
+    },
+    {
+      "epoch": 0.282428103922709,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001553389437692168,
+      "loss": 0.1021,
+      "step": 32536
+    },
+    {
+      "epoch": 0.28243678440291314,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0015533637274396989,
+      "loss": 0.1074,
+      "step": 32537
+    },
+    {
+      "epoch": 0.28244546488311734,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0015533380166914378,
+      "loss": 0.0898,
+      "step": 32538
+    },
+    {
+      "epoch": 0.2824541453633215,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.001553312305447413,
+      "loss": 0.064,
+      "step": 32539
+    },
+    {
+      "epoch": 0.28246282584352567,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0015532865937076526,
+      "loss": 0.0923,
+      "step": 32540
+    },
+    {
+      "epoch": 0.2824715063237298,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0015532608814721848,
+      "loss": 0.0864,
+      "step": 32541
+    },
+    {
+      "epoch": 0.282480186803934,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0015532351687410375,
+      "loss": 0.1699,
+      "step": 32542
+    },
+    {
+      "epoch": 0.28248886728413813,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0015532094555142394,
+      "loss": 0.0957,
+      "step": 32543
+    },
+    {
+      "epoch": 0.2824975477643423,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0015531837417918184,
+      "loss": 0.0894,
+      "step": 32544
+    },
+    {
+      "epoch": 0.28250622824454646,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001553158027573802,
+      "loss": 0.1289,
+      "step": 32545
+    },
+    {
+      "epoch": 0.28251490872475066,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001553132312860219,
+      "loss": 0.0986,
+      "step": 32546
+    },
+    {
+      "epoch": 0.2825235892049548,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0015531065976510977,
+      "loss": 0.1279,
+      "step": 32547
+    },
+    {
+      "epoch": 0.282532269685159,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0015530808819464652,
+      "loss": 0.1328,
+      "step": 32548
+    },
+    {
+      "epoch": 0.2825409501653631,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0015530551657463507,
+      "loss": 0.1279,
+      "step": 32549
+    },
+    {
+      "epoch": 0.2825496306455673,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0015530294490507816,
+      "loss": 0.0962,
+      "step": 32550
+    },
+    {
+      "epoch": 0.28255831112577146,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0015530037318597866,
+      "loss": 0.1025,
+      "step": 32551
+    },
+    {
+      "epoch": 0.28256699160597565,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0015529780141733931,
+      "loss": 0.1074,
+      "step": 32552
+    },
+    {
+      "epoch": 0.2825756720861798,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00155295229599163,
+      "loss": 0.0791,
+      "step": 32553
+    },
+    {
+      "epoch": 0.282584352566384,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0015529265773145248,
+      "loss": 0.0835,
+      "step": 32554
+    },
+    {
+      "epoch": 0.2825930330465881,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001552900858142106,
+      "loss": 0.0957,
+      "step": 32555
+    },
+    {
+      "epoch": 0.2826017135267923,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0015528751384744019,
+      "loss": 0.1396,
+      "step": 32556
+    },
+    {
+      "epoch": 0.28261039400699645,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0015528494183114402,
+      "loss": 0.1118,
+      "step": 32557
+    },
+    {
+      "epoch": 0.28261907448720064,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0015528236976532488,
+      "loss": 0.0928,
+      "step": 32558
+    },
+    {
+      "epoch": 0.2826277549674048,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0015527979764998565,
+      "loss": 0.0918,
+      "step": 32559
+    },
+    {
+      "epoch": 0.28263643544760897,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0015527722548512916,
+      "loss": 0.1025,
+      "step": 32560
+    },
+    {
+      "epoch": 0.2826451159278131,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001552746532707581,
+      "loss": 0.1211,
+      "step": 32561
+    },
+    {
+      "epoch": 0.2826537964080173,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0015527208100687539,
+      "loss": 0.0786,
+      "step": 32562
+    },
+    {
+      "epoch": 0.28266247688822144,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0015526950869348382,
+      "loss": 0.1099,
+      "step": 32563
+    },
+    {
+      "epoch": 0.28267115736842563,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0015526693633058618,
+      "loss": 0.1094,
+      "step": 32564
+    },
+    {
+      "epoch": 0.28267983784862977,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0015526436391818533,
+      "loss": 0.0879,
+      "step": 32565
+    },
+    {
+      "epoch": 0.28268851832883396,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0015526179145628402,
+      "loss": 0.1045,
+      "step": 32566
+    },
+    {
+      "epoch": 0.2826971988090381,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001552592189448851,
+      "loss": 0.0928,
+      "step": 32567
+    },
+    {
+      "epoch": 0.2827058792892423,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0015525664638399138,
+      "loss": 0.0952,
+      "step": 32568
+    },
+    {
+      "epoch": 0.2827145597694464,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.001552540737736057,
+      "loss": 0.1191,
+      "step": 32569
+    },
+    {
+      "epoch": 0.2827232402496506,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001552515011137308,
+      "loss": 0.0918,
+      "step": 32570
+    },
+    {
+      "epoch": 0.28273192072985476,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0015524892840436952,
+      "loss": 0.1465,
+      "step": 32571
+    },
+    {
+      "epoch": 0.28274060121005895,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0015524635564552474,
+      "loss": 0.0957,
+      "step": 32572
+    },
+    {
+      "epoch": 0.2827492816902631,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0015524378283719921,
+      "loss": 0.0864,
+      "step": 32573
+    },
+    {
+      "epoch": 0.2827579621704673,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0015524120997939578,
+      "loss": 0.127,
+      "step": 32574
+    },
+    {
+      "epoch": 0.2827666426506714,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0015523863707211723,
+      "loss": 0.1235,
+      "step": 32575
+    },
+    {
+      "epoch": 0.2827753231308756,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0015523606411536636,
+      "loss": 0.1221,
+      "step": 32576
+    },
+    {
+      "epoch": 0.28278400361107975,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0015523349110914606,
+      "loss": 0.0889,
+      "step": 32577
+    },
+    {
+      "epoch": 0.28279268409128394,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0015523091805345905,
+      "loss": 0.1172,
+      "step": 32578
+    },
+    {
+      "epoch": 0.2828013645714881,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0015522834494830823,
+      "loss": 0.083,
+      "step": 32579
+    },
+    {
+      "epoch": 0.28281004505169227,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0015522577179369632,
+      "loss": 0.0781,
+      "step": 32580
+    },
+    {
+      "epoch": 0.2828187255318964,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001552231985896262,
+      "loss": 0.085,
+      "step": 32581
+    },
+    {
+      "epoch": 0.2828274060121006,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001552206253361007,
+      "loss": 0.0938,
+      "step": 32582
+    },
+    {
+      "epoch": 0.28283608649230474,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0015521805203312258,
+      "loss": 0.1104,
+      "step": 32583
+    },
+    {
+      "epoch": 0.28284476697250893,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0015521547868069467,
+      "loss": 0.1035,
+      "step": 32584
+    },
+    {
+      "epoch": 0.28285344745271307,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0015521290527881983,
+      "loss": 0.0913,
+      "step": 32585
+    },
+    {
+      "epoch": 0.28286212793291726,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001552103318275008,
+      "loss": 0.0996,
+      "step": 32586
+    },
+    {
+      "epoch": 0.2828708084131214,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0015520775832674044,
+      "loss": 0.0879,
+      "step": 32587
+    },
+    {
+      "epoch": 0.2828794888933256,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0015520518477654152,
+      "loss": 0.0781,
+      "step": 32588
+    },
+    {
+      "epoch": 0.28288816937352973,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0015520261117690694,
+      "loss": 0.0879,
+      "step": 32589
+    },
+    {
+      "epoch": 0.2828968498537339,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0015520003752783945,
+      "loss": 0.1465,
+      "step": 32590
+    },
+    {
+      "epoch": 0.28290553033393806,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0015519746382934188,
+      "loss": 0.1357,
+      "step": 32591
+    },
+    {
+      "epoch": 0.28291421081414225,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0015519489008141705,
+      "loss": 0.1191,
+      "step": 32592
+    },
+    {
+      "epoch": 0.2829228912943464,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0015519231628406771,
+      "loss": 0.1836,
+      "step": 32593
+    },
+    {
+      "epoch": 0.2829315717745506,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001551897424372968,
+      "loss": 0.1152,
+      "step": 32594
+    },
+    {
+      "epoch": 0.2829402522547547,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0015518716854110706,
+      "loss": 0.1128,
+      "step": 32595
+    },
+    {
+      "epoch": 0.2829489327349589,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001551845945955013,
+      "loss": 0.1348,
+      "step": 32596
+    },
+    {
+      "epoch": 0.28295761321516305,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0015518202060048232,
+      "loss": 0.0898,
+      "step": 32597
+    },
+    {
+      "epoch": 0.28296629369536724,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0015517944655605298,
+      "loss": 0.1021,
+      "step": 32598
+    },
+    {
+      "epoch": 0.2829749741755714,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0015517687246221607,
+      "loss": 0.0908,
+      "step": 32599
+    },
+    {
+      "epoch": 0.2829836546557756,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0015517429831897443,
+      "loss": 0.166,
+      "step": 32600
+    },
+    {
+      "epoch": 0.2829923351359797,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0015517172412633084,
+      "loss": 0.1455,
+      "step": 32601
+    },
+    {
+      "epoch": 0.2830010156161839,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0015516914988428815,
+      "loss": 0.0762,
+      "step": 32602
+    },
+    {
+      "epoch": 0.28300969609638804,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0015516657559284913,
+      "loss": 0.1484,
+      "step": 32603
+    },
+    {
+      "epoch": 0.28301837657659223,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0015516400125201665,
+      "loss": 0.0947,
+      "step": 32604
+    },
+    {
+      "epoch": 0.28302705705679637,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001551614268617935,
+      "loss": 0.0811,
+      "step": 32605
+    },
+    {
+      "epoch": 0.28303573753700056,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0015515885242218244,
+      "loss": 0.1504,
+      "step": 32606
+    },
+    {
+      "epoch": 0.2830444180172047,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0015515627793318636,
+      "loss": 0.0923,
+      "step": 32607
+    },
+    {
+      "epoch": 0.2830530984974089,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0015515370339480809,
+      "loss": 0.123,
+      "step": 32608
+    },
+    {
+      "epoch": 0.28306177897761303,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0015515112880705039,
+      "loss": 0.0869,
+      "step": 32609
+    },
+    {
+      "epoch": 0.2830704594578172,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0015514855416991604,
+      "loss": 0.0791,
+      "step": 32610
+    },
+    {
+      "epoch": 0.28307913993802136,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0015514597948340799,
+      "loss": 0.1064,
+      "step": 32611
+    },
+    {
+      "epoch": 0.28308782041822556,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0015514340474752893,
+      "loss": 0.124,
+      "step": 32612
+    },
+    {
+      "epoch": 0.2830965008984297,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0015514082996228174,
+      "loss": 0.0854,
+      "step": 32613
+    },
+    {
+      "epoch": 0.2831051813786339,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0015513825512766919,
+      "loss": 0.1182,
+      "step": 32614
+    },
+    {
+      "epoch": 0.283113861858838,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0015513568024369414,
+      "loss": 0.1963,
+      "step": 32615
+    },
+    {
+      "epoch": 0.2831225423390422,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001551331053103594,
+      "loss": 0.0703,
+      "step": 32616
+    },
+    {
+      "epoch": 0.28313122281924635,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0015513053032766776,
+      "loss": 0.0957,
+      "step": 32617
+    },
+    {
+      "epoch": 0.28313990329945055,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0015512795529562206,
+      "loss": 0.0977,
+      "step": 32618
+    },
+    {
+      "epoch": 0.2831485837796547,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.001551253802142251,
+      "loss": 0.0981,
+      "step": 32619
+    },
+    {
+      "epoch": 0.2831572642598589,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0015512280508347969,
+      "loss": 0.1006,
+      "step": 32620
+    },
+    {
+      "epoch": 0.283165944740063,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0015512022990338869,
+      "loss": 0.1416,
+      "step": 32621
+    },
+    {
+      "epoch": 0.2831746252202672,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0015511765467395486,
+      "loss": 0.1079,
+      "step": 32622
+    },
+    {
+      "epoch": 0.28318330570047134,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0015511507939518104,
+      "loss": 0.1074,
+      "step": 32623
+    },
+    {
+      "epoch": 0.28319198618067554,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0015511250406707005,
+      "loss": 0.1025,
+      "step": 32624
+    },
+    {
+      "epoch": 0.2832006666608797,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0015510992868962472,
+      "loss": 0.1035,
+      "step": 32625
+    },
+    {
+      "epoch": 0.28320934714108387,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0015510735326284785,
+      "loss": 0.1045,
+      "step": 32626
+    },
+    {
+      "epoch": 0.283218027621288,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0015510477778674222,
+      "loss": 0.0894,
+      "step": 32627
+    },
+    {
+      "epoch": 0.2832267081014922,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001551022022613107,
+      "loss": 0.1211,
+      "step": 32628
+    },
+    {
+      "epoch": 0.28323538858169633,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001550996266865561,
+      "loss": 0.0874,
+      "step": 32629
+    },
+    {
+      "epoch": 0.2832440690619005,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001550970510624812,
+      "loss": 0.126,
+      "step": 32630
+    },
+    {
+      "epoch": 0.28325274954210466,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0015509447538908886,
+      "loss": 0.1279,
+      "step": 32631
+    },
+    {
+      "epoch": 0.28326143002230886,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0015509189966638186,
+      "loss": 0.1396,
+      "step": 32632
+    },
+    {
+      "epoch": 0.283270110502513,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0015508932389436307,
+      "loss": 0.1055,
+      "step": 32633
+    },
+    {
+      "epoch": 0.2832787909827172,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001550867480730353,
+      "loss": 0.1162,
+      "step": 32634
+    },
+    {
+      "epoch": 0.2832874714629213,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001550841722024013,
+      "loss": 0.1162,
+      "step": 32635
+    },
+    {
+      "epoch": 0.2832961519431255,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001550815962824639,
+      "loss": 0.0786,
+      "step": 32636
+    },
+    {
+      "epoch": 0.28330483242332966,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0015507902031322597,
+      "loss": 0.1055,
+      "step": 32637
+    },
+    {
+      "epoch": 0.28331351290353385,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001550764442946903,
+      "loss": 0.1084,
+      "step": 32638
+    },
+    {
+      "epoch": 0.283322193383738,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0015507386822685972,
+      "loss": 0.1099,
+      "step": 32639
+    },
+    {
+      "epoch": 0.2833308738639422,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.00155071292109737,
+      "loss": 0.0977,
+      "step": 32640
+    },
+    {
+      "epoch": 0.2833395543441463,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0015506871594332503,
+      "loss": 0.082,
+      "step": 32641
+    },
+    {
+      "epoch": 0.2833482348243505,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0015506613972762655,
+      "loss": 0.1021,
+      "step": 32642
+    },
+    {
+      "epoch": 0.28335691530455465,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0015506356346264448,
+      "loss": 0.0918,
+      "step": 32643
+    },
+    {
+      "epoch": 0.28336559578475884,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0015506098714838151,
+      "loss": 0.127,
+      "step": 32644
+    },
+    {
+      "epoch": 0.283374276264963,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0015505841078484058,
+      "loss": 0.1133,
+      "step": 32645
+    },
+    {
+      "epoch": 0.28338295674516717,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001550558343720244,
+      "loss": 0.126,
+      "step": 32646
+    },
+    {
+      "epoch": 0.2833916372253713,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0015505325790993584,
+      "loss": 0.0747,
+      "step": 32647
+    },
+    {
+      "epoch": 0.2834003177055755,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0015505068139857773,
+      "loss": 0.106,
+      "step": 32648
+    },
+    {
+      "epoch": 0.28340899818577964,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0015504810483795287,
+      "loss": 0.0884,
+      "step": 32649
+    },
+    {
+      "epoch": 0.28341767866598383,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0015504552822806405,
+      "loss": 0.1201,
+      "step": 32650
+    },
+    {
+      "epoch": 0.28342635914618797,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0015504295156891419,
+      "loss": 0.0967,
+      "step": 32651
+    },
+    {
+      "epoch": 0.2834350396263921,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0015504037486050598,
+      "loss": 0.0957,
+      "step": 32652
+    },
+    {
+      "epoch": 0.2834437201065963,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001550377981028423,
+      "loss": 0.0874,
+      "step": 32653
+    },
+    {
+      "epoch": 0.28345240058680043,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0015503522129592595,
+      "loss": 0.1064,
+      "step": 32654
+    },
+    {
+      "epoch": 0.2834610810670046,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0015503264443975978,
+      "loss": 0.1079,
+      "step": 32655
+    },
+    {
+      "epoch": 0.28346976154720877,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0015503006753434657,
+      "loss": 0.0889,
+      "step": 32656
+    },
+    {
+      "epoch": 0.28347844202741296,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0015502749057968916,
+      "loss": 0.1084,
+      "step": 32657
+    },
+    {
+      "epoch": 0.2834871225076171,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0015502491357579037,
+      "loss": 0.1089,
+      "step": 32658
+    },
+    {
+      "epoch": 0.2834958029878213,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.00155022336522653,
+      "loss": 0.0874,
+      "step": 32659
+    },
+    {
+      "epoch": 0.2835044834680254,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001550197594202799,
+      "loss": 0.1162,
+      "step": 32660
+    },
+    {
+      "epoch": 0.2835131639482296,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0015501718226867388,
+      "loss": 0.1484,
+      "step": 32661
+    },
+    {
+      "epoch": 0.28352184442843376,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001550146050678377,
+      "loss": 0.0957,
+      "step": 32662
+    },
+    {
+      "epoch": 0.28353052490863795,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0015501202781777424,
+      "loss": 0.0781,
+      "step": 32663
+    },
+    {
+      "epoch": 0.2835392053888421,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001550094505184863,
+      "loss": 0.1123,
+      "step": 32664
+    },
+    {
+      "epoch": 0.2835478858690463,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0015500687316997673,
+      "loss": 0.0938,
+      "step": 32665
+    },
+    {
+      "epoch": 0.2835565663492504,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0015500429577224829,
+      "loss": 0.1416,
+      "step": 32666
+    },
+    {
+      "epoch": 0.2835652468294546,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.001550017183253038,
+      "loss": 0.0859,
+      "step": 32667
+    },
+    {
+      "epoch": 0.28357392730965875,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0015499914082914617,
+      "loss": 0.1152,
+      "step": 32668
+    },
+    {
+      "epoch": 0.28358260778986294,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.001549965632837781,
+      "loss": 0.1953,
+      "step": 32669
+    },
+    {
+      "epoch": 0.2835912882700671,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001549939856892025,
+      "loss": 0.1152,
+      "step": 32670
+    },
+    {
+      "epoch": 0.28359996875027127,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0015499140804542215,
+      "loss": 0.1025,
+      "step": 32671
+    },
+    {
+      "epoch": 0.2836086492304754,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0015498883035243987,
+      "loss": 0.1064,
+      "step": 32672
+    },
+    {
+      "epoch": 0.2836173297106796,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0015498625261025848,
+      "loss": 0.0854,
+      "step": 32673
+    },
+    {
+      "epoch": 0.28362601019088374,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0015498367481888082,
+      "loss": 0.0981,
+      "step": 32674
+    },
+    {
+      "epoch": 0.28363469067108793,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0015498109697830967,
+      "loss": 0.1143,
+      "step": 32675
+    },
+    {
+      "epoch": 0.28364337115129207,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0015497851908854786,
+      "loss": 0.123,
+      "step": 32676
+    },
+    {
+      "epoch": 0.28365205163149626,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0015497594114959822,
+      "loss": 0.0928,
+      "step": 32677
+    },
+    {
+      "epoch": 0.2836607321117004,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0015497336316146358,
+      "loss": 0.1123,
+      "step": 32678
+    },
+    {
+      "epoch": 0.2836694125919046,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001549707851241467,
+      "loss": 0.126,
+      "step": 32679
+    },
+    {
+      "epoch": 0.28367809307210873,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001549682070376505,
+      "loss": 0.0913,
+      "step": 32680
+    },
+    {
+      "epoch": 0.2836867735523129,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0015496562890197772,
+      "loss": 0.084,
+      "step": 32681
+    },
+    {
+      "epoch": 0.28369545403251706,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001549630507171312,
+      "loss": 0.0942,
+      "step": 32682
+    },
+    {
+      "epoch": 0.28370413451272125,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001549604724831138,
+      "loss": 0.1113,
+      "step": 32683
+    },
+    {
+      "epoch": 0.2837128149929254,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0015495789419992826,
+      "loss": 0.0947,
+      "step": 32684
+    },
+    {
+      "epoch": 0.2837214954731296,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0015495531586757747,
+      "loss": 0.1152,
+      "step": 32685
+    },
+    {
+      "epoch": 0.2837301759533337,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.001549527374860642,
+      "loss": 0.1309,
+      "step": 32686
+    },
+    {
+      "epoch": 0.2837388564335379,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001549501590553913,
+      "loss": 0.105,
+      "step": 32687
+    },
+    {
+      "epoch": 0.28374753691374205,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0015494758057556157,
+      "loss": 0.0928,
+      "step": 32688
+    },
+    {
+      "epoch": 0.28375621739394624,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0015494500204657786,
+      "loss": 0.1094,
+      "step": 32689
+    },
+    {
+      "epoch": 0.2837648978741504,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0015494242346844299,
+      "loss": 0.124,
+      "step": 32690
+    },
+    {
+      "epoch": 0.28377357835435457,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0015493984484115974,
+      "loss": 0.0986,
+      "step": 32691
+    },
+    {
+      "epoch": 0.2837822588345587,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0015493726616473092,
+      "loss": 0.0625,
+      "step": 32692
+    },
+    {
+      "epoch": 0.2837909393147629,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0015493468743915941,
+      "loss": 0.1133,
+      "step": 32693
+    },
+    {
+      "epoch": 0.28379961979496704,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0015493210866444802,
+      "loss": 0.0967,
+      "step": 32694
+    },
+    {
+      "epoch": 0.28380830027517123,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0015492952984059952,
+      "loss": 0.1152,
+      "step": 32695
+    },
+    {
+      "epoch": 0.28381698075537537,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0015492695096761678,
+      "loss": 0.1133,
+      "step": 32696
+    },
+    {
+      "epoch": 0.28382566123557956,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0015492437204550256,
+      "loss": 0.1318,
+      "step": 32697
+    },
+    {
+      "epoch": 0.2838343417157837,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0015492179307425979,
+      "loss": 0.1318,
+      "step": 32698
+    },
+    {
+      "epoch": 0.2838430221959879,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0015491921405389117,
+      "loss": 0.1113,
+      "step": 32699
+    },
+    {
+      "epoch": 0.28385170267619203,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0015491663498439958,
+      "loss": 0.1128,
+      "step": 32700
+    },
+    {
+      "epoch": 0.2838603831563962,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0015491405586578787,
+      "loss": 0.0791,
+      "step": 32701
+    },
+    {
+      "epoch": 0.28386906363660036,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0015491147669805879,
+      "loss": 0.0801,
+      "step": 32702
+    },
+    {
+      "epoch": 0.28387774411680455,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.001549088974812152,
+      "loss": 0.1338,
+      "step": 32703
+    },
+    {
+      "epoch": 0.2838864245970087,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001549063182152599,
+      "loss": 0.1172,
+      "step": 32704
+    },
+    {
+      "epoch": 0.2838951050772129,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0015490373890019577,
+      "loss": 0.1367,
+      "step": 32705
+    },
+    {
+      "epoch": 0.283903785557417,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0015490115953602553,
+      "loss": 0.126,
+      "step": 32706
+    },
+    {
+      "epoch": 0.2839124660376212,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0015489858012275209,
+      "loss": 0.0898,
+      "step": 32707
+    },
+    {
+      "epoch": 0.28392114651782535,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0015489600066037822,
+      "loss": 0.127,
+      "step": 32708
+    },
+    {
+      "epoch": 0.28392982699802954,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0015489342114890677,
+      "loss": 0.1162,
+      "step": 32709
+    },
+    {
+      "epoch": 0.2839385074782337,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0015489084158834055,
+      "loss": 0.1172,
+      "step": 32710
+    },
+    {
+      "epoch": 0.2839471879584379,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0015488826197868238,
+      "loss": 0.1504,
+      "step": 32711
+    },
+    {
+      "epoch": 0.283955868438642,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0015488568231993506,
+      "loss": 0.0942,
+      "step": 32712
+    },
+    {
+      "epoch": 0.2839645489188462,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0015488310261210147,
+      "loss": 0.1152,
+      "step": 32713
+    },
+    {
+      "epoch": 0.28397322939905034,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0015488052285518438,
+      "loss": 0.123,
+      "step": 32714
+    },
+    {
+      "epoch": 0.28398190987925453,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0015487794304918658,
+      "loss": 0.0898,
+      "step": 32715
+    },
+    {
+      "epoch": 0.28399059035945867,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.00154875363194111,
+      "loss": 0.1426,
+      "step": 32716
+    },
+    {
+      "epoch": 0.28399927083966287,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0015487278328996034,
+      "loss": 0.1094,
+      "step": 32717
+    },
+    {
+      "epoch": 0.284007951319867,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0015487020333673752,
+      "loss": 0.1045,
+      "step": 32718
+    },
+    {
+      "epoch": 0.2840166318000712,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001548676233344453,
+      "loss": 0.0835,
+      "step": 32719
+    },
+    {
+      "epoch": 0.28402531228027533,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0015486504328308655,
+      "loss": 0.0708,
+      "step": 32720
+    },
+    {
+      "epoch": 0.2840339927604795,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0015486246318266405,
+      "loss": 0.1094,
+      "step": 32721
+    },
+    {
+      "epoch": 0.28404267324068366,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0015485988303318061,
+      "loss": 0.0957,
+      "step": 32722
+    },
+    {
+      "epoch": 0.28405135372088786,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0015485730283463911,
+      "loss": 0.0752,
+      "step": 32723
+    },
+    {
+      "epoch": 0.284060034201092,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001548547225870423,
+      "loss": 0.1211,
+      "step": 32724
+    },
+    {
+      "epoch": 0.2840687146812962,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0015485214229039306,
+      "loss": 0.1216,
+      "step": 32725
+    },
+    {
+      "epoch": 0.2840773951615003,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0015484956194469419,
+      "loss": 0.1094,
+      "step": 32726
+    },
+    {
+      "epoch": 0.2840860756417045,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0015484698154994852,
+      "loss": 0.1084,
+      "step": 32727
+    },
+    {
+      "epoch": 0.28409475612190865,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0015484440110615887,
+      "loss": 0.1533,
+      "step": 32728
+    },
+    {
+      "epoch": 0.28410343660211285,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0015484182061332803,
+      "loss": 0.1396,
+      "step": 32729
+    },
+    {
+      "epoch": 0.284112117082317,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001548392400714589,
+      "loss": 0.1416,
+      "step": 32730
+    },
+    {
+      "epoch": 0.2841207975625212,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0015483665948055416,
+      "loss": 0.123,
+      "step": 32731
+    },
+    {
+      "epoch": 0.2841294780427253,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001548340788406168,
+      "loss": 0.104,
+      "step": 32732
+    },
+    {
+      "epoch": 0.2841381585229295,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0015483149815164953,
+      "loss": 0.1367,
+      "step": 32733
+    },
+    {
+      "epoch": 0.28414683900313364,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0015482891741365524,
+      "loss": 0.1064,
+      "step": 32734
+    },
+    {
+      "epoch": 0.28415551948333784,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0015482633662663669,
+      "loss": 0.123,
+      "step": 32735
+    },
+    {
+      "epoch": 0.284164199963542,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0015482375579059674,
+      "loss": 0.1572,
+      "step": 32736
+    },
+    {
+      "epoch": 0.28417288044374617,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001548211749055382,
+      "loss": 0.0908,
+      "step": 32737
+    },
+    {
+      "epoch": 0.2841815609239503,
+      "grad_norm": 5.46875,
+      "learning_rate": 0.0015481859397146389,
+      "loss": 0.3379,
+      "step": 32738
+    },
+    {
+      "epoch": 0.2841902414041545,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0015481601298837663,
+      "loss": 0.0684,
+      "step": 32739
+    },
+    {
+      "epoch": 0.28419892188435864,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001548134319562793,
+      "loss": 0.1162,
+      "step": 32740
+    },
+    {
+      "epoch": 0.28420760236456283,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0015481085087517465,
+      "loss": 0.0928,
+      "step": 32741
+    },
+    {
+      "epoch": 0.28421628284476697,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0015480826974506555,
+      "loss": 0.0942,
+      "step": 32742
+    },
+    {
+      "epoch": 0.28422496332497116,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0015480568856595473,
+      "loss": 0.1001,
+      "step": 32743
+    },
+    {
+      "epoch": 0.2842336438051753,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0015480310733784514,
+      "loss": 0.0762,
+      "step": 32744
+    },
+    {
+      "epoch": 0.2842423242853795,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0015480052606073953,
+      "loss": 0.1025,
+      "step": 32745
+    },
+    {
+      "epoch": 0.2842510047655836,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0015479794473464074,
+      "loss": 0.1406,
+      "step": 32746
+    },
+    {
+      "epoch": 0.2842596852457878,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0015479536335955158,
+      "loss": 0.1543,
+      "step": 32747
+    },
+    {
+      "epoch": 0.28426836572599196,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0015479278193547487,
+      "loss": 0.0918,
+      "step": 32748
+    },
+    {
+      "epoch": 0.28427704620619615,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001547902004624135,
+      "loss": 0.1289,
+      "step": 32749
+    },
+    {
+      "epoch": 0.2842857266864003,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001547876189403702,
+      "loss": 0.1494,
+      "step": 32750
+    },
+    {
+      "epoch": 0.2842944071666045,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0015478503736934786,
+      "loss": 0.1328,
+      "step": 32751
+    },
+    {
+      "epoch": 0.2843030876468086,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0015478245574934925,
+      "loss": 0.1709,
+      "step": 32752
+    },
+    {
+      "epoch": 0.2843117681270128,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0015477987408037722,
+      "loss": 0.0854,
+      "step": 32753
+    },
+    {
+      "epoch": 0.28432044860721695,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001547772923624346,
+      "loss": 0.082,
+      "step": 32754
+    },
+    {
+      "epoch": 0.28432912908742114,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001547747105955242,
+      "loss": 0.1162,
+      "step": 32755
+    },
+    {
+      "epoch": 0.2843378095676253,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0015477212877964888,
+      "loss": 0.0879,
+      "step": 32756
+    },
+    {
+      "epoch": 0.28434649004782947,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0015476954691481138,
+      "loss": 0.0879,
+      "step": 32757
+    },
+    {
+      "epoch": 0.2843551705280336,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0015476696500101462,
+      "loss": 0.0791,
+      "step": 32758
+    },
+    {
+      "epoch": 0.2843638510082378,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0015476438303826136,
+      "loss": 0.0918,
+      "step": 32759
+    },
+    {
+      "epoch": 0.28437253148844194,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0015476180102655444,
+      "loss": 0.1099,
+      "step": 32760
+    },
+    {
+      "epoch": 0.28438121196864613,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001547592189658967,
+      "loss": 0.1094,
+      "step": 32761
+    },
+    {
+      "epoch": 0.28438989244885027,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0015475663685629096,
+      "loss": 0.0996,
+      "step": 32762
+    },
+    {
+      "epoch": 0.28439857292905446,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0015475405469773998,
+      "loss": 0.1055,
+      "step": 32763
+    },
+    {
+      "epoch": 0.2844072534092586,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001547514724902467,
+      "loss": 0.1211,
+      "step": 32764
+    },
+    {
+      "epoch": 0.2844159338894628,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0015474889023381385,
+      "loss": 0.0801,
+      "step": 32765
+    },
+    {
+      "epoch": 0.28442461436966693,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001547463079284443,
+      "loss": 0.1172,
+      "step": 32766
+    },
+    {
+      "epoch": 0.2844332948498711,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0015474372557414086,
+      "loss": 0.1035,
+      "step": 32767
+    },
+    {
+      "epoch": 0.28444197533007526,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0015474114317090633,
+      "loss": 0.1035,
+      "step": 32768
+    },
+    {
+      "epoch": 0.28445065581027945,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0015473856071874358,
+      "loss": 0.1123,
+      "step": 32769
+    },
+    {
+      "epoch": 0.2844593362904836,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0015473597821765539,
+      "loss": 0.1406,
+      "step": 32770
+    },
+    {
+      "epoch": 0.2844680167706878,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0015473339566764465,
+      "loss": 0.1045,
+      "step": 32771
+    },
+    {
+      "epoch": 0.2844766972508919,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001547308130687141,
+      "loss": 0.0771,
+      "step": 32772
+    },
+    {
+      "epoch": 0.2844853777310961,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0015472823042086664,
+      "loss": 0.0811,
+      "step": 32773
+    },
+    {
+      "epoch": 0.28449405821130025,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.00154725647724105,
+      "loss": 0.1055,
+      "step": 32774
+    },
+    {
+      "epoch": 0.2845027386915044,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.001547230649784321,
+      "loss": 0.0967,
+      "step": 32775
+    },
+    {
+      "epoch": 0.2845114191717086,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0015472048218385078,
+      "loss": 0.1328,
+      "step": 32776
+    },
+    {
+      "epoch": 0.2845200996519127,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0015471789934036375,
+      "loss": 0.1104,
+      "step": 32777
+    },
+    {
+      "epoch": 0.2845287801321169,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001547153164479739,
+      "loss": 0.103,
+      "step": 32778
+    },
+    {
+      "epoch": 0.28453746061232105,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.0015471273350668407,
+      "loss": 0.1523,
+      "step": 32779
+    },
+    {
+      "epoch": 0.28454614109252524,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0015471015051649706,
+      "loss": 0.1064,
+      "step": 32780
+    },
+    {
+      "epoch": 0.2845548215727294,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001547075674774157,
+      "loss": 0.1025,
+      "step": 32781
+    },
+    {
+      "epoch": 0.28456350205293357,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0015470498438944283,
+      "loss": 0.1328,
+      "step": 32782
+    },
+    {
+      "epoch": 0.2845721825331377,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0015470240125258121,
+      "loss": 0.0913,
+      "step": 32783
+    },
+    {
+      "epoch": 0.2845808630133419,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0015469981806683374,
+      "loss": 0.0898,
+      "step": 32784
+    },
+    {
+      "epoch": 0.28458954349354604,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0015469723483220326,
+      "loss": 0.0811,
+      "step": 32785
+    },
+    {
+      "epoch": 0.28459822397375023,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0015469465154869255,
+      "loss": 0.1406,
+      "step": 32786
+    },
+    {
+      "epoch": 0.28460690445395437,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.001546920682163044,
+      "loss": 0.1279,
+      "step": 32787
+    },
+    {
+      "epoch": 0.28461558493415856,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001546894848350417,
+      "loss": 0.0957,
+      "step": 32788
+    },
+    {
+      "epoch": 0.2846242654143627,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0015468690140490726,
+      "loss": 0.0908,
+      "step": 32789
+    },
+    {
+      "epoch": 0.2846329458945669,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0015468431792590388,
+      "loss": 0.103,
+      "step": 32790
+    },
+    {
+      "epoch": 0.28464162637477103,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0015468173439803438,
+      "loss": 0.1045,
+      "step": 32791
+    },
+    {
+      "epoch": 0.2846503068549752,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0015467915082130162,
+      "loss": 0.1035,
+      "step": 32792
+    },
+    {
+      "epoch": 0.28465898733517936,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0015467656719570845,
+      "loss": 0.1162,
+      "step": 32793
+    },
+    {
+      "epoch": 0.28466766781538355,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.001546739835212576,
+      "loss": 0.2402,
+      "step": 32794
+    },
+    {
+      "epoch": 0.2846763482955877,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0015467139979795201,
+      "loss": 0.0938,
+      "step": 32795
+    },
+    {
+      "epoch": 0.2846850287757919,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0015466881602579441,
+      "loss": 0.1187,
+      "step": 32796
+    },
+    {
+      "epoch": 0.284693709255996,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0015466623220478767,
+      "loss": 0.0918,
+      "step": 32797
+    },
+    {
+      "epoch": 0.2847023897362002,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0015466364833493463,
+      "loss": 0.125,
+      "step": 32798
+    },
+    {
+      "epoch": 0.28471107021640435,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001546610644162381,
+      "loss": 0.127,
+      "step": 32799
+    },
+    {
+      "epoch": 0.28471975069660854,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0015465848044870086,
+      "loss": 0.1465,
+      "step": 32800
+    },
+    {
+      "epoch": 0.2847284311768127,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.001546558964323258,
+      "loss": 0.1543,
+      "step": 32801
+    },
+    {
+      "epoch": 0.2847371116570169,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001546533123671157,
+      "loss": 0.0835,
+      "step": 32802
+    },
+    {
+      "epoch": 0.284745792137221,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0015465072825307342,
+      "loss": 0.1147,
+      "step": 32803
+    },
+    {
+      "epoch": 0.2847544726174252,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0015464814409020177,
+      "loss": 0.1152,
+      "step": 32804
+    },
+    {
+      "epoch": 0.28476315309762934,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001546455598785036,
+      "loss": 0.1143,
+      "step": 32805
+    },
+    {
+      "epoch": 0.28477183357783353,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0015464297561798173,
+      "loss": 0.0947,
+      "step": 32806
+    },
+    {
+      "epoch": 0.28478051405803767,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0015464039130863897,
+      "loss": 0.0918,
+      "step": 32807
+    },
+    {
+      "epoch": 0.28478919453824186,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001546378069504781,
+      "loss": 0.1582,
+      "step": 32808
+    },
+    {
+      "epoch": 0.284797875018446,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0015463522254350204,
+      "loss": 0.1191,
+      "step": 32809
+    },
+    {
+      "epoch": 0.2848065554986502,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0015463263808771356,
+      "loss": 0.125,
+      "step": 32810
+    },
+    {
+      "epoch": 0.28481523597885433,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0015463005358311549,
+      "loss": 0.0879,
+      "step": 32811
+    },
+    {
+      "epoch": 0.2848239164590585,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0015462746902971068,
+      "loss": 0.082,
+      "step": 32812
+    },
+    {
+      "epoch": 0.28483259693926266,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001546248844275019,
+      "loss": 0.0684,
+      "step": 32813
+    },
+    {
+      "epoch": 0.28484127741946685,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0015462229977649205,
+      "loss": 0.1094,
+      "step": 32814
+    },
+    {
+      "epoch": 0.284849957899671,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0015461971507668394,
+      "loss": 0.1514,
+      "step": 32815
+    },
+    {
+      "epoch": 0.2848586383798752,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015461713032808035,
+      "loss": 0.1123,
+      "step": 32816
+    },
+    {
+      "epoch": 0.2848673188600793,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0015461454553068416,
+      "loss": 0.0957,
+      "step": 32817
+    },
+    {
+      "epoch": 0.2848759993402835,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0015461196068449815,
+      "loss": 0.127,
+      "step": 32818
+    },
+    {
+      "epoch": 0.28488467982048765,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0015460937578952519,
+      "loss": 0.1211,
+      "step": 32819
+    },
+    {
+      "epoch": 0.28489336030069184,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0015460679084576805,
+      "loss": 0.1006,
+      "step": 32820
+    },
+    {
+      "epoch": 0.284902040780896,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0015460420585322963,
+      "loss": 0.083,
+      "step": 32821
+    },
+    {
+      "epoch": 0.2849107212611002,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001546016208119127,
+      "loss": 0.1016,
+      "step": 32822
+    },
+    {
+      "epoch": 0.2849194017413043,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0015459903572182013,
+      "loss": 0.082,
+      "step": 32823
+    },
+    {
+      "epoch": 0.2849280822215085,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0015459645058295472,
+      "loss": 0.1196,
+      "step": 32824
+    },
+    {
+      "epoch": 0.28493676270171264,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001545938653953193,
+      "loss": 0.1289,
+      "step": 32825
+    },
+    {
+      "epoch": 0.28494544318191684,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0015459128015891668,
+      "loss": 0.0854,
+      "step": 32826
+    },
+    {
+      "epoch": 0.284954123662121,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0015458869487374972,
+      "loss": 0.0801,
+      "step": 32827
+    },
+    {
+      "epoch": 0.28496280414232517,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0015458610953982125,
+      "loss": 0.1504,
+      "step": 32828
+    },
+    {
+      "epoch": 0.2849714846225293,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0015458352415713404,
+      "loss": 0.1094,
+      "step": 32829
+    },
+    {
+      "epoch": 0.2849801651027335,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.00154580938725691,
+      "loss": 0.0981,
+      "step": 32830
+    },
+    {
+      "epoch": 0.28498884558293763,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0015457835324549486,
+      "loss": 0.1055,
+      "step": 32831
+    },
+    {
+      "epoch": 0.2849975260631418,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0015457576771654858,
+      "loss": 0.1455,
+      "step": 32832
+    },
+    {
+      "epoch": 0.28500620654334596,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0015457318213885486,
+      "loss": 0.0933,
+      "step": 32833
+    },
+    {
+      "epoch": 0.28501488702355016,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0015457059651241658,
+      "loss": 0.1064,
+      "step": 32834
+    },
+    {
+      "epoch": 0.2850235675037543,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0015456801083723656,
+      "loss": 0.0762,
+      "step": 32835
+    },
+    {
+      "epoch": 0.2850322479839585,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0015456542511331766,
+      "loss": 0.0908,
+      "step": 32836
+    },
+    {
+      "epoch": 0.2850409284641626,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0015456283934066266,
+      "loss": 0.0952,
+      "step": 32837
+    },
+    {
+      "epoch": 0.2850496089443668,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0015456025351927444,
+      "loss": 0.1289,
+      "step": 32838
+    },
+    {
+      "epoch": 0.28505828942457095,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0015455766764915575,
+      "loss": 0.3066,
+      "step": 32839
+    },
+    {
+      "epoch": 0.28506696990477515,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0015455508173030947,
+      "loss": 0.0762,
+      "step": 32840
+    },
+    {
+      "epoch": 0.2850756503849793,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0015455249576273845,
+      "loss": 0.1045,
+      "step": 32841
+    },
+    {
+      "epoch": 0.2850843308651835,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0015454990974644548,
+      "loss": 0.085,
+      "step": 32842
+    },
+    {
+      "epoch": 0.2850930113453876,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0015454732368143339,
+      "loss": 0.1201,
+      "step": 32843
+    },
+    {
+      "epoch": 0.2851016918255918,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0015454473756770502,
+      "loss": 0.123,
+      "step": 32844
+    },
+    {
+      "epoch": 0.28511037230579594,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.001545421514052632,
+      "loss": 0.104,
+      "step": 32845
+    },
+    {
+      "epoch": 0.28511905278600014,
+      "grad_norm": 1.875,
+      "learning_rate": 0.0015453956519411074,
+      "loss": 0.1973,
+      "step": 32846
+    },
+    {
+      "epoch": 0.2851277332662043,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001545369789342505,
+      "loss": 0.0928,
+      "step": 32847
+    },
+    {
+      "epoch": 0.28513641374640847,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0015453439262568527,
+      "loss": 0.0986,
+      "step": 32848
+    },
+    {
+      "epoch": 0.2851450942266126,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001545318062684179,
+      "loss": 0.1406,
+      "step": 32849
+    },
+    {
+      "epoch": 0.2851537747068168,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001545292198624512,
+      "loss": 0.1445,
+      "step": 32850
+    },
+    {
+      "epoch": 0.28516245518702094,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0015452663340778808,
+      "loss": 0.0845,
+      "step": 32851
+    },
+    {
+      "epoch": 0.28517113566722513,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0015452404690443123,
+      "loss": 0.1162,
+      "step": 32852
+    },
+    {
+      "epoch": 0.28517981614742927,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001545214603523836,
+      "loss": 0.0928,
+      "step": 32853
+    },
+    {
+      "epoch": 0.28518849662763346,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0015451887375164792,
+      "loss": 0.1055,
+      "step": 32854
+    },
+    {
+      "epoch": 0.2851971771078376,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0015451628710222711,
+      "loss": 0.1084,
+      "step": 32855
+    },
+    {
+      "epoch": 0.2852058575880418,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0015451370040412396,
+      "loss": 0.1387,
+      "step": 32856
+    },
+    {
+      "epoch": 0.2852145380682459,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0015451111365734128,
+      "loss": 0.1133,
+      "step": 32857
+    },
+    {
+      "epoch": 0.2852232185484501,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001545085268618819,
+      "loss": 0.0967,
+      "step": 32858
+    },
+    {
+      "epoch": 0.28523189902865426,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0015450594001774867,
+      "loss": 0.0947,
+      "step": 32859
+    },
+    {
+      "epoch": 0.28524057950885845,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0015450335312494443,
+      "loss": 0.0864,
+      "step": 32860
+    },
+    {
+      "epoch": 0.2852492599890626,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00154500766183472,
+      "loss": 0.0698,
+      "step": 32861
+    },
+    {
+      "epoch": 0.2852579404692668,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0015449817919333416,
+      "loss": 0.1387,
+      "step": 32862
+    },
+    {
+      "epoch": 0.2852666209494709,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001544955921545338,
+      "loss": 0.0991,
+      "step": 32863
+    },
+    {
+      "epoch": 0.2852753014296751,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0015449300506707375,
+      "loss": 0.0962,
+      "step": 32864
+    },
+    {
+      "epoch": 0.28528398190987925,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001544904179309568,
+      "loss": 0.0898,
+      "step": 32865
+    },
+    {
+      "epoch": 0.28529266239008344,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.001544878307461858,
+      "loss": 0.1201,
+      "step": 32866
+    },
+    {
+      "epoch": 0.2853013428702876,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001544852435127636,
+      "loss": 0.1221,
+      "step": 32867
+    },
+    {
+      "epoch": 0.28531002335049177,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0015448265623069295,
+      "loss": 0.0923,
+      "step": 32868
+    },
+    {
+      "epoch": 0.2853187038306959,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0015448006889997676,
+      "loss": 0.1035,
+      "step": 32869
+    },
+    {
+      "epoch": 0.2853273843109001,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0015447748152061784,
+      "loss": 0.1143,
+      "step": 32870
+    },
+    {
+      "epoch": 0.28533606479110424,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0015447489409261903,
+      "loss": 0.0801,
+      "step": 32871
+    },
+    {
+      "epoch": 0.28534474527130843,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0015447230661598315,
+      "loss": 0.1055,
+      "step": 32872
+    },
+    {
+      "epoch": 0.28535342575151257,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0015446971909071298,
+      "loss": 0.1104,
+      "step": 32873
+    },
+    {
+      "epoch": 0.28536210623171676,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0015446713151681141,
+      "loss": 0.085,
+      "step": 32874
+    },
+    {
+      "epoch": 0.2853707867119209,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0015446454389428127,
+      "loss": 0.083,
+      "step": 32875
+    },
+    {
+      "epoch": 0.2853794671921251,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0015446195622312539,
+      "loss": 0.0889,
+      "step": 32876
+    },
+    {
+      "epoch": 0.28538814767232923,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0015445936850334654,
+      "loss": 0.1104,
+      "step": 32877
+    },
+    {
+      "epoch": 0.2853968281525334,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0015445678073494758,
+      "loss": 0.085,
+      "step": 32878
+    },
+    {
+      "epoch": 0.28540550863273756,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001544541929179314,
+      "loss": 0.1118,
+      "step": 32879
+    },
+    {
+      "epoch": 0.28541418911294175,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0015445160505230076,
+      "loss": 0.0825,
+      "step": 32880
+    },
+    {
+      "epoch": 0.2854228695931459,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0015444901713805852,
+      "loss": 0.1084,
+      "step": 32881
+    },
+    {
+      "epoch": 0.2854315500733501,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0015444642917520751,
+      "loss": 0.0986,
+      "step": 32882
+    },
+    {
+      "epoch": 0.2854402305535542,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0015444384116375053,
+      "loss": 0.1143,
+      "step": 32883
+    },
+    {
+      "epoch": 0.2854489110337584,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0015444125310369046,
+      "loss": 0.0845,
+      "step": 32884
+    },
+    {
+      "epoch": 0.28545759151396255,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0015443866499503009,
+      "loss": 0.0908,
+      "step": 32885
+    },
+    {
+      "epoch": 0.28546627199416674,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0015443607683777226,
+      "loss": 0.1016,
+      "step": 32886
+    },
+    {
+      "epoch": 0.2854749524743709,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0015443348863191982,
+      "loss": 0.166,
+      "step": 32887
+    },
+    {
+      "epoch": 0.2854836329545751,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0015443090037747555,
+      "loss": 0.0806,
+      "step": 32888
+    },
+    {
+      "epoch": 0.2854923134347792,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0015442831207444234,
+      "loss": 0.0898,
+      "step": 32889
+    },
+    {
+      "epoch": 0.2855009939149834,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0015442572372282297,
+      "loss": 0.127,
+      "step": 32890
+    },
+    {
+      "epoch": 0.28550967439518754,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0015442313532262034,
+      "loss": 0.0928,
+      "step": 32891
+    },
+    {
+      "epoch": 0.28551835487539173,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001544205468738372,
+      "loss": 0.0869,
+      "step": 32892
+    },
+    {
+      "epoch": 0.28552703535559587,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0015441795837647643,
+      "loss": 0.207,
+      "step": 32893
+    },
+    {
+      "epoch": 0.28553571583580006,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0015441536983054084,
+      "loss": 0.123,
+      "step": 32894
+    },
+    {
+      "epoch": 0.2855443963160042,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0015441278123603328,
+      "loss": 0.0771,
+      "step": 32895
+    },
+    {
+      "epoch": 0.2855530767962084,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0015441019259295656,
+      "loss": 0.1113,
+      "step": 32896
+    },
+    {
+      "epoch": 0.28556175727641253,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0015440760390131353,
+      "loss": 0.1006,
+      "step": 32897
+    },
+    {
+      "epoch": 0.28557043775661667,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0015440501516110704,
+      "loss": 0.3672,
+      "step": 32898
+    },
+    {
+      "epoch": 0.28557911823682086,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0015440242637233983,
+      "loss": 0.084,
+      "step": 32899
+    },
+    {
+      "epoch": 0.285587798717025,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0015439983753501485,
+      "loss": 0.0947,
+      "step": 32900
+    },
+    {
+      "epoch": 0.2855964791972292,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0015439724864913485,
+      "loss": 0.1377,
+      "step": 32901
+    },
+    {
+      "epoch": 0.28560515967743333,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001543946597147027,
+      "loss": 0.1152,
+      "step": 32902
+    },
+    {
+      "epoch": 0.2856138401576375,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0015439207073172118,
+      "loss": 0.0815,
+      "step": 32903
+    },
+    {
+      "epoch": 0.28562252063784166,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001543894817001932,
+      "loss": 0.1094,
+      "step": 32904
+    },
+    {
+      "epoch": 0.28563120111804585,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001543868926201215,
+      "loss": 0.0747,
+      "step": 32905
+    },
+    {
+      "epoch": 0.28563988159825,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.00154384303491509,
+      "loss": 0.1045,
+      "step": 32906
+    },
+    {
+      "epoch": 0.2856485620784542,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0015438171431435847,
+      "loss": 0.0918,
+      "step": 32907
+    },
+    {
+      "epoch": 0.2856572425586583,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001543791250886728,
+      "loss": 0.0757,
+      "step": 32908
+    },
+    {
+      "epoch": 0.2856659230388625,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0015437653581445478,
+      "loss": 0.1318,
+      "step": 32909
+    },
+    {
+      "epoch": 0.28567460351906665,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0015437394649170722,
+      "loss": 0.082,
+      "step": 32910
+    },
+    {
+      "epoch": 0.28568328399927084,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.00154371357120433,
+      "loss": 0.1064,
+      "step": 32911
+    },
+    {
+      "epoch": 0.285691964479475,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0015436876770063492,
+      "loss": 0.1143,
+      "step": 32912
+    },
+    {
+      "epoch": 0.2857006449596792,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0015436617823231583,
+      "loss": 0.1035,
+      "step": 32913
+    },
+    {
+      "epoch": 0.2857093254398833,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0015436358871547856,
+      "loss": 0.1226,
+      "step": 32914
+    },
+    {
+      "epoch": 0.2857180059200875,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0015436099915012592,
+      "loss": 0.1416,
+      "step": 32915
+    },
+    {
+      "epoch": 0.28572668640029164,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0015435840953626075,
+      "loss": 0.1064,
+      "step": 32916
+    },
+    {
+      "epoch": 0.28573536688049583,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0015435581987388587,
+      "loss": 0.1562,
+      "step": 32917
+    },
+    {
+      "epoch": 0.28574404736069997,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0015435323016300417,
+      "loss": 0.1299,
+      "step": 32918
+    },
+    {
+      "epoch": 0.28575272784090416,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0015435064040361845,
+      "loss": 0.0698,
+      "step": 32919
+    },
+    {
+      "epoch": 0.2857614083211083,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0015434805059573154,
+      "loss": 0.1016,
+      "step": 32920
+    },
+    {
+      "epoch": 0.2857700888013125,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0015434546073934622,
+      "loss": 0.1123,
+      "step": 32921
+    },
+    {
+      "epoch": 0.28577876928151663,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0015434287083446543,
+      "loss": 0.0962,
+      "step": 32922
+    },
+    {
+      "epoch": 0.2857874497617208,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0015434028088109192,
+      "loss": 0.1211,
+      "step": 32923
+    },
+    {
+      "epoch": 0.28579613024192496,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0015433769087922853,
+      "loss": 0.0781,
+      "step": 32924
+    },
+    {
+      "epoch": 0.28580481072212915,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0015433510082887814,
+      "loss": 0.1309,
+      "step": 32925
+    },
+    {
+      "epoch": 0.2858134912023333,
+      "grad_norm": 2.421875,
+      "learning_rate": 0.0015433251073004352,
+      "loss": 0.2139,
+      "step": 32926
+    },
+    {
+      "epoch": 0.2858221716825375,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0015432992058272753,
+      "loss": 0.1064,
+      "step": 32927
+    },
+    {
+      "epoch": 0.2858308521627416,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.00154327330386933,
+      "loss": 0.0801,
+      "step": 32928
+    },
+    {
+      "epoch": 0.2858395326429458,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001543247401426628,
+      "loss": 0.1396,
+      "step": 32929
+    },
+    {
+      "epoch": 0.28584821312314995,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001543221498499197,
+      "loss": 0.1182,
+      "step": 32930
+    },
+    {
+      "epoch": 0.28585689360335415,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0015431955950870658,
+      "loss": 0.124,
+      "step": 32931
+    },
+    {
+      "epoch": 0.2858655740835583,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0015431696911902626,
+      "loss": 0.0884,
+      "step": 32932
+    },
+    {
+      "epoch": 0.2858742545637625,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0015431437868088157,
+      "loss": 0.1152,
+      "step": 32933
+    },
+    {
+      "epoch": 0.2858829350439666,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001543117881942753,
+      "loss": 0.0874,
+      "step": 32934
+    },
+    {
+      "epoch": 0.2858916155241708,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0015430919765921038,
+      "loss": 0.1094,
+      "step": 32935
+    },
+    {
+      "epoch": 0.28590029600437494,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0015430660707568956,
+      "loss": 0.1108,
+      "step": 32936
+    },
+    {
+      "epoch": 0.28590897648457914,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0015430401644371567,
+      "loss": 0.1436,
+      "step": 32937
+    },
+    {
+      "epoch": 0.2859176569647833,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0015430142576329162,
+      "loss": 0.0918,
+      "step": 32938
+    },
+    {
+      "epoch": 0.28592633744498747,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001542988350344202,
+      "loss": 0.0874,
+      "step": 32939
+    },
+    {
+      "epoch": 0.2859350179251916,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001542962442571042,
+      "loss": 0.1279,
+      "step": 32940
+    },
+    {
+      "epoch": 0.2859436984053958,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0015429365343134654,
+      "loss": 0.1025,
+      "step": 32941
+    },
+    {
+      "epoch": 0.28595237888559993,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0015429106255714998,
+      "loss": 0.0918,
+      "step": 32942
+    },
+    {
+      "epoch": 0.2859610593658041,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0015428847163451737,
+      "loss": 0.084,
+      "step": 32943
+    },
+    {
+      "epoch": 0.28596973984600826,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0015428588066345155,
+      "loss": 0.1465,
+      "step": 32944
+    },
+    {
+      "epoch": 0.28597842032621246,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0015428328964395537,
+      "loss": 0.1084,
+      "step": 32945
+    },
+    {
+      "epoch": 0.2859871008064166,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0015428069857603166,
+      "loss": 0.1045,
+      "step": 32946
+    },
+    {
+      "epoch": 0.2859957812866208,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0015427810745968322,
+      "loss": 0.1406,
+      "step": 32947
+    },
+    {
+      "epoch": 0.2860044617668249,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0015427551629491293,
+      "loss": 0.1143,
+      "step": 32948
+    },
+    {
+      "epoch": 0.2860131422470291,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001542729250817236,
+      "loss": 0.1084,
+      "step": 32949
+    },
+    {
+      "epoch": 0.28602182272723325,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0015427033382011803,
+      "loss": 0.1484,
+      "step": 32950
+    },
+    {
+      "epoch": 0.28603050320743745,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0015426774251009912,
+      "loss": 0.1084,
+      "step": 32951
+    },
+    {
+      "epoch": 0.2860391836876416,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0015426515115166965,
+      "loss": 0.0884,
+      "step": 32952
+    },
+    {
+      "epoch": 0.2860478641678458,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001542625597448325,
+      "loss": 0.1147,
+      "step": 32953
+    },
+    {
+      "epoch": 0.2860565446480499,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0015425996828959047,
+      "loss": 0.0894,
+      "step": 32954
+    },
+    {
+      "epoch": 0.2860652251282541,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0015425737678594642,
+      "loss": 0.1328,
+      "step": 32955
+    },
+    {
+      "epoch": 0.28607390560845825,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0015425478523390312,
+      "loss": 0.1562,
+      "step": 32956
+    },
+    {
+      "epoch": 0.28608258608866244,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001542521936334635,
+      "loss": 0.0938,
+      "step": 32957
+    },
+    {
+      "epoch": 0.2860912665688666,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0015424960198463034,
+      "loss": 0.1396,
+      "step": 32958
+    },
+    {
+      "epoch": 0.28609994704907077,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0015424701028740647,
+      "loss": 0.0967,
+      "step": 32959
+    },
+    {
+      "epoch": 0.2861086275292749,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001542444185417947,
+      "loss": 0.1182,
+      "step": 32960
+    },
+    {
+      "epoch": 0.2861173080094791,
+      "grad_norm": 0.061279296875,
+      "learning_rate": 0.0015424182674779797,
+      "loss": 0.0815,
+      "step": 32961
+    },
+    {
+      "epoch": 0.28612598848968324,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0015423923490541897,
+      "loss": 0.1006,
+      "step": 32962
+    },
+    {
+      "epoch": 0.28613466896988743,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0015423664301466066,
+      "loss": 0.104,
+      "step": 32963
+    },
+    {
+      "epoch": 0.28614334945009157,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001542340510755258,
+      "loss": 0.0884,
+      "step": 32964
+    },
+    {
+      "epoch": 0.28615202993029576,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0015423145908801724,
+      "loss": 0.1211,
+      "step": 32965
+    },
+    {
+      "epoch": 0.2861607104104999,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0015422886705213786,
+      "loss": 0.0889,
+      "step": 32966
+    },
+    {
+      "epoch": 0.2861693908907041,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0015422627496789044,
+      "loss": 0.1104,
+      "step": 32967
+    },
+    {
+      "epoch": 0.2861780713709082,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0015422368283527779,
+      "loss": 0.0869,
+      "step": 32968
+    },
+    {
+      "epoch": 0.2861867518511124,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0015422109065430282,
+      "loss": 0.1299,
+      "step": 32969
+    },
+    {
+      "epoch": 0.28619543233131656,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001542184984249683,
+      "loss": 0.0903,
+      "step": 32970
+    },
+    {
+      "epoch": 0.28620411281152075,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0015421590614727715,
+      "loss": 0.1074,
+      "step": 32971
+    },
+    {
+      "epoch": 0.2862127932917249,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001542133138212321,
+      "loss": 0.1445,
+      "step": 32972
+    },
+    {
+      "epoch": 0.2862214737719291,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0015421072144683606,
+      "loss": 0.1328,
+      "step": 32973
+    },
+    {
+      "epoch": 0.2862301542521332,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001542081290240918,
+      "loss": 0.1235,
+      "step": 32974
+    },
+    {
+      "epoch": 0.2862388347323374,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0015420553655300226,
+      "loss": 0.124,
+      "step": 32975
+    },
+    {
+      "epoch": 0.28624751521254155,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0015420294403357014,
+      "loss": 0.0962,
+      "step": 32976
+    },
+    {
+      "epoch": 0.28625619569274574,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0015420035146579839,
+      "loss": 0.0869,
+      "step": 32977
+    },
+    {
+      "epoch": 0.2862648761729499,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0015419775884968978,
+      "loss": 0.1094,
+      "step": 32978
+    },
+    {
+      "epoch": 0.28627355665315407,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0015419516618524717,
+      "loss": 0.083,
+      "step": 32979
+    },
+    {
+      "epoch": 0.2862822371333582,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0015419257347247342,
+      "loss": 0.0977,
+      "step": 32980
+    },
+    {
+      "epoch": 0.2862909176135624,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0015418998071137127,
+      "loss": 0.0938,
+      "step": 32981
+    },
+    {
+      "epoch": 0.28629959809376654,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001541873879019437,
+      "loss": 0.1001,
+      "step": 32982
+    },
+    {
+      "epoch": 0.28630827857397073,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0015418479504419338,
+      "loss": 0.0869,
+      "step": 32983
+    },
+    {
+      "epoch": 0.28631695905417487,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0015418220213812331,
+      "loss": 0.0771,
+      "step": 32984
+    },
+    {
+      "epoch": 0.28632563953437906,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001541796091837362,
+      "loss": 0.0933,
+      "step": 32985
+    },
+    {
+      "epoch": 0.2863343200145832,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0015417701618103494,
+      "loss": 0.126,
+      "step": 32986
+    },
+    {
+      "epoch": 0.2863430004947874,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0015417442313002236,
+      "loss": 0.1172,
+      "step": 32987
+    },
+    {
+      "epoch": 0.28635168097499153,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0015417183003070133,
+      "loss": 0.0957,
+      "step": 32988
+    },
+    {
+      "epoch": 0.2863603614551957,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0015416923688307462,
+      "loss": 0.0942,
+      "step": 32989
+    },
+    {
+      "epoch": 0.28636904193539986,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0015416664368714508,
+      "loss": 0.1309,
+      "step": 32990
+    },
+    {
+      "epoch": 0.28637772241560405,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001541640504429156,
+      "loss": 0.1157,
+      "step": 32991
+    },
+    {
+      "epoch": 0.2863864028958082,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0015416145715038895,
+      "loss": 0.1079,
+      "step": 32992
+    },
+    {
+      "epoch": 0.2863950833760124,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.00154158863809568,
+      "loss": 0.1099,
+      "step": 32993
+    },
+    {
+      "epoch": 0.2864037638562165,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0015415627042045558,
+      "loss": 0.1084,
+      "step": 32994
+    },
+    {
+      "epoch": 0.2864124443364207,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001541536769830545,
+      "loss": 0.1094,
+      "step": 32995
+    },
+    {
+      "epoch": 0.28642112481662485,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0015415108349736768,
+      "loss": 0.083,
+      "step": 32996
+    },
+    {
+      "epoch": 0.28642980529682904,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0015414848996339787,
+      "loss": 0.0991,
+      "step": 32997
+    },
+    {
+      "epoch": 0.2864384857770332,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0015414589638114795,
+      "loss": 0.0947,
+      "step": 32998
+    },
+    {
+      "epoch": 0.2864471662572374,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0015414330275062072,
+      "loss": 0.0967,
+      "step": 32999
+    },
+    {
+      "epoch": 0.2864558467374415,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0015414070907181904,
+      "loss": 0.1279,
+      "step": 33000
+    },
+    {
+      "epoch": 0.2864645272176457,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0015413811534474575,
+      "loss": 0.1094,
+      "step": 33001
+    },
+    {
+      "epoch": 0.28647320769784984,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0015413552156940365,
+      "loss": 0.0747,
+      "step": 33002
+    },
+    {
+      "epoch": 0.28648188817805403,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0015413292774579563,
+      "loss": 0.125,
+      "step": 33003
+    },
+    {
+      "epoch": 0.28649056865825817,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0015413033387392451,
+      "loss": 0.083,
+      "step": 33004
+    },
+    {
+      "epoch": 0.28649924913846236,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0015412773995379311,
+      "loss": 0.083,
+      "step": 33005
+    },
+    {
+      "epoch": 0.2865079296186665,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001541251459854043,
+      "loss": 0.1099,
+      "step": 33006
+    },
+    {
+      "epoch": 0.2865166100988707,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0015412255196876085,
+      "loss": 0.1553,
+      "step": 33007
+    },
+    {
+      "epoch": 0.28652529057907483,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0015411995790386571,
+      "loss": 0.0781,
+      "step": 33008
+    },
+    {
+      "epoch": 0.286533971059279,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001541173637907216,
+      "loss": 0.1108,
+      "step": 33009
+    },
+    {
+      "epoch": 0.28654265153948316,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001541147696293314,
+      "loss": 0.104,
+      "step": 33010
+    },
+    {
+      "epoch": 0.28655133201968735,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0015411217541969795,
+      "loss": 0.1465,
+      "step": 33011
+    },
+    {
+      "epoch": 0.2865600124998915,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001541095811618241,
+      "loss": 0.0933,
+      "step": 33012
+    },
+    {
+      "epoch": 0.2865686929800957,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0015410698685571266,
+      "loss": 0.0967,
+      "step": 33013
+    },
+    {
+      "epoch": 0.2865773734602998,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001541043925013665,
+      "loss": 0.124,
+      "step": 33014
+    },
+    {
+      "epoch": 0.286586053940504,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0015410179809878845,
+      "loss": 0.1133,
+      "step": 33015
+    },
+    {
+      "epoch": 0.28659473442070815,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0015409920364798128,
+      "loss": 0.1094,
+      "step": 33016
+    },
+    {
+      "epoch": 0.28660341490091235,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0015409660914894794,
+      "loss": 0.0869,
+      "step": 33017
+    },
+    {
+      "epoch": 0.2866120953811165,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0015409401460169119,
+      "loss": 0.1484,
+      "step": 33018
+    },
+    {
+      "epoch": 0.2866207758613207,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001540914200062139,
+      "loss": 0.0869,
+      "step": 33019
+    },
+    {
+      "epoch": 0.2866294563415248,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0015408882536251888,
+      "loss": 0.0879,
+      "step": 33020
+    },
+    {
+      "epoch": 0.28663813682172895,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0015408623067060898,
+      "loss": 0.0967,
+      "step": 33021
+    },
+    {
+      "epoch": 0.28664681730193314,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0015408363593048704,
+      "loss": 0.0601,
+      "step": 33022
+    },
+    {
+      "epoch": 0.2866554977821373,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001540810411421559,
+      "loss": 0.0854,
+      "step": 33023
+    },
+    {
+      "epoch": 0.2866641782623415,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0015407844630561842,
+      "loss": 0.084,
+      "step": 33024
+    },
+    {
+      "epoch": 0.2866728587425456,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0015407585142087738,
+      "loss": 0.0811,
+      "step": 33025
+    },
+    {
+      "epoch": 0.2866815392227498,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0015407325648793568,
+      "loss": 0.0918,
+      "step": 33026
+    },
+    {
+      "epoch": 0.28669021970295394,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0015407066150679612,
+      "loss": 0.1055,
+      "step": 33027
+    },
+    {
+      "epoch": 0.28669890018315813,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0015406806647746153,
+      "loss": 0.0659,
+      "step": 33028
+    },
+    {
+      "epoch": 0.28670758066336227,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0015406547139993482,
+      "loss": 0.1147,
+      "step": 33029
+    },
+    {
+      "epoch": 0.28671626114356646,
+      "grad_norm": 2.140625,
+      "learning_rate": 0.0015406287627421872,
+      "loss": 0.0918,
+      "step": 33030
+    },
+    {
+      "epoch": 0.2867249416237706,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0015406028110031612,
+      "loss": 0.085,
+      "step": 33031
+    },
+    {
+      "epoch": 0.2867336221039748,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0015405768587822988,
+      "loss": 0.1113,
+      "step": 33032
+    },
+    {
+      "epoch": 0.28674230258417893,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001540550906079628,
+      "loss": 0.0933,
+      "step": 33033
+    },
+    {
+      "epoch": 0.2867509830643831,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0015405249528951772,
+      "loss": 0.1128,
+      "step": 33034
+    },
+    {
+      "epoch": 0.28675966354458726,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0015404989992289753,
+      "loss": 0.126,
+      "step": 33035
+    },
+    {
+      "epoch": 0.28676834402479146,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.00154047304508105,
+      "loss": 0.1152,
+      "step": 33036
+    },
+    {
+      "epoch": 0.2867770245049956,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0015404470904514303,
+      "loss": 0.0933,
+      "step": 33037
+    },
+    {
+      "epoch": 0.2867857049851998,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001540421135340144,
+      "loss": 0.0991,
+      "step": 33038
+    },
+    {
+      "epoch": 0.2867943854654039,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0015403951797472199,
+      "loss": 0.1055,
+      "step": 33039
+    },
+    {
+      "epoch": 0.2868030659456081,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0015403692236726865,
+      "loss": 0.0898,
+      "step": 33040
+    },
+    {
+      "epoch": 0.28681174642581225,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0015403432671165713,
+      "loss": 0.1299,
+      "step": 33041
+    },
+    {
+      "epoch": 0.28682042690601645,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0015403173100789036,
+      "loss": 0.125,
+      "step": 33042
+    },
+    {
+      "epoch": 0.2868291073862206,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0015402913525597117,
+      "loss": 0.1387,
+      "step": 33043
+    },
+    {
+      "epoch": 0.2868377878664248,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001540265394559024,
+      "loss": 0.0845,
+      "step": 33044
+    },
+    {
+      "epoch": 0.2868464683466289,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0015402394360768683,
+      "loss": 0.1084,
+      "step": 33045
+    },
+    {
+      "epoch": 0.2868551488268331,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0015402134771132733,
+      "loss": 0.083,
+      "step": 33046
+    },
+    {
+      "epoch": 0.28686382930703724,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0015401875176682676,
+      "loss": 0.0879,
+      "step": 33047
+    },
+    {
+      "epoch": 0.28687250978724144,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0015401615577418796,
+      "loss": 0.1025,
+      "step": 33048
+    },
+    {
+      "epoch": 0.2868811902674456,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0015401355973341374,
+      "loss": 0.1055,
+      "step": 33049
+    },
+    {
+      "epoch": 0.28688987074764977,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0015401096364450696,
+      "loss": 0.1084,
+      "step": 33050
+    },
+    {
+      "epoch": 0.2868985512278539,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001540083675074704,
+      "loss": 0.064,
+      "step": 33051
+    },
+    {
+      "epoch": 0.2869072317080581,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0015400577132230699,
+      "loss": 0.1025,
+      "step": 33052
+    },
+    {
+      "epoch": 0.28691591218826223,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0015400317508901956,
+      "loss": 0.1079,
+      "step": 33053
+    },
+    {
+      "epoch": 0.2869245926684664,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0015400057880761088,
+      "loss": 0.104,
+      "step": 33054
+    },
+    {
+      "epoch": 0.28693327314867056,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0015399798247808384,
+      "loss": 0.085,
+      "step": 33055
+    },
+    {
+      "epoch": 0.28694195362887476,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0015399538610044127,
+      "loss": 0.1318,
+      "step": 33056
+    },
+    {
+      "epoch": 0.2869506341090789,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0015399278967468599,
+      "loss": 0.1016,
+      "step": 33057
+    },
+    {
+      "epoch": 0.2869593145892831,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0015399019320082089,
+      "loss": 0.0903,
+      "step": 33058
+    },
+    {
+      "epoch": 0.2869679950694872,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0015398759667884874,
+      "loss": 0.1523,
+      "step": 33059
+    },
+    {
+      "epoch": 0.2869766755496914,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001539850001087724,
+      "loss": 0.103,
+      "step": 33060
+    },
+    {
+      "epoch": 0.28698535602989556,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0015398240349059477,
+      "loss": 0.1055,
+      "step": 33061
+    },
+    {
+      "epoch": 0.28699403651009975,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0015397980682431863,
+      "loss": 0.1011,
+      "step": 33062
+    },
+    {
+      "epoch": 0.2870027169903039,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0015397721010994682,
+      "loss": 0.0923,
+      "step": 33063
+    },
+    {
+      "epoch": 0.2870113974705081,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0015397461334748222,
+      "loss": 0.1035,
+      "step": 33064
+    },
+    {
+      "epoch": 0.2870200779507122,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0015397201653692763,
+      "loss": 0.1631,
+      "step": 33065
+    },
+    {
+      "epoch": 0.2870287584309164,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001539694196782859,
+      "loss": 0.1152,
+      "step": 33066
+    },
+    {
+      "epoch": 0.28703743891112055,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0015396682277155989,
+      "loss": 0.0933,
+      "step": 33067
+    },
+    {
+      "epoch": 0.28704611939132474,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0015396422581675237,
+      "loss": 0.1025,
+      "step": 33068
+    },
+    {
+      "epoch": 0.2870547998715289,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001539616288138663,
+      "loss": 0.0928,
+      "step": 33069
+    },
+    {
+      "epoch": 0.28706348035173307,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0015395903176290441,
+      "loss": 0.1045,
+      "step": 33070
+    },
+    {
+      "epoch": 0.2870721608319372,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0015395643466386963,
+      "loss": 0.1035,
+      "step": 33071
+    },
+    {
+      "epoch": 0.2870808413121414,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.001539538375167647,
+      "loss": 0.1055,
+      "step": 33072
+    },
+    {
+      "epoch": 0.28708952179234554,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0015395124032159255,
+      "loss": 0.0781,
+      "step": 33073
+    },
+    {
+      "epoch": 0.28709820227254973,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0015394864307835597,
+      "loss": 0.1064,
+      "step": 33074
+    },
+    {
+      "epoch": 0.28710688275275387,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0015394604578705781,
+      "loss": 0.0884,
+      "step": 33075
+    },
+    {
+      "epoch": 0.28711556323295806,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0015394344844770092,
+      "loss": 0.0908,
+      "step": 33076
+    },
+    {
+      "epoch": 0.2871242437131622,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0015394085106028815,
+      "loss": 0.1006,
+      "step": 33077
+    },
+    {
+      "epoch": 0.2871329241933664,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001539382536248223,
+      "loss": 0.1143,
+      "step": 33078
+    },
+    {
+      "epoch": 0.2871416046735705,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0015393565614130623,
+      "loss": 0.0776,
+      "step": 33079
+    },
+    {
+      "epoch": 0.2871502851537747,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001539330586097428,
+      "loss": 0.1011,
+      "step": 33080
+    },
+    {
+      "epoch": 0.28715896563397886,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0015393046103013483,
+      "loss": 0.0889,
+      "step": 33081
+    },
+    {
+      "epoch": 0.28716764611418305,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001539278634024852,
+      "loss": 0.0986,
+      "step": 33082
+    },
+    {
+      "epoch": 0.2871763265943872,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0015392526572679671,
+      "loss": 0.0791,
+      "step": 33083
+    },
+    {
+      "epoch": 0.2871850070745914,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001539226680030722,
+      "loss": 0.0737,
+      "step": 33084
+    },
+    {
+      "epoch": 0.2871936875547955,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0015392007023131448,
+      "loss": 0.0928,
+      "step": 33085
+    },
+    {
+      "epoch": 0.2872023680349997,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0015391747241152646,
+      "loss": 0.1152,
+      "step": 33086
+    },
+    {
+      "epoch": 0.28721104851520385,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0015391487454371098,
+      "loss": 0.1064,
+      "step": 33087
+    },
+    {
+      "epoch": 0.28721972899540804,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0015391227662787084,
+      "loss": 0.1226,
+      "step": 33088
+    },
+    {
+      "epoch": 0.2872284094756122,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001539096786640089,
+      "loss": 0.123,
+      "step": 33089
+    },
+    {
+      "epoch": 0.28723708995581637,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0015390708065212795,
+      "loss": 0.123,
+      "step": 33090
+    },
+    {
+      "epoch": 0.2872457704360205,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001539044825922309,
+      "loss": 0.1035,
+      "step": 33091
+    },
+    {
+      "epoch": 0.2872544509162247,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0015390188448432059,
+      "loss": 0.1143,
+      "step": 33092
+    },
+    {
+      "epoch": 0.28726313139642884,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0015389928632839982,
+      "loss": 0.1055,
+      "step": 33093
+    },
+    {
+      "epoch": 0.28727181187663303,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0015389668812447145,
+      "loss": 0.1021,
+      "step": 33094
+    },
+    {
+      "epoch": 0.28728049235683717,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0015389408987253833,
+      "loss": 0.0947,
+      "step": 33095
+    },
+    {
+      "epoch": 0.28728917283704136,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001538914915726033,
+      "loss": 0.1279,
+      "step": 33096
+    },
+    {
+      "epoch": 0.2872978533172455,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001538888932246692,
+      "loss": 0.0708,
+      "step": 33097
+    },
+    {
+      "epoch": 0.2873065337974497,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.001538862948287388,
+      "loss": 0.1055,
+      "step": 33098
+    },
+    {
+      "epoch": 0.28731521427765383,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0015388369638481508,
+      "loss": 0.1387,
+      "step": 33099
+    },
+    {
+      "epoch": 0.287323894757858,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0015388109789290078,
+      "loss": 0.1006,
+      "step": 33100
+    },
+    {
+      "epoch": 0.28733257523806216,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0015387849935299877,
+      "loss": 0.0747,
+      "step": 33101
+    },
+    {
+      "epoch": 0.28734125571826635,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0015387590076511189,
+      "loss": 0.1074,
+      "step": 33102
+    },
+    {
+      "epoch": 0.2873499361984705,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00153873302129243,
+      "loss": 0.1035,
+      "step": 33103
+    },
+    {
+      "epoch": 0.2873586166786747,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0015387070344539493,
+      "loss": 0.1094,
+      "step": 33104
+    },
+    {
+      "epoch": 0.2873672971588788,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0015386810471357051,
+      "loss": 0.0811,
+      "step": 33105
+    },
+    {
+      "epoch": 0.287375977639083,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0015386550593377258,
+      "loss": 0.1104,
+      "step": 33106
+    },
+    {
+      "epoch": 0.28738465811928715,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.00153862907106004,
+      "loss": 0.0718,
+      "step": 33107
+    },
+    {
+      "epoch": 0.28739333859949134,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0015386030823026757,
+      "loss": 0.1416,
+      "step": 33108
+    },
+    {
+      "epoch": 0.2874020190796955,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0015385770930656621,
+      "loss": 0.0957,
+      "step": 33109
+    },
+    {
+      "epoch": 0.2874106995598997,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0015385511033490272,
+      "loss": 0.1689,
+      "step": 33110
+    },
+    {
+      "epoch": 0.2874193800401038,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.001538525113152799,
+      "loss": 0.1543,
+      "step": 33111
+    },
+    {
+      "epoch": 0.287428060520308,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0015384991224770069,
+      "loss": 0.1055,
+      "step": 33112
+    },
+    {
+      "epoch": 0.28743674100051214,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0015384731313216784,
+      "loss": 0.0859,
+      "step": 33113
+    },
+    {
+      "epoch": 0.28744542148071633,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0015384471396868422,
+      "loss": 0.085,
+      "step": 33114
+    },
+    {
+      "epoch": 0.28745410196092047,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0015384211475725268,
+      "loss": 0.0957,
+      "step": 33115
+    },
+    {
+      "epoch": 0.28746278244112466,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0015383951549787606,
+      "loss": 0.0864,
+      "step": 33116
+    },
+    {
+      "epoch": 0.2874714629213288,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0015383691619055724,
+      "loss": 0.1191,
+      "step": 33117
+    },
+    {
+      "epoch": 0.287480143401533,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0015383431683529901,
+      "loss": 0.0972,
+      "step": 33118
+    },
+    {
+      "epoch": 0.28748882388173713,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001538317174321042,
+      "loss": 0.123,
+      "step": 33119
+    },
+    {
+      "epoch": 0.2874975043619413,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.001538291179809757,
+      "loss": 0.1064,
+      "step": 33120
+    },
+    {
+      "epoch": 0.28750618484214546,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0015382651848191635,
+      "loss": 0.1211,
+      "step": 33121
+    },
+    {
+      "epoch": 0.28751486532234966,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0015382391893492896,
+      "loss": 0.1055,
+      "step": 33122
+    },
+    {
+      "epoch": 0.2875235458025538,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0015382131934001642,
+      "loss": 0.2324,
+      "step": 33123
+    },
+    {
+      "epoch": 0.287532226282758,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001538187196971815,
+      "loss": 0.0923,
+      "step": 33124
+    },
+    {
+      "epoch": 0.2875409067629621,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001538161200064271,
+      "loss": 0.126,
+      "step": 33125
+    },
+    {
+      "epoch": 0.2875495872431663,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0015381352026775609,
+      "loss": 0.0908,
+      "step": 33126
+    },
+    {
+      "epoch": 0.28755826772337045,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0015381092048117123,
+      "loss": 0.126,
+      "step": 33127
+    },
+    {
+      "epoch": 0.28756694820357465,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.001538083206466754,
+      "loss": 0.1177,
+      "step": 33128
+    },
+    {
+      "epoch": 0.2875756286837788,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0015380572076427149,
+      "loss": 0.0708,
+      "step": 33129
+    },
+    {
+      "epoch": 0.287584309163983,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0015380312083396225,
+      "loss": 0.1094,
+      "step": 33130
+    },
+    {
+      "epoch": 0.2875929896441871,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0015380052085575062,
+      "loss": 0.0781,
+      "step": 33131
+    },
+    {
+      "epoch": 0.2876016701243913,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0015379792082963937,
+      "loss": 0.1196,
+      "step": 33132
+    },
+    {
+      "epoch": 0.28761035060459544,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0015379532075563138,
+      "loss": 0.1172,
+      "step": 33133
+    },
+    {
+      "epoch": 0.28761903108479964,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0015379272063372948,
+      "loss": 0.1191,
+      "step": 33134
+    },
+    {
+      "epoch": 0.2876277115650038,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0015379012046393655,
+      "loss": 0.1152,
+      "step": 33135
+    },
+    {
+      "epoch": 0.28763639204520797,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0015378752024625538,
+      "loss": 0.1162,
+      "step": 33136
+    },
+    {
+      "epoch": 0.2876450725254121,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0015378491998068883,
+      "loss": 0.1064,
+      "step": 33137
+    },
+    {
+      "epoch": 0.2876537530056163,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0015378231966723973,
+      "loss": 0.0679,
+      "step": 33138
+    },
+    {
+      "epoch": 0.28766243348582043,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0015377971930591097,
+      "loss": 0.1182,
+      "step": 33139
+    },
+    {
+      "epoch": 0.2876711139660246,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0015377711889670539,
+      "loss": 0.1133,
+      "step": 33140
+    },
+    {
+      "epoch": 0.28767979444622876,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0015377451843962575,
+      "loss": 0.1221,
+      "step": 33141
+    },
+    {
+      "epoch": 0.28768847492643296,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0015377191793467502,
+      "loss": 0.0879,
+      "step": 33142
+    },
+    {
+      "epoch": 0.2876971554066371,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0015376931738185593,
+      "loss": 0.0957,
+      "step": 33143
+    },
+    {
+      "epoch": 0.28770583588684123,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001537667167811714,
+      "loss": 0.1123,
+      "step": 33144
+    },
+    {
+      "epoch": 0.2877145163670454,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0015376411613262425,
+      "loss": 0.1162,
+      "step": 33145
+    },
+    {
+      "epoch": 0.28772319684724956,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0015376151543621728,
+      "loss": 0.0908,
+      "step": 33146
+    },
+    {
+      "epoch": 0.28773187732745376,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001537589146919534,
+      "loss": 0.0938,
+      "step": 33147
+    },
+    {
+      "epoch": 0.2877405578076579,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001537563138998354,
+      "loss": 0.124,
+      "step": 33148
+    },
+    {
+      "epoch": 0.2877492382878621,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0015375371305986624,
+      "loss": 0.1182,
+      "step": 33149
+    },
+    {
+      "epoch": 0.2877579187680662,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0015375111217204858,
+      "loss": 0.0986,
+      "step": 33150
+    },
+    {
+      "epoch": 0.2877665992482704,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001537485112363854,
+      "loss": 0.1069,
+      "step": 33151
+    },
+    {
+      "epoch": 0.28777527972847455,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001537459102528795,
+      "loss": 0.1328,
+      "step": 33152
+    },
+    {
+      "epoch": 0.28778396020867875,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0015374330922153375,
+      "loss": 0.0884,
+      "step": 33153
+    },
+    {
+      "epoch": 0.2877926406888829,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0015374070814235094,
+      "loss": 0.0845,
+      "step": 33154
+    },
+    {
+      "epoch": 0.2878013211690871,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0015373810701533397,
+      "loss": 0.1016,
+      "step": 33155
+    },
+    {
+      "epoch": 0.2878100016492912,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0015373550584048565,
+      "loss": 0.0791,
+      "step": 33156
+    },
+    {
+      "epoch": 0.2878186821294954,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0015373290461780887,
+      "loss": 0.1318,
+      "step": 33157
+    },
+    {
+      "epoch": 0.28782736260969954,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001537303033473064,
+      "loss": 0.105,
+      "step": 33158
+    },
+    {
+      "epoch": 0.28783604308990374,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0015372770202898114,
+      "loss": 0.1021,
+      "step": 33159
+    },
+    {
+      "epoch": 0.2878447235701079,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0015372510066283596,
+      "loss": 0.1025,
+      "step": 33160
+    },
+    {
+      "epoch": 0.28785340405031207,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0015372249924887362,
+      "loss": 0.1089,
+      "step": 33161
+    },
+    {
+      "epoch": 0.2878620845305162,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0015371989778709702,
+      "loss": 0.1001,
+      "step": 33162
+    },
+    {
+      "epoch": 0.2878707650107204,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0015371729627750902,
+      "loss": 0.1064,
+      "step": 33163
+    },
+    {
+      "epoch": 0.28787944549092453,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0015371469472011243,
+      "loss": 0.0703,
+      "step": 33164
+    },
+    {
+      "epoch": 0.2878881259711287,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001537120931149101,
+      "loss": 0.1484,
+      "step": 33165
+    },
+    {
+      "epoch": 0.28789680645133287,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0015370949146190488,
+      "loss": 0.1367,
+      "step": 33166
+    },
+    {
+      "epoch": 0.28790548693153706,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0015370688976109959,
+      "loss": 0.1196,
+      "step": 33167
+    },
+    {
+      "epoch": 0.2879141674117412,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0015370428801249716,
+      "loss": 0.0938,
+      "step": 33168
+    },
+    {
+      "epoch": 0.2879228478919454,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0015370168621610034,
+      "loss": 0.1445,
+      "step": 33169
+    },
+    {
+      "epoch": 0.2879315283721495,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0015369908437191204,
+      "loss": 0.1123,
+      "step": 33170
+    },
+    {
+      "epoch": 0.2879402088523537,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0015369648247993508,
+      "loss": 0.084,
+      "step": 33171
+    },
+    {
+      "epoch": 0.28794888933255786,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0015369388054017226,
+      "loss": 0.0947,
+      "step": 33172
+    },
+    {
+      "epoch": 0.28795756981276205,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001536912785526265,
+      "loss": 0.125,
+      "step": 33173
+    },
+    {
+      "epoch": 0.2879662502929662,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0015368867651730062,
+      "loss": 0.0996,
+      "step": 33174
+    },
+    {
+      "epoch": 0.2879749307731704,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0015368607443419745,
+      "loss": 0.1123,
+      "step": 33175
+    },
+    {
+      "epoch": 0.2879836112533745,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0015368347230331984,
+      "loss": 0.1523,
+      "step": 33176
+    },
+    {
+      "epoch": 0.2879922917335787,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0015368087012467063,
+      "loss": 0.0986,
+      "step": 33177
+    },
+    {
+      "epoch": 0.28800097221378285,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001536782678982527,
+      "loss": 0.1406,
+      "step": 33178
+    },
+    {
+      "epoch": 0.28800965269398704,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0015367566562406885,
+      "loss": 0.0952,
+      "step": 33179
+    },
+    {
+      "epoch": 0.2880183331741912,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0015367306330212197,
+      "loss": 0.1162,
+      "step": 33180
+    },
+    {
+      "epoch": 0.28802701365439537,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0015367046093241487,
+      "loss": 0.103,
+      "step": 33181
+    },
+    {
+      "epoch": 0.2880356941345995,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0015366785851495045,
+      "loss": 0.1055,
+      "step": 33182
+    },
+    {
+      "epoch": 0.2880443746148037,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0015366525604973147,
+      "loss": 0.1016,
+      "step": 33183
+    },
+    {
+      "epoch": 0.28805305509500784,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001536626535367608,
+      "loss": 0.1035,
+      "step": 33184
+    },
+    {
+      "epoch": 0.28806173557521203,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0015366005097604137,
+      "loss": 0.0952,
+      "step": 33185
+    },
+    {
+      "epoch": 0.28807041605541617,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0015365744836757591,
+      "loss": 0.1045,
+      "step": 33186
+    },
+    {
+      "epoch": 0.28807909653562036,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0015365484571136737,
+      "loss": 0.1113,
+      "step": 33187
+    },
+    {
+      "epoch": 0.2880877770158245,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.001536522430074185,
+      "loss": 0.1475,
+      "step": 33188
+    },
+    {
+      "epoch": 0.2880964574960287,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0015364964025573222,
+      "loss": 0.0977,
+      "step": 33189
+    },
+    {
+      "epoch": 0.28810513797623283,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0015364703745631134,
+      "loss": 0.1182,
+      "step": 33190
+    },
+    {
+      "epoch": 0.288113818456437,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0015364443460915873,
+      "loss": 0.0986,
+      "step": 33191
+    },
+    {
+      "epoch": 0.28812249893664116,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0015364183171427722,
+      "loss": 0.1436,
+      "step": 33192
+    },
+    {
+      "epoch": 0.28813117941684535,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0015363922877166965,
+      "loss": 0.1016,
+      "step": 33193
+    },
+    {
+      "epoch": 0.2881398598970495,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0015363662578133889,
+      "loss": 0.0815,
+      "step": 33194
+    },
+    {
+      "epoch": 0.2881485403772537,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0015363402274328774,
+      "loss": 0.1191,
+      "step": 33195
+    },
+    {
+      "epoch": 0.2881572208574578,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.001536314196575191,
+      "loss": 0.1084,
+      "step": 33196
+    },
+    {
+      "epoch": 0.288165901337662,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0015362881652403579,
+      "loss": 0.0898,
+      "step": 33197
+    },
+    {
+      "epoch": 0.28817458181786615,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0015362621334284066,
+      "loss": 0.1006,
+      "step": 33198
+    },
+    {
+      "epoch": 0.28818326229807034,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0015362361011393658,
+      "loss": 0.0776,
+      "step": 33199
+    },
+    {
+      "epoch": 0.2881919427782745,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0015362100683732633,
+      "loss": 0.1001,
+      "step": 33200
+    },
+    {
+      "epoch": 0.28820062325847867,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0015361840351301285,
+      "loss": 0.1074,
+      "step": 33201
+    },
+    {
+      "epoch": 0.2882093037386828,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0015361580014099892,
+      "loss": 0.1094,
+      "step": 33202
+    },
+    {
+      "epoch": 0.288217984218887,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0015361319672128743,
+      "loss": 0.0986,
+      "step": 33203
+    },
+    {
+      "epoch": 0.28822666469909114,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0015361059325388116,
+      "loss": 0.124,
+      "step": 33204
+    },
+    {
+      "epoch": 0.28823534517929533,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0015360798973878302,
+      "loss": 0.0972,
+      "step": 33205
+    },
+    {
+      "epoch": 0.28824402565949947,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0015360538617599582,
+      "loss": 0.1211,
+      "step": 33206
+    },
+    {
+      "epoch": 0.28825270613970366,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0015360278256552245,
+      "loss": 0.1162,
+      "step": 33207
+    },
+    {
+      "epoch": 0.2882613866199078,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001536001789073657,
+      "loss": 0.0835,
+      "step": 33208
+    },
+    {
+      "epoch": 0.288270067100112,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0015359757520152849,
+      "loss": 0.1035,
+      "step": 33209
+    },
+    {
+      "epoch": 0.28827874758031613,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0015359497144801363,
+      "loss": 0.0938,
+      "step": 33210
+    },
+    {
+      "epoch": 0.2882874280605203,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001535923676468239,
+      "loss": 0.084,
+      "step": 33211
+    },
+    {
+      "epoch": 0.28829610854072446,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0015358976379796227,
+      "loss": 0.1123,
+      "step": 33212
+    },
+    {
+      "epoch": 0.28830478902092865,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001535871599014315,
+      "loss": 0.1182,
+      "step": 33213
+    },
+    {
+      "epoch": 0.2883134695011328,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0015358455595723451,
+      "loss": 0.0889,
+      "step": 33214
+    },
+    {
+      "epoch": 0.288322149981337,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0015358195196537407,
+      "loss": 0.0825,
+      "step": 33215
+    },
+    {
+      "epoch": 0.2883308304615411,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0015357934792585306,
+      "loss": 0.083,
+      "step": 33216
+    },
+    {
+      "epoch": 0.2883395109417453,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0015357674383867437,
+      "loss": 0.1299,
+      "step": 33217
+    },
+    {
+      "epoch": 0.28834819142194945,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0015357413970384078,
+      "loss": 0.126,
+      "step": 33218
+    },
+    {
+      "epoch": 0.28835687190215364,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0015357153552135515,
+      "loss": 0.1338,
+      "step": 33219
+    },
+    {
+      "epoch": 0.2883655523823578,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0015356893129122035,
+      "loss": 0.1123,
+      "step": 33220
+    },
+    {
+      "epoch": 0.288374232862562,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0015356632701343921,
+      "loss": 0.1211,
+      "step": 33221
+    },
+    {
+      "epoch": 0.2883829133427661,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0015356372268801465,
+      "loss": 0.1104,
+      "step": 33222
+    },
+    {
+      "epoch": 0.2883915938229703,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001535611183149494,
+      "loss": 0.1074,
+      "step": 33223
+    },
+    {
+      "epoch": 0.28840027430317444,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0015355851389424636,
+      "loss": 0.1147,
+      "step": 33224
+    },
+    {
+      "epoch": 0.28840895478337863,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0015355590942590838,
+      "loss": 0.0908,
+      "step": 33225
+    },
+    {
+      "epoch": 0.2884176352635828,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0015355330490993836,
+      "loss": 0.0996,
+      "step": 33226
+    },
+    {
+      "epoch": 0.28842631574378697,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0015355070034633906,
+      "loss": 0.1162,
+      "step": 33227
+    },
+    {
+      "epoch": 0.2884349962239911,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001535480957351134,
+      "loss": 0.1025,
+      "step": 33228
+    },
+    {
+      "epoch": 0.2884436767041953,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0015354549107626416,
+      "loss": 0.1396,
+      "step": 33229
+    },
+    {
+      "epoch": 0.28845235718439943,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0015354288636979425,
+      "loss": 0.0898,
+      "step": 33230
+    },
+    {
+      "epoch": 0.2884610376646036,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.001535402816157065,
+      "loss": 0.3789,
+      "step": 33231
+    },
+    {
+      "epoch": 0.28846971814480776,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0015353767681400372,
+      "loss": 0.1035,
+      "step": 33232
+    },
+    {
+      "epoch": 0.28847839862501196,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001535350719646888,
+      "loss": 0.1514,
+      "step": 33233
+    },
+    {
+      "epoch": 0.2884870791052161,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001535324670677646,
+      "loss": 0.0889,
+      "step": 33234
+    },
+    {
+      "epoch": 0.2884957595854203,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0015352986212323397,
+      "loss": 0.1367,
+      "step": 33235
+    },
+    {
+      "epoch": 0.2885044400656244,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0015352725713109966,
+      "loss": 0.1562,
+      "step": 33236
+    },
+    {
+      "epoch": 0.2885131205458286,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0015352465209136467,
+      "loss": 0.1182,
+      "step": 33237
+    },
+    {
+      "epoch": 0.28852180102603275,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0015352204700403175,
+      "loss": 0.1172,
+      "step": 33238
+    },
+    {
+      "epoch": 0.28853048150623695,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0015351944186910377,
+      "loss": 0.0894,
+      "step": 33239
+    },
+    {
+      "epoch": 0.2885391619864411,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0015351683668658358,
+      "loss": 0.123,
+      "step": 33240
+    },
+    {
+      "epoch": 0.2885478424666453,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0015351423145647405,
+      "loss": 0.0977,
+      "step": 33241
+    },
+    {
+      "epoch": 0.2885565229468494,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0015351162617877798,
+      "loss": 0.0967,
+      "step": 33242
+    },
+    {
+      "epoch": 0.2885652034270536,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0015350902085349825,
+      "loss": 0.127,
+      "step": 33243
+    },
+    {
+      "epoch": 0.28857388390725774,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0015350641548063776,
+      "loss": 0.0928,
+      "step": 33244
+    },
+    {
+      "epoch": 0.28858256438746194,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015350381006019923,
+      "loss": 0.0752,
+      "step": 33245
+    },
+    {
+      "epoch": 0.2885912448676661,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0015350120459218563,
+      "loss": 0.1143,
+      "step": 33246
+    },
+    {
+      "epoch": 0.28859992534787027,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0015349859907659975,
+      "loss": 0.0664,
+      "step": 33247
+    },
+    {
+      "epoch": 0.2886086058280744,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001534959935134445,
+      "loss": 0.1426,
+      "step": 33248
+    },
+    {
+      "epoch": 0.2886172863082786,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0015349338790272263,
+      "loss": 0.1045,
+      "step": 33249
+    },
+    {
+      "epoch": 0.28862596678848274,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0015349078224443707,
+      "loss": 0.1182,
+      "step": 33250
+    },
+    {
+      "epoch": 0.28863464726868693,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0015348817653859067,
+      "loss": 0.1162,
+      "step": 33251
+    },
+    {
+      "epoch": 0.28864332774889107,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0015348557078518619,
+      "loss": 0.0913,
+      "step": 33252
+    },
+    {
+      "epoch": 0.28865200822909526,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0015348296498422659,
+      "loss": 0.1562,
+      "step": 33253
+    },
+    {
+      "epoch": 0.2886606887092994,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0015348035913571466,
+      "loss": 0.1348,
+      "step": 33254
+    },
+    {
+      "epoch": 0.2886693691895036,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0015347775323965326,
+      "loss": 0.0942,
+      "step": 33255
+    },
+    {
+      "epoch": 0.2886780496697077,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0015347514729604523,
+      "loss": 0.0957,
+      "step": 33256
+    },
+    {
+      "epoch": 0.2886867301499119,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0015347254130489347,
+      "loss": 0.1016,
+      "step": 33257
+    },
+    {
+      "epoch": 0.28869541063011606,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0015346993526620077,
+      "loss": 0.1191,
+      "step": 33258
+    },
+    {
+      "epoch": 0.28870409111032025,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0015346732917997,
+      "loss": 0.105,
+      "step": 33259
+    },
+    {
+      "epoch": 0.2887127715905244,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00153464723046204,
+      "loss": 0.0752,
+      "step": 33260
+    },
+    {
+      "epoch": 0.2887214520707286,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0015346211686490566,
+      "loss": 0.0898,
+      "step": 33261
+    },
+    {
+      "epoch": 0.2887301325509327,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0015345951063607778,
+      "loss": 0.1846,
+      "step": 33262
+    },
+    {
+      "epoch": 0.2887388130311369,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0015345690435972323,
+      "loss": 0.1387,
+      "step": 33263
+    },
+    {
+      "epoch": 0.28874749351134105,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0015345429803584488,
+      "loss": 0.1035,
+      "step": 33264
+    },
+    {
+      "epoch": 0.28875617399154524,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0015345169166444557,
+      "loss": 0.123,
+      "step": 33265
+    },
+    {
+      "epoch": 0.2887648544717494,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015344908524552812,
+      "loss": 0.1279,
+      "step": 33266
+    },
+    {
+      "epoch": 0.2887735349519535,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0015344647877909541,
+      "loss": 0.0957,
+      "step": 33267
+    },
+    {
+      "epoch": 0.2887822154321577,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0015344387226515028,
+      "loss": 0.0811,
+      "step": 33268
+    },
+    {
+      "epoch": 0.28879089591236184,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0015344126570369558,
+      "loss": 0.1123,
+      "step": 33269
+    },
+    {
+      "epoch": 0.28879957639256604,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001534386590947342,
+      "loss": 0.0981,
+      "step": 33270
+    },
+    {
+      "epoch": 0.2888082568727702,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0015343605243826892,
+      "loss": 0.0825,
+      "step": 33271
+    },
+    {
+      "epoch": 0.28881693735297437,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001534334457343026,
+      "loss": 0.1445,
+      "step": 33272
+    },
+    {
+      "epoch": 0.2888256178331785,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0015343083898283817,
+      "loss": 0.1006,
+      "step": 33273
+    },
+    {
+      "epoch": 0.2888342983133827,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001534282321838784,
+      "loss": 0.0957,
+      "step": 33274
+    },
+    {
+      "epoch": 0.28884297879358684,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0015342562533742616,
+      "loss": 0.1006,
+      "step": 33275
+    },
+    {
+      "epoch": 0.28885165927379103,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0015342301844348432,
+      "loss": 0.0996,
+      "step": 33276
+    },
+    {
+      "epoch": 0.28886033975399517,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0015342041150205575,
+      "loss": 0.1011,
+      "step": 33277
+    },
+    {
+      "epoch": 0.28886902023419936,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015341780451314324,
+      "loss": 0.083,
+      "step": 33278
+    },
+    {
+      "epoch": 0.2888777007144035,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0015341519747674968,
+      "loss": 0.1084,
+      "step": 33279
+    },
+    {
+      "epoch": 0.2888863811946077,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.001534125903928779,
+      "loss": 0.1055,
+      "step": 33280
+    },
+    {
+      "epoch": 0.2888950616748118,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0015340998326153076,
+      "loss": 0.0903,
+      "step": 33281
+    },
+    {
+      "epoch": 0.288903742155016,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0015340737608271113,
+      "loss": 0.1191,
+      "step": 33282
+    },
+    {
+      "epoch": 0.28891242263522016,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0015340476885642184,
+      "loss": 0.124,
+      "step": 33283
+    },
+    {
+      "epoch": 0.28892110311542435,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0015340216158266572,
+      "loss": 0.1118,
+      "step": 33284
+    },
+    {
+      "epoch": 0.2889297835956285,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0015339955426144569,
+      "loss": 0.104,
+      "step": 33285
+    },
+    {
+      "epoch": 0.2889384640758327,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0015339694689276455,
+      "loss": 0.0928,
+      "step": 33286
+    },
+    {
+      "epoch": 0.2889471445560368,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0015339433947662517,
+      "loss": 0.0986,
+      "step": 33287
+    },
+    {
+      "epoch": 0.288955825036241,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0015339173201303037,
+      "loss": 0.0811,
+      "step": 33288
+    },
+    {
+      "epoch": 0.28896450551644515,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0015338912450198303,
+      "loss": 0.1816,
+      "step": 33289
+    },
+    {
+      "epoch": 0.28897318599664934,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.00153386516943486,
+      "loss": 0.1128,
+      "step": 33290
+    },
+    {
+      "epoch": 0.2889818664768535,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001533839093375421,
+      "loss": 0.0859,
+      "step": 33291
+    },
+    {
+      "epoch": 0.28899054695705767,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0015338130168415425,
+      "loss": 0.0713,
+      "step": 33292
+    },
+    {
+      "epoch": 0.2889992274372618,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0015337869398332523,
+      "loss": 0.1016,
+      "step": 33293
+    },
+    {
+      "epoch": 0.289007907917466,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0015337608623505795,
+      "loss": 0.1182,
+      "step": 33294
+    },
+    {
+      "epoch": 0.28901658839767014,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0015337347843935522,
+      "loss": 0.1104,
+      "step": 33295
+    },
+    {
+      "epoch": 0.28902526887787433,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001533708705962199,
+      "loss": 0.1396,
+      "step": 33296
+    },
+    {
+      "epoch": 0.28903394935807847,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0015336826270565486,
+      "loss": 0.1177,
+      "step": 33297
+    },
+    {
+      "epoch": 0.28904262983828266,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0015336565476766293,
+      "loss": 0.0928,
+      "step": 33298
+    },
+    {
+      "epoch": 0.2890513103184868,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.00153363046782247,
+      "loss": 0.1582,
+      "step": 33299
+    },
+    {
+      "epoch": 0.289059990798691,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0015336043874940983,
+      "loss": 0.084,
+      "step": 33300
+    },
+    {
+      "epoch": 0.28906867127889513,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0015335783066915435,
+      "loss": 0.1572,
+      "step": 33301
+    },
+    {
+      "epoch": 0.2890773517590993,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0015335522254148345,
+      "loss": 0.0693,
+      "step": 33302
+    },
+    {
+      "epoch": 0.28908603223930346,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001533526143663999,
+      "loss": 0.083,
+      "step": 33303
+    },
+    {
+      "epoch": 0.28909471271950765,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001533500061439066,
+      "loss": 0.1011,
+      "step": 33304
+    },
+    {
+      "epoch": 0.2891033931997118,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0015334739787400636,
+      "loss": 0.0986,
+      "step": 33305
+    },
+    {
+      "epoch": 0.289112073679916,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0015334478955670203,
+      "loss": 0.1025,
+      "step": 33306
+    },
+    {
+      "epoch": 0.2891207541601201,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0015334218119199656,
+      "loss": 0.0986,
+      "step": 33307
+    },
+    {
+      "epoch": 0.2891294346403243,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0015333957277989267,
+      "loss": 0.1162,
+      "step": 33308
+    },
+    {
+      "epoch": 0.28913811512052845,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001533369643203933,
+      "loss": 0.0967,
+      "step": 33309
+    },
+    {
+      "epoch": 0.28914679560073264,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0015333435581350128,
+      "loss": 0.0781,
+      "step": 33310
+    },
+    {
+      "epoch": 0.2891554760809368,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0015333174725921945,
+      "loss": 0.0874,
+      "step": 33311
+    },
+    {
+      "epoch": 0.289164156561141,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0015332913865755068,
+      "loss": 0.0898,
+      "step": 33312
+    },
+    {
+      "epoch": 0.2891728370413451,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0015332653000849784,
+      "loss": 0.0918,
+      "step": 33313
+    },
+    {
+      "epoch": 0.2891815175215493,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001533239213120637,
+      "loss": 0.1084,
+      "step": 33314
+    },
+    {
+      "epoch": 0.28919019800175344,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0015332131256825122,
+      "loss": 0.1533,
+      "step": 33315
+    },
+    {
+      "epoch": 0.28919887848195763,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001533187037770632,
+      "loss": 0.1289,
+      "step": 33316
+    },
+    {
+      "epoch": 0.28920755896216177,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0015331609493850248,
+      "loss": 0.1104,
+      "step": 33317
+    },
+    {
+      "epoch": 0.28921623944236596,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001533134860525719,
+      "loss": 0.1016,
+      "step": 33318
+    },
+    {
+      "epoch": 0.2892249199225701,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0015331087711927438,
+      "loss": 0.0928,
+      "step": 33319
+    },
+    {
+      "epoch": 0.2892336004027743,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0015330826813861275,
+      "loss": 0.1328,
+      "step": 33320
+    },
+    {
+      "epoch": 0.28924228088297843,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0015330565911058983,
+      "loss": 0.0942,
+      "step": 33321
+    },
+    {
+      "epoch": 0.2892509613631826,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001533030500352085,
+      "loss": 0.1221,
+      "step": 33322
+    },
+    {
+      "epoch": 0.28925964184338676,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0015330044091247158,
+      "loss": 0.1123,
+      "step": 33323
+    },
+    {
+      "epoch": 0.28926832232359095,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0015329783174238196,
+      "loss": 0.1289,
+      "step": 33324
+    },
+    {
+      "epoch": 0.2892770028037951,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001532952225249425,
+      "loss": 0.0996,
+      "step": 33325
+    },
+    {
+      "epoch": 0.2892856832839993,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0015329261326015602,
+      "loss": 0.1064,
+      "step": 33326
+    },
+    {
+      "epoch": 0.2892943637642034,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0015329000394802537,
+      "loss": 0.0918,
+      "step": 33327
+    },
+    {
+      "epoch": 0.2893030442444076,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0015328739458855345,
+      "loss": 0.0811,
+      "step": 33328
+    },
+    {
+      "epoch": 0.28931172472461175,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0015328478518174306,
+      "loss": 0.1211,
+      "step": 33329
+    },
+    {
+      "epoch": 0.28932040520481594,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0015328217572759708,
+      "loss": 0.1016,
+      "step": 33330
+    },
+    {
+      "epoch": 0.2893290856850201,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0015327956622611838,
+      "loss": 0.085,
+      "step": 33331
+    },
+    {
+      "epoch": 0.2893377661652243,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.001532769566773098,
+      "loss": 0.1006,
+      "step": 33332
+    },
+    {
+      "epoch": 0.2893464466454284,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001532743470811742,
+      "loss": 0.209,
+      "step": 33333
+    },
+    {
+      "epoch": 0.2893551271256326,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0015327173743771441,
+      "loss": 0.1055,
+      "step": 33334
+    },
+    {
+      "epoch": 0.28936380760583674,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0015326912774693332,
+      "loss": 0.1553,
+      "step": 33335
+    },
+    {
+      "epoch": 0.28937248808604094,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0015326651800883372,
+      "loss": 0.0957,
+      "step": 33336
+    },
+    {
+      "epoch": 0.2893811685662451,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0015326390822341853,
+      "loss": 0.1123,
+      "step": 33337
+    },
+    {
+      "epoch": 0.28938984904644927,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0015326129839069056,
+      "loss": 0.1309,
+      "step": 33338
+    },
+    {
+      "epoch": 0.2893985295266534,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0015325868851065272,
+      "loss": 0.1094,
+      "step": 33339
+    },
+    {
+      "epoch": 0.2894072100068576,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001532560785833078,
+      "loss": 0.1011,
+      "step": 33340
+    },
+    {
+      "epoch": 0.28941589048706173,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0015325346860865867,
+      "loss": 0.2305,
+      "step": 33341
+    },
+    {
+      "epoch": 0.2894245709672659,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0015325085858670824,
+      "loss": 0.0898,
+      "step": 33342
+    },
+    {
+      "epoch": 0.28943325144747006,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0015324824851745932,
+      "loss": 0.0879,
+      "step": 33343
+    },
+    {
+      "epoch": 0.28944193192767426,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001532456384009147,
+      "loss": 0.1064,
+      "step": 33344
+    },
+    {
+      "epoch": 0.2894506124078784,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0015324302823707737,
+      "loss": 0.1084,
+      "step": 33345
+    },
+    {
+      "epoch": 0.2894592928880826,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0015324041802595007,
+      "loss": 0.0603,
+      "step": 33346
+    },
+    {
+      "epoch": 0.2894679733682867,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0015323780776753575,
+      "loss": 0.1099,
+      "step": 33347
+    },
+    {
+      "epoch": 0.2894766538484909,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0015323519746183716,
+      "loss": 0.2695,
+      "step": 33348
+    },
+    {
+      "epoch": 0.28948533432869505,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001532325871088572,
+      "loss": 0.0986,
+      "step": 33349
+    },
+    {
+      "epoch": 0.28949401480889925,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0015322997670859878,
+      "loss": 0.1562,
+      "step": 33350
+    },
+    {
+      "epoch": 0.2895026952891034,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001532273662610647,
+      "loss": 0.0737,
+      "step": 33351
+    },
+    {
+      "epoch": 0.2895113757693076,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0015322475576625778,
+      "loss": 0.0728,
+      "step": 33352
+    },
+    {
+      "epoch": 0.2895200562495117,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0015322214522418094,
+      "loss": 0.0879,
+      "step": 33353
+    },
+    {
+      "epoch": 0.2895287367297159,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0015321953463483702,
+      "loss": 0.1045,
+      "step": 33354
+    },
+    {
+      "epoch": 0.28953741720992004,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001532169239982289,
+      "loss": 0.1387,
+      "step": 33355
+    },
+    {
+      "epoch": 0.28954609769012424,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0015321431331435935,
+      "loss": 0.0938,
+      "step": 33356
+    },
+    {
+      "epoch": 0.2895547781703284,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0015321170258323128,
+      "loss": 0.1226,
+      "step": 33357
+    },
+    {
+      "epoch": 0.28956345865053257,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0015320909180484752,
+      "loss": 0.1025,
+      "step": 33358
+    },
+    {
+      "epoch": 0.2895721391307367,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.00153206480979211,
+      "loss": 0.1416,
+      "step": 33359
+    },
+    {
+      "epoch": 0.2895808196109409,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0015320387010632449,
+      "loss": 0.1328,
+      "step": 33360
+    },
+    {
+      "epoch": 0.28958950009114504,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001532012591861909,
+      "loss": 0.0791,
+      "step": 33361
+    },
+    {
+      "epoch": 0.28959818057134923,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0015319864821881305,
+      "loss": 0.1152,
+      "step": 33362
+    },
+    {
+      "epoch": 0.28960686105155337,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001531960372041938,
+      "loss": 0.0845,
+      "step": 33363
+    },
+    {
+      "epoch": 0.28961554153175756,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0015319342614233602,
+      "loss": 0.1387,
+      "step": 33364
+    },
+    {
+      "epoch": 0.2896242220119617,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0015319081503324256,
+      "loss": 0.0859,
+      "step": 33365
+    },
+    {
+      "epoch": 0.2896329024921659,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0015318820387691626,
+      "loss": 0.1182,
+      "step": 33366
+    },
+    {
+      "epoch": 0.28964158297237,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0015318559267336,
+      "loss": 0.0889,
+      "step": 33367
+    },
+    {
+      "epoch": 0.2896502634525742,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001531829814225766,
+      "loss": 0.1172,
+      "step": 33368
+    },
+    {
+      "epoch": 0.28965894393277836,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0015318037012456898,
+      "loss": 0.1104,
+      "step": 33369
+    },
+    {
+      "epoch": 0.28966762441298255,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0015317775877933992,
+      "loss": 0.123,
+      "step": 33370
+    },
+    {
+      "epoch": 0.2896763048931867,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0015317514738689235,
+      "loss": 0.0991,
+      "step": 33371
+    },
+    {
+      "epoch": 0.2896849853733909,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0015317253594722905,
+      "loss": 0.0776,
+      "step": 33372
+    },
+    {
+      "epoch": 0.289693665853595,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0015316992446035294,
+      "loss": 0.1104,
+      "step": 33373
+    },
+    {
+      "epoch": 0.2897023463337992,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0015316731292626686,
+      "loss": 0.0928,
+      "step": 33374
+    },
+    {
+      "epoch": 0.28971102681400335,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0015316470134497362,
+      "loss": 0.1055,
+      "step": 33375
+    },
+    {
+      "epoch": 0.28971970729420754,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0015316208971647613,
+      "loss": 0.0908,
+      "step": 33376
+    },
+    {
+      "epoch": 0.2897283877744117,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001531594780407772,
+      "loss": 0.124,
+      "step": 33377
+    },
+    {
+      "epoch": 0.28973706825461587,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0015315686631787978,
+      "loss": 0.1348,
+      "step": 33378
+    },
+    {
+      "epoch": 0.28974574873482,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0015315425454778659,
+      "loss": 0.125,
+      "step": 33379
+    },
+    {
+      "epoch": 0.2897544292150242,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0015315164273050057,
+      "loss": 0.1367,
+      "step": 33380
+    },
+    {
+      "epoch": 0.28976310969522834,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0015314903086602462,
+      "loss": 0.0947,
+      "step": 33381
+    },
+    {
+      "epoch": 0.28977179017543253,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001531464189543615,
+      "loss": 0.1055,
+      "step": 33382
+    },
+    {
+      "epoch": 0.28978047065563667,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0015314380699551406,
+      "loss": 0.1211,
+      "step": 33383
+    },
+    {
+      "epoch": 0.28978915113584086,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0015314119498948525,
+      "loss": 0.1289,
+      "step": 33384
+    },
+    {
+      "epoch": 0.289797831616045,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0015313858293627789,
+      "loss": 0.1494,
+      "step": 33385
+    },
+    {
+      "epoch": 0.2898065120962492,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0015313597083589476,
+      "loss": 0.1289,
+      "step": 33386
+    },
+    {
+      "epoch": 0.28981519257645333,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0015313335868833884,
+      "loss": 0.0967,
+      "step": 33387
+    },
+    {
+      "epoch": 0.2898238730566575,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0015313074649361289,
+      "loss": 0.0845,
+      "step": 33388
+    },
+    {
+      "epoch": 0.28983255353686166,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0015312813425171982,
+      "loss": 0.127,
+      "step": 33389
+    },
+    {
+      "epoch": 0.2898412340170658,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0015312552196266246,
+      "loss": 0.1338,
+      "step": 33390
+    },
+    {
+      "epoch": 0.28984991449727,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001531229096264437,
+      "loss": 0.1143,
+      "step": 33391
+    },
+    {
+      "epoch": 0.2898585949774741,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0015312029724306635,
+      "loss": 0.0811,
+      "step": 33392
+    },
+    {
+      "epoch": 0.2898672754576783,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0015311768481253334,
+      "loss": 0.106,
+      "step": 33393
+    },
+    {
+      "epoch": 0.28987595593788246,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0015311507233484742,
+      "loss": 0.1064,
+      "step": 33394
+    },
+    {
+      "epoch": 0.28988463641808665,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0015311245981001155,
+      "loss": 0.1089,
+      "step": 33395
+    },
+    {
+      "epoch": 0.2898933168982908,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001531098472380285,
+      "loss": 0.167,
+      "step": 33396
+    },
+    {
+      "epoch": 0.289901997378495,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0015310723461890118,
+      "loss": 0.0859,
+      "step": 33397
+    },
+    {
+      "epoch": 0.2899106778586991,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0015310462195263243,
+      "loss": 0.1045,
+      "step": 33398
+    },
+    {
+      "epoch": 0.2899193583389033,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0015310200923922515,
+      "loss": 0.103,
+      "step": 33399
+    },
+    {
+      "epoch": 0.28992803881910745,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0015309939647868214,
+      "loss": 0.125,
+      "step": 33400
+    },
+    {
+      "epoch": 0.28993671929931164,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0015309678367100626,
+      "loss": 0.0505,
+      "step": 33401
+    },
+    {
+      "epoch": 0.2899453997795158,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001530941708162004,
+      "loss": 0.0801,
+      "step": 33402
+    },
+    {
+      "epoch": 0.28995408025971997,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0015309155791426738,
+      "loss": 0.1089,
+      "step": 33403
+    },
+    {
+      "epoch": 0.2899627607399241,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0015308894496521013,
+      "loss": 0.1016,
+      "step": 33404
+    },
+    {
+      "epoch": 0.2899714412201283,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001530863319690314,
+      "loss": 0.1055,
+      "step": 33405
+    },
+    {
+      "epoch": 0.28998012170033244,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0015308371892573413,
+      "loss": 0.1484,
+      "step": 33406
+    },
+    {
+      "epoch": 0.28998880218053663,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0015308110583532118,
+      "loss": 0.0947,
+      "step": 33407
+    },
+    {
+      "epoch": 0.28999748266074077,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0015307849269779535,
+      "loss": 0.085,
+      "step": 33408
+    },
+    {
+      "epoch": 0.29000616314094496,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0015307587951315952,
+      "loss": 0.1074,
+      "step": 33409
+    },
+    {
+      "epoch": 0.2900148436211491,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0015307326628141655,
+      "loss": 0.1235,
+      "step": 33410
+    },
+    {
+      "epoch": 0.2900235241013533,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0015307065300256934,
+      "loss": 0.1011,
+      "step": 33411
+    },
+    {
+      "epoch": 0.29003220458155743,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001530680396766207,
+      "loss": 0.1079,
+      "step": 33412
+    },
+    {
+      "epoch": 0.2900408850617616,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001530654263035735,
+      "loss": 0.1021,
+      "step": 33413
+    },
+    {
+      "epoch": 0.29004956554196576,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0015306281288343057,
+      "loss": 0.1416,
+      "step": 33414
+    },
+    {
+      "epoch": 0.29005824602216995,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001530601994161948,
+      "loss": 0.1094,
+      "step": 33415
+    },
+    {
+      "epoch": 0.2900669265023741,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0015305758590186908,
+      "loss": 0.1289,
+      "step": 33416
+    },
+    {
+      "epoch": 0.2900756069825783,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001530549723404562,
+      "loss": 0.0996,
+      "step": 33417
+    },
+    {
+      "epoch": 0.2900842874627824,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00153052358731959,
+      "loss": 0.1055,
+      "step": 33418
+    },
+    {
+      "epoch": 0.2900929679429866,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0015304974507638048,
+      "loss": 0.1035,
+      "step": 33419
+    },
+    {
+      "epoch": 0.29010164842319075,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0015304713137372337,
+      "loss": 0.1016,
+      "step": 33420
+    },
+    {
+      "epoch": 0.29011032890339494,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0015304451762399056,
+      "loss": 0.1279,
+      "step": 33421
+    },
+    {
+      "epoch": 0.2901190093835991,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001530419038271849,
+      "loss": 0.1504,
+      "step": 33422
+    },
+    {
+      "epoch": 0.2901276898638033,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001530392899833093,
+      "loss": 0.0859,
+      "step": 33423
+    },
+    {
+      "epoch": 0.2901363703440074,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0015303667609236655,
+      "loss": 0.123,
+      "step": 33424
+    },
+    {
+      "epoch": 0.2901450508242116,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0015303406215435953,
+      "loss": 0.085,
+      "step": 33425
+    },
+    {
+      "epoch": 0.29015373130441574,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015303144816929113,
+      "loss": 0.1089,
+      "step": 33426
+    },
+    {
+      "epoch": 0.29016241178461993,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0015302883413716415,
+      "loss": 0.0952,
+      "step": 33427
+    },
+    {
+      "epoch": 0.29017109226482407,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0015302622005798152,
+      "loss": 0.0957,
+      "step": 33428
+    },
+    {
+      "epoch": 0.29017977274502826,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0015302360593174606,
+      "loss": 0.1191,
+      "step": 33429
+    },
+    {
+      "epoch": 0.2901884532252324,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0015302099175846062,
+      "loss": 0.0859,
+      "step": 33430
+    },
+    {
+      "epoch": 0.2901971337054366,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0015301837753812803,
+      "loss": 0.0815,
+      "step": 33431
+    },
+    {
+      "epoch": 0.29020581418564073,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0015301576327075123,
+      "loss": 0.1074,
+      "step": 33432
+    },
+    {
+      "epoch": 0.2902144946658449,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0015301314895633307,
+      "loss": 0.0767,
+      "step": 33433
+    },
+    {
+      "epoch": 0.29022317514604906,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0015301053459487633,
+      "loss": 0.0894,
+      "step": 33434
+    },
+    {
+      "epoch": 0.29023185562625325,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001530079201863839,
+      "loss": 0.082,
+      "step": 33435
+    },
+    {
+      "epoch": 0.2902405361064574,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0015300530573085869,
+      "loss": 0.1035,
+      "step": 33436
+    },
+    {
+      "epoch": 0.2902492165866616,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0015300269122830354,
+      "loss": 0.0986,
+      "step": 33437
+    },
+    {
+      "epoch": 0.2902578970668657,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0015300007667872126,
+      "loss": 0.1152,
+      "step": 33438
+    },
+    {
+      "epoch": 0.2902665775470699,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0015299746208211477,
+      "loss": 0.1006,
+      "step": 33439
+    },
+    {
+      "epoch": 0.29027525802727405,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0015299484743848688,
+      "loss": 0.1016,
+      "step": 33440
+    },
+    {
+      "epoch": 0.29028393850747825,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0015299223274784046,
+      "loss": 0.1045,
+      "step": 33441
+    },
+    {
+      "epoch": 0.2902926189876824,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0015298961801017842,
+      "loss": 0.1064,
+      "step": 33442
+    },
+    {
+      "epoch": 0.2903012994678866,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0015298700322550352,
+      "loss": 0.1855,
+      "step": 33443
+    },
+    {
+      "epoch": 0.2903099799480907,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0015298438839381873,
+      "loss": 0.0708,
+      "step": 33444
+    },
+    {
+      "epoch": 0.2903186604282949,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0015298177351512681,
+      "loss": 0.1602,
+      "step": 33445
+    },
+    {
+      "epoch": 0.29032734090849904,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0015297915858943072,
+      "loss": 0.0913,
+      "step": 33446
+    },
+    {
+      "epoch": 0.29033602138870324,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0015297654361673323,
+      "loss": 0.1118,
+      "step": 33447
+    },
+    {
+      "epoch": 0.2903447018689074,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.0015297392859703724,
+      "loss": 0.0947,
+      "step": 33448
+    },
+    {
+      "epoch": 0.29035338234911157,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0015297131353034564,
+      "loss": 0.1562,
+      "step": 33449
+    },
+    {
+      "epoch": 0.2903620628293157,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0015296869841666125,
+      "loss": 0.1328,
+      "step": 33450
+    },
+    {
+      "epoch": 0.2903707433095199,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0015296608325598693,
+      "loss": 0.1152,
+      "step": 33451
+    },
+    {
+      "epoch": 0.29037942378972403,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0015296346804832554,
+      "loss": 0.0938,
+      "step": 33452
+    },
+    {
+      "epoch": 0.2903881042699282,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0015296085279367995,
+      "loss": 0.0903,
+      "step": 33453
+    },
+    {
+      "epoch": 0.29039678475013236,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.00152958237492053,
+      "loss": 0.1543,
+      "step": 33454
+    },
+    {
+      "epoch": 0.29040546523033656,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001529556221434476,
+      "loss": 0.332,
+      "step": 33455
+    },
+    {
+      "epoch": 0.2904141457105407,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0015295300674786655,
+      "loss": 0.1182,
+      "step": 33456
+    },
+    {
+      "epoch": 0.2904228261907449,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0015295039130531277,
+      "loss": 0.127,
+      "step": 33457
+    },
+    {
+      "epoch": 0.290431506670949,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0015294777581578904,
+      "loss": 0.1113,
+      "step": 33458
+    },
+    {
+      "epoch": 0.2904401871511532,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001529451602792983,
+      "loss": 0.1191,
+      "step": 33459
+    },
+    {
+      "epoch": 0.29044886763135735,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0015294254469584339,
+      "loss": 0.1113,
+      "step": 33460
+    },
+    {
+      "epoch": 0.29045754811156155,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0015293992906542713,
+      "loss": 0.1318,
+      "step": 33461
+    },
+    {
+      "epoch": 0.2904662285917657,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0015293731338805244,
+      "loss": 0.1416,
+      "step": 33462
+    },
+    {
+      "epoch": 0.2904749090719699,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001529346976637221,
+      "loss": 0.1104,
+      "step": 33463
+    },
+    {
+      "epoch": 0.290483589552174,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0015293208189243904,
+      "loss": 0.0635,
+      "step": 33464
+    },
+    {
+      "epoch": 0.2904922700323782,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.001529294660742061,
+      "loss": 0.1045,
+      "step": 33465
+    },
+    {
+      "epoch": 0.29050095051258235,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0015292685020902614,
+      "loss": 0.0977,
+      "step": 33466
+    },
+    {
+      "epoch": 0.29050963099278654,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0015292423429690205,
+      "loss": 0.1484,
+      "step": 33467
+    },
+    {
+      "epoch": 0.2905183114729907,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0015292161833783664,
+      "loss": 0.126,
+      "step": 33468
+    },
+    {
+      "epoch": 0.29052699195319487,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001529190023318328,
+      "loss": 0.1201,
+      "step": 33469
+    },
+    {
+      "epoch": 0.290535672433399,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0015291638627889337,
+      "loss": 0.1045,
+      "step": 33470
+    },
+    {
+      "epoch": 0.2905443529136032,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0015291377017902124,
+      "loss": 0.1875,
+      "step": 33471
+    },
+    {
+      "epoch": 0.29055303339380734,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0015291115403221927,
+      "loss": 0.0859,
+      "step": 33472
+    },
+    {
+      "epoch": 0.29056171387401153,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0015290853783849027,
+      "loss": 0.1279,
+      "step": 33473
+    },
+    {
+      "epoch": 0.29057039435421567,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0015290592159783716,
+      "loss": 0.1221,
+      "step": 33474
+    },
+    {
+      "epoch": 0.29057907483441986,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0015290330531026277,
+      "loss": 0.1006,
+      "step": 33475
+    },
+    {
+      "epoch": 0.290587755314624,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0015290068897576997,
+      "loss": 0.1182,
+      "step": 33476
+    },
+    {
+      "epoch": 0.2905964357948282,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001528980725943616,
+      "loss": 0.105,
+      "step": 33477
+    },
+    {
+      "epoch": 0.2906051162750323,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001528954561660406,
+      "loss": 0.0942,
+      "step": 33478
+    },
+    {
+      "epoch": 0.2906137967552365,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0015289283969080972,
+      "loss": 0.1099,
+      "step": 33479
+    },
+    {
+      "epoch": 0.29062247723544066,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0015289022316867192,
+      "loss": 0.0986,
+      "step": 33480
+    },
+    {
+      "epoch": 0.29063115771564485,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0015288760659963,
+      "loss": 0.0718,
+      "step": 33481
+    },
+    {
+      "epoch": 0.290639838195849,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001528849899836868,
+      "loss": 0.1367,
+      "step": 33482
+    },
+    {
+      "epoch": 0.2906485186760532,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0015288237332084522,
+      "loss": 0.0801,
+      "step": 33483
+    },
+    {
+      "epoch": 0.2906571991562573,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0015287975661110818,
+      "loss": 0.1279,
+      "step": 33484
+    },
+    {
+      "epoch": 0.2906658796364615,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0015287713985447844,
+      "loss": 0.1211,
+      "step": 33485
+    },
+    {
+      "epoch": 0.29067456011666565,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0015287452305095893,
+      "loss": 0.0938,
+      "step": 33486
+    },
+    {
+      "epoch": 0.29068324059686984,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0015287190620055246,
+      "loss": 0.1387,
+      "step": 33487
+    },
+    {
+      "epoch": 0.290691921077074,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0015286928930326193,
+      "loss": 0.1162,
+      "step": 33488
+    },
+    {
+      "epoch": 0.29070060155727817,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001528666723590902,
+      "loss": 0.0981,
+      "step": 33489
+    },
+    {
+      "epoch": 0.2907092820374823,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0015286405536804012,
+      "loss": 0.1025,
+      "step": 33490
+    },
+    {
+      "epoch": 0.2907179625176865,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0015286143833011451,
+      "loss": 0.1084,
+      "step": 33491
+    },
+    {
+      "epoch": 0.29072664299789064,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0015285882124531633,
+      "loss": 0.1211,
+      "step": 33492
+    },
+    {
+      "epoch": 0.29073532347809483,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0015285620411364836,
+      "loss": 0.1348,
+      "step": 33493
+    },
+    {
+      "epoch": 0.29074400395829897,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0015285358693511348,
+      "loss": 0.1045,
+      "step": 33494
+    },
+    {
+      "epoch": 0.29075268443850316,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001528509697097146,
+      "loss": 0.0835,
+      "step": 33495
+    },
+    {
+      "epoch": 0.2907613649187073,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001528483524374545,
+      "loss": 0.1011,
+      "step": 33496
+    },
+    {
+      "epoch": 0.2907700453989115,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0015284573511833612,
+      "loss": 0.103,
+      "step": 33497
+    },
+    {
+      "epoch": 0.29077872587911563,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0015284311775236226,
+      "loss": 0.1133,
+      "step": 33498
+    },
+    {
+      "epoch": 0.2907874063593198,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0015284050033953585,
+      "loss": 0.0869,
+      "step": 33499
+    },
+    {
+      "epoch": 0.29079608683952396,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0015283788287985968,
+      "loss": 0.0869,
+      "step": 33500
+    },
+    {
+      "epoch": 0.29080476731972815,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0015283526537333662,
+      "loss": 0.0791,
+      "step": 33501
+    },
+    {
+      "epoch": 0.2908134477999323,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0015283264781996958,
+      "loss": 0.124,
+      "step": 33502
+    },
+    {
+      "epoch": 0.2908221282801365,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0015283003021976144,
+      "loss": 0.1094,
+      "step": 33503
+    },
+    {
+      "epoch": 0.2908308087603406,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0015282741257271494,
+      "loss": 0.0903,
+      "step": 33504
+    },
+    {
+      "epoch": 0.2908394892405448,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001528247948788331,
+      "loss": 0.0703,
+      "step": 33505
+    },
+    {
+      "epoch": 0.29084816972074895,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0015282217713811868,
+      "loss": 0.1445,
+      "step": 33506
+    },
+    {
+      "epoch": 0.29085685020095314,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0015281955935057458,
+      "loss": 0.1143,
+      "step": 33507
+    },
+    {
+      "epoch": 0.2908655306811573,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0015281694151620365,
+      "loss": 0.1416,
+      "step": 33508
+    },
+    {
+      "epoch": 0.2908742111613615,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0015281432363500873,
+      "loss": 0.0801,
+      "step": 33509
+    },
+    {
+      "epoch": 0.2908828916415656,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0015281170570699274,
+      "loss": 0.1113,
+      "step": 33510
+    },
+    {
+      "epoch": 0.2908915721217698,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001528090877321585,
+      "loss": 0.0986,
+      "step": 33511
+    },
+    {
+      "epoch": 0.29090025260197394,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001528064697105089,
+      "loss": 0.1211,
+      "step": 33512
+    },
+    {
+      "epoch": 0.29090893308217813,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0015280385164204676,
+      "loss": 0.1465,
+      "step": 33513
+    },
+    {
+      "epoch": 0.29091761356238227,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0015280123352677498,
+      "loss": 0.0811,
+      "step": 33514
+    },
+    {
+      "epoch": 0.2909262940425864,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0015279861536469643,
+      "loss": 0.1177,
+      "step": 33515
+    },
+    {
+      "epoch": 0.2909349745227906,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0015279599715581395,
+      "loss": 0.1104,
+      "step": 33516
+    },
+    {
+      "epoch": 0.29094365500299474,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0015279337890013042,
+      "loss": 0.1582,
+      "step": 33517
+    },
+    {
+      "epoch": 0.29095233548319893,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0015279076059764867,
+      "loss": 0.1172,
+      "step": 33518
+    },
+    {
+      "epoch": 0.29096101596340307,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001527881422483716,
+      "loss": 0.1123,
+      "step": 33519
+    },
+    {
+      "epoch": 0.29096969644360726,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0015278552385230206,
+      "loss": 0.1406,
+      "step": 33520
+    },
+    {
+      "epoch": 0.2909783769238114,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0015278290540944293,
+      "loss": 0.0913,
+      "step": 33521
+    },
+    {
+      "epoch": 0.2909870574040156,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.00152780286919797,
+      "loss": 0.0791,
+      "step": 33522
+    },
+    {
+      "epoch": 0.29099573788421973,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0015277766838336724,
+      "loss": 0.1777,
+      "step": 33523
+    },
+    {
+      "epoch": 0.2910044183644239,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0015277504980015647,
+      "loss": 0.0996,
+      "step": 33524
+    },
+    {
+      "epoch": 0.29101309884462806,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0015277243117016754,
+      "loss": 0.1025,
+      "step": 33525
+    },
+    {
+      "epoch": 0.29102177932483225,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0015276981249340332,
+      "loss": 0.1543,
+      "step": 33526
+    },
+    {
+      "epoch": 0.2910304598050364,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0015276719376986667,
+      "loss": 0.1094,
+      "step": 33527
+    },
+    {
+      "epoch": 0.2910391402852406,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0015276457499956047,
+      "loss": 0.1426,
+      "step": 33528
+    },
+    {
+      "epoch": 0.2910478207654447,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0015276195618248757,
+      "loss": 0.1201,
+      "step": 33529
+    },
+    {
+      "epoch": 0.2910565012456489,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0015275933731865082,
+      "loss": 0.0996,
+      "step": 33530
+    },
+    {
+      "epoch": 0.29106518172585305,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.001527567184080531,
+      "loss": 0.0801,
+      "step": 33531
+    },
+    {
+      "epoch": 0.29107386220605724,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0015275409945069728,
+      "loss": 0.1084,
+      "step": 33532
+    },
+    {
+      "epoch": 0.2910825426862614,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0015275148044658624,
+      "loss": 0.1113,
+      "step": 33533
+    },
+    {
+      "epoch": 0.2910912231664656,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0015274886139572284,
+      "loss": 0.1357,
+      "step": 33534
+    },
+    {
+      "epoch": 0.2910999036466697,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0015274624229810985,
+      "loss": 0.1221,
+      "step": 33535
+    },
+    {
+      "epoch": 0.2911085841268739,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0015274362315375028,
+      "loss": 0.0981,
+      "step": 33536
+    },
+    {
+      "epoch": 0.29111726460707804,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001527410039626469,
+      "loss": 0.083,
+      "step": 33537
+    },
+    {
+      "epoch": 0.29112594508728223,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0015273838472480263,
+      "loss": 0.1064,
+      "step": 33538
+    },
+    {
+      "epoch": 0.29113462556748637,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0015273576544022028,
+      "loss": 0.0918,
+      "step": 33539
+    },
+    {
+      "epoch": 0.29114330604769056,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0015273314610890272,
+      "loss": 0.1064,
+      "step": 33540
+    },
+    {
+      "epoch": 0.2911519865278947,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0015273052673085287,
+      "loss": 0.0947,
+      "step": 33541
+    },
+    {
+      "epoch": 0.2911606670080989,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0015272790730607353,
+      "loss": 0.0879,
+      "step": 33542
+    },
+    {
+      "epoch": 0.29116934748830303,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001527252878345676,
+      "loss": 0.1406,
+      "step": 33543
+    },
+    {
+      "epoch": 0.2911780279685072,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0015272266831633794,
+      "loss": 0.1016,
+      "step": 33544
+    },
+    {
+      "epoch": 0.29118670844871136,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0015272004875138744,
+      "loss": 0.0752,
+      "step": 33545
+    },
+    {
+      "epoch": 0.29119538892891556,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0015271742913971888,
+      "loss": 0.1035,
+      "step": 33546
+    },
+    {
+      "epoch": 0.2912040694091197,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0015271480948133524,
+      "loss": 0.0986,
+      "step": 33547
+    },
+    {
+      "epoch": 0.2912127498893239,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0015271218977623928,
+      "loss": 0.1357,
+      "step": 33548
+    },
+    {
+      "epoch": 0.291221430369528,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0015270957002443393,
+      "loss": 0.1289,
+      "step": 33549
+    },
+    {
+      "epoch": 0.2912301108497322,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0015270695022592201,
+      "loss": 0.1123,
+      "step": 33550
+    },
+    {
+      "epoch": 0.29123879132993635,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0015270433038070646,
+      "loss": 0.1123,
+      "step": 33551
+    },
+    {
+      "epoch": 0.29124747181014055,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0015270171048879005,
+      "loss": 0.085,
+      "step": 33552
+    },
+    {
+      "epoch": 0.2912561522903447,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0015269909055017573,
+      "loss": 0.0938,
+      "step": 33553
+    },
+    {
+      "epoch": 0.2912648327705489,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001526964705648663,
+      "loss": 0.1113,
+      "step": 33554
+    },
+    {
+      "epoch": 0.291273513250753,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0015269385053286466,
+      "loss": 0.1074,
+      "step": 33555
+    },
+    {
+      "epoch": 0.2912821937309572,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0015269123045417365,
+      "loss": 0.0991,
+      "step": 33556
+    },
+    {
+      "epoch": 0.29129087421116134,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001526886103287962,
+      "loss": 0.085,
+      "step": 33557
+    },
+    {
+      "epoch": 0.29129955469136554,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0015268599015673507,
+      "loss": 0.1055,
+      "step": 33558
+    },
+    {
+      "epoch": 0.2913082351715697,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001526833699379932,
+      "loss": 0.1016,
+      "step": 33559
+    },
+    {
+      "epoch": 0.29131691565177387,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0015268074967257346,
+      "loss": 0.124,
+      "step": 33560
+    },
+    {
+      "epoch": 0.291325596131978,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0015267812936047866,
+      "loss": 0.1104,
+      "step": 33561
+    },
+    {
+      "epoch": 0.2913342766121822,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0015267550900171173,
+      "loss": 0.0947,
+      "step": 33562
+    },
+    {
+      "epoch": 0.29134295709238633,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001526728885962755,
+      "loss": 0.1904,
+      "step": 33563
+    },
+    {
+      "epoch": 0.2913516375725905,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0015267026814417284,
+      "loss": 0.0972,
+      "step": 33564
+    },
+    {
+      "epoch": 0.29136031805279466,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0015266764764540661,
+      "loss": 0.123,
+      "step": 33565
+    },
+    {
+      "epoch": 0.29136899853299886,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0015266502709997967,
+      "loss": 0.0903,
+      "step": 33566
+    },
+    {
+      "epoch": 0.291377679013203,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001526624065078949,
+      "loss": 0.1221,
+      "step": 33567
+    },
+    {
+      "epoch": 0.2913863594934072,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0015265978586915519,
+      "loss": 0.0918,
+      "step": 33568
+    },
+    {
+      "epoch": 0.2913950399736113,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0015265716518376336,
+      "loss": 0.103,
+      "step": 33569
+    },
+    {
+      "epoch": 0.2914037204538155,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.001526545444517223,
+      "loss": 0.1182,
+      "step": 33570
+    },
+    {
+      "epoch": 0.29141240093401966,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015265192367303487,
+      "loss": 0.127,
+      "step": 33571
+    },
+    {
+      "epoch": 0.29142108141422385,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0015264930284770394,
+      "loss": 0.1348,
+      "step": 33572
+    },
+    {
+      "epoch": 0.291429761894428,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0015264668197573232,
+      "loss": 0.0908,
+      "step": 33573
+    },
+    {
+      "epoch": 0.2914384423746322,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0015264406105712299,
+      "loss": 0.1055,
+      "step": 33574
+    },
+    {
+      "epoch": 0.2914471228548363,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0015264144009187876,
+      "loss": 0.0981,
+      "step": 33575
+    },
+    {
+      "epoch": 0.2914558033350405,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0015263881908000245,
+      "loss": 0.1201,
+      "step": 33576
+    },
+    {
+      "epoch": 0.29146448381524465,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0015263619802149701,
+      "loss": 0.0952,
+      "step": 33577
+    },
+    {
+      "epoch": 0.29147316429544884,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0015263357691636524,
+      "loss": 0.063,
+      "step": 33578
+    },
+    {
+      "epoch": 0.291481844775653,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0015263095576461,
+      "loss": 0.0986,
+      "step": 33579
+    },
+    {
+      "epoch": 0.29149052525585717,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0015262833456623423,
+      "loss": 0.1025,
+      "step": 33580
+    },
+    {
+      "epoch": 0.2914992057360613,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0015262571332124077,
+      "loss": 0.2461,
+      "step": 33581
+    },
+    {
+      "epoch": 0.2915078862162655,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0015262309202963246,
+      "loss": 0.0913,
+      "step": 33582
+    },
+    {
+      "epoch": 0.29151656669646964,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0015262047069141216,
+      "loss": 0.1436,
+      "step": 33583
+    },
+    {
+      "epoch": 0.29152524717667383,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0015261784930658275,
+      "loss": 0.1562,
+      "step": 33584
+    },
+    {
+      "epoch": 0.29153392765687797,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0015261522787514711,
+      "loss": 0.0825,
+      "step": 33585
+    },
+    {
+      "epoch": 0.29154260813708216,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.001526126063971081,
+      "loss": 0.0723,
+      "step": 33586
+    },
+    {
+      "epoch": 0.2915512886172863,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0015260998487246857,
+      "loss": 0.0933,
+      "step": 33587
+    },
+    {
+      "epoch": 0.2915599690974905,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0015260736330123142,
+      "loss": 0.1055,
+      "step": 33588
+    },
+    {
+      "epoch": 0.2915686495776946,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0015260474168339948,
+      "loss": 0.1094,
+      "step": 33589
+    },
+    {
+      "epoch": 0.2915773300578988,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0015260212001897567,
+      "loss": 0.0771,
+      "step": 33590
+    },
+    {
+      "epoch": 0.29158601053810296,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0015259949830796276,
+      "loss": 0.1084,
+      "step": 33591
+    },
+    {
+      "epoch": 0.29159469101830715,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0015259687655036374,
+      "loss": 0.125,
+      "step": 33592
+    },
+    {
+      "epoch": 0.2916033714985113,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001525942547461814,
+      "loss": 0.0947,
+      "step": 33593
+    },
+    {
+      "epoch": 0.2916120519787155,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001525916328954186,
+      "loss": 0.1074,
+      "step": 33594
+    },
+    {
+      "epoch": 0.2916207324589196,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0015258901099807823,
+      "loss": 0.1128,
+      "step": 33595
+    },
+    {
+      "epoch": 0.2916294129391238,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0015258638905416317,
+      "loss": 0.0972,
+      "step": 33596
+    },
+    {
+      "epoch": 0.29163809341932795,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0015258376706367629,
+      "loss": 0.0698,
+      "step": 33597
+    },
+    {
+      "epoch": 0.29164677389953214,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001525811450266204,
+      "loss": 0.1338,
+      "step": 33598
+    },
+    {
+      "epoch": 0.2916554543797363,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0015257852294299843,
+      "loss": 0.106,
+      "step": 33599
+    },
+    {
+      "epoch": 0.29166413485994047,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0015257590081281325,
+      "loss": 0.1064,
+      "step": 33600
+    },
+    {
+      "epoch": 0.2916728153401446,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.001525732786360677,
+      "loss": 0.0913,
+      "step": 33601
+    },
+    {
+      "epoch": 0.2916814958203488,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0015257065641276464,
+      "loss": 0.1123,
+      "step": 33602
+    },
+    {
+      "epoch": 0.29169017630055294,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0015256803414290694,
+      "loss": 0.1406,
+      "step": 33603
+    },
+    {
+      "epoch": 0.29169885678075713,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0015256541182649748,
+      "loss": 0.1152,
+      "step": 33604
+    },
+    {
+      "epoch": 0.29170753726096127,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0015256278946353915,
+      "loss": 0.127,
+      "step": 33605
+    },
+    {
+      "epoch": 0.29171621774116546,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0015256016705403478,
+      "loss": 0.0986,
+      "step": 33606
+    },
+    {
+      "epoch": 0.2917248982213696,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.001525575445979872,
+      "loss": 0.1235,
+      "step": 33607
+    },
+    {
+      "epoch": 0.2917335787015738,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0015255492209539938,
+      "loss": 0.0938,
+      "step": 33608
+    },
+    {
+      "epoch": 0.29174225918177793,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0015255229954627414,
+      "loss": 0.1357,
+      "step": 33609
+    },
+    {
+      "epoch": 0.2917509396619821,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0015254967695061434,
+      "loss": 0.1123,
+      "step": 33610
+    },
+    {
+      "epoch": 0.29175962014218626,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0015254705430842285,
+      "loss": 0.1377,
+      "step": 33611
+    },
+    {
+      "epoch": 0.29176830062239045,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0015254443161970253,
+      "loss": 0.125,
+      "step": 33612
+    },
+    {
+      "epoch": 0.2917769811025946,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0015254180888445627,
+      "loss": 0.0825,
+      "step": 33613
+    },
+    {
+      "epoch": 0.2917856615827988,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0015253918610268692,
+      "loss": 0.0933,
+      "step": 33614
+    },
+    {
+      "epoch": 0.2917943420630029,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0015253656327439739,
+      "loss": 0.0801,
+      "step": 33615
+    },
+    {
+      "epoch": 0.2918030225432071,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0015253394039959048,
+      "loss": 0.0981,
+      "step": 33616
+    },
+    {
+      "epoch": 0.29181170302341125,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0015253131747826909,
+      "loss": 0.1318,
+      "step": 33617
+    },
+    {
+      "epoch": 0.29182038350361544,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0015252869451043607,
+      "loss": 0.1162,
+      "step": 33618
+    },
+    {
+      "epoch": 0.2918290639838196,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0015252607149609434,
+      "loss": 0.1396,
+      "step": 33619
+    },
+    {
+      "epoch": 0.2918377444640238,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0015252344843524675,
+      "loss": 0.1021,
+      "step": 33620
+    },
+    {
+      "epoch": 0.2918464249442279,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0015252082532789613,
+      "loss": 0.0923,
+      "step": 33621
+    },
+    {
+      "epoch": 0.2918551054244321,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0015251820217404538,
+      "loss": 0.0879,
+      "step": 33622
+    },
+    {
+      "epoch": 0.29186378590463624,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0015251557897369736,
+      "loss": 0.1504,
+      "step": 33623
+    },
+    {
+      "epoch": 0.29187246638484043,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0015251295572685497,
+      "loss": 0.0928,
+      "step": 33624
+    },
+    {
+      "epoch": 0.29188114686504457,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0015251033243352103,
+      "loss": 0.127,
+      "step": 33625
+    },
+    {
+      "epoch": 0.29188982734524876,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0015250770909369842,
+      "loss": 0.1143,
+      "step": 33626
+    },
+    {
+      "epoch": 0.2918985078254529,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0015250508570739,
+      "loss": 0.1445,
+      "step": 33627
+    },
+    {
+      "epoch": 0.2919071883056571,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0015250246227459868,
+      "loss": 0.1221,
+      "step": 33628
+    },
+    {
+      "epoch": 0.29191586878586123,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001524998387953273,
+      "loss": 0.123,
+      "step": 33629
+    },
+    {
+      "epoch": 0.2919245492660654,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0015249721526957875,
+      "loss": 0.1426,
+      "step": 33630
+    },
+    {
+      "epoch": 0.29193322974626956,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001524945916973559,
+      "loss": 0.1084,
+      "step": 33631
+    },
+    {
+      "epoch": 0.29194191022647376,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0015249196807866157,
+      "loss": 0.0913,
+      "step": 33632
+    },
+    {
+      "epoch": 0.2919505907066779,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0015248934441349866,
+      "loss": 0.1289,
+      "step": 33633
+    },
+    {
+      "epoch": 0.2919592711868821,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0015248672070187004,
+      "loss": 0.0898,
+      "step": 33634
+    },
+    {
+      "epoch": 0.2919679516670862,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001524840969437786,
+      "loss": 0.0879,
+      "step": 33635
+    },
+    {
+      "epoch": 0.2919766321472904,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0015248147313922716,
+      "loss": 0.1084,
+      "step": 33636
+    },
+    {
+      "epoch": 0.29198531262749455,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0015247884928821867,
+      "loss": 0.0981,
+      "step": 33637
+    },
+    {
+      "epoch": 0.2919939931076987,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001524762253907559,
+      "loss": 0.1104,
+      "step": 33638
+    },
+    {
+      "epoch": 0.2920026735879029,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001524736014468418,
+      "loss": 0.1025,
+      "step": 33639
+    },
+    {
+      "epoch": 0.292011354068107,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001524709774564792,
+      "loss": 0.1787,
+      "step": 33640
+    },
+    {
+      "epoch": 0.2920200345483112,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0015246835341967099,
+      "loss": 0.1328,
+      "step": 33641
+    },
+    {
+      "epoch": 0.29202871502851535,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0015246572933642,
+      "loss": 0.0918,
+      "step": 33642
+    },
+    {
+      "epoch": 0.29203739550871954,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0015246310520672913,
+      "loss": 0.082,
+      "step": 33643
+    },
+    {
+      "epoch": 0.2920460759889237,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0015246048103060125,
+      "loss": 0.084,
+      "step": 33644
+    },
+    {
+      "epoch": 0.2920547564691279,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0015245785680803921,
+      "loss": 0.084,
+      "step": 33645
+    },
+    {
+      "epoch": 0.292063436949332,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0015245523253904594,
+      "loss": 0.0854,
+      "step": 33646
+    },
+    {
+      "epoch": 0.2920721174295362,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0015245260822362421,
+      "loss": 0.1475,
+      "step": 33647
+    },
+    {
+      "epoch": 0.29208079790974034,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0015244998386177698,
+      "loss": 0.0957,
+      "step": 33648
+    },
+    {
+      "epoch": 0.29208947838994453,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0015244735945350708,
+      "loss": 0.1104,
+      "step": 33649
+    },
+    {
+      "epoch": 0.29209815887014867,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001524447349988174,
+      "loss": 0.0825,
+      "step": 33650
+    },
+    {
+      "epoch": 0.29210683935035286,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0015244211049771077,
+      "loss": 0.1245,
+      "step": 33651
+    },
+    {
+      "epoch": 0.292115519830557,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001524394859501901,
+      "loss": 0.0737,
+      "step": 33652
+    },
+    {
+      "epoch": 0.2921242003107612,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0015243686135625826,
+      "loss": 0.166,
+      "step": 33653
+    },
+    {
+      "epoch": 0.29213288079096533,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001524342367159181,
+      "loss": 0.1133,
+      "step": 33654
+    },
+    {
+      "epoch": 0.2921415612711695,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0015243161202917247,
+      "loss": 0.1318,
+      "step": 33655
+    },
+    {
+      "epoch": 0.29215024175137366,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0015242898729602428,
+      "loss": 0.123,
+      "step": 33656
+    },
+    {
+      "epoch": 0.29215892223157786,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001524263625164764,
+      "loss": 0.0889,
+      "step": 33657
+    },
+    {
+      "epoch": 0.292167602711782,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0015242373769053165,
+      "loss": 0.0986,
+      "step": 33658
+    },
+    {
+      "epoch": 0.2921762831919862,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0015242111281819298,
+      "loss": 0.0933,
+      "step": 33659
+    },
+    {
+      "epoch": 0.2921849636721903,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0015241848789946317,
+      "loss": 0.0918,
+      "step": 33660
+    },
+    {
+      "epoch": 0.2921936441523945,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001524158629343452,
+      "loss": 0.1133,
+      "step": 33661
+    },
+    {
+      "epoch": 0.29220232463259865,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0015241323792284185,
+      "loss": 0.1035,
+      "step": 33662
+    },
+    {
+      "epoch": 0.29221100511280285,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0015241061286495603,
+      "loss": 0.0928,
+      "step": 33663
+    },
+    {
+      "epoch": 0.292219685593007,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0015240798776069058,
+      "loss": 0.1348,
+      "step": 33664
+    },
+    {
+      "epoch": 0.2922283660732112,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001524053626100484,
+      "loss": 0.1211,
+      "step": 33665
+    },
+    {
+      "epoch": 0.2922370465534153,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0015240273741303236,
+      "loss": 0.1074,
+      "step": 33666
+    },
+    {
+      "epoch": 0.2922457270336195,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0015240011216964534,
+      "loss": 0.1533,
+      "step": 33667
+    },
+    {
+      "epoch": 0.29225440751382364,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0015239748687989017,
+      "loss": 0.1094,
+      "step": 33668
+    },
+    {
+      "epoch": 0.29226308799402784,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0015239486154376971,
+      "loss": 0.0991,
+      "step": 33669
+    },
+    {
+      "epoch": 0.292271768474232,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0015239223616128692,
+      "loss": 0.1045,
+      "step": 33670
+    },
+    {
+      "epoch": 0.29228044895443617,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0015238961073244463,
+      "loss": 0.0859,
+      "step": 33671
+    },
+    {
+      "epoch": 0.2922891294346403,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0015238698525724568,
+      "loss": 0.1006,
+      "step": 33672
+    },
+    {
+      "epoch": 0.2922978099148445,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0015238435973569293,
+      "loss": 0.1367,
+      "step": 33673
+    },
+    {
+      "epoch": 0.29230649039504863,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001523817341677893,
+      "loss": 0.1152,
+      "step": 33674
+    },
+    {
+      "epoch": 0.2923151708752528,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0015237910855353763,
+      "loss": 0.1069,
+      "step": 33675
+    },
+    {
+      "epoch": 0.29232385135545697,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0015237648289294083,
+      "loss": 0.1025,
+      "step": 33676
+    },
+    {
+      "epoch": 0.29233253183566116,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0015237385718600172,
+      "loss": 0.0933,
+      "step": 33677
+    },
+    {
+      "epoch": 0.2923412123158653,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0015237123143272322,
+      "loss": 0.1104,
+      "step": 33678
+    },
+    {
+      "epoch": 0.2923498927960695,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0015236860563310817,
+      "loss": 0.1064,
+      "step": 33679
+    },
+    {
+      "epoch": 0.2923585732762736,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0015236597978715947,
+      "loss": 0.1099,
+      "step": 33680
+    },
+    {
+      "epoch": 0.2923672537564778,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0015236335389487996,
+      "loss": 0.1143,
+      "step": 33681
+    },
+    {
+      "epoch": 0.29237593423668196,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001523607279562725,
+      "loss": 0.084,
+      "step": 33682
+    },
+    {
+      "epoch": 0.29238461471688615,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0015235810197134,
+      "loss": 0.1094,
+      "step": 33683
+    },
+    {
+      "epoch": 0.2923932951970903,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0015235547594008528,
+      "loss": 0.0938,
+      "step": 33684
+    },
+    {
+      "epoch": 0.2924019756772945,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0015235284986251127,
+      "loss": 0.0801,
+      "step": 33685
+    },
+    {
+      "epoch": 0.2924106561574986,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0015235022373862085,
+      "loss": 0.1221,
+      "step": 33686
+    },
+    {
+      "epoch": 0.2924193366377028,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0015234759756841683,
+      "loss": 0.1377,
+      "step": 33687
+    },
+    {
+      "epoch": 0.29242801711790695,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0015234497135190215,
+      "loss": 0.1064,
+      "step": 33688
+    },
+    {
+      "epoch": 0.29243669759811114,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001523423450890796,
+      "loss": 0.1318,
+      "step": 33689
+    },
+    {
+      "epoch": 0.2924453780783153,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0015233971877995213,
+      "loss": 0.1484,
+      "step": 33690
+    },
+    {
+      "epoch": 0.29245405855851947,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0015233709242452257,
+      "loss": 0.1279,
+      "step": 33691
+    },
+    {
+      "epoch": 0.2924627390387236,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001523344660227938,
+      "loss": 0.1094,
+      "step": 33692
+    },
+    {
+      "epoch": 0.2924714195189278,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0015233183957476869,
+      "loss": 0.1221,
+      "step": 33693
+    },
+    {
+      "epoch": 0.29248009999913194,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0015232921308045012,
+      "loss": 0.1016,
+      "step": 33694
+    },
+    {
+      "epoch": 0.29248878047933613,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0015232658653984094,
+      "loss": 0.0928,
+      "step": 33695
+    },
+    {
+      "epoch": 0.29249746095954027,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0015232395995294406,
+      "loss": 0.1064,
+      "step": 33696
+    },
+    {
+      "epoch": 0.29250614143974446,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0015232133331976234,
+      "loss": 0.0889,
+      "step": 33697
+    },
+    {
+      "epoch": 0.2925148219199486,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0015231870664029865,
+      "loss": 0.1191,
+      "step": 33698
+    },
+    {
+      "epoch": 0.2925235024001528,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0015231607991455583,
+      "loss": 0.0903,
+      "step": 33699
+    },
+    {
+      "epoch": 0.29253218288035693,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0015231345314253681,
+      "loss": 0.0762,
+      "step": 33700
+    },
+    {
+      "epoch": 0.2925408633605611,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0015231082632424443,
+      "loss": 0.1719,
+      "step": 33701
+    },
+    {
+      "epoch": 0.29254954384076526,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0015230819945968156,
+      "loss": 0.1035,
+      "step": 33702
+    },
+    {
+      "epoch": 0.29255822432096945,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0015230557254885108,
+      "loss": 0.4023,
+      "step": 33703
+    },
+    {
+      "epoch": 0.2925669048011736,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0015230294559175584,
+      "loss": 0.082,
+      "step": 33704
+    },
+    {
+      "epoch": 0.2925755852813778,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0015230031858839877,
+      "loss": 0.1494,
+      "step": 33705
+    },
+    {
+      "epoch": 0.2925842657615819,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0015229769153878268,
+      "loss": 0.0869,
+      "step": 33706
+    },
+    {
+      "epoch": 0.2925929462417861,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0015229506444291046,
+      "loss": 0.1045,
+      "step": 33707
+    },
+    {
+      "epoch": 0.29260162672199025,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0015229243730078505,
+      "loss": 0.0679,
+      "step": 33708
+    },
+    {
+      "epoch": 0.29261030720219444,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0015228981011240921,
+      "loss": 0.1001,
+      "step": 33709
+    },
+    {
+      "epoch": 0.2926189876823986,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001522871828777859,
+      "loss": 0.0806,
+      "step": 33710
+    },
+    {
+      "epoch": 0.29262766816260277,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0015228455559691794,
+      "loss": 0.1128,
+      "step": 33711
+    },
+    {
+      "epoch": 0.2926363486428069,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0015228192826980823,
+      "loss": 0.103,
+      "step": 33712
+    },
+    {
+      "epoch": 0.2926450291230111,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0015227930089645964,
+      "loss": 0.0918,
+      "step": 33713
+    },
+    {
+      "epoch": 0.29265370960321524,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0015227667347687505,
+      "loss": 0.1328,
+      "step": 33714
+    },
+    {
+      "epoch": 0.29266239008341943,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0015227404601105732,
+      "loss": 0.123,
+      "step": 33715
+    },
+    {
+      "epoch": 0.29267107056362357,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0015227141849900932,
+      "loss": 0.1162,
+      "step": 33716
+    },
+    {
+      "epoch": 0.29267975104382776,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0015226879094073394,
+      "loss": 0.0762,
+      "step": 33717
+    },
+    {
+      "epoch": 0.2926884315240319,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0015226616333623405,
+      "loss": 0.0894,
+      "step": 33718
+    },
+    {
+      "epoch": 0.2926971120042361,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001522635356855125,
+      "loss": 0.0791,
+      "step": 33719
+    },
+    {
+      "epoch": 0.29270579248444023,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0015226090798857222,
+      "loss": 0.085,
+      "step": 33720
+    },
+    {
+      "epoch": 0.2927144729646444,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.00152258280245416,
+      "loss": 0.1299,
+      "step": 33721
+    },
+    {
+      "epoch": 0.29272315344484856,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001522556524560468,
+      "loss": 0.0918,
+      "step": 33722
+    },
+    {
+      "epoch": 0.29273183392505275,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001522530246204674,
+      "loss": 0.0947,
+      "step": 33723
+    },
+    {
+      "epoch": 0.2927405144052569,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0015225039673868077,
+      "loss": 0.1484,
+      "step": 33724
+    },
+    {
+      "epoch": 0.2927491948854611,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0015224776881068974,
+      "loss": 0.1123,
+      "step": 33725
+    },
+    {
+      "epoch": 0.2927578753656652,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0015224514083649718,
+      "loss": 0.1172,
+      "step": 33726
+    },
+    {
+      "epoch": 0.2927665558458694,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0015224251281610597,
+      "loss": 0.0806,
+      "step": 33727
+    },
+    {
+      "epoch": 0.29277523632607355,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.00152239884749519,
+      "loss": 0.0938,
+      "step": 33728
+    },
+    {
+      "epoch": 0.29278391680627774,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0015223725663673908,
+      "loss": 0.0923,
+      "step": 33729
+    },
+    {
+      "epoch": 0.2927925972864819,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0015223462847776916,
+      "loss": 0.1211,
+      "step": 33730
+    },
+    {
+      "epoch": 0.2928012777666861,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0015223200027261207,
+      "loss": 0.0967,
+      "step": 33731
+    },
+    {
+      "epoch": 0.2928099582468902,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0015222937202127073,
+      "loss": 0.127,
+      "step": 33732
+    },
+    {
+      "epoch": 0.2928186387270944,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0015222674372374794,
+      "loss": 0.1387,
+      "step": 33733
+    },
+    {
+      "epoch": 0.29282731920729854,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0015222411538004666,
+      "loss": 0.0898,
+      "step": 33734
+    },
+    {
+      "epoch": 0.29283599968750273,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001522214869901697,
+      "loss": 0.1084,
+      "step": 33735
+    },
+    {
+      "epoch": 0.2928446801677069,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0015221885855411996,
+      "loss": 0.125,
+      "step": 33736
+    },
+    {
+      "epoch": 0.29285336064791107,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015221623007190034,
+      "loss": 0.1338,
+      "step": 33737
+    },
+    {
+      "epoch": 0.2928620411281152,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0015221360154351363,
+      "loss": 0.0918,
+      "step": 33738
+    },
+    {
+      "epoch": 0.2928707216083194,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0015221097296896277,
+      "loss": 0.0791,
+      "step": 33739
+    },
+    {
+      "epoch": 0.29287940208852353,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0015220834434825066,
+      "loss": 0.1328,
+      "step": 33740
+    },
+    {
+      "epoch": 0.2928880825687277,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0015220571568138011,
+      "loss": 0.1445,
+      "step": 33741
+    },
+    {
+      "epoch": 0.29289676304893186,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0015220308696835404,
+      "loss": 0.1133,
+      "step": 33742
+    },
+    {
+      "epoch": 0.29290544352913606,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.001522004582091753,
+      "loss": 0.1055,
+      "step": 33743
+    },
+    {
+      "epoch": 0.2929141240093402,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0015219782940384676,
+      "loss": 0.1367,
+      "step": 33744
+    },
+    {
+      "epoch": 0.2929228044895444,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0015219520055237132,
+      "loss": 0.1074,
+      "step": 33745
+    },
+    {
+      "epoch": 0.2929314849697485,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0015219257165475185,
+      "loss": 0.0791,
+      "step": 33746
+    },
+    {
+      "epoch": 0.2929401654499527,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001521899427109912,
+      "loss": 0.1328,
+      "step": 33747
+    },
+    {
+      "epoch": 0.29294884593015685,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001521873137210923,
+      "loss": 0.0996,
+      "step": 33748
+    },
+    {
+      "epoch": 0.29295752641036105,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0015218468468505796,
+      "loss": 0.126,
+      "step": 33749
+    },
+    {
+      "epoch": 0.2929662068905652,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0015218205560289107,
+      "loss": 0.0859,
+      "step": 33750
+    },
+    {
+      "epoch": 0.2929748873707694,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0015217942647459456,
+      "loss": 0.0933,
+      "step": 33751
+    },
+    {
+      "epoch": 0.2929835678509735,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001521767973001712,
+      "loss": 0.123,
+      "step": 33752
+    },
+    {
+      "epoch": 0.2929922483311777,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0015217416807962395,
+      "loss": 0.0986,
+      "step": 33753
+    },
+    {
+      "epoch": 0.29300092881138184,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001521715388129557,
+      "loss": 0.0781,
+      "step": 33754
+    },
+    {
+      "epoch": 0.29300960929158604,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0015216890950016926,
+      "loss": 0.1025,
+      "step": 33755
+    },
+    {
+      "epoch": 0.2930182897717902,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0015216628014126752,
+      "loss": 0.0928,
+      "step": 33756
+    },
+    {
+      "epoch": 0.29302697025199437,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001521636507362534,
+      "loss": 0.1089,
+      "step": 33757
+    },
+    {
+      "epoch": 0.2930356507321985,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0015216102128512974,
+      "loss": 0.0884,
+      "step": 33758
+    },
+    {
+      "epoch": 0.2930443312124027,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0015215839178789941,
+      "loss": 0.1055,
+      "step": 33759
+    },
+    {
+      "epoch": 0.29305301169260684,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001521557622445653,
+      "loss": 0.0918,
+      "step": 33760
+    },
+    {
+      "epoch": 0.293061692172811,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0015215313265513027,
+      "loss": 0.1152,
+      "step": 33761
+    },
+    {
+      "epoch": 0.29307037265301517,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0015215050301959723,
+      "loss": 0.0806,
+      "step": 33762
+    },
+    {
+      "epoch": 0.2930790531332193,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0015214787333796905,
+      "loss": 0.0698,
+      "step": 33763
+    },
+    {
+      "epoch": 0.2930877336134235,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0015214524361024852,
+      "loss": 0.0962,
+      "step": 33764
+    },
+    {
+      "epoch": 0.29309641409362763,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0015214261383643863,
+      "loss": 0.1084,
+      "step": 33765
+    },
+    {
+      "epoch": 0.2931050945738318,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0015213998401654221,
+      "loss": 0.0879,
+      "step": 33766
+    },
+    {
+      "epoch": 0.29311377505403596,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0015213735415056215,
+      "loss": 0.1016,
+      "step": 33767
+    },
+    {
+      "epoch": 0.29312245553424016,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0015213472423850128,
+      "loss": 0.126,
+      "step": 33768
+    },
+    {
+      "epoch": 0.2931311360144443,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0015213209428036254,
+      "loss": 0.1216,
+      "step": 33769
+    },
+    {
+      "epoch": 0.2931398164946485,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0015212946427614876,
+      "loss": 0.1543,
+      "step": 33770
+    },
+    {
+      "epoch": 0.2931484969748526,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0015212683422586282,
+      "loss": 0.0889,
+      "step": 33771
+    },
+    {
+      "epoch": 0.2931571774550568,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.001521242041295076,
+      "loss": 0.1387,
+      "step": 33772
+    },
+    {
+      "epoch": 0.29316585793526095,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0015212157398708603,
+      "loss": 0.1069,
+      "step": 33773
+    },
+    {
+      "epoch": 0.29317453841546515,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001521189437986009,
+      "loss": 0.0913,
+      "step": 33774
+    },
+    {
+      "epoch": 0.2931832188956693,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0015211631356405514,
+      "loss": 0.1328,
+      "step": 33775
+    },
+    {
+      "epoch": 0.2931918993758735,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0015211368328345162,
+      "loss": 0.1045,
+      "step": 33776
+    },
+    {
+      "epoch": 0.2932005798560776,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001521110529567932,
+      "loss": 0.167,
+      "step": 33777
+    },
+    {
+      "epoch": 0.2932092603362818,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0015210842258408276,
+      "loss": 0.1069,
+      "step": 33778
+    },
+    {
+      "epoch": 0.29321794081648594,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001521057921653232,
+      "loss": 0.1543,
+      "step": 33779
+    },
+    {
+      "epoch": 0.29322662129669014,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0015210316170051735,
+      "loss": 0.1406,
+      "step": 33780
+    },
+    {
+      "epoch": 0.2932353017768943,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001521005311896681,
+      "loss": 0.1211,
+      "step": 33781
+    },
+    {
+      "epoch": 0.29324398225709847,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0015209790063277838,
+      "loss": 0.1152,
+      "step": 33782
+    },
+    {
+      "epoch": 0.2932526627373026,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0015209527002985103,
+      "loss": 0.1016,
+      "step": 33783
+    },
+    {
+      "epoch": 0.2932613432175068,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001520926393808889,
+      "loss": 0.1592,
+      "step": 33784
+    },
+    {
+      "epoch": 0.29327002369771094,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001520900086858949,
+      "loss": 0.104,
+      "step": 33785
+    },
+    {
+      "epoch": 0.29327870417791513,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0015208737794487192,
+      "loss": 0.1357,
+      "step": 33786
+    },
+    {
+      "epoch": 0.29328738465811927,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001520847471578228,
+      "loss": 0.1191,
+      "step": 33787
+    },
+    {
+      "epoch": 0.29329606513832346,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0015208211632475042,
+      "loss": 0.1143,
+      "step": 33788
+    },
+    {
+      "epoch": 0.2933047456185276,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.001520794854456577,
+      "loss": 0.1357,
+      "step": 33789
+    },
+    {
+      "epoch": 0.2933134260987318,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0015207685452054743,
+      "loss": 0.1182,
+      "step": 33790
+    },
+    {
+      "epoch": 0.2933221065789359,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001520742235494226,
+      "loss": 0.1182,
+      "step": 33791
+    },
+    {
+      "epoch": 0.2933307870591401,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0015207159253228602,
+      "loss": 0.1074,
+      "step": 33792
+    },
+    {
+      "epoch": 0.29333946753934426,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0015206896146914058,
+      "loss": 0.1162,
+      "step": 33793
+    },
+    {
+      "epoch": 0.29334814801954845,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0015206633035998913,
+      "loss": 0.1108,
+      "step": 33794
+    },
+    {
+      "epoch": 0.2933568284997526,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0015206369920483464,
+      "loss": 0.1201,
+      "step": 33795
+    },
+    {
+      "epoch": 0.2933655089799568,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0015206106800367985,
+      "loss": 0.1094,
+      "step": 33796
+    },
+    {
+      "epoch": 0.2933741894601609,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0015205843675652772,
+      "loss": 0.0928,
+      "step": 33797
+    },
+    {
+      "epoch": 0.2933828699403651,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0015205580546338113,
+      "loss": 0.1426,
+      "step": 33798
+    },
+    {
+      "epoch": 0.29339155042056925,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0015205317412424295,
+      "loss": 0.0977,
+      "step": 33799
+    },
+    {
+      "epoch": 0.29340023090077344,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0015205054273911603,
+      "loss": 0.1035,
+      "step": 33800
+    },
+    {
+      "epoch": 0.2934089113809776,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0015204791130800327,
+      "loss": 0.0698,
+      "step": 33801
+    },
+    {
+      "epoch": 0.29341759186118177,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0015204527983090754,
+      "loss": 0.0854,
+      "step": 33802
+    },
+    {
+      "epoch": 0.2934262723413859,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0015204264830783174,
+      "loss": 0.0947,
+      "step": 33803
+    },
+    {
+      "epoch": 0.2934349528215901,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0015204001673877873,
+      "loss": 0.1074,
+      "step": 33804
+    },
+    {
+      "epoch": 0.29344363330179424,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001520373851237514,
+      "loss": 0.1084,
+      "step": 33805
+    },
+    {
+      "epoch": 0.29345231378199843,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.001520347534627526,
+      "loss": 0.0859,
+      "step": 33806
+    },
+    {
+      "epoch": 0.29346099426220257,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0015203212175578524,
+      "loss": 0.0879,
+      "step": 33807
+    },
+    {
+      "epoch": 0.29346967474240676,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0015202949000285217,
+      "loss": 0.1445,
+      "step": 33808
+    },
+    {
+      "epoch": 0.2934783552226109,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0015202685820395628,
+      "loss": 0.0986,
+      "step": 33809
+    },
+    {
+      "epoch": 0.2934870357028151,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0015202422635910047,
+      "loss": 0.1196,
+      "step": 33810
+    },
+    {
+      "epoch": 0.29349571618301923,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0015202159446828757,
+      "loss": 0.0728,
+      "step": 33811
+    },
+    {
+      "epoch": 0.2935043966632234,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001520189625315205,
+      "loss": 0.082,
+      "step": 33812
+    },
+    {
+      "epoch": 0.29351307714342756,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0015201633054880213,
+      "loss": 0.1221,
+      "step": 33813
+    },
+    {
+      "epoch": 0.29352175762363175,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0015201369852013533,
+      "loss": 0.1055,
+      "step": 33814
+    },
+    {
+      "epoch": 0.2935304381038359,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0015201106644552296,
+      "loss": 0.1011,
+      "step": 33815
+    },
+    {
+      "epoch": 0.2935391185840401,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0015200843432496794,
+      "loss": 0.106,
+      "step": 33816
+    },
+    {
+      "epoch": 0.2935477990642442,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0015200580215847313,
+      "loss": 0.0957,
+      "step": 33817
+    },
+    {
+      "epoch": 0.2935564795444484,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0015200316994604137,
+      "loss": 0.0669,
+      "step": 33818
+    },
+    {
+      "epoch": 0.29356516002465255,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0015200053768767562,
+      "loss": 0.0781,
+      "step": 33819
+    },
+    {
+      "epoch": 0.29357384050485674,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0015199790538337867,
+      "loss": 0.0771,
+      "step": 33820
+    },
+    {
+      "epoch": 0.2935825209850609,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0015199527303315348,
+      "loss": 0.1064,
+      "step": 33821
+    },
+    {
+      "epoch": 0.2935912014652651,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0015199264063700288,
+      "loss": 0.1299,
+      "step": 33822
+    },
+    {
+      "epoch": 0.2935998819454692,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0015199000819492976,
+      "loss": 0.1064,
+      "step": 33823
+    },
+    {
+      "epoch": 0.2936085624256734,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.00151987375706937,
+      "loss": 0.0908,
+      "step": 33824
+    },
+    {
+      "epoch": 0.29361724290587754,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0015198474317302745,
+      "loss": 0.1182,
+      "step": 33825
+    },
+    {
+      "epoch": 0.29362592338608173,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0015198211059320407,
+      "loss": 0.0986,
+      "step": 33826
+    },
+    {
+      "epoch": 0.29363460386628587,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0015197947796746964,
+      "loss": 0.126,
+      "step": 33827
+    },
+    {
+      "epoch": 0.29364328434649006,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0015197684529582706,
+      "loss": 0.0713,
+      "step": 33828
+    },
+    {
+      "epoch": 0.2936519648266942,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0015197421257827927,
+      "loss": 0.0913,
+      "step": 33829
+    },
+    {
+      "epoch": 0.2936606453068984,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0015197157981482914,
+      "loss": 0.1162,
+      "step": 33830
+    },
+    {
+      "epoch": 0.29366932578710253,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0015196894700547948,
+      "loss": 0.1426,
+      "step": 33831
+    },
+    {
+      "epoch": 0.2936780062673067,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001519663141502332,
+      "loss": 0.1069,
+      "step": 33832
+    },
+    {
+      "epoch": 0.29368668674751086,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0015196368124909324,
+      "loss": 0.123,
+      "step": 33833
+    },
+    {
+      "epoch": 0.29369536722771505,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001519610483020624,
+      "loss": 0.083,
+      "step": 33834
+    },
+    {
+      "epoch": 0.2937040477079192,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0015195841530914358,
+      "loss": 0.123,
+      "step": 33835
+    },
+    {
+      "epoch": 0.2937127281881234,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0015195578227033967,
+      "loss": 0.0698,
+      "step": 33836
+    },
+    {
+      "epoch": 0.2937214086683275,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0015195314918565353,
+      "loss": 0.1011,
+      "step": 33837
+    },
+    {
+      "epoch": 0.2937300891485317,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0015195051605508807,
+      "loss": 0.0898,
+      "step": 33838
+    },
+    {
+      "epoch": 0.29373876962873585,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0015194788287864619,
+      "loss": 0.1816,
+      "step": 33839
+    },
+    {
+      "epoch": 0.29374745010894004,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0015194524965633069,
+      "loss": 0.0928,
+      "step": 33840
+    },
+    {
+      "epoch": 0.2937561305891442,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0015194261638814452,
+      "loss": 0.1006,
+      "step": 33841
+    },
+    {
+      "epoch": 0.2937648110693484,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0015193998307409052,
+      "loss": 0.0864,
+      "step": 33842
+    },
+    {
+      "epoch": 0.2937734915495525,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001519373497141716,
+      "loss": 0.1465,
+      "step": 33843
+    },
+    {
+      "epoch": 0.2937821720297567,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0015193471630839063,
+      "loss": 0.1084,
+      "step": 33844
+    },
+    {
+      "epoch": 0.29379085250996084,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0015193208285675045,
+      "loss": 0.1543,
+      "step": 33845
+    },
+    {
+      "epoch": 0.29379953299016504,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.00151929449359254,
+      "loss": 0.125,
+      "step": 33846
+    },
+    {
+      "epoch": 0.2938082134703692,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0015192681581590413,
+      "loss": 0.1348,
+      "step": 33847
+    },
+    {
+      "epoch": 0.29381689395057337,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0015192418222670372,
+      "loss": 0.0854,
+      "step": 33848
+    },
+    {
+      "epoch": 0.2938255744307775,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0015192154859165567,
+      "loss": 0.1162,
+      "step": 33849
+    },
+    {
+      "epoch": 0.2938342549109817,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0015191891491076282,
+      "loss": 0.106,
+      "step": 33850
+    },
+    {
+      "epoch": 0.29384293539118583,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0015191628118402809,
+      "loss": 0.1309,
+      "step": 33851
+    },
+    {
+      "epoch": 0.29385161587139,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0015191364741145436,
+      "loss": 0.0986,
+      "step": 33852
+    },
+    {
+      "epoch": 0.29386029635159416,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0015191101359304448,
+      "loss": 0.126,
+      "step": 33853
+    },
+    {
+      "epoch": 0.29386897683179836,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0015190837972880135,
+      "loss": 0.1035,
+      "step": 33854
+    },
+    {
+      "epoch": 0.2938776573120025,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001519057458187278,
+      "loss": 0.1216,
+      "step": 33855
+    },
+    {
+      "epoch": 0.2938863377922067,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0015190311186282681,
+      "loss": 0.0815,
+      "step": 33856
+    },
+    {
+      "epoch": 0.2938950182724108,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0015190047786110118,
+      "loss": 0.1211,
+      "step": 33857
+    },
+    {
+      "epoch": 0.293903698752615,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0015189784381355383,
+      "loss": 0.1191,
+      "step": 33858
+    },
+    {
+      "epoch": 0.29391237923281915,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0015189520972018762,
+      "loss": 0.0845,
+      "step": 33859
+    },
+    {
+      "epoch": 0.29392105971302335,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0015189257558100545,
+      "loss": 0.1016,
+      "step": 33860
+    },
+    {
+      "epoch": 0.2939297401932275,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.001518899413960102,
+      "loss": 0.1533,
+      "step": 33861
+    },
+    {
+      "epoch": 0.2939384206734317,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0015188730716520472,
+      "loss": 0.1201,
+      "step": 33862
+    },
+    {
+      "epoch": 0.2939471011536358,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001518846728885919,
+      "loss": 0.1162,
+      "step": 33863
+    },
+    {
+      "epoch": 0.29395578163384,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0015188203856617464,
+      "loss": 0.1064,
+      "step": 33864
+    },
+    {
+      "epoch": 0.29396446211404414,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001518794041979558,
+      "loss": 0.1689,
+      "step": 33865
+    },
+    {
+      "epoch": 0.29397314259424834,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001518767697839383,
+      "loss": 0.0835,
+      "step": 33866
+    },
+    {
+      "epoch": 0.2939818230744525,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0015187413532412497,
+      "loss": 0.0854,
+      "step": 33867
+    },
+    {
+      "epoch": 0.29399050355465667,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0015187150081851872,
+      "loss": 0.1221,
+      "step": 33868
+    },
+    {
+      "epoch": 0.2939991840348608,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0015186886626712243,
+      "loss": 0.1035,
+      "step": 33869
+    },
+    {
+      "epoch": 0.294007864515065,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0015186623166993899,
+      "loss": 0.1123,
+      "step": 33870
+    },
+    {
+      "epoch": 0.29401654499526914,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0015186359702697126,
+      "loss": 0.0811,
+      "step": 33871
+    },
+    {
+      "epoch": 0.29402522547547333,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0015186096233822213,
+      "loss": 0.1221,
+      "step": 33872
+    },
+    {
+      "epoch": 0.29403390595567747,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0015185832760369446,
+      "loss": 0.0684,
+      "step": 33873
+    },
+    {
+      "epoch": 0.29404258643588166,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0015185569282339117,
+      "loss": 0.0996,
+      "step": 33874
+    },
+    {
+      "epoch": 0.2940512669160858,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015185305799731512,
+      "loss": 0.1167,
+      "step": 33875
+    },
+    {
+      "epoch": 0.29405994739629,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0015185042312546922,
+      "loss": 0.0845,
+      "step": 33876
+    },
+    {
+      "epoch": 0.2940686278764941,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0015184778820785628,
+      "loss": 0.0903,
+      "step": 33877
+    },
+    {
+      "epoch": 0.2940773083566983,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0015184515324447928,
+      "loss": 0.0947,
+      "step": 33878
+    },
+    {
+      "epoch": 0.29408598883690246,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0015184251823534098,
+      "loss": 0.1309,
+      "step": 33879
+    },
+    {
+      "epoch": 0.29409466931710665,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0015183988318044437,
+      "loss": 0.1113,
+      "step": 33880
+    },
+    {
+      "epoch": 0.2941033497973108,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001518372480797923,
+      "loss": 0.1143,
+      "step": 33881
+    },
+    {
+      "epoch": 0.294112030277515,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015183461293338766,
+      "loss": 0.1377,
+      "step": 33882
+    },
+    {
+      "epoch": 0.2941207107577191,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0015183197774123328,
+      "loss": 0.123,
+      "step": 33883
+    },
+    {
+      "epoch": 0.29412939123792325,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0015182934250333208,
+      "loss": 0.1118,
+      "step": 33884
+    },
+    {
+      "epoch": 0.29413807171812745,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0015182670721968694,
+      "loss": 0.1021,
+      "step": 33885
+    },
+    {
+      "epoch": 0.2941467521983316,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0015182407189030073,
+      "loss": 0.1162,
+      "step": 33886
+    },
+    {
+      "epoch": 0.2941554326785358,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001518214365151764,
+      "loss": 0.0894,
+      "step": 33887
+    },
+    {
+      "epoch": 0.2941641131587399,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0015181880109431673,
+      "loss": 0.0811,
+      "step": 33888
+    },
+    {
+      "epoch": 0.2941727936389441,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0015181616562772464,
+      "loss": 0.0991,
+      "step": 33889
+    },
+    {
+      "epoch": 0.29418147411914825,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0015181353011540304,
+      "loss": 0.1084,
+      "step": 33890
+    },
+    {
+      "epoch": 0.29419015459935244,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0015181089455735478,
+      "loss": 0.084,
+      "step": 33891
+    },
+    {
+      "epoch": 0.2941988350795566,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001518082589535828,
+      "loss": 0.127,
+      "step": 33892
+    },
+    {
+      "epoch": 0.29420751555976077,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0015180562330408987,
+      "loss": 0.1172,
+      "step": 33893
+    },
+    {
+      "epoch": 0.2942161960399649,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0015180298760887896,
+      "loss": 0.0918,
+      "step": 33894
+    },
+    {
+      "epoch": 0.2942248765201691,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0015180035186795296,
+      "loss": 0.1113,
+      "step": 33895
+    },
+    {
+      "epoch": 0.29423355700037324,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0015179771608131469,
+      "loss": 0.0869,
+      "step": 33896
+    },
+    {
+      "epoch": 0.29424223748057743,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001517950802489671,
+      "loss": 0.1143,
+      "step": 33897
+    },
+    {
+      "epoch": 0.29425091796078157,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.00151792444370913,
+      "loss": 0.1035,
+      "step": 33898
+    },
+    {
+      "epoch": 0.29425959844098576,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0015178980844715532,
+      "loss": 0.0903,
+      "step": 33899
+    },
+    {
+      "epoch": 0.2942682789211899,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0015178717247769694,
+      "loss": 0.1035,
+      "step": 33900
+    },
+    {
+      "epoch": 0.2942769594013941,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0015178453646254076,
+      "loss": 0.168,
+      "step": 33901
+    },
+    {
+      "epoch": 0.2942856398815982,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0015178190040168959,
+      "loss": 0.0635,
+      "step": 33902
+    },
+    {
+      "epoch": 0.2942943203618024,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0015177926429514638,
+      "loss": 0.0947,
+      "step": 33903
+    },
+    {
+      "epoch": 0.29430300084200656,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.00151776628142914,
+      "loss": 0.1094,
+      "step": 33904
+    },
+    {
+      "epoch": 0.29431168132221075,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001517739919449953,
+      "loss": 0.1035,
+      "step": 33905
+    },
+    {
+      "epoch": 0.2943203618024149,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0015177135570139321,
+      "loss": 0.0923,
+      "step": 33906
+    },
+    {
+      "epoch": 0.2943290422826191,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0015176871941211061,
+      "loss": 0.0981,
+      "step": 33907
+    },
+    {
+      "epoch": 0.2943377227628232,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0015176608307715038,
+      "loss": 0.0801,
+      "step": 33908
+    },
+    {
+      "epoch": 0.2943464032430274,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0015176344669651535,
+      "loss": 0.085,
+      "step": 33909
+    },
+    {
+      "epoch": 0.29435508372323155,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0015176081027020846,
+      "loss": 0.0869,
+      "step": 33910
+    },
+    {
+      "epoch": 0.29436376420343574,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0015175817379823257,
+      "loss": 0.104,
+      "step": 33911
+    },
+    {
+      "epoch": 0.2943724446836399,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.001517555372805906,
+      "loss": 0.1494,
+      "step": 33912
+    },
+    {
+      "epoch": 0.29438112516384407,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0015175290071728536,
+      "loss": 0.0752,
+      "step": 33913
+    },
+    {
+      "epoch": 0.2943898056440482,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0015175026410831976,
+      "loss": 0.1289,
+      "step": 33914
+    },
+    {
+      "epoch": 0.2943984861242524,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0015174762745369672,
+      "loss": 0.0981,
+      "step": 33915
+    },
+    {
+      "epoch": 0.29440716660445654,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0015174499075341912,
+      "loss": 0.1094,
+      "step": 33916
+    },
+    {
+      "epoch": 0.29441584708466073,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0015174235400748982,
+      "loss": 0.0957,
+      "step": 33917
+    },
+    {
+      "epoch": 0.29442452756486487,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001517397172159117,
+      "loss": 0.0898,
+      "step": 33918
+    },
+    {
+      "epoch": 0.29443320804506906,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0015173708037868766,
+      "loss": 0.1123,
+      "step": 33919
+    },
+    {
+      "epoch": 0.2944418885252732,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0015173444349582054,
+      "loss": 0.1016,
+      "step": 33920
+    },
+    {
+      "epoch": 0.2944505690054774,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.001517318065673133,
+      "loss": 0.1299,
+      "step": 33921
+    },
+    {
+      "epoch": 0.29445924948568153,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001517291695931688,
+      "loss": 0.1221,
+      "step": 33922
+    },
+    {
+      "epoch": 0.2944679299658857,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0015172653257338987,
+      "loss": 0.0898,
+      "step": 33923
+    },
+    {
+      "epoch": 0.29447661044608986,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0015172389550797945,
+      "loss": 0.123,
+      "step": 33924
+    },
+    {
+      "epoch": 0.29448529092629405,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.001517212583969404,
+      "loss": 0.1113,
+      "step": 33925
+    },
+    {
+      "epoch": 0.2944939714064982,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0015171862124027558,
+      "loss": 0.1074,
+      "step": 33926
+    },
+    {
+      "epoch": 0.2945026518867024,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0015171598403798793,
+      "loss": 0.1318,
+      "step": 33927
+    },
+    {
+      "epoch": 0.2945113323669065,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0015171334679008031,
+      "loss": 0.1084,
+      "step": 33928
+    },
+    {
+      "epoch": 0.2945200128471107,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0015171070949655558,
+      "loss": 0.0938,
+      "step": 33929
+    },
+    {
+      "epoch": 0.29452869332731485,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0015170807215741668,
+      "loss": 0.1133,
+      "step": 33930
+    },
+    {
+      "epoch": 0.29453737380751904,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0015170543477266644,
+      "loss": 0.1167,
+      "step": 33931
+    },
+    {
+      "epoch": 0.2945460542877232,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0015170279734230776,
+      "loss": 0.1055,
+      "step": 33932
+    },
+    {
+      "epoch": 0.2945547347679274,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0015170015986634353,
+      "loss": 0.127,
+      "step": 33933
+    },
+    {
+      "epoch": 0.2945634152481315,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0015169752234477662,
+      "loss": 0.0928,
+      "step": 33934
+    },
+    {
+      "epoch": 0.2945720957283357,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0015169488477760995,
+      "loss": 0.1226,
+      "step": 33935
+    },
+    {
+      "epoch": 0.29458077620853984,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0015169224716484635,
+      "loss": 0.0801,
+      "step": 33936
+    },
+    {
+      "epoch": 0.29458945668874403,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0015168960950648876,
+      "loss": 0.0854,
+      "step": 33937
+    },
+    {
+      "epoch": 0.29459813716894817,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0015168697180254003,
+      "loss": 0.0977,
+      "step": 33938
+    },
+    {
+      "epoch": 0.29460681764915236,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0015168433405300305,
+      "loss": 0.1152,
+      "step": 33939
+    },
+    {
+      "epoch": 0.2946154981293565,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0015168169625788073,
+      "loss": 0.1738,
+      "step": 33940
+    },
+    {
+      "epoch": 0.2946241786095607,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001516790584171759,
+      "loss": 0.105,
+      "step": 33941
+    },
+    {
+      "epoch": 0.29463285908976483,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0015167642053089151,
+      "loss": 0.0801,
+      "step": 33942
+    },
+    {
+      "epoch": 0.294641539569969,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001516737825990304,
+      "loss": 0.126,
+      "step": 33943
+    },
+    {
+      "epoch": 0.29465022005017316,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0015167114462159546,
+      "loss": 0.0874,
+      "step": 33944
+    },
+    {
+      "epoch": 0.29465890053037735,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0015166850659858958,
+      "loss": 0.0928,
+      "step": 33945
+    },
+    {
+      "epoch": 0.2946675810105815,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0015166586853001564,
+      "loss": 0.0811,
+      "step": 33946
+    },
+    {
+      "epoch": 0.2946762614907857,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0015166323041587654,
+      "loss": 0.1206,
+      "step": 33947
+    },
+    {
+      "epoch": 0.2946849419709898,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001516605922561752,
+      "loss": 0.0986,
+      "step": 33948
+    },
+    {
+      "epoch": 0.294693622451194,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001516579540509144,
+      "loss": 0.0908,
+      "step": 33949
+    },
+    {
+      "epoch": 0.29470230293139815,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0015165531580009712,
+      "loss": 0.082,
+      "step": 33950
+    },
+    {
+      "epoch": 0.29471098341160235,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0015165267750372623,
+      "loss": 0.1118,
+      "step": 33951
+    },
+    {
+      "epoch": 0.2947196638918065,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0015165003916180458,
+      "loss": 0.1162,
+      "step": 33952
+    },
+    {
+      "epoch": 0.2947283443720107,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0015164740077433505,
+      "loss": 0.1152,
+      "step": 33953
+    },
+    {
+      "epoch": 0.2947370248522148,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0015164476234132054,
+      "loss": 0.0835,
+      "step": 33954
+    },
+    {
+      "epoch": 0.294745705332419,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.00151642123862764,
+      "loss": 0.1201,
+      "step": 33955
+    },
+    {
+      "epoch": 0.29475438581262314,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0015163948533866822,
+      "loss": 0.1113,
+      "step": 33956
+    },
+    {
+      "epoch": 0.29476306629282734,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0015163684676903612,
+      "loss": 0.1138,
+      "step": 33957
+    },
+    {
+      "epoch": 0.2947717467730315,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001516342081538706,
+      "loss": 0.0981,
+      "step": 33958
+    },
+    {
+      "epoch": 0.29478042725323567,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0015163156949317457,
+      "loss": 0.0854,
+      "step": 33959
+    },
+    {
+      "epoch": 0.2947891077334398,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0015162893078695084,
+      "loss": 0.0967,
+      "step": 33960
+    },
+    {
+      "epoch": 0.294797788213644,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0015162629203520235,
+      "loss": 0.124,
+      "step": 33961
+    },
+    {
+      "epoch": 0.29480646869384813,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0015162365323793195,
+      "loss": 0.1123,
+      "step": 33962
+    },
+    {
+      "epoch": 0.2948151491740523,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0015162101439514257,
+      "loss": 0.1001,
+      "step": 33963
+    },
+    {
+      "epoch": 0.29482382965425646,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0015161837550683706,
+      "loss": 0.0923,
+      "step": 33964
+    },
+    {
+      "epoch": 0.29483251013446066,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0015161573657301835,
+      "loss": 0.0908,
+      "step": 33965
+    },
+    {
+      "epoch": 0.2948411906146648,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0015161309759368926,
+      "loss": 0.0947,
+      "step": 33966
+    },
+    {
+      "epoch": 0.294849871094869,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.001516104585688527,
+      "loss": 0.1094,
+      "step": 33967
+    },
+    {
+      "epoch": 0.2948585515750731,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0015160781949851163,
+      "loss": 0.0771,
+      "step": 33968
+    },
+    {
+      "epoch": 0.2948672320552773,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0015160518038266884,
+      "loss": 0.0845,
+      "step": 33969
+    },
+    {
+      "epoch": 0.29487591253548145,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0015160254122132727,
+      "loss": 0.1064,
+      "step": 33970
+    },
+    {
+      "epoch": 0.29488459301568565,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0015159990201448973,
+      "loss": 0.0923,
+      "step": 33971
+    },
+    {
+      "epoch": 0.2948932734958898,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0015159726276215918,
+      "loss": 0.0864,
+      "step": 33972
+    },
+    {
+      "epoch": 0.294901953976094,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0015159462346433852,
+      "loss": 0.1187,
+      "step": 33973
+    },
+    {
+      "epoch": 0.2949106344562981,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001515919841210306,
+      "loss": 0.0898,
+      "step": 33974
+    },
+    {
+      "epoch": 0.2949193149365023,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0015158934473223828,
+      "loss": 0.0928,
+      "step": 33975
+    },
+    {
+      "epoch": 0.29492799541670645,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0015158670529796451,
+      "loss": 0.1133,
+      "step": 33976
+    },
+    {
+      "epoch": 0.29493667589691064,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0015158406581821213,
+      "loss": 0.1094,
+      "step": 33977
+    },
+    {
+      "epoch": 0.2949453563771148,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0015158142629298405,
+      "loss": 0.1182,
+      "step": 33978
+    },
+    {
+      "epoch": 0.29495403685731897,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0015157878672228315,
+      "loss": 0.1104,
+      "step": 33979
+    },
+    {
+      "epoch": 0.2949627173375231,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0015157614710611229,
+      "loss": 0.167,
+      "step": 33980
+    },
+    {
+      "epoch": 0.2949713978177273,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0015157350744447442,
+      "loss": 0.0957,
+      "step": 33981
+    },
+    {
+      "epoch": 0.29498007829793144,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0015157086773737235,
+      "loss": 0.127,
+      "step": 33982
+    },
+    {
+      "epoch": 0.29498875877813563,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.00151568227984809,
+      "loss": 0.1035,
+      "step": 33983
+    },
+    {
+      "epoch": 0.29499743925833977,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0015156558818678726,
+      "loss": 0.106,
+      "step": 33984
+    },
+    {
+      "epoch": 0.29500611973854396,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0015156294834331005,
+      "loss": 0.0781,
+      "step": 33985
+    },
+    {
+      "epoch": 0.2950148002187481,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015156030845438022,
+      "loss": 0.1025,
+      "step": 33986
+    },
+    {
+      "epoch": 0.2950234806989523,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0015155766852000066,
+      "loss": 0.0938,
+      "step": 33987
+    },
+    {
+      "epoch": 0.2950321611791564,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0015155502854017424,
+      "loss": 0.1514,
+      "step": 33988
+    },
+    {
+      "epoch": 0.2950408416593606,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0015155238851490387,
+      "loss": 0.0664,
+      "step": 33989
+    },
+    {
+      "epoch": 0.29504952213956476,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0015154974844419245,
+      "loss": 0.1172,
+      "step": 33990
+    },
+    {
+      "epoch": 0.29505820261976895,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001515471083280428,
+      "loss": 0.1621,
+      "step": 33991
+    },
+    {
+      "epoch": 0.2950668830999731,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001515444681664579,
+      "loss": 0.1011,
+      "step": 33992
+    },
+    {
+      "epoch": 0.2950755635801773,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0015154182795944057,
+      "loss": 0.1406,
+      "step": 33993
+    },
+    {
+      "epoch": 0.2950842440603814,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0015153918770699373,
+      "loss": 0.0947,
+      "step": 33994
+    },
+    {
+      "epoch": 0.2950929245405856,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0015153654740912028,
+      "loss": 0.1484,
+      "step": 33995
+    },
+    {
+      "epoch": 0.29510160502078975,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0015153390706582309,
+      "loss": 0.0732,
+      "step": 33996
+    },
+    {
+      "epoch": 0.29511028550099394,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.00151531266677105,
+      "loss": 0.1167,
+      "step": 33997
+    },
+    {
+      "epoch": 0.2951189659811981,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0015152862624296898,
+      "loss": 0.1289,
+      "step": 33998
+    },
+    {
+      "epoch": 0.29512764646140227,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0015152598576341784,
+      "loss": 0.0947,
+      "step": 33999
+    },
+    {
+      "epoch": 0.2951363269416064,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0015152334523845451,
+      "loss": 0.1523,
+      "step": 34000
+    },
+    {
+      "epoch": 0.2951450074218106,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0015152070466808191,
+      "loss": 0.1777,
+      "step": 34001
+    },
+    {
+      "epoch": 0.29515368790201474,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0015151806405230286,
+      "loss": 0.123,
+      "step": 34002
+    },
+    {
+      "epoch": 0.29516236838221893,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001515154233911203,
+      "loss": 0.1494,
+      "step": 34003
+    },
+    {
+      "epoch": 0.29517104886242307,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0015151278268453708,
+      "loss": 0.1035,
+      "step": 34004
+    },
+    {
+      "epoch": 0.29517972934262726,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0015151014193255613,
+      "loss": 0.085,
+      "step": 34005
+    },
+    {
+      "epoch": 0.2951884098228314,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0015150750113518028,
+      "loss": 0.0684,
+      "step": 34006
+    },
+    {
+      "epoch": 0.29519709030303554,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0015150486029241246,
+      "loss": 0.1465,
+      "step": 34007
+    },
+    {
+      "epoch": 0.29520577078323973,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0015150221940425557,
+      "loss": 0.125,
+      "step": 34008
+    },
+    {
+      "epoch": 0.29521445126344387,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0015149957847071245,
+      "loss": 0.1108,
+      "step": 34009
+    },
+    {
+      "epoch": 0.29522313174364806,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0015149693749178603,
+      "loss": 0.1133,
+      "step": 34010
+    },
+    {
+      "epoch": 0.2952318122238522,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0015149429646747917,
+      "loss": 0.1123,
+      "step": 34011
+    },
+    {
+      "epoch": 0.2952404927040564,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0015149165539779478,
+      "loss": 0.125,
+      "step": 34012
+    },
+    {
+      "epoch": 0.2952491731842605,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0015148901428273574,
+      "loss": 0.0894,
+      "step": 34013
+    },
+    {
+      "epoch": 0.2952578536644647,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.001514863731223049,
+      "loss": 0.1357,
+      "step": 34014
+    },
+    {
+      "epoch": 0.29526653414466886,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0015148373191650525,
+      "loss": 0.1465,
+      "step": 34015
+    },
+    {
+      "epoch": 0.29527521462487305,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001514810906653396,
+      "loss": 0.1025,
+      "step": 34016
+    },
+    {
+      "epoch": 0.2952838951050772,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0015147844936881083,
+      "loss": 0.1484,
+      "step": 34017
+    },
+    {
+      "epoch": 0.2952925755852814,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0015147580802692187,
+      "loss": 0.0977,
+      "step": 34018
+    },
+    {
+      "epoch": 0.2953012560654855,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0015147316663967553,
+      "loss": 0.1533,
+      "step": 34019
+    },
+    {
+      "epoch": 0.2953099365456897,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001514705252070748,
+      "loss": 0.084,
+      "step": 34020
+    },
+    {
+      "epoch": 0.29531861702589385,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0015146788372912257,
+      "loss": 0.0996,
+      "step": 34021
+    },
+    {
+      "epoch": 0.29532729750609804,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0015146524220582163,
+      "loss": 0.105,
+      "step": 34022
+    },
+    {
+      "epoch": 0.2953359779863022,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001514626006371749,
+      "loss": 0.1235,
+      "step": 34023
+    },
+    {
+      "epoch": 0.29534465846650637,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0015145995902318538,
+      "loss": 0.4238,
+      "step": 34024
+    },
+    {
+      "epoch": 0.2953533389467105,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0015145731736385581,
+      "loss": 0.0918,
+      "step": 34025
+    },
+    {
+      "epoch": 0.2953620194269147,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0015145467565918919,
+      "loss": 0.1201,
+      "step": 34026
+    },
+    {
+      "epoch": 0.29537069990711884,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.001514520339091883,
+      "loss": 0.1069,
+      "step": 34027
+    },
+    {
+      "epoch": 0.29537938038732303,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001514493921138561,
+      "loss": 0.1338,
+      "step": 34028
+    },
+    {
+      "epoch": 0.29538806086752717,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0015144675027319553,
+      "loss": 0.0908,
+      "step": 34029
+    },
+    {
+      "epoch": 0.29539674134773136,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0015144410838720933,
+      "loss": 0.1074,
+      "step": 34030
+    },
+    {
+      "epoch": 0.2954054218279355,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0015144146645590052,
+      "loss": 0.1289,
+      "step": 34031
+    },
+    {
+      "epoch": 0.2954141023081397,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0015143882447927193,
+      "loss": 0.1172,
+      "step": 34032
+    },
+    {
+      "epoch": 0.29542278278834383,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0015143618245732648,
+      "loss": 0.0859,
+      "step": 34033
+    },
+    {
+      "epoch": 0.295431463268548,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0015143354039006702,
+      "loss": 0.1006,
+      "step": 34034
+    },
+    {
+      "epoch": 0.29544014374875216,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001514308982774965,
+      "loss": 0.1064,
+      "step": 34035
+    },
+    {
+      "epoch": 0.29544882422895635,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0015142825611961775,
+      "loss": 0.1416,
+      "step": 34036
+    },
+    {
+      "epoch": 0.2954575047091605,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001514256139164337,
+      "loss": 0.1475,
+      "step": 34037
+    },
+    {
+      "epoch": 0.2954661851893647,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0015142297166794718,
+      "loss": 0.1118,
+      "step": 34038
+    },
+    {
+      "epoch": 0.2954748656695688,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0015142032937416115,
+      "loss": 0.0742,
+      "step": 34039
+    },
+    {
+      "epoch": 0.295483546149773,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0015141768703507846,
+      "loss": 0.0742,
+      "step": 34040
+    },
+    {
+      "epoch": 0.29549222662997715,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.00151415044650702,
+      "loss": 0.0869,
+      "step": 34041
+    },
+    {
+      "epoch": 0.29550090711018134,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001514124022210347,
+      "loss": 0.0918,
+      "step": 34042
+    },
+    {
+      "epoch": 0.2955095875903855,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001514097597460794,
+      "loss": 0.1104,
+      "step": 34043
+    },
+    {
+      "epoch": 0.2955182680705897,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0015140711722583901,
+      "loss": 0.0947,
+      "step": 34044
+    },
+    {
+      "epoch": 0.2955269485507938,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0015140447466031643,
+      "loss": 0.1201,
+      "step": 34045
+    },
+    {
+      "epoch": 0.295535629030998,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0015140183204951454,
+      "loss": 0.0928,
+      "step": 34046
+    },
+    {
+      "epoch": 0.29554430951120214,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.001513991893934362,
+      "loss": 0.0815,
+      "step": 34047
+    },
+    {
+      "epoch": 0.29555298999140633,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0015139654669208435,
+      "loss": 0.1025,
+      "step": 34048
+    },
+    {
+      "epoch": 0.29556167047161047,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0015139390394546186,
+      "loss": 0.1348,
+      "step": 34049
+    },
+    {
+      "epoch": 0.29557035095181466,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001513912611535716,
+      "loss": 0.1094,
+      "step": 34050
+    },
+    {
+      "epoch": 0.2955790314320188,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0015138861831641651,
+      "loss": 0.1011,
+      "step": 34051
+    },
+    {
+      "epoch": 0.295587711912223,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0015138597543399943,
+      "loss": 0.127,
+      "step": 34052
+    },
+    {
+      "epoch": 0.29559639239242713,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0015138333250632324,
+      "loss": 0.0825,
+      "step": 34053
+    },
+    {
+      "epoch": 0.2956050728726313,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001513806895333909,
+      "loss": 0.1191,
+      "step": 34054
+    },
+    {
+      "epoch": 0.29561375335283546,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0015137804651520527,
+      "loss": 0.1201,
+      "step": 34055
+    },
+    {
+      "epoch": 0.29562243383303966,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001513754034517692,
+      "loss": 0.1074,
+      "step": 34056
+    },
+    {
+      "epoch": 0.2956311143132438,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0015137276034308561,
+      "loss": 0.0923,
+      "step": 34057
+    },
+    {
+      "epoch": 0.295639794793448,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0015137011718915742,
+      "loss": 0.1348,
+      "step": 34058
+    },
+    {
+      "epoch": 0.2956484752736521,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0015136747398998745,
+      "loss": 0.1084,
+      "step": 34059
+    },
+    {
+      "epoch": 0.2956571557538563,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0015136483074557868,
+      "loss": 0.1211,
+      "step": 34060
+    },
+    {
+      "epoch": 0.29566583623406045,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0015136218745593394,
+      "loss": 0.1406,
+      "step": 34061
+    },
+    {
+      "epoch": 0.29567451671426465,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0015135954412105609,
+      "loss": 0.1016,
+      "step": 34062
+    },
+    {
+      "epoch": 0.2956831971944688,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001513569007409481,
+      "loss": 0.1177,
+      "step": 34063
+    },
+    {
+      "epoch": 0.295691877674673,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0015135425731561281,
+      "loss": 0.0923,
+      "step": 34064
+    },
+    {
+      "epoch": 0.2957005581548771,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0015135161384505313,
+      "loss": 0.1562,
+      "step": 34065
+    },
+    {
+      "epoch": 0.2957092386350813,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0015134897032927194,
+      "loss": 0.0762,
+      "step": 34066
+    },
+    {
+      "epoch": 0.29571791911528544,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0015134632676827215,
+      "loss": 0.1069,
+      "step": 34067
+    },
+    {
+      "epoch": 0.29572659959548964,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0015134368316205664,
+      "loss": 0.0889,
+      "step": 34068
+    },
+    {
+      "epoch": 0.2957352800756938,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001513410395106283,
+      "loss": 0.1089,
+      "step": 34069
+    },
+    {
+      "epoch": 0.29574396055589797,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0015133839581399002,
+      "loss": 0.0962,
+      "step": 34070
+    },
+    {
+      "epoch": 0.2957526410361021,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001513357520721447,
+      "loss": 0.1348,
+      "step": 34071
+    },
+    {
+      "epoch": 0.2957613215163063,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015133310828509521,
+      "loss": 0.1123,
+      "step": 34072
+    },
+    {
+      "epoch": 0.29577000199651043,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0015133046445284443,
+      "loss": 0.123,
+      "step": 34073
+    },
+    {
+      "epoch": 0.2957786824767146,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0015132782057539532,
+      "loss": 0.0918,
+      "step": 34074
+    },
+    {
+      "epoch": 0.29578736295691876,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.001513251766527507,
+      "loss": 0.1152,
+      "step": 34075
+    },
+    {
+      "epoch": 0.29579604343712296,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0015132253268491352,
+      "loss": 0.0986,
+      "step": 34076
+    },
+    {
+      "epoch": 0.2958047239173271,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001513198886718866,
+      "loss": 0.0781,
+      "step": 34077
+    },
+    {
+      "epoch": 0.2958134043975313,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0015131724461367284,
+      "loss": 0.1035,
+      "step": 34078
+    },
+    {
+      "epoch": 0.2958220848777354,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0015131460051027521,
+      "loss": 0.0879,
+      "step": 34079
+    },
+    {
+      "epoch": 0.2958307653579396,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0015131195636169657,
+      "loss": 0.1387,
+      "step": 34080
+    },
+    {
+      "epoch": 0.29583944583814376,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0015130931216793978,
+      "loss": 0.0845,
+      "step": 34081
+    },
+    {
+      "epoch": 0.29584812631834795,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0015130666792900773,
+      "loss": 0.0918,
+      "step": 34082
+    },
+    {
+      "epoch": 0.2958568067985521,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0015130402364490333,
+      "loss": 0.0659,
+      "step": 34083
+    },
+    {
+      "epoch": 0.2958654872787563,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001513013793156295,
+      "loss": 0.1006,
+      "step": 34084
+    },
+    {
+      "epoch": 0.2958741677589604,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0015129873494118908,
+      "loss": 0.0977,
+      "step": 34085
+    },
+    {
+      "epoch": 0.2958828482391646,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.00151296090521585,
+      "loss": 0.0933,
+      "step": 34086
+    },
+    {
+      "epoch": 0.29589152871936875,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001512934460568201,
+      "loss": 0.1689,
+      "step": 34087
+    },
+    {
+      "epoch": 0.29590020919957294,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0015129080154689732,
+      "loss": 0.1299,
+      "step": 34088
+    },
+    {
+      "epoch": 0.2959088896797771,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0015128815699181955,
+      "loss": 0.1079,
+      "step": 34089
+    },
+    {
+      "epoch": 0.29591757015998127,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0015128551239158967,
+      "loss": 0.0713,
+      "step": 34090
+    },
+    {
+      "epoch": 0.2959262506401854,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0015128286774621059,
+      "loss": 0.0947,
+      "step": 34091
+    },
+    {
+      "epoch": 0.2959349311203896,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0015128022305568515,
+      "loss": 0.082,
+      "step": 34092
+    },
+    {
+      "epoch": 0.29594361160059374,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0015127757832001632,
+      "loss": 0.1484,
+      "step": 34093
+    },
+    {
+      "epoch": 0.29595229208079793,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0015127493353920695,
+      "loss": 0.0962,
+      "step": 34094
+    },
+    {
+      "epoch": 0.29596097256100207,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0015127228871325991,
+      "loss": 0.0952,
+      "step": 34095
+    },
+    {
+      "epoch": 0.29596965304120626,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015126964384217809,
+      "loss": 0.0713,
+      "step": 34096
+    },
+    {
+      "epoch": 0.2959783335214104,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0015126699892596444,
+      "loss": 0.0889,
+      "step": 34097
+    },
+    {
+      "epoch": 0.2959870140016146,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0015126435396462178,
+      "loss": 0.1074,
+      "step": 34098
+    },
+    {
+      "epoch": 0.2959956944818187,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0015126170895815312,
+      "loss": 0.1074,
+      "step": 34099
+    },
+    {
+      "epoch": 0.2960043749620229,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0015125906390656121,
+      "loss": 0.0938,
+      "step": 34100
+    },
+    {
+      "epoch": 0.29601305544222706,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0015125641880984904,
+      "loss": 0.1016,
+      "step": 34101
+    },
+    {
+      "epoch": 0.29602173592243125,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0015125377366801947,
+      "loss": 0.0718,
+      "step": 34102
+    },
+    {
+      "epoch": 0.2960304164026354,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0015125112848107538,
+      "loss": 0.0996,
+      "step": 34103
+    },
+    {
+      "epoch": 0.2960390968828396,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0015124848324901967,
+      "loss": 0.0957,
+      "step": 34104
+    },
+    {
+      "epoch": 0.2960477773630437,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0015124583797185526,
+      "loss": 0.1201,
+      "step": 34105
+    },
+    {
+      "epoch": 0.2960564578432479,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0015124319264958502,
+      "loss": 0.0938,
+      "step": 34106
+    },
+    {
+      "epoch": 0.29606513832345205,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001512405472822118,
+      "loss": 0.1152,
+      "step": 34107
+    },
+    {
+      "epoch": 0.29607381880365624,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0015123790186973858,
+      "loss": 0.1504,
+      "step": 34108
+    },
+    {
+      "epoch": 0.2960824992838604,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0015123525641216819,
+      "loss": 0.1182,
+      "step": 34109
+    },
+    {
+      "epoch": 0.29609117976406457,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0015123261090950356,
+      "loss": 0.084,
+      "step": 34110
+    },
+    {
+      "epoch": 0.2960998602442687,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0015122996536174756,
+      "loss": 0.0894,
+      "step": 34111
+    },
+    {
+      "epoch": 0.2961085407244729,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001512273197689031,
+      "loss": 0.0933,
+      "step": 34112
+    },
+    {
+      "epoch": 0.29611722120467704,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0015122467413097305,
+      "loss": 0.1211,
+      "step": 34113
+    },
+    {
+      "epoch": 0.29612590168488123,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0015122202844796032,
+      "loss": 0.1182,
+      "step": 34114
+    },
+    {
+      "epoch": 0.29613458216508537,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0015121938271986778,
+      "loss": 0.1348,
+      "step": 34115
+    },
+    {
+      "epoch": 0.29614326264528956,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0015121673694669835,
+      "loss": 0.1309,
+      "step": 34116
+    },
+    {
+      "epoch": 0.2961519431254937,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0015121409112845495,
+      "loss": 0.1006,
+      "step": 34117
+    },
+    {
+      "epoch": 0.2961606236056979,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001512114452651404,
+      "loss": 0.0913,
+      "step": 34118
+    },
+    {
+      "epoch": 0.29616930408590203,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0015120879935675764,
+      "loss": 0.0752,
+      "step": 34119
+    },
+    {
+      "epoch": 0.2961779845661062,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0015120615340330958,
+      "loss": 0.1113,
+      "step": 34120
+    },
+    {
+      "epoch": 0.29618666504631036,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001512035074047991,
+      "loss": 0.1196,
+      "step": 34121
+    },
+    {
+      "epoch": 0.29619534552651455,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0015120086136122903,
+      "loss": 0.1055,
+      "step": 34122
+    },
+    {
+      "epoch": 0.2962040260067187,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0015119821527260233,
+      "loss": 0.1016,
+      "step": 34123
+    },
+    {
+      "epoch": 0.2962127064869229,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001511955691389219,
+      "loss": 0.0928,
+      "step": 34124
+    },
+    {
+      "epoch": 0.296221386967127,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001511929229601906,
+      "loss": 0.0913,
+      "step": 34125
+    },
+    {
+      "epoch": 0.2962300674473312,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0015119027673641135,
+      "loss": 0.1182,
+      "step": 34126
+    },
+    {
+      "epoch": 0.29623874792753535,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0015118763046758701,
+      "loss": 0.1758,
+      "step": 34127
+    },
+    {
+      "epoch": 0.29624742840773954,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0015118498415372056,
+      "loss": 0.085,
+      "step": 34128
+    },
+    {
+      "epoch": 0.2962561088879437,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0015118233779481476,
+      "loss": 0.125,
+      "step": 34129
+    },
+    {
+      "epoch": 0.2962647893681478,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001511796913908726,
+      "loss": 0.0952,
+      "step": 34130
+    },
+    {
+      "epoch": 0.296273469848352,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0015117704494189694,
+      "loss": 0.1172,
+      "step": 34131
+    },
+    {
+      "epoch": 0.29628215032855615,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0015117439844789069,
+      "loss": 0.1064,
+      "step": 34132
+    },
+    {
+      "epoch": 0.29629083080876034,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0015117175190885674,
+      "loss": 0.0835,
+      "step": 34133
+    },
+    {
+      "epoch": 0.2962995112889645,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0015116910532479801,
+      "loss": 0.1387,
+      "step": 34134
+    },
+    {
+      "epoch": 0.29630819176916867,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0015116645869571732,
+      "loss": 0.1133,
+      "step": 34135
+    },
+    {
+      "epoch": 0.2963168722493728,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001511638120216176,
+      "loss": 0.0835,
+      "step": 34136
+    },
+    {
+      "epoch": 0.296325552729577,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0015116116530250177,
+      "loss": 0.085,
+      "step": 34137
+    },
+    {
+      "epoch": 0.29633423320978114,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0015115851853837272,
+      "loss": 0.1025,
+      "step": 34138
+    },
+    {
+      "epoch": 0.29634291368998533,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0015115587172923333,
+      "loss": 0.1113,
+      "step": 34139
+    },
+    {
+      "epoch": 0.29635159417018947,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0015115322487508649,
+      "loss": 0.1025,
+      "step": 34140
+    },
+    {
+      "epoch": 0.29636027465039366,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0015115057797593513,
+      "loss": 0.0898,
+      "step": 34141
+    },
+    {
+      "epoch": 0.2963689551305978,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0015114793103178205,
+      "loss": 0.0669,
+      "step": 34142
+    },
+    {
+      "epoch": 0.296377635610802,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001511452840426303,
+      "loss": 0.1011,
+      "step": 34143
+    },
+    {
+      "epoch": 0.29638631609100613,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001511426370084826,
+      "loss": 0.1455,
+      "step": 34144
+    },
+    {
+      "epoch": 0.2963949965712103,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0015113998992934198,
+      "loss": 0.1191,
+      "step": 34145
+    },
+    {
+      "epoch": 0.29640367705141446,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0015113734280521125,
+      "loss": 0.1367,
+      "step": 34146
+    },
+    {
+      "epoch": 0.29641235753161865,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0015113469563609338,
+      "loss": 0.0879,
+      "step": 34147
+    },
+    {
+      "epoch": 0.2964210380118228,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001511320484219912,
+      "loss": 0.1167,
+      "step": 34148
+    },
+    {
+      "epoch": 0.296429718492027,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0015112940116290762,
+      "loss": 0.1172,
+      "step": 34149
+    },
+    {
+      "epoch": 0.2964383989722311,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001511267538588456,
+      "loss": 0.085,
+      "step": 34150
+    },
+    {
+      "epoch": 0.2964470794524353,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0015112410650980792,
+      "loss": 0.1152,
+      "step": 34151
+    },
+    {
+      "epoch": 0.29645575993263945,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0015112145911579757,
+      "loss": 0.1064,
+      "step": 34152
+    },
+    {
+      "epoch": 0.29646444041284364,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0015111881167681738,
+      "loss": 0.082,
+      "step": 34153
+    },
+    {
+      "epoch": 0.2964731208930478,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0015111616419287031,
+      "loss": 0.1104,
+      "step": 34154
+    },
+    {
+      "epoch": 0.296481801373252,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001511135166639592,
+      "loss": 0.1465,
+      "step": 34155
+    },
+    {
+      "epoch": 0.2964904818534561,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0015111086909008697,
+      "loss": 0.1191,
+      "step": 34156
+    },
+    {
+      "epoch": 0.2964991623336603,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001511082214712565,
+      "loss": 0.0869,
+      "step": 34157
+    },
+    {
+      "epoch": 0.29650784281386444,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001511055738074707,
+      "loss": 0.0659,
+      "step": 34158
+    },
+    {
+      "epoch": 0.29651652329406863,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0015110292609873245,
+      "loss": 0.0918,
+      "step": 34159
+    },
+    {
+      "epoch": 0.29652520377427277,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0015110027834504469,
+      "loss": 0.125,
+      "step": 34160
+    },
+    {
+      "epoch": 0.29653388425447696,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0015109763054641028,
+      "loss": 0.0879,
+      "step": 34161
+    },
+    {
+      "epoch": 0.2965425647346811,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001510949827028321,
+      "loss": 0.0996,
+      "step": 34162
+    },
+    {
+      "epoch": 0.2965512452148853,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001510923348143131,
+      "loss": 0.106,
+      "step": 34163
+    },
+    {
+      "epoch": 0.29655992569508943,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001510896868808561,
+      "loss": 0.0967,
+      "step": 34164
+    },
+    {
+      "epoch": 0.2965686061752936,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0015108703890246405,
+      "loss": 0.0928,
+      "step": 34165
+    },
+    {
+      "epoch": 0.29657728665549776,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0015108439087913983,
+      "loss": 0.1279,
+      "step": 34166
+    },
+    {
+      "epoch": 0.29658596713570196,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0015108174281088633,
+      "loss": 0.0747,
+      "step": 34167
+    },
+    {
+      "epoch": 0.2965946476159061,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0015107909469770644,
+      "loss": 0.1016,
+      "step": 34168
+    },
+    {
+      "epoch": 0.2966033280961103,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0015107644653960313,
+      "loss": 0.1011,
+      "step": 34169
+    },
+    {
+      "epoch": 0.2966120085763144,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0015107379833657916,
+      "loss": 0.0859,
+      "step": 34170
+    },
+    {
+      "epoch": 0.2966206890565186,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0015107115008863756,
+      "loss": 0.0928,
+      "step": 34171
+    },
+    {
+      "epoch": 0.29662936953672275,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0015106850179578116,
+      "loss": 0.1289,
+      "step": 34172
+    },
+    {
+      "epoch": 0.29663805001692695,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0015106585345801282,
+      "loss": 0.1484,
+      "step": 34173
+    },
+    {
+      "epoch": 0.2966467304971311,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0015106320507533551,
+      "loss": 0.1553,
+      "step": 34174
+    },
+    {
+      "epoch": 0.2966554109773353,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015106055664775209,
+      "loss": 0.0908,
+      "step": 34175
+    },
+    {
+      "epoch": 0.2966640914575394,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0015105790817526545,
+      "loss": 0.0742,
+      "step": 34176
+    },
+    {
+      "epoch": 0.2966727719377436,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0015105525965787854,
+      "loss": 0.0908,
+      "step": 34177
+    },
+    {
+      "epoch": 0.29668145241794774,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0015105261109559416,
+      "loss": 0.0684,
+      "step": 34178
+    },
+    {
+      "epoch": 0.29669013289815194,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001510499624884153,
+      "loss": 0.0957,
+      "step": 34179
+    },
+    {
+      "epoch": 0.2966988133783561,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0015104731383634483,
+      "loss": 0.0825,
+      "step": 34180
+    },
+    {
+      "epoch": 0.29670749385856027,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001510446651393856,
+      "loss": 0.0977,
+      "step": 34181
+    },
+    {
+      "epoch": 0.2967161743387644,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0015104201639754055,
+      "loss": 0.0996,
+      "step": 34182
+    },
+    {
+      "epoch": 0.2967248548189686,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.001510393676108126,
+      "loss": 0.1089,
+      "step": 34183
+    },
+    {
+      "epoch": 0.29673353529917273,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0015103671877920455,
+      "loss": 0.1211,
+      "step": 34184
+    },
+    {
+      "epoch": 0.2967422157793769,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001510340699027194,
+      "loss": 0.1309,
+      "step": 34185
+    },
+    {
+      "epoch": 0.29675089625958107,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0015103142098136,
+      "loss": 0.0698,
+      "step": 34186
+    },
+    {
+      "epoch": 0.29675957673978526,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0015102877201512925,
+      "loss": 0.1143,
+      "step": 34187
+    },
+    {
+      "epoch": 0.2967682572199894,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0015102612300403008,
+      "loss": 0.104,
+      "step": 34188
+    },
+    {
+      "epoch": 0.2967769377001936,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0015102347394806532,
+      "loss": 0.0835,
+      "step": 34189
+    },
+    {
+      "epoch": 0.2967856181803977,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0015102082484723795,
+      "loss": 0.1104,
+      "step": 34190
+    },
+    {
+      "epoch": 0.2967942986606019,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0015101817570155076,
+      "loss": 0.1201,
+      "step": 34191
+    },
+    {
+      "epoch": 0.29680297914080606,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0015101552651100676,
+      "loss": 0.0928,
+      "step": 34192
+    },
+    {
+      "epoch": 0.29681165962101025,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0015101287727560878,
+      "loss": 0.1055,
+      "step": 34193
+    },
+    {
+      "epoch": 0.2968203401012144,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0015101022799535973,
+      "loss": 0.1504,
+      "step": 34194
+    },
+    {
+      "epoch": 0.2968290205814186,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0015100757867026254,
+      "loss": 0.1001,
+      "step": 34195
+    },
+    {
+      "epoch": 0.2968377010616227,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0015100492930032001,
+      "loss": 0.1719,
+      "step": 34196
+    },
+    {
+      "epoch": 0.2968463815418269,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0015100227988553518,
+      "loss": 0.0815,
+      "step": 34197
+    },
+    {
+      "epoch": 0.29685506202203105,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0015099963042591086,
+      "loss": 0.127,
+      "step": 34198
+    },
+    {
+      "epoch": 0.29686374250223524,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0015099698092144993,
+      "loss": 0.1245,
+      "step": 34199
+    },
+    {
+      "epoch": 0.2968724229824394,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001509943313721553,
+      "loss": 0.0938,
+      "step": 34200
+    },
+    {
+      "epoch": 0.29688110346264357,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0015099168177802992,
+      "loss": 0.0771,
+      "step": 34201
+    },
+    {
+      "epoch": 0.2968897839428477,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0015098903213907665,
+      "loss": 0.1143,
+      "step": 34202
+    },
+    {
+      "epoch": 0.2968984644230519,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0015098638245529838,
+      "loss": 0.0967,
+      "step": 34203
+    },
+    {
+      "epoch": 0.29690714490325604,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0015098373272669802,
+      "loss": 0.1309,
+      "step": 34204
+    },
+    {
+      "epoch": 0.29691582538346023,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0015098108295327848,
+      "loss": 0.0986,
+      "step": 34205
+    },
+    {
+      "epoch": 0.29692450586366437,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0015097843313504264,
+      "loss": 0.1084,
+      "step": 34206
+    },
+    {
+      "epoch": 0.29693318634386856,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001509757832719934,
+      "loss": 0.1108,
+      "step": 34207
+    },
+    {
+      "epoch": 0.2969418668240727,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0015097313336413366,
+      "loss": 0.1182,
+      "step": 34208
+    },
+    {
+      "epoch": 0.2969505473042769,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0015097048341146631,
+      "loss": 0.1016,
+      "step": 34209
+    },
+    {
+      "epoch": 0.29695922778448103,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0015096783341399425,
+      "loss": 0.1426,
+      "step": 34210
+    },
+    {
+      "epoch": 0.2969679082646852,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001509651833717204,
+      "loss": 0.0962,
+      "step": 34211
+    },
+    {
+      "epoch": 0.29697658874488936,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0015096253328464762,
+      "loss": 0.0752,
+      "step": 34212
+    },
+    {
+      "epoch": 0.29698526922509355,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0015095988315277885,
+      "loss": 0.0889,
+      "step": 34213
+    },
+    {
+      "epoch": 0.2969939497052977,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0015095723297611694,
+      "loss": 0.0957,
+      "step": 34214
+    },
+    {
+      "epoch": 0.2970026301855019,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0015095458275466486,
+      "loss": 0.0723,
+      "step": 34215
+    },
+    {
+      "epoch": 0.297011310665706,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.001509519324884254,
+      "loss": 0.0986,
+      "step": 34216
+    },
+    {
+      "epoch": 0.2970199911459102,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0015094928217740157,
+      "loss": 0.1074,
+      "step": 34217
+    },
+    {
+      "epoch": 0.29702867162611435,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001509466318215962,
+      "loss": 0.1074,
+      "step": 34218
+    },
+    {
+      "epoch": 0.29703735210631854,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0015094398142101223,
+      "loss": 0.0947,
+      "step": 34219
+    },
+    {
+      "epoch": 0.2970460325865227,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0015094133097565251,
+      "loss": 0.1328,
+      "step": 34220
+    },
+    {
+      "epoch": 0.29705471306672687,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0015093868048551998,
+      "loss": 0.1006,
+      "step": 34221
+    },
+    {
+      "epoch": 0.297063393546931,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0015093602995061752,
+      "loss": 0.1074,
+      "step": 34222
+    },
+    {
+      "epoch": 0.2970720740271352,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0015093337937094804,
+      "loss": 0.1055,
+      "step": 34223
+    },
+    {
+      "epoch": 0.29708075450733934,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0015093072874651442,
+      "loss": 0.1123,
+      "step": 34224
+    },
+    {
+      "epoch": 0.29708943498754353,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0015092807807731959,
+      "loss": 0.1021,
+      "step": 34225
+    },
+    {
+      "epoch": 0.29709811546774767,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0015092542736336644,
+      "loss": 0.0552,
+      "step": 34226
+    },
+    {
+      "epoch": 0.29710679594795186,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001509227766046578,
+      "loss": 0.0908,
+      "step": 34227
+    },
+    {
+      "epoch": 0.297115476428156,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0015092012580119668,
+      "loss": 0.1113,
+      "step": 34228
+    },
+    {
+      "epoch": 0.2971241569083602,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001509174749529859,
+      "loss": 0.1045,
+      "step": 34229
+    },
+    {
+      "epoch": 0.29713283738856433,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001509148240600284,
+      "loss": 0.1416,
+      "step": 34230
+    },
+    {
+      "epoch": 0.2971415178687685,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0015091217312232705,
+      "loss": 0.0889,
+      "step": 34231
+    },
+    {
+      "epoch": 0.29715019834897266,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0015090952213988476,
+      "loss": 0.0986,
+      "step": 34232
+    },
+    {
+      "epoch": 0.29715887882917685,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0015090687111270443,
+      "loss": 0.1484,
+      "step": 34233
+    },
+    {
+      "epoch": 0.297167559309381,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0015090422004078899,
+      "loss": 0.1416,
+      "step": 34234
+    },
+    {
+      "epoch": 0.2971762397895852,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0015090156892414127,
+      "loss": 0.1484,
+      "step": 34235
+    },
+    {
+      "epoch": 0.2971849202697893,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001508989177627642,
+      "loss": 0.1128,
+      "step": 34236
+    },
+    {
+      "epoch": 0.2971936007499935,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0015089626655666072,
+      "loss": 0.104,
+      "step": 34237
+    },
+    {
+      "epoch": 0.29720228123019765,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0015089361530583373,
+      "loss": 0.0898,
+      "step": 34238
+    },
+    {
+      "epoch": 0.29721096171040184,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0015089096401028604,
+      "loss": 0.0898,
+      "step": 34239
+    },
+    {
+      "epoch": 0.297219642190606,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001508883126700206,
+      "loss": 0.0879,
+      "step": 34240
+    },
+    {
+      "epoch": 0.2972283226708102,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0015088566128504035,
+      "loss": 0.0879,
+      "step": 34241
+    },
+    {
+      "epoch": 0.2972370031510143,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0015088300985534814,
+      "loss": 0.1191,
+      "step": 34242
+    },
+    {
+      "epoch": 0.2972456836312185,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0015088035838094688,
+      "loss": 0.1089,
+      "step": 34243
+    },
+    {
+      "epoch": 0.29725436411142264,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0015087770686183948,
+      "loss": 0.1128,
+      "step": 34244
+    },
+    {
+      "epoch": 0.29726304459162683,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0015087505529802886,
+      "loss": 0.0815,
+      "step": 34245
+    },
+    {
+      "epoch": 0.297271725071831,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0015087240368951787,
+      "loss": 0.1221,
+      "step": 34246
+    },
+    {
+      "epoch": 0.29728040555203517,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0015086975203630944,
+      "loss": 0.0674,
+      "step": 34247
+    },
+    {
+      "epoch": 0.2972890860322393,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0015086710033840646,
+      "loss": 0.1133,
+      "step": 34248
+    },
+    {
+      "epoch": 0.2972977665124435,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0015086444859581183,
+      "loss": 0.1602,
+      "step": 34249
+    },
+    {
+      "epoch": 0.29730644699264763,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0015086179680852843,
+      "loss": 0.0996,
+      "step": 34250
+    },
+    {
+      "epoch": 0.2973151274728518,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001508591449765592,
+      "loss": 0.0811,
+      "step": 34251
+    },
+    {
+      "epoch": 0.29732380795305596,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0015085649309990704,
+      "loss": 0.0879,
+      "step": 34252
+    },
+    {
+      "epoch": 0.2973324884332601,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0015085384117857483,
+      "loss": 0.124,
+      "step": 34253
+    },
+    {
+      "epoch": 0.2973411689134643,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0015085118921256547,
+      "loss": 0.1211,
+      "step": 34254
+    },
+    {
+      "epoch": 0.29734984939366843,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0015084853720188185,
+      "loss": 0.0757,
+      "step": 34255
+    },
+    {
+      "epoch": 0.2973585298738726,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0015084588514652688,
+      "loss": 0.1055,
+      "step": 34256
+    },
+    {
+      "epoch": 0.29736721035407676,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001508432330465035,
+      "loss": 0.1025,
+      "step": 34257
+    },
+    {
+      "epoch": 0.29737589083428095,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0015084058090181454,
+      "loss": 0.1455,
+      "step": 34258
+    },
+    {
+      "epoch": 0.2973845713144851,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0015083792871246296,
+      "loss": 0.0825,
+      "step": 34259
+    },
+    {
+      "epoch": 0.2973932517946893,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001508352764784516,
+      "loss": 0.1406,
+      "step": 34260
+    },
+    {
+      "epoch": 0.2974019322748934,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001508326241997834,
+      "loss": 0.0957,
+      "step": 34261
+    },
+    {
+      "epoch": 0.2974106127550976,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0015082997187646128,
+      "loss": 0.0981,
+      "step": 34262
+    },
+    {
+      "epoch": 0.29741929323530175,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.001508273195084881,
+      "loss": 0.0991,
+      "step": 34263
+    },
+    {
+      "epoch": 0.29742797371550594,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0015082466709586682,
+      "loss": 0.1504,
+      "step": 34264
+    },
+    {
+      "epoch": 0.2974366541957101,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001508220146386002,
+      "loss": 0.083,
+      "step": 34265
+    },
+    {
+      "epoch": 0.2974453346759143,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0015081936213669133,
+      "loss": 0.1123,
+      "step": 34266
+    },
+    {
+      "epoch": 0.2974540151561184,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.00150816709590143,
+      "loss": 0.0991,
+      "step": 34267
+    },
+    {
+      "epoch": 0.2974626956363226,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0015081405699895812,
+      "loss": 0.0879,
+      "step": 34268
+    },
+    {
+      "epoch": 0.29747137611652674,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0015081140436313959,
+      "loss": 0.1211,
+      "step": 34269
+    },
+    {
+      "epoch": 0.29748005659673094,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001508087516826903,
+      "loss": 0.1299,
+      "step": 34270
+    },
+    {
+      "epoch": 0.2974887370769351,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001508060989576132,
+      "loss": 0.0977,
+      "step": 34271
+    },
+    {
+      "epoch": 0.29749741755713927,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0015080344618791117,
+      "loss": 0.0859,
+      "step": 34272
+    },
+    {
+      "epoch": 0.2975060980373434,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0015080079337358714,
+      "loss": 0.123,
+      "step": 34273
+    },
+    {
+      "epoch": 0.2975147785175476,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0015079814051464392,
+      "loss": 0.1113,
+      "step": 34274
+    },
+    {
+      "epoch": 0.29752345899775173,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0015079548761108446,
+      "loss": 0.0928,
+      "step": 34275
+    },
+    {
+      "epoch": 0.2975321394779559,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0015079283466291169,
+      "loss": 0.1069,
+      "step": 34276
+    },
+    {
+      "epoch": 0.29754081995816006,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.001507901816701285,
+      "loss": 0.1738,
+      "step": 34277
+    },
+    {
+      "epoch": 0.29754950043836426,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0015078752863273777,
+      "loss": 0.127,
+      "step": 34278
+    },
+    {
+      "epoch": 0.2975581809185684,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.001507848755507424,
+      "loss": 0.1367,
+      "step": 34279
+    },
+    {
+      "epoch": 0.2975668613987726,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001507822224241453,
+      "loss": 0.1396,
+      "step": 34280
+    },
+    {
+      "epoch": 0.2975755418789767,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001507795692529494,
+      "loss": 0.1011,
+      "step": 34281
+    },
+    {
+      "epoch": 0.2975842223591809,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0015077691603715756,
+      "loss": 0.084,
+      "step": 34282
+    },
+    {
+      "epoch": 0.29759290283938505,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0015077426277677268,
+      "loss": 0.0703,
+      "step": 34283
+    },
+    {
+      "epoch": 0.29760158331958925,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0015077160947179773,
+      "loss": 0.0859,
+      "step": 34284
+    },
+    {
+      "epoch": 0.2976102637997934,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001507689561222355,
+      "loss": 0.1055,
+      "step": 34285
+    },
+    {
+      "epoch": 0.2976189442799976,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.00150766302728089,
+      "loss": 0.1182,
+      "step": 34286
+    },
+    {
+      "epoch": 0.2976276247602017,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0015076364928936106,
+      "loss": 0.0801,
+      "step": 34287
+    },
+    {
+      "epoch": 0.2976363052404059,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0015076099580605463,
+      "loss": 0.1035,
+      "step": 34288
+    },
+    {
+      "epoch": 0.29764498572061004,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0015075834227817255,
+      "loss": 0.0913,
+      "step": 34289
+    },
+    {
+      "epoch": 0.29765366620081424,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0015075568870571779,
+      "loss": 0.124,
+      "step": 34290
+    },
+    {
+      "epoch": 0.2976623466810184,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001507530350886932,
+      "loss": 0.083,
+      "step": 34291
+    },
+    {
+      "epoch": 0.29767102716122257,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0015075038142710168,
+      "loss": 0.1289,
+      "step": 34292
+    },
+    {
+      "epoch": 0.2976797076414267,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0015074772772094623,
+      "loss": 0.1318,
+      "step": 34293
+    },
+    {
+      "epoch": 0.2976883881216309,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0015074507397022964,
+      "loss": 0.1172,
+      "step": 34294
+    },
+    {
+      "epoch": 0.29769706860183504,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001507424201749548,
+      "loss": 0.0864,
+      "step": 34295
+    },
+    {
+      "epoch": 0.29770574908203923,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0015073976633512471,
+      "loss": 0.0923,
+      "step": 34296
+    },
+    {
+      "epoch": 0.29771442956224337,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0015073711245074221,
+      "loss": 0.1328,
+      "step": 34297
+    },
+    {
+      "epoch": 0.29772311004244756,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0015073445852181021,
+      "loss": 0.1182,
+      "step": 34298
+    },
+    {
+      "epoch": 0.2977317905226517,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0015073180454833167,
+      "loss": 0.1113,
+      "step": 34299
+    },
+    {
+      "epoch": 0.2977404710028559,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0015072915053030939,
+      "loss": 0.082,
+      "step": 34300
+    },
+    {
+      "epoch": 0.29774915148306,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0015072649646774635,
+      "loss": 0.1289,
+      "step": 34301
+    },
+    {
+      "epoch": 0.2977578319632642,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001507238423606454,
+      "loss": 0.0815,
+      "step": 34302
+    },
+    {
+      "epoch": 0.29776651244346836,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0015072118820900948,
+      "loss": 0.1001,
+      "step": 34303
+    },
+    {
+      "epoch": 0.29777519292367255,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0015071853401284149,
+      "loss": 0.0913,
+      "step": 34304
+    },
+    {
+      "epoch": 0.2977838734038767,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001507158797721443,
+      "loss": 0.0928,
+      "step": 34305
+    },
+    {
+      "epoch": 0.2977925538840809,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0015071322548692089,
+      "loss": 0.1084,
+      "step": 34306
+    },
+    {
+      "epoch": 0.297801234364285,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0015071057115717405,
+      "loss": 0.0583,
+      "step": 34307
+    },
+    {
+      "epoch": 0.2978099148444892,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0015070791678290672,
+      "loss": 0.1016,
+      "step": 34308
+    },
+    {
+      "epoch": 0.29781859532469335,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0015070526236412188,
+      "loss": 0.082,
+      "step": 34309
+    },
+    {
+      "epoch": 0.29782727580489754,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0015070260790082234,
+      "loss": 0.1084,
+      "step": 34310
+    },
+    {
+      "epoch": 0.2978359562851017,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0015069995339301106,
+      "loss": 0.1172,
+      "step": 34311
+    },
+    {
+      "epoch": 0.29784463676530587,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0015069729884069091,
+      "loss": 0.127,
+      "step": 34312
+    },
+    {
+      "epoch": 0.29785331724551,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0015069464424386483,
+      "loss": 0.123,
+      "step": 34313
+    },
+    {
+      "epoch": 0.2978619977257142,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001506919896025357,
+      "loss": 0.1504,
+      "step": 34314
+    },
+    {
+      "epoch": 0.29787067820591834,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001506893349167064,
+      "loss": 0.1396,
+      "step": 34315
+    },
+    {
+      "epoch": 0.29787935868612253,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0015068668018637986,
+      "loss": 0.0928,
+      "step": 34316
+    },
+    {
+      "epoch": 0.29788803916632667,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0015068402541155896,
+      "loss": 0.1074,
+      "step": 34317
+    },
+    {
+      "epoch": 0.29789671964653086,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0015068137059224662,
+      "loss": 0.0879,
+      "step": 34318
+    },
+    {
+      "epoch": 0.297905400126735,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001506787157284458,
+      "loss": 0.0908,
+      "step": 34319
+    },
+    {
+      "epoch": 0.2979140806069392,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001506760608201593,
+      "loss": 0.0698,
+      "step": 34320
+    },
+    {
+      "epoch": 0.29792276108714333,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0015067340586739006,
+      "loss": 0.1001,
+      "step": 34321
+    },
+    {
+      "epoch": 0.2979314415673475,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0015067075087014103,
+      "loss": 0.2285,
+      "step": 34322
+    },
+    {
+      "epoch": 0.29794012204755166,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0015066809582841506,
+      "loss": 0.0986,
+      "step": 34323
+    },
+    {
+      "epoch": 0.29794880252775585,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0015066544074221512,
+      "loss": 0.126,
+      "step": 34324
+    },
+    {
+      "epoch": 0.29795748300796,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0015066278561154398,
+      "loss": 0.106,
+      "step": 34325
+    },
+    {
+      "epoch": 0.2979661634881642,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0015066013043640468,
+      "loss": 0.103,
+      "step": 34326
+    },
+    {
+      "epoch": 0.2979748439683683,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0015065747521680005,
+      "loss": 0.1001,
+      "step": 34327
+    },
+    {
+      "epoch": 0.2979835244485725,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0015065481995273306,
+      "loss": 0.1416,
+      "step": 34328
+    },
+    {
+      "epoch": 0.29799220492877665,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0015065216464420653,
+      "loss": 0.0869,
+      "step": 34329
+    },
+    {
+      "epoch": 0.29800088540898084,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001506495092912234,
+      "loss": 0.1196,
+      "step": 34330
+    },
+    {
+      "epoch": 0.298009565889185,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0015064685389378659,
+      "loss": 0.0938,
+      "step": 34331
+    },
+    {
+      "epoch": 0.2980182463693892,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.00150644198451899,
+      "loss": 0.1201,
+      "step": 34332
+    },
+    {
+      "epoch": 0.2980269268495933,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001506415429655635,
+      "loss": 0.1182,
+      "step": 34333
+    },
+    {
+      "epoch": 0.2980356073297975,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0015063888743478302,
+      "loss": 0.1182,
+      "step": 34334
+    },
+    {
+      "epoch": 0.29804428781000164,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001506362318595605,
+      "loss": 0.0811,
+      "step": 34335
+    },
+    {
+      "epoch": 0.29805296829020583,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001506335762398988,
+      "loss": 0.126,
+      "step": 34336
+    },
+    {
+      "epoch": 0.29806164877040997,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0015063092057580078,
+      "loss": 0.1143,
+      "step": 34337
+    },
+    {
+      "epoch": 0.29807032925061416,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0015062826486726942,
+      "loss": 0.0957,
+      "step": 34338
+    },
+    {
+      "epoch": 0.2980790097308183,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0015062560911430762,
+      "loss": 0.0967,
+      "step": 34339
+    },
+    {
+      "epoch": 0.2980876902110225,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0015062295331691826,
+      "loss": 0.1006,
+      "step": 34340
+    },
+    {
+      "epoch": 0.29809637069122663,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0015062029747510426,
+      "loss": 0.1064,
+      "step": 34341
+    },
+    {
+      "epoch": 0.2981050511714308,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001506176415888685,
+      "loss": 0.0928,
+      "step": 34342
+    },
+    {
+      "epoch": 0.29811373165163496,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001506149856582139,
+      "loss": 0.0991,
+      "step": 34343
+    },
+    {
+      "epoch": 0.29812241213183915,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0015061232968314334,
+      "loss": 0.0977,
+      "step": 34344
+    },
+    {
+      "epoch": 0.2981310926120433,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0015060967366365976,
+      "loss": 0.1128,
+      "step": 34345
+    },
+    {
+      "epoch": 0.2981397730922475,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0015060701759976607,
+      "loss": 0.0894,
+      "step": 34346
+    },
+    {
+      "epoch": 0.2981484535724516,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0015060436149146512,
+      "loss": 0.0845,
+      "step": 34347
+    },
+    {
+      "epoch": 0.2981571340526558,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0015060170533875987,
+      "loss": 0.0806,
+      "step": 34348
+    },
+    {
+      "epoch": 0.29816581453285995,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001505990491416532,
+      "loss": 0.1553,
+      "step": 34349
+    },
+    {
+      "epoch": 0.29817449501306414,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0015059639290014805,
+      "loss": 0.0874,
+      "step": 34350
+    },
+    {
+      "epoch": 0.2981831754932683,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0015059373661424724,
+      "loss": 0.0684,
+      "step": 34351
+    },
+    {
+      "epoch": 0.2981918559734725,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0015059108028395377,
+      "loss": 0.1602,
+      "step": 34352
+    },
+    {
+      "epoch": 0.2982005364536766,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0015058842390927051,
+      "loss": 0.0781,
+      "step": 34353
+    },
+    {
+      "epoch": 0.2982092169338808,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0015058576749020034,
+      "loss": 0.1416,
+      "step": 34354
+    },
+    {
+      "epoch": 0.29821789741408494,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0015058311102674618,
+      "loss": 0.0884,
+      "step": 34355
+    },
+    {
+      "epoch": 0.29822657789428914,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.00150580454518911,
+      "loss": 0.0864,
+      "step": 34356
+    },
+    {
+      "epoch": 0.2982352583744933,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0015057779796669754,
+      "loss": 0.1709,
+      "step": 34357
+    },
+    {
+      "epoch": 0.29824393885469747,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0015057514137010886,
+      "loss": 0.1064,
+      "step": 34358
+    },
+    {
+      "epoch": 0.2982526193349016,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0015057248472914785,
+      "loss": 0.1025,
+      "step": 34359
+    },
+    {
+      "epoch": 0.2982612998151058,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001505698280438173,
+      "loss": 0.1152,
+      "step": 34360
+    },
+    {
+      "epoch": 0.29826998029530993,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0015056717131412029,
+      "loss": 0.124,
+      "step": 34361
+    },
+    {
+      "epoch": 0.2982786607755141,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0015056451454005955,
+      "loss": 0.1191,
+      "step": 34362
+    },
+    {
+      "epoch": 0.29828734125571826,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0015056185772163813,
+      "loss": 0.0884,
+      "step": 34363
+    },
+    {
+      "epoch": 0.29829602173592246,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0015055920085885884,
+      "loss": 0.0947,
+      "step": 34364
+    },
+    {
+      "epoch": 0.2983047022161266,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001505565439517246,
+      "loss": 0.0542,
+      "step": 34365
+    },
+    {
+      "epoch": 0.2983133826963308,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0015055388700023836,
+      "loss": 0.1201,
+      "step": 34366
+    },
+    {
+      "epoch": 0.2983220631765349,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00150551230004403,
+      "loss": 0.1177,
+      "step": 34367
+    },
+    {
+      "epoch": 0.2983307436567391,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001505485729642214,
+      "loss": 0.1084,
+      "step": 34368
+    },
+    {
+      "epoch": 0.29833942413694325,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001505459158796965,
+      "loss": 0.1064,
+      "step": 34369
+    },
+    {
+      "epoch": 0.29834810461714745,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0015054325875083119,
+      "loss": 0.1104,
+      "step": 34370
+    },
+    {
+      "epoch": 0.2983567850973516,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001505406015776284,
+      "loss": 0.0918,
+      "step": 34371
+    },
+    {
+      "epoch": 0.2983654655775558,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0015053794436009102,
+      "loss": 0.1069,
+      "step": 34372
+    },
+    {
+      "epoch": 0.2983741460577599,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0015053528709822195,
+      "loss": 0.0928,
+      "step": 34373
+    },
+    {
+      "epoch": 0.2983828265379641,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001505326297920241,
+      "loss": 0.1001,
+      "step": 34374
+    },
+    {
+      "epoch": 0.29839150701816824,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0015052997244150033,
+      "loss": 0.1045,
+      "step": 34375
+    },
+    {
+      "epoch": 0.2984001874983724,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0015052731504665365,
+      "loss": 0.0947,
+      "step": 34376
+    },
+    {
+      "epoch": 0.2984088679785766,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0015052465760748688,
+      "loss": 0.0742,
+      "step": 34377
+    },
+    {
+      "epoch": 0.2984175484587807,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0015052200012400294,
+      "loss": 0.1162,
+      "step": 34378
+    },
+    {
+      "epoch": 0.2984262289389849,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001505193425962048,
+      "loss": 0.1021,
+      "step": 34379
+    },
+    {
+      "epoch": 0.29843490941918904,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0015051668502409528,
+      "loss": 0.0977,
+      "step": 34380
+    },
+    {
+      "epoch": 0.29844358989939324,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0015051402740767733,
+      "loss": 0.1494,
+      "step": 34381
+    },
+    {
+      "epoch": 0.2984522703795974,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0015051136974695383,
+      "loss": 0.0874,
+      "step": 34382
+    },
+    {
+      "epoch": 0.29846095085980157,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0015050871204192774,
+      "loss": 0.0728,
+      "step": 34383
+    },
+    {
+      "epoch": 0.2984696313400057,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001505060542926019,
+      "loss": 0.1328,
+      "step": 34384
+    },
+    {
+      "epoch": 0.2984783118202099,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0015050339649897923,
+      "loss": 0.1113,
+      "step": 34385
+    },
+    {
+      "epoch": 0.29848699230041403,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0015050073866106268,
+      "loss": 0.1191,
+      "step": 34386
+    },
+    {
+      "epoch": 0.2984956727806182,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0015049808077885515,
+      "loss": 0.0723,
+      "step": 34387
+    },
+    {
+      "epoch": 0.29850435326082236,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0015049542285235952,
+      "loss": 0.1074,
+      "step": 34388
+    },
+    {
+      "epoch": 0.29851303374102656,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0015049276488157868,
+      "loss": 0.0869,
+      "step": 34389
+    },
+    {
+      "epoch": 0.2985217142212307,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0015049010686651557,
+      "loss": 0.1006,
+      "step": 34390
+    },
+    {
+      "epoch": 0.2985303947014349,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0015048744880717308,
+      "loss": 0.1064,
+      "step": 34391
+    },
+    {
+      "epoch": 0.298539075181639,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0015048479070355414,
+      "loss": 0.1079,
+      "step": 34392
+    },
+    {
+      "epoch": 0.2985477556618432,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0015048213255566168,
+      "loss": 0.2598,
+      "step": 34393
+    },
+    {
+      "epoch": 0.29855643614204735,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001504794743634985,
+      "loss": 0.0957,
+      "step": 34394
+    },
+    {
+      "epoch": 0.29856511662225155,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.001504768161270676,
+      "loss": 0.1045,
+      "step": 34395
+    },
+    {
+      "epoch": 0.2985737971024557,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0015047415784637185,
+      "loss": 0.0942,
+      "step": 34396
+    },
+    {
+      "epoch": 0.2985824775826599,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0015047149952141419,
+      "loss": 0.0786,
+      "step": 34397
+    },
+    {
+      "epoch": 0.298591158062864,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001504688411521975,
+      "loss": 0.0923,
+      "step": 34398
+    },
+    {
+      "epoch": 0.2985998385430682,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0015046618273872468,
+      "loss": 0.0918,
+      "step": 34399
+    },
+    {
+      "epoch": 0.29860851902327235,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0015046352428099868,
+      "loss": 0.1465,
+      "step": 34400
+    },
+    {
+      "epoch": 0.29861719950347654,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0015046086577902235,
+      "loss": 0.1299,
+      "step": 34401
+    },
+    {
+      "epoch": 0.2986258799836807,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0015045820723279865,
+      "loss": 0.1182,
+      "step": 34402
+    },
+    {
+      "epoch": 0.29863456046388487,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0015045554864233043,
+      "loss": 0.1045,
+      "step": 34403
+    },
+    {
+      "epoch": 0.298643240944089,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0015045289000762063,
+      "loss": 0.1157,
+      "step": 34404
+    },
+    {
+      "epoch": 0.2986519214242932,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0015045023132867219,
+      "loss": 0.2246,
+      "step": 34405
+    },
+    {
+      "epoch": 0.29866060190449734,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0015044757260548798,
+      "loss": 0.125,
+      "step": 34406
+    },
+    {
+      "epoch": 0.29866928238470153,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0015044491383807085,
+      "loss": 0.1074,
+      "step": 34407
+    },
+    {
+      "epoch": 0.29867796286490567,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0015044225502642385,
+      "loss": 0.0913,
+      "step": 34408
+    },
+    {
+      "epoch": 0.29868664334510986,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001504395961705498,
+      "loss": 0.0791,
+      "step": 34409
+    },
+    {
+      "epoch": 0.298695323825314,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0015043693727045159,
+      "loss": 0.0938,
+      "step": 34410
+    },
+    {
+      "epoch": 0.2987040043055182,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0015043427832613218,
+      "loss": 0.1113,
+      "step": 34411
+    },
+    {
+      "epoch": 0.2987126847857223,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0015043161933759438,
+      "loss": 0.1455,
+      "step": 34412
+    },
+    {
+      "epoch": 0.2987213652659265,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0015042896030484124,
+      "loss": 0.1143,
+      "step": 34413
+    },
+    {
+      "epoch": 0.29873004574613066,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0015042630122787555,
+      "loss": 0.0967,
+      "step": 34414
+    },
+    {
+      "epoch": 0.29873872622633485,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001504236421067003,
+      "loss": 0.1377,
+      "step": 34415
+    },
+    {
+      "epoch": 0.298747406706539,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0015042098294131834,
+      "loss": 0.0986,
+      "step": 34416
+    },
+    {
+      "epoch": 0.2987560871867432,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0015041832373173262,
+      "loss": 0.1187,
+      "step": 34417
+    },
+    {
+      "epoch": 0.2987647676669473,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.00150415664477946,
+      "loss": 0.0908,
+      "step": 34418
+    },
+    {
+      "epoch": 0.2987734481471515,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0015041300517996145,
+      "loss": 0.0664,
+      "step": 34419
+    },
+    {
+      "epoch": 0.29878212862735565,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0015041034583778182,
+      "loss": 0.2451,
+      "step": 34420
+    },
+    {
+      "epoch": 0.29879080910755984,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0015040768645141005,
+      "loss": 0.0986,
+      "step": 34421
+    },
+    {
+      "epoch": 0.298799489587764,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0015040502702084908,
+      "loss": 0.0938,
+      "step": 34422
+    },
+    {
+      "epoch": 0.29880817006796817,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0015040236754610171,
+      "loss": 0.125,
+      "step": 34423
+    },
+    {
+      "epoch": 0.2988168505481723,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0015039970802717098,
+      "loss": 0.1396,
+      "step": 34424
+    },
+    {
+      "epoch": 0.2988255310283765,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0015039704846405966,
+      "loss": 0.0962,
+      "step": 34425
+    },
+    {
+      "epoch": 0.29883421150858064,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0015039438885677081,
+      "loss": 0.1006,
+      "step": 34426
+    },
+    {
+      "epoch": 0.29884289198878483,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0015039172920530723,
+      "loss": 0.0996,
+      "step": 34427
+    },
+    {
+      "epoch": 0.29885157246898897,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0015038906950967186,
+      "loss": 0.105,
+      "step": 34428
+    },
+    {
+      "epoch": 0.29886025294919316,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0015038640976986763,
+      "loss": 0.1143,
+      "step": 34429
+    },
+    {
+      "epoch": 0.2988689334293973,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0015038374998589741,
+      "loss": 0.123,
+      "step": 34430
+    },
+    {
+      "epoch": 0.2988776139096015,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0015038109015776415,
+      "loss": 0.1123,
+      "step": 34431
+    },
+    {
+      "epoch": 0.29888629438980563,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0015037843028547072,
+      "loss": 0.1157,
+      "step": 34432
+    },
+    {
+      "epoch": 0.2988949748700098,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0015037577036902004,
+      "loss": 0.0957,
+      "step": 34433
+    },
+    {
+      "epoch": 0.29890365535021396,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.00150373110408415,
+      "loss": 0.1299,
+      "step": 34434
+    },
+    {
+      "epoch": 0.29891233583041815,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0015037045040365857,
+      "loss": 0.124,
+      "step": 34435
+    },
+    {
+      "epoch": 0.2989210163106223,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0015036779035475364,
+      "loss": 0.1138,
+      "step": 34436
+    },
+    {
+      "epoch": 0.2989296967908265,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0015036513026170308,
+      "loss": 0.0938,
+      "step": 34437
+    },
+    {
+      "epoch": 0.2989383772710306,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0015036247012450979,
+      "loss": 0.0957,
+      "step": 34438
+    },
+    {
+      "epoch": 0.2989470577512348,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0015035980994317676,
+      "loss": 0.1396,
+      "step": 34439
+    },
+    {
+      "epoch": 0.29895573823143895,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0015035714971770684,
+      "loss": 0.064,
+      "step": 34440
+    },
+    {
+      "epoch": 0.29896441871164314,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0015035448944810293,
+      "loss": 0.0938,
+      "step": 34441
+    },
+    {
+      "epoch": 0.2989730991918473,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0015035182913436793,
+      "loss": 0.0747,
+      "step": 34442
+    },
+    {
+      "epoch": 0.2989817796720515,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.001503491687765048,
+      "loss": 0.1162,
+      "step": 34443
+    },
+    {
+      "epoch": 0.2989904601522556,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0015034650837451643,
+      "loss": 0.0864,
+      "step": 34444
+    },
+    {
+      "epoch": 0.2989991406324598,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0015034384792840575,
+      "loss": 0.0952,
+      "step": 34445
+    },
+    {
+      "epoch": 0.29900782111266394,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0015034118743817559,
+      "loss": 0.1045,
+      "step": 34446
+    },
+    {
+      "epoch": 0.29901650159286813,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0015033852690382894,
+      "loss": 0.1318,
+      "step": 34447
+    },
+    {
+      "epoch": 0.29902518207307227,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001503358663253687,
+      "loss": 0.0762,
+      "step": 34448
+    },
+    {
+      "epoch": 0.29903386255327646,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0015033320570279776,
+      "loss": 0.1147,
+      "step": 34449
+    },
+    {
+      "epoch": 0.2990425430334806,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0015033054503611903,
+      "loss": 0.0825,
+      "step": 34450
+    },
+    {
+      "epoch": 0.2990512235136848,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0015032788432533537,
+      "loss": 0.1123,
+      "step": 34451
+    },
+    {
+      "epoch": 0.29905990399388893,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0015032522357044979,
+      "loss": 0.1396,
+      "step": 34452
+    },
+    {
+      "epoch": 0.2990685844740931,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0015032256277146515,
+      "loss": 0.0991,
+      "step": 34453
+    },
+    {
+      "epoch": 0.29907726495429726,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0015031990192838436,
+      "loss": 0.0654,
+      "step": 34454
+    },
+    {
+      "epoch": 0.29908594543450145,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0015031724104121033,
+      "loss": 0.125,
+      "step": 34455
+    },
+    {
+      "epoch": 0.2990946259147056,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0015031458010994596,
+      "loss": 0.0981,
+      "step": 34456
+    },
+    {
+      "epoch": 0.2991033063949098,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0015031191913459417,
+      "loss": 0.0825,
+      "step": 34457
+    },
+    {
+      "epoch": 0.2991119868751139,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001503092581151579,
+      "loss": 0.124,
+      "step": 34458
+    },
+    {
+      "epoch": 0.2991206673553181,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0015030659705164001,
+      "loss": 0.1084,
+      "step": 34459
+    },
+    {
+      "epoch": 0.29912934783552225,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0015030393594404343,
+      "loss": 0.103,
+      "step": 34460
+    },
+    {
+      "epoch": 0.29913802831572645,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0015030127479237108,
+      "loss": 0.3125,
+      "step": 34461
+    },
+    {
+      "epoch": 0.2991467087959306,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0015029861359662584,
+      "loss": 0.1348,
+      "step": 34462
+    },
+    {
+      "epoch": 0.2991553892761348,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0015029595235681063,
+      "loss": 0.0889,
+      "step": 34463
+    },
+    {
+      "epoch": 0.2991640697563389,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001502932910729284,
+      "loss": 0.0962,
+      "step": 34464
+    },
+    {
+      "epoch": 0.2991727502365431,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0015029062974498205,
+      "loss": 0.124,
+      "step": 34465
+    },
+    {
+      "epoch": 0.29918143071674724,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0015028796837297444,
+      "loss": 0.1094,
+      "step": 34466
+    },
+    {
+      "epoch": 0.29919011119695144,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0015028530695690855,
+      "loss": 0.0767,
+      "step": 34467
+    },
+    {
+      "epoch": 0.2991987916771556,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001502826454967872,
+      "loss": 0.1074,
+      "step": 34468
+    },
+    {
+      "epoch": 0.29920747215735977,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001502799839926134,
+      "loss": 0.0815,
+      "step": 34469
+    },
+    {
+      "epoch": 0.2992161526375639,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0015027732244439,
+      "loss": 0.1211,
+      "step": 34470
+    },
+    {
+      "epoch": 0.2992248331177681,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0015027466085211993,
+      "loss": 0.0879,
+      "step": 34471
+    },
+    {
+      "epoch": 0.29923351359797223,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0015027199921580606,
+      "loss": 0.1113,
+      "step": 34472
+    },
+    {
+      "epoch": 0.2992421940781764,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0015026933753545138,
+      "loss": 0.1016,
+      "step": 34473
+    },
+    {
+      "epoch": 0.29925087455838056,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0015026667581105875,
+      "loss": 0.1094,
+      "step": 34474
+    },
+    {
+      "epoch": 0.29925955503858476,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0015026401404263107,
+      "loss": 0.1143,
+      "step": 34475
+    },
+    {
+      "epoch": 0.2992682355187889,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001502613522301713,
+      "loss": 0.1152,
+      "step": 34476
+    },
+    {
+      "epoch": 0.2992769159989931,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.001502586903736823,
+      "loss": 0.0889,
+      "step": 34477
+    },
+    {
+      "epoch": 0.2992855964791972,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0015025602847316699,
+      "loss": 0.0869,
+      "step": 34478
+    },
+    {
+      "epoch": 0.2992942769594014,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0015025336652862832,
+      "loss": 0.1455,
+      "step": 34479
+    },
+    {
+      "epoch": 0.29930295743960555,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0015025070454006916,
+      "loss": 0.1162,
+      "step": 34480
+    },
+    {
+      "epoch": 0.29931163791980975,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0015024804250749243,
+      "loss": 0.0708,
+      "step": 34481
+    },
+    {
+      "epoch": 0.2993203184000139,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0015024538043090104,
+      "loss": 0.0996,
+      "step": 34482
+    },
+    {
+      "epoch": 0.2993289988802181,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001502427183102979,
+      "loss": 0.1328,
+      "step": 34483
+    },
+    {
+      "epoch": 0.2993376793604222,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0015024005614568594,
+      "loss": 0.1221,
+      "step": 34484
+    },
+    {
+      "epoch": 0.2993463598406264,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0015023739393706806,
+      "loss": 0.1113,
+      "step": 34485
+    },
+    {
+      "epoch": 0.29935504032083055,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001502347316844472,
+      "loss": 0.0869,
+      "step": 34486
+    },
+    {
+      "epoch": 0.29936372080103474,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001502320693878262,
+      "loss": 0.0967,
+      "step": 34487
+    },
+    {
+      "epoch": 0.2993724012812389,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0015022940704720804,
+      "loss": 0.0767,
+      "step": 34488
+    },
+    {
+      "epoch": 0.29938108176144307,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001502267446625956,
+      "loss": 0.0898,
+      "step": 34489
+    },
+    {
+      "epoch": 0.2993897622416472,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0015022408223399175,
+      "loss": 0.085,
+      "step": 34490
+    },
+    {
+      "epoch": 0.2993984427218514,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0015022141976139948,
+      "loss": 0.0996,
+      "step": 34491
+    },
+    {
+      "epoch": 0.29940712320205554,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001502187572448217,
+      "loss": 0.1416,
+      "step": 34492
+    },
+    {
+      "epoch": 0.29941580368225973,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0015021609468426128,
+      "loss": 0.0986,
+      "step": 34493
+    },
+    {
+      "epoch": 0.29942448416246387,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0015021343207972109,
+      "loss": 0.1172,
+      "step": 34494
+    },
+    {
+      "epoch": 0.29943316464266806,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0015021076943120418,
+      "loss": 0.1309,
+      "step": 34495
+    },
+    {
+      "epoch": 0.2994418451228722,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001502081067387133,
+      "loss": 0.1309,
+      "step": 34496
+    },
+    {
+      "epoch": 0.2994505256030764,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0015020544400225148,
+      "loss": 0.1299,
+      "step": 34497
+    },
+    {
+      "epoch": 0.2994592060832805,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0015020278122182158,
+      "loss": 0.1309,
+      "step": 34498
+    },
+    {
+      "epoch": 0.29946788656348466,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0015020011839742653,
+      "loss": 0.0991,
+      "step": 34499
+    },
+    {
+      "epoch": 0.29947656704368886,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0015019745552906922,
+      "loss": 0.0801,
+      "step": 34500
+    },
+    {
+      "epoch": 0.299485247523893,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001501947926167526,
+      "loss": 0.1504,
+      "step": 34501
+    },
+    {
+      "epoch": 0.2994939280040972,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0015019212966047954,
+      "loss": 0.0913,
+      "step": 34502
+    },
+    {
+      "epoch": 0.2995026084843013,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0015018946666025294,
+      "loss": 0.1104,
+      "step": 34503
+    },
+    {
+      "epoch": 0.2995112889645055,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001501868036160758,
+      "loss": 0.1055,
+      "step": 34504
+    },
+    {
+      "epoch": 0.29951996944470966,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0015018414052795095,
+      "loss": 0.1216,
+      "step": 34505
+    },
+    {
+      "epoch": 0.29952864992491385,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0015018147739588135,
+      "loss": 0.0713,
+      "step": 34506
+    },
+    {
+      "epoch": 0.299537330405118,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0015017881421986988,
+      "loss": 0.0742,
+      "step": 34507
+    },
+    {
+      "epoch": 0.2995460108853222,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0015017615099991945,
+      "loss": 0.1299,
+      "step": 34508
+    },
+    {
+      "epoch": 0.2995546913655263,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0015017348773603298,
+      "loss": 0.1162,
+      "step": 34509
+    },
+    {
+      "epoch": 0.2995633718457305,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0015017082442821338,
+      "loss": 0.1055,
+      "step": 34510
+    },
+    {
+      "epoch": 0.29957205232593465,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0015016816107646359,
+      "loss": 0.1084,
+      "step": 34511
+    },
+    {
+      "epoch": 0.29958073280613884,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001501654976807865,
+      "loss": 0.1562,
+      "step": 34512
+    },
+    {
+      "epoch": 0.299589413286343,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0015016283424118503,
+      "loss": 0.1113,
+      "step": 34513
+    },
+    {
+      "epoch": 0.29959809376654717,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0015016017075766208,
+      "loss": 0.1357,
+      "step": 34514
+    },
+    {
+      "epoch": 0.2996067742467513,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0015015750723022057,
+      "loss": 0.0996,
+      "step": 34515
+    },
+    {
+      "epoch": 0.2996154547269555,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0015015484365886343,
+      "loss": 0.0967,
+      "step": 34516
+    },
+    {
+      "epoch": 0.29962413520715964,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0015015218004359352,
+      "loss": 0.0879,
+      "step": 34517
+    },
+    {
+      "epoch": 0.29963281568736383,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0015014951638441384,
+      "loss": 0.1514,
+      "step": 34518
+    },
+    {
+      "epoch": 0.29964149616756797,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001501468526813272,
+      "loss": 0.1016,
+      "step": 34519
+    },
+    {
+      "epoch": 0.29965017664777216,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001501441889343366,
+      "loss": 0.1196,
+      "step": 34520
+    },
+    {
+      "epoch": 0.2996588571279763,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001501415251434449,
+      "loss": 0.1201,
+      "step": 34521
+    },
+    {
+      "epoch": 0.2996675376081805,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0015013886130865503,
+      "loss": 0.1045,
+      "step": 34522
+    },
+    {
+      "epoch": 0.2996762180883846,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0015013619742996994,
+      "loss": 0.1299,
+      "step": 34523
+    },
+    {
+      "epoch": 0.2996848985685888,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0015013353350739242,
+      "loss": 0.0923,
+      "step": 34524
+    },
+    {
+      "epoch": 0.29969357904879296,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0015013086954092556,
+      "loss": 0.1357,
+      "step": 34525
+    },
+    {
+      "epoch": 0.29970225952899715,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0015012820553057216,
+      "loss": 0.1006,
+      "step": 34526
+    },
+    {
+      "epoch": 0.2997109400092013,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0015012554147633517,
+      "loss": 0.1025,
+      "step": 34527
+    },
+    {
+      "epoch": 0.2997196204894055,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0015012287737821747,
+      "loss": 0.0835,
+      "step": 34528
+    },
+    {
+      "epoch": 0.2997283009696096,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0015012021323622196,
+      "loss": 0.123,
+      "step": 34529
+    },
+    {
+      "epoch": 0.2997369814498138,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0015011754905035163,
+      "loss": 0.1025,
+      "step": 34530
+    },
+    {
+      "epoch": 0.29974566193001795,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0015011488482060936,
+      "loss": 0.0938,
+      "step": 34531
+    },
+    {
+      "epoch": 0.29975434241022214,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0015011222054699804,
+      "loss": 0.0986,
+      "step": 34532
+    },
+    {
+      "epoch": 0.2997630228904263,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0015010955622952059,
+      "loss": 0.0723,
+      "step": 34533
+    },
+    {
+      "epoch": 0.29977170337063047,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0015010689186817994,
+      "loss": 0.1216,
+      "step": 34534
+    },
+    {
+      "epoch": 0.2997803838508346,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.00150104227462979,
+      "loss": 0.1055,
+      "step": 34535
+    },
+    {
+      "epoch": 0.2997890643310388,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0015010156301392067,
+      "loss": 0.1367,
+      "step": 34536
+    },
+    {
+      "epoch": 0.29979774481124294,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0015009889852100789,
+      "loss": 0.103,
+      "step": 34537
+    },
+    {
+      "epoch": 0.29980642529144713,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0015009623398424354,
+      "loss": 0.0938,
+      "step": 34538
+    },
+    {
+      "epoch": 0.29981510577165127,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0015009356940363055,
+      "loss": 0.0918,
+      "step": 34539
+    },
+    {
+      "epoch": 0.29982378625185546,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0015009090477917184,
+      "loss": 0.1084,
+      "step": 34540
+    },
+    {
+      "epoch": 0.2998324667320596,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001500882401108703,
+      "loss": 0.0918,
+      "step": 34541
+    },
+    {
+      "epoch": 0.2998411472122638,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001500855753987289,
+      "loss": 0.1533,
+      "step": 34542
+    },
+    {
+      "epoch": 0.29984982769246793,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001500829106427505,
+      "loss": 0.1016,
+      "step": 34543
+    },
+    {
+      "epoch": 0.2998585081726721,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0015008024584293803,
+      "loss": 0.1357,
+      "step": 34544
+    },
+    {
+      "epoch": 0.29986718865287626,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0015007758099929442,
+      "loss": 0.0962,
+      "step": 34545
+    },
+    {
+      "epoch": 0.29987586913308045,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0015007491611182255,
+      "loss": 0.1123,
+      "step": 34546
+    },
+    {
+      "epoch": 0.2998845496132846,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0015007225118052537,
+      "loss": 0.0923,
+      "step": 34547
+    },
+    {
+      "epoch": 0.2998932300934888,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0015006958620540573,
+      "loss": 0.126,
+      "step": 34548
+    },
+    {
+      "epoch": 0.2999019105736929,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0015006692118646665,
+      "loss": 0.1084,
+      "step": 34549
+    },
+    {
+      "epoch": 0.2999105910538971,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0015006425612371095,
+      "loss": 0.0972,
+      "step": 34550
+    },
+    {
+      "epoch": 0.29991927153410125,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0015006159101714162,
+      "loss": 0.1172,
+      "step": 34551
+    },
+    {
+      "epoch": 0.29992795201430544,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0015005892586676152,
+      "loss": 0.1069,
+      "step": 34552
+    },
+    {
+      "epoch": 0.2999366324945096,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0015005626067257356,
+      "loss": 0.1035,
+      "step": 34553
+    },
+    {
+      "epoch": 0.2999453129747138,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001500535954345807,
+      "loss": 0.1094,
+      "step": 34554
+    },
+    {
+      "epoch": 0.2999539934549179,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0015005093015278582,
+      "loss": 0.1094,
+      "step": 34555
+    },
+    {
+      "epoch": 0.2999626739351221,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0015004826482719186,
+      "loss": 0.1152,
+      "step": 34556
+    },
+    {
+      "epoch": 0.29997135441532624,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0015004559945780169,
+      "loss": 0.0752,
+      "step": 34557
+    },
+    {
+      "epoch": 0.29998003489553043,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0015004293404461827,
+      "loss": 0.1514,
+      "step": 34558
+    },
+    {
+      "epoch": 0.29998871537573457,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0015004026858764447,
+      "loss": 0.0859,
+      "step": 34559
+    },
+    {
+      "epoch": 0.29999739585593876,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001500376030868833,
+      "loss": 0.0767,
+      "step": 34560
+    },
+    {
+      "epoch": 0.3000060763361429,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0015003493754233758,
+      "loss": 0.0903,
+      "step": 34561
+    },
+    {
+      "epoch": 0.3000147568163471,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0015003227195401023,
+      "loss": 0.123,
+      "step": 34562
+    },
+    {
+      "epoch": 0.30002343729655123,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001500296063219042,
+      "loss": 0.1416,
+      "step": 34563
+    },
+    {
+      "epoch": 0.3000321177767554,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001500269406460224,
+      "loss": 0.0874,
+      "step": 34564
+    },
+    {
+      "epoch": 0.30004079825695956,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0015002427492636773,
+      "loss": 0.1152,
+      "step": 34565
+    },
+    {
+      "epoch": 0.30004947873716376,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0015002160916294312,
+      "loss": 0.0889,
+      "step": 34566
+    },
+    {
+      "epoch": 0.3000581592173679,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001500189433557515,
+      "loss": 0.0967,
+      "step": 34567
+    },
+    {
+      "epoch": 0.3000668396975721,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0015001627750479572,
+      "loss": 0.2324,
+      "step": 34568
+    },
+    {
+      "epoch": 0.3000755201777762,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0015001361161007876,
+      "loss": 0.1021,
+      "step": 34569
+    },
+    {
+      "epoch": 0.3000842006579804,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0015001094567160353,
+      "loss": 0.1582,
+      "step": 34570
+    },
+    {
+      "epoch": 0.30009288113818455,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0015000827968937294,
+      "loss": 0.083,
+      "step": 34571
+    },
+    {
+      "epoch": 0.30010156161838875,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0015000561366338985,
+      "loss": 0.0869,
+      "step": 34572
+    },
+    {
+      "epoch": 0.3001102420985929,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0015000294759365724,
+      "loss": 0.1797,
+      "step": 34573
+    },
+    {
+      "epoch": 0.3001189225787971,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0015000028148017804,
+      "loss": 0.0869,
+      "step": 34574
+    },
+    {
+      "epoch": 0.3001276030590012,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0014999761532295513,
+      "loss": 0.1084,
+      "step": 34575
+    },
+    {
+      "epoch": 0.3001362835392054,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001499949491219914,
+      "loss": 0.123,
+      "step": 34576
+    },
+    {
+      "epoch": 0.30014496401940954,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0014999228287728979,
+      "loss": 0.1167,
+      "step": 34577
+    },
+    {
+      "epoch": 0.30015364449961374,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0014998961658885324,
+      "loss": 0.1182,
+      "step": 34578
+    },
+    {
+      "epoch": 0.3001623249798179,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0014998695025668462,
+      "loss": 0.0972,
+      "step": 34579
+    },
+    {
+      "epoch": 0.30017100546002207,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0014998428388078688,
+      "loss": 0.0698,
+      "step": 34580
+    },
+    {
+      "epoch": 0.3001796859402262,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0014998161746116296,
+      "loss": 0.1211,
+      "step": 34581
+    },
+    {
+      "epoch": 0.3001883664204304,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0014997895099781574,
+      "loss": 0.106,
+      "step": 34582
+    },
+    {
+      "epoch": 0.30019704690063453,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0014997628449074812,
+      "loss": 0.0981,
+      "step": 34583
+    },
+    {
+      "epoch": 0.3002057273808387,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0014997361793996306,
+      "loss": 0.1108,
+      "step": 34584
+    },
+    {
+      "epoch": 0.30021440786104286,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0014997095134546342,
+      "loss": 0.1602,
+      "step": 34585
+    },
+    {
+      "epoch": 0.30022308834124706,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0014996828470725217,
+      "loss": 0.0991,
+      "step": 34586
+    },
+    {
+      "epoch": 0.3002317688214512,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0014996561802533216,
+      "loss": 0.0986,
+      "step": 34587
+    },
+    {
+      "epoch": 0.3002404493016554,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001499629512997064,
+      "loss": 0.1206,
+      "step": 34588
+    },
+    {
+      "epoch": 0.3002491297818595,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0014996028453037773,
+      "loss": 0.1279,
+      "step": 34589
+    },
+    {
+      "epoch": 0.3002578102620637,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001499576177173491,
+      "loss": 0.0898,
+      "step": 34590
+    },
+    {
+      "epoch": 0.30026649074226786,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0014995495086062344,
+      "loss": 0.0771,
+      "step": 34591
+    },
+    {
+      "epoch": 0.30027517122247205,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0014995228396020364,
+      "loss": 0.1123,
+      "step": 34592
+    },
+    {
+      "epoch": 0.3002838517026762,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001499496170160926,
+      "loss": 0.0908,
+      "step": 34593
+    },
+    {
+      "epoch": 0.3002925321828804,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001499469500282933,
+      "loss": 0.1123,
+      "step": 34594
+    },
+    {
+      "epoch": 0.3003012126630845,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0014994428299680857,
+      "loss": 0.1367,
+      "step": 34595
+    },
+    {
+      "epoch": 0.3003098931432887,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0014994161592164137,
+      "loss": 0.0854,
+      "step": 34596
+    },
+    {
+      "epoch": 0.30031857362349285,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0014993894880279467,
+      "loss": 0.1064,
+      "step": 34597
+    },
+    {
+      "epoch": 0.30032725410369704,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001499362816402713,
+      "loss": 0.0698,
+      "step": 34598
+    },
+    {
+      "epoch": 0.3003359345839012,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001499336144340742,
+      "loss": 0.0825,
+      "step": 34599
+    },
+    {
+      "epoch": 0.30034461506410537,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0014993094718420631,
+      "loss": 0.127,
+      "step": 34600
+    },
+    {
+      "epoch": 0.3003532955443095,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0014992827989067058,
+      "loss": 0.0923,
+      "step": 34601
+    },
+    {
+      "epoch": 0.3003619760245137,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0014992561255346981,
+      "loss": 0.0869,
+      "step": 34602
+    },
+    {
+      "epoch": 0.30037065650471784,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0014992294517260705,
+      "loss": 0.126,
+      "step": 34603
+    },
+    {
+      "epoch": 0.30037933698492203,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0014992027774808512,
+      "loss": 0.1045,
+      "step": 34604
+    },
+    {
+      "epoch": 0.30038801746512617,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0014991761027990699,
+      "loss": 0.1357,
+      "step": 34605
+    },
+    {
+      "epoch": 0.30039669794533036,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0014991494276807556,
+      "loss": 0.0781,
+      "step": 34606
+    },
+    {
+      "epoch": 0.3004053784255345,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0014991227521259374,
+      "loss": 0.084,
+      "step": 34607
+    },
+    {
+      "epoch": 0.3004140589057387,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0014990960761346447,
+      "loss": 0.0825,
+      "step": 34608
+    },
+    {
+      "epoch": 0.3004227393859428,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0014990693997069065,
+      "loss": 0.0894,
+      "step": 34609
+    },
+    {
+      "epoch": 0.300431419866147,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001499042722842752,
+      "loss": 0.1084,
+      "step": 34610
+    },
+    {
+      "epoch": 0.30044010034635116,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0014990160455422101,
+      "loss": 0.1221,
+      "step": 34611
+    },
+    {
+      "epoch": 0.30044878082655535,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0014989893678053106,
+      "loss": 0.1074,
+      "step": 34612
+    },
+    {
+      "epoch": 0.3004574613067595,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0014989626896320823,
+      "loss": 0.127,
+      "step": 34613
+    },
+    {
+      "epoch": 0.3004661417869637,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0014989360110225544,
+      "loss": 0.1387,
+      "step": 34614
+    },
+    {
+      "epoch": 0.3004748222671678,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0014989093319767556,
+      "loss": 0.0669,
+      "step": 34615
+    },
+    {
+      "epoch": 0.300483502747372,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.0014988826524947158,
+      "loss": 0.0977,
+      "step": 34616
+    },
+    {
+      "epoch": 0.30049218322757615,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0014988559725764644,
+      "loss": 0.0791,
+      "step": 34617
+    },
+    {
+      "epoch": 0.30050086370778034,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0014988292922220295,
+      "loss": 0.1572,
+      "step": 34618
+    },
+    {
+      "epoch": 0.3005095441879845,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001498802611431441,
+      "loss": 0.0977,
+      "step": 34619
+    },
+    {
+      "epoch": 0.30051822466818867,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.001498775930204728,
+      "loss": 0.1309,
+      "step": 34620
+    },
+    {
+      "epoch": 0.3005269051483928,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.00149874924854192,
+      "loss": 0.1221,
+      "step": 34621
+    },
+    {
+      "epoch": 0.30053558562859695,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0014987225664430455,
+      "loss": 0.1221,
+      "step": 34622
+    },
+    {
+      "epoch": 0.30054426610880114,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0014986958839081342,
+      "loss": 0.1074,
+      "step": 34623
+    },
+    {
+      "epoch": 0.3005529465890053,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0014986692009372148,
+      "loss": 0.1025,
+      "step": 34624
+    },
+    {
+      "epoch": 0.30056162706920947,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0014986425175303167,
+      "loss": 0.1162,
+      "step": 34625
+    },
+    {
+      "epoch": 0.3005703075494136,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0014986158336874691,
+      "loss": 0.1104,
+      "step": 34626
+    },
+    {
+      "epoch": 0.3005789880296178,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0014985891494087016,
+      "loss": 0.1079,
+      "step": 34627
+    },
+    {
+      "epoch": 0.30058766850982194,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0014985624646940425,
+      "loss": 0.0942,
+      "step": 34628
+    },
+    {
+      "epoch": 0.30059634899002613,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.001498535779543522,
+      "loss": 0.1572,
+      "step": 34629
+    },
+    {
+      "epoch": 0.30060502947023027,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0014985090939571683,
+      "loss": 0.1079,
+      "step": 34630
+    },
+    {
+      "epoch": 0.30061370995043446,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0014984824079350117,
+      "loss": 0.0972,
+      "step": 34631
+    },
+    {
+      "epoch": 0.3006223904306386,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.00149845572147708,
+      "loss": 0.0835,
+      "step": 34632
+    },
+    {
+      "epoch": 0.3006310709108428,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0014984290345834035,
+      "loss": 0.0991,
+      "step": 34633
+    },
+    {
+      "epoch": 0.3006397513910469,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0014984023472540107,
+      "loss": 0.1055,
+      "step": 34634
+    },
+    {
+      "epoch": 0.3006484318712511,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0014983756594889315,
+      "loss": 0.0933,
+      "step": 34635
+    },
+    {
+      "epoch": 0.30065711235145526,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0014983489712881942,
+      "loss": 0.0781,
+      "step": 34636
+    },
+    {
+      "epoch": 0.30066579283165945,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0014983222826518289,
+      "loss": 0.0967,
+      "step": 34637
+    },
+    {
+      "epoch": 0.3006744733118636,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0014982955935798638,
+      "loss": 0.064,
+      "step": 34638
+    },
+    {
+      "epoch": 0.3006831537920678,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001498268904072329,
+      "loss": 0.1455,
+      "step": 34639
+    },
+    {
+      "epoch": 0.3006918342722719,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0014982422141292533,
+      "loss": 0.1045,
+      "step": 34640
+    },
+    {
+      "epoch": 0.3007005147524761,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0014982155237506659,
+      "loss": 0.1113,
+      "step": 34641
+    },
+    {
+      "epoch": 0.30070919523268025,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0014981888329365961,
+      "loss": 0.0947,
+      "step": 34642
+    },
+    {
+      "epoch": 0.30071787571288444,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0014981621416870726,
+      "loss": 0.084,
+      "step": 34643
+    },
+    {
+      "epoch": 0.3007265561930886,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0014981354500021252,
+      "loss": 0.126,
+      "step": 34644
+    },
+    {
+      "epoch": 0.30073523667329277,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0014981087578817827,
+      "loss": 0.0762,
+      "step": 34645
+    },
+    {
+      "epoch": 0.3007439171534969,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0014980820653260746,
+      "loss": 0.1172,
+      "step": 34646
+    },
+    {
+      "epoch": 0.3007525976337011,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0014980553723350299,
+      "loss": 0.0776,
+      "step": 34647
+    },
+    {
+      "epoch": 0.30076127811390524,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001498028678908678,
+      "loss": 0.082,
+      "step": 34648
+    },
+    {
+      "epoch": 0.30076995859410943,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0014980019850470477,
+      "loss": 0.1357,
+      "step": 34649
+    },
+    {
+      "epoch": 0.30077863907431357,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0014979752907501684,
+      "loss": 0.1494,
+      "step": 34650
+    },
+    {
+      "epoch": 0.30078731955451776,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0014979485960180693,
+      "loss": 0.1079,
+      "step": 34651
+    },
+    {
+      "epoch": 0.3007960000347219,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0014979219008507798,
+      "loss": 0.1504,
+      "step": 34652
+    },
+    {
+      "epoch": 0.3008046805149261,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0014978952052483285,
+      "loss": 0.1191,
+      "step": 34653
+    },
+    {
+      "epoch": 0.30081336099513023,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001497868509210745,
+      "loss": 0.0898,
+      "step": 34654
+    },
+    {
+      "epoch": 0.3008220414753344,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001497841812738059,
+      "loss": 0.1328,
+      "step": 34655
+    },
+    {
+      "epoch": 0.30083072195553856,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001497815115830299,
+      "loss": 0.0728,
+      "step": 34656
+    },
+    {
+      "epoch": 0.30083940243574275,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0014977884184874941,
+      "loss": 0.0898,
+      "step": 34657
+    },
+    {
+      "epoch": 0.3008480829159469,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0014977617207096738,
+      "loss": 0.0845,
+      "step": 34658
+    },
+    {
+      "epoch": 0.3008567633961511,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0014977350224968675,
+      "loss": 0.0879,
+      "step": 34659
+    },
+    {
+      "epoch": 0.3008654438763552,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0014977083238491042,
+      "loss": 0.0781,
+      "step": 34660
+    },
+    {
+      "epoch": 0.3008741243565594,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001497681624766413,
+      "loss": 0.1279,
+      "step": 34661
+    },
+    {
+      "epoch": 0.30088280483676355,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0014976549252488226,
+      "loss": 0.1777,
+      "step": 34662
+    },
+    {
+      "epoch": 0.30089148531696774,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0014976282252963631,
+      "loss": 0.1406,
+      "step": 34663
+    },
+    {
+      "epoch": 0.3009001657971719,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0014976015249090636,
+      "loss": 0.085,
+      "step": 34664
+    },
+    {
+      "epoch": 0.3009088462773761,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0014975748240869528,
+      "loss": 0.0967,
+      "step": 34665
+    },
+    {
+      "epoch": 0.3009175267575802,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0014975481228300602,
+      "loss": 0.1123,
+      "step": 34666
+    },
+    {
+      "epoch": 0.3009262072377844,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0014975214211384149,
+      "loss": 0.1201,
+      "step": 34667
+    },
+    {
+      "epoch": 0.30093488771798854,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0014974947190120463,
+      "loss": 0.1309,
+      "step": 34668
+    },
+    {
+      "epoch": 0.30094356819819273,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0014974680164509832,
+      "loss": 0.0986,
+      "step": 34669
+    },
+    {
+      "epoch": 0.30095224867839687,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0014974413134552554,
+      "loss": 0.1152,
+      "step": 34670
+    },
+    {
+      "epoch": 0.30096092915860106,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0014974146100248916,
+      "loss": 0.0757,
+      "step": 34671
+    },
+    {
+      "epoch": 0.3009696096388052,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001497387906159921,
+      "loss": 0.0972,
+      "step": 34672
+    },
+    {
+      "epoch": 0.3009782901190094,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0014973612018603728,
+      "loss": 0.0781,
+      "step": 34673
+    },
+    {
+      "epoch": 0.30098697059921353,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0014973344971262765,
+      "loss": 0.0962,
+      "step": 34674
+    },
+    {
+      "epoch": 0.3009956510794177,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0014973077919576614,
+      "loss": 0.1152,
+      "step": 34675
+    },
+    {
+      "epoch": 0.30100433155962186,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0014972810863545564,
+      "loss": 0.0796,
+      "step": 34676
+    },
+    {
+      "epoch": 0.30101301203982606,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0014972543803169907,
+      "loss": 0.0962,
+      "step": 34677
+    },
+    {
+      "epoch": 0.3010216925200302,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001497227673844994,
+      "loss": 0.0771,
+      "step": 34678
+    },
+    {
+      "epoch": 0.3010303730002344,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014972009669385945,
+      "loss": 0.0908,
+      "step": 34679
+    },
+    {
+      "epoch": 0.3010390534804385,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0014971742595978222,
+      "loss": 0.1426,
+      "step": 34680
+    },
+    {
+      "epoch": 0.3010477339606427,
+      "grad_norm": 0.061767578125,
+      "learning_rate": 0.0014971475518227057,
+      "loss": 0.0723,
+      "step": 34681
+    },
+    {
+      "epoch": 0.30105641444084685,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001497120843613275,
+      "loss": 0.1045,
+      "step": 34682
+    },
+    {
+      "epoch": 0.30106509492105105,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001497094134969559,
+      "loss": 0.1113,
+      "step": 34683
+    },
+    {
+      "epoch": 0.3010737754012552,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0014970674258915867,
+      "loss": 0.1357,
+      "step": 34684
+    },
+    {
+      "epoch": 0.3010824558814594,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0014970407163793873,
+      "loss": 0.1206,
+      "step": 34685
+    },
+    {
+      "epoch": 0.3010911363616635,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0014970140064329905,
+      "loss": 0.0889,
+      "step": 34686
+    },
+    {
+      "epoch": 0.3010998168418677,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0014969872960524252,
+      "loss": 0.0747,
+      "step": 34687
+    },
+    {
+      "epoch": 0.30110849732207184,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0014969605852377197,
+      "loss": 0.1152,
+      "step": 34688
+    },
+    {
+      "epoch": 0.30111717780227604,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0014969338739889049,
+      "loss": 0.0776,
+      "step": 34689
+    },
+    {
+      "epoch": 0.3011258582824802,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001496907162306009,
+      "loss": 0.1099,
+      "step": 34690
+    },
+    {
+      "epoch": 0.30113453876268437,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0014968804501890613,
+      "loss": 0.1133,
+      "step": 34691
+    },
+    {
+      "epoch": 0.3011432192428885,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001496853737638091,
+      "loss": 0.0894,
+      "step": 34692
+    },
+    {
+      "epoch": 0.3011518997230927,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0014968270246531272,
+      "loss": 0.1045,
+      "step": 34693
+    },
+    {
+      "epoch": 0.30116058020329683,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0014968003112342,
+      "loss": 0.1025,
+      "step": 34694
+    },
+    {
+      "epoch": 0.301169260683501,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0014967735973813376,
+      "loss": 0.0884,
+      "step": 34695
+    },
+    {
+      "epoch": 0.30117794116370517,
+      "grad_norm": 3.65625,
+      "learning_rate": 0.0014967468830945697,
+      "loss": 0.3359,
+      "step": 34696
+    },
+    {
+      "epoch": 0.30118662164390936,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001496720168373925,
+      "loss": 0.127,
+      "step": 34697
+    },
+    {
+      "epoch": 0.3011953021241135,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0014966934532194334,
+      "loss": 0.1172,
+      "step": 34698
+    },
+    {
+      "epoch": 0.3012039826043177,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0014966667376311237,
+      "loss": 0.0854,
+      "step": 34699
+    },
+    {
+      "epoch": 0.3012126630845218,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0014966400216090252,
+      "loss": 0.1045,
+      "step": 34700
+    },
+    {
+      "epoch": 0.301221343564726,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0014966133051531672,
+      "loss": 0.127,
+      "step": 34701
+    },
+    {
+      "epoch": 0.30123002404493016,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0014965865882635787,
+      "loss": 0.0752,
+      "step": 34702
+    },
+    {
+      "epoch": 0.30123870452513435,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0014965598709402892,
+      "loss": 0.125,
+      "step": 34703
+    },
+    {
+      "epoch": 0.3012473850053385,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001496533153183328,
+      "loss": 0.0957,
+      "step": 34704
+    },
+    {
+      "epoch": 0.3012560654855427,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0014965064349927238,
+      "loss": 0.0791,
+      "step": 34705
+    },
+    {
+      "epoch": 0.3012647459657468,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.001496479716368506,
+      "loss": 0.127,
+      "step": 34706
+    },
+    {
+      "epoch": 0.301273426445951,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0014964529973107042,
+      "loss": 0.1011,
+      "step": 34707
+    },
+    {
+      "epoch": 0.30128210692615515,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0014964262778193473,
+      "loss": 0.1543,
+      "step": 34708
+    },
+    {
+      "epoch": 0.30129078740635934,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014963995578944646,
+      "loss": 0.1235,
+      "step": 34709
+    },
+    {
+      "epoch": 0.3012994678865635,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001496372837536085,
+      "loss": 0.1387,
+      "step": 34710
+    },
+    {
+      "epoch": 0.30130814836676767,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0014963461167442386,
+      "loss": 0.1191,
+      "step": 34711
+    },
+    {
+      "epoch": 0.3013168288469718,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0014963193955189534,
+      "loss": 0.0767,
+      "step": 34712
+    },
+    {
+      "epoch": 0.301325509327176,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0014962926738602597,
+      "loss": 0.1562,
+      "step": 34713
+    },
+    {
+      "epoch": 0.30133418980738014,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001496265951768186,
+      "loss": 0.1006,
+      "step": 34714
+    },
+    {
+      "epoch": 0.30134287028758433,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001496239229242762,
+      "loss": 0.0977,
+      "step": 34715
+    },
+    {
+      "epoch": 0.30135155076778847,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001496212506284017,
+      "loss": 0.083,
+      "step": 34716
+    },
+    {
+      "epoch": 0.30136023124799266,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0014961857828919797,
+      "loss": 0.1055,
+      "step": 34717
+    },
+    {
+      "epoch": 0.3013689117281968,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0014961590590666797,
+      "loss": 0.1367,
+      "step": 34718
+    },
+    {
+      "epoch": 0.301377592208401,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0014961323348081457,
+      "loss": 0.1211,
+      "step": 34719
+    },
+    {
+      "epoch": 0.30138627268860513,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0014961056101164077,
+      "loss": 0.0684,
+      "step": 34720
+    },
+    {
+      "epoch": 0.3013949531688093,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0014960788849914942,
+      "loss": 0.1357,
+      "step": 34721
+    },
+    {
+      "epoch": 0.30140363364901346,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001496052159433435,
+      "loss": 0.1001,
+      "step": 34722
+    },
+    {
+      "epoch": 0.30141231412921765,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0014960254334422589,
+      "loss": 0.0601,
+      "step": 34723
+    },
+    {
+      "epoch": 0.3014209946094218,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0014959987070179957,
+      "loss": 0.104,
+      "step": 34724
+    },
+    {
+      "epoch": 0.301429675089626,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0014959719801606742,
+      "loss": 0.0732,
+      "step": 34725
+    },
+    {
+      "epoch": 0.3014383555698301,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014959452528703237,
+      "loss": 0.1465,
+      "step": 34726
+    },
+    {
+      "epoch": 0.3014470360500343,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0014959185251469734,
+      "loss": 0.127,
+      "step": 34727
+    },
+    {
+      "epoch": 0.30145571653023845,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0014958917969906524,
+      "loss": 0.1436,
+      "step": 34728
+    },
+    {
+      "epoch": 0.30146439701044264,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.00149586506840139,
+      "loss": 0.0771,
+      "step": 34729
+    },
+    {
+      "epoch": 0.3014730774906468,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001495838339379216,
+      "loss": 0.0713,
+      "step": 34730
+    },
+    {
+      "epoch": 0.30148175797085097,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0014958116099241588,
+      "loss": 0.1309,
+      "step": 34731
+    },
+    {
+      "epoch": 0.3014904384510551,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0014957848800362477,
+      "loss": 0.084,
+      "step": 34732
+    },
+    {
+      "epoch": 0.3014991189312593,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0014957581497155126,
+      "loss": 0.0879,
+      "step": 34733
+    },
+    {
+      "epoch": 0.30150779941146344,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0014957314189619824,
+      "loss": 0.084,
+      "step": 34734
+    },
+    {
+      "epoch": 0.30151647989166763,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001495704687775686,
+      "loss": 0.1182,
+      "step": 34735
+    },
+    {
+      "epoch": 0.30152516037187177,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.001495677956156653,
+      "loss": 0.0918,
+      "step": 34736
+    },
+    {
+      "epoch": 0.30153384085207596,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0014956512241049125,
+      "loss": 0.1074,
+      "step": 34737
+    },
+    {
+      "epoch": 0.3015425213322801,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0014956244916204937,
+      "loss": 0.0645,
+      "step": 34738
+    },
+    {
+      "epoch": 0.3015512018124843,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.001495597758703426,
+      "loss": 0.0884,
+      "step": 34739
+    },
+    {
+      "epoch": 0.30155988229268843,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0014955710253537386,
+      "loss": 0.1235,
+      "step": 34740
+    },
+    {
+      "epoch": 0.3015685627728926,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0014955442915714602,
+      "loss": 0.0898,
+      "step": 34741
+    },
+    {
+      "epoch": 0.30157724325309676,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0014955175573566214,
+      "loss": 0.0869,
+      "step": 34742
+    },
+    {
+      "epoch": 0.30158592373330095,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014954908227092498,
+      "loss": 0.1455,
+      "step": 34743
+    },
+    {
+      "epoch": 0.3015946042135051,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001495464087629376,
+      "loss": 0.0986,
+      "step": 34744
+    },
+    {
+      "epoch": 0.3016032846937093,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0014954373521170279,
+      "loss": 0.123,
+      "step": 34745
+    },
+    {
+      "epoch": 0.3016119651739134,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0014954106161722358,
+      "loss": 0.1123,
+      "step": 34746
+    },
+    {
+      "epoch": 0.30162064565411756,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.0014953838797950288,
+      "loss": 0.0962,
+      "step": 34747
+    },
+    {
+      "epoch": 0.30162932613432175,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0014953571429854357,
+      "loss": 0.1279,
+      "step": 34748
+    },
+    {
+      "epoch": 0.3016380066145259,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0014953304057434857,
+      "loss": 0.0713,
+      "step": 34749
+    },
+    {
+      "epoch": 0.3016466870947301,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0014953036680692084,
+      "loss": 0.0776,
+      "step": 34750
+    },
+    {
+      "epoch": 0.3016553675749342,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0014952769299626334,
+      "loss": 0.0811,
+      "step": 34751
+    },
+    {
+      "epoch": 0.3016640480551384,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0014952501914237892,
+      "loss": 0.1055,
+      "step": 34752
+    },
+    {
+      "epoch": 0.30167272853534255,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0014952234524527052,
+      "loss": 0.0806,
+      "step": 34753
+    },
+    {
+      "epoch": 0.30168140901554674,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001495196713049411,
+      "loss": 0.0801,
+      "step": 34754
+    },
+    {
+      "epoch": 0.3016900894957509,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0014951699732139356,
+      "loss": 0.1069,
+      "step": 34755
+    },
+    {
+      "epoch": 0.3016987699759551,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001495143232946308,
+      "loss": 0.123,
+      "step": 34756
+    },
+    {
+      "epoch": 0.3017074504561592,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0014951164922465583,
+      "loss": 0.0767,
+      "step": 34757
+    },
+    {
+      "epoch": 0.3017161309363634,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0014950897511147143,
+      "loss": 0.125,
+      "step": 34758
+    },
+    {
+      "epoch": 0.30172481141656754,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0014950630095508066,
+      "loss": 0.0903,
+      "step": 34759
+    },
+    {
+      "epoch": 0.30173349189677173,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0014950362675548638,
+      "loss": 0.1387,
+      "step": 34760
+    },
+    {
+      "epoch": 0.30174217237697587,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0014950095251269153,
+      "loss": 0.1719,
+      "step": 34761
+    },
+    {
+      "epoch": 0.30175085285718006,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0014949827822669905,
+      "loss": 0.1055,
+      "step": 34762
+    },
+    {
+      "epoch": 0.3017595333373842,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0014949560389751182,
+      "loss": 0.0908,
+      "step": 34763
+    },
+    {
+      "epoch": 0.3017682138175884,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001494929295251328,
+      "loss": 0.0703,
+      "step": 34764
+    },
+    {
+      "epoch": 0.30177689429779253,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0014949025510956494,
+      "loss": 0.1055,
+      "step": 34765
+    },
+    {
+      "epoch": 0.3017855747779967,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0014948758065081106,
+      "loss": 0.0986,
+      "step": 34766
+    },
+    {
+      "epoch": 0.30179425525820086,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0014948490614887422,
+      "loss": 0.0986,
+      "step": 34767
+    },
+    {
+      "epoch": 0.30180293573840505,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0014948223160375724,
+      "loss": 0.1011,
+      "step": 34768
+    },
+    {
+      "epoch": 0.3018116162186092,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0014947955701546307,
+      "loss": 0.1172,
+      "step": 34769
+    },
+    {
+      "epoch": 0.3018202966988134,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0014947688238399468,
+      "loss": 0.1641,
+      "step": 34770
+    },
+    {
+      "epoch": 0.3018289771790175,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0014947420770935495,
+      "loss": 0.0884,
+      "step": 34771
+    },
+    {
+      "epoch": 0.3018376576592217,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0014947153299154686,
+      "loss": 0.1152,
+      "step": 34772
+    },
+    {
+      "epoch": 0.30184633813942585,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0014946885823057326,
+      "loss": 0.0825,
+      "step": 34773
+    },
+    {
+      "epoch": 0.30185501861963004,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0014946618342643711,
+      "loss": 0.0781,
+      "step": 34774
+    },
+    {
+      "epoch": 0.3018636990998342,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0014946350857914135,
+      "loss": 0.1133,
+      "step": 34775
+    },
+    {
+      "epoch": 0.3018723795800384,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0014946083368868887,
+      "loss": 0.1084,
+      "step": 34776
+    },
+    {
+      "epoch": 0.3018810600602425,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0014945815875508262,
+      "loss": 0.1201,
+      "step": 34777
+    },
+    {
+      "epoch": 0.3018897405404467,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0014945548377832552,
+      "loss": 0.1113,
+      "step": 34778
+    },
+    {
+      "epoch": 0.30189842102065084,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.001494528087584205,
+      "loss": 0.1406,
+      "step": 34779
+    },
+    {
+      "epoch": 0.30190710150085504,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0014945013369537046,
+      "loss": 0.1069,
+      "step": 34780
+    },
+    {
+      "epoch": 0.3019157819810592,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014944745858917838,
+      "loss": 0.0986,
+      "step": 34781
+    },
+    {
+      "epoch": 0.30192446246126337,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0014944478343984719,
+      "loss": 0.0801,
+      "step": 34782
+    },
+    {
+      "epoch": 0.3019331429414675,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0014944210824737968,
+      "loss": 0.1025,
+      "step": 34783
+    },
+    {
+      "epoch": 0.3019418234216717,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0014943943301177895,
+      "loss": 0.1602,
+      "step": 34784
+    },
+    {
+      "epoch": 0.30195050390187583,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001494367577330478,
+      "loss": 0.1006,
+      "step": 34785
+    },
+    {
+      "epoch": 0.30195918438208,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0014943408241118923,
+      "loss": 0.3633,
+      "step": 34786
+    },
+    {
+      "epoch": 0.30196786486228416,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0014943140704620612,
+      "loss": 0.1387,
+      "step": 34787
+    },
+    {
+      "epoch": 0.30197654534248836,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0014942873163810144,
+      "loss": 0.0825,
+      "step": 34788
+    },
+    {
+      "epoch": 0.3019852258226925,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0014942605618687805,
+      "loss": 0.0835,
+      "step": 34789
+    },
+    {
+      "epoch": 0.3019939063028967,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0014942338069253898,
+      "loss": 0.085,
+      "step": 34790
+    },
+    {
+      "epoch": 0.3020025867831008,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0014942070515508706,
+      "loss": 0.0972,
+      "step": 34791
+    },
+    {
+      "epoch": 0.302011267263305,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0014941802957452524,
+      "loss": 0.0947,
+      "step": 34792
+    },
+    {
+      "epoch": 0.30201994774350915,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0014941535395085646,
+      "loss": 0.1045,
+      "step": 34793
+    },
+    {
+      "epoch": 0.30202862822371335,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0014941267828408368,
+      "loss": 0.1562,
+      "step": 34794
+    },
+    {
+      "epoch": 0.3020373087039175,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0014941000257420974,
+      "loss": 0.1011,
+      "step": 34795
+    },
+    {
+      "epoch": 0.3020459891841217,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0014940732682123762,
+      "loss": 0.0791,
+      "step": 34796
+    },
+    {
+      "epoch": 0.3020546696643258,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0014940465102517024,
+      "loss": 0.0967,
+      "step": 34797
+    },
+    {
+      "epoch": 0.30206335014453,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001494019751860105,
+      "loss": 0.0835,
+      "step": 34798
+    },
+    {
+      "epoch": 0.30207203062473414,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0014939929930376141,
+      "loss": 0.082,
+      "step": 34799
+    },
+    {
+      "epoch": 0.30208071110493834,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001493966233784258,
+      "loss": 0.1187,
+      "step": 34800
+    },
+    {
+      "epoch": 0.3020893915851425,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014939394741000666,
+      "loss": 0.1133,
+      "step": 34801
+    },
+    {
+      "epoch": 0.30209807206534667,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001493912713985069,
+      "loss": 0.0854,
+      "step": 34802
+    },
+    {
+      "epoch": 0.3021067525455508,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001493885953439294,
+      "loss": 0.1016,
+      "step": 34803
+    },
+    {
+      "epoch": 0.302115433025755,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0014938591924627714,
+      "loss": 0.0947,
+      "step": 34804
+    },
+    {
+      "epoch": 0.30212411350595914,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0014938324310555303,
+      "loss": 0.0771,
+      "step": 34805
+    },
+    {
+      "epoch": 0.30213279398616333,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0014938056692176001,
+      "loss": 0.1191,
+      "step": 34806
+    },
+    {
+      "epoch": 0.30214147446636747,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0014937789069490097,
+      "loss": 0.0752,
+      "step": 34807
+    },
+    {
+      "epoch": 0.30215015494657166,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0014937521442497886,
+      "loss": 0.127,
+      "step": 34808
+    },
+    {
+      "epoch": 0.3021588354267758,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.001493725381119966,
+      "loss": 0.1357,
+      "step": 34809
+    },
+    {
+      "epoch": 0.30216751590698,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0014936986175595715,
+      "loss": 0.0972,
+      "step": 34810
+    },
+    {
+      "epoch": 0.3021761963871841,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001493671853568634,
+      "loss": 0.1289,
+      "step": 34811
+    },
+    {
+      "epoch": 0.3021848768673883,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0014936450891471832,
+      "loss": 0.0977,
+      "step": 34812
+    },
+    {
+      "epoch": 0.30219355734759246,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001493618324295248,
+      "loss": 0.0659,
+      "step": 34813
+    },
+    {
+      "epoch": 0.30220223782779665,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0014935915590128571,
+      "loss": 0.0894,
+      "step": 34814
+    },
+    {
+      "epoch": 0.3022109183080008,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0014935647933000409,
+      "loss": 0.0977,
+      "step": 34815
+    },
+    {
+      "epoch": 0.302219598788205,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001493538027156828,
+      "loss": 0.1113,
+      "step": 34816
+    },
+    {
+      "epoch": 0.3022282792684091,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0014935112605832478,
+      "loss": 0.0825,
+      "step": 34817
+    },
+    {
+      "epoch": 0.3022369597486133,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0014934844935793293,
+      "loss": 0.1084,
+      "step": 34818
+    },
+    {
+      "epoch": 0.30224564022881745,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0014934577261451025,
+      "loss": 0.1021,
+      "step": 34819
+    },
+    {
+      "epoch": 0.30225432070902164,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0014934309582805962,
+      "loss": 0.1074,
+      "step": 34820
+    },
+    {
+      "epoch": 0.3022630011892258,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0014934041899858398,
+      "loss": 0.0859,
+      "step": 34821
+    },
+    {
+      "epoch": 0.30227168166942997,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001493377421260862,
+      "loss": 0.0908,
+      "step": 34822
+    },
+    {
+      "epoch": 0.3022803621496341,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001493350652105693,
+      "loss": 0.1064,
+      "step": 34823
+    },
+    {
+      "epoch": 0.3022890426298383,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0014933238825203617,
+      "loss": 0.1035,
+      "step": 34824
+    },
+    {
+      "epoch": 0.30229772311004244,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0014932971125048973,
+      "loss": 0.1074,
+      "step": 34825
+    },
+    {
+      "epoch": 0.30230640359024663,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0014932703420593287,
+      "loss": 0.084,
+      "step": 34826
+    },
+    {
+      "epoch": 0.30231508407045077,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0014932435711836856,
+      "loss": 0.1182,
+      "step": 34827
+    },
+    {
+      "epoch": 0.30232376455065496,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0014932167998779975,
+      "loss": 0.0942,
+      "step": 34828
+    },
+    {
+      "epoch": 0.3023324450308591,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0014931900281422938,
+      "loss": 0.1777,
+      "step": 34829
+    },
+    {
+      "epoch": 0.3023411255110633,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0014931632559766026,
+      "loss": 0.082,
+      "step": 34830
+    },
+    {
+      "epoch": 0.30234980599126743,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0014931364833809543,
+      "loss": 0.1064,
+      "step": 34831
+    },
+    {
+      "epoch": 0.3023584864714716,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0014931097103553779,
+      "loss": 0.0718,
+      "step": 34832
+    },
+    {
+      "epoch": 0.30236716695167576,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0014930829368999027,
+      "loss": 0.1025,
+      "step": 34833
+    },
+    {
+      "epoch": 0.30237584743187995,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0014930561630145578,
+      "loss": 0.1064,
+      "step": 34834
+    },
+    {
+      "epoch": 0.3023845279120841,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0014930293886993727,
+      "loss": 0.1006,
+      "step": 34835
+    },
+    {
+      "epoch": 0.3023932083922883,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0014930026139543765,
+      "loss": 0.1001,
+      "step": 34836
+    },
+    {
+      "epoch": 0.3024018888724924,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0014929758387795984,
+      "loss": 0.1611,
+      "step": 34837
+    },
+    {
+      "epoch": 0.3024105693526966,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001492949063175068,
+      "loss": 0.0986,
+      "step": 34838
+    },
+    {
+      "epoch": 0.30241924983290075,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0014929222871408142,
+      "loss": 0.0859,
+      "step": 34839
+    },
+    {
+      "epoch": 0.30242793031310494,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0014928955106768666,
+      "loss": 0.1016,
+      "step": 34840
+    },
+    {
+      "epoch": 0.3024366107933091,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0014928687337832545,
+      "loss": 0.0957,
+      "step": 34841
+    },
+    {
+      "epoch": 0.3024452912735133,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0014928419564600069,
+      "loss": 0.1133,
+      "step": 34842
+    },
+    {
+      "epoch": 0.3024539717537174,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0014928151787071532,
+      "loss": 0.0967,
+      "step": 34843
+    },
+    {
+      "epoch": 0.3024626522339216,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001492788400524723,
+      "loss": 0.0996,
+      "step": 34844
+    },
+    {
+      "epoch": 0.30247133271412574,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001492761621912745,
+      "loss": 0.0918,
+      "step": 34845
+    },
+    {
+      "epoch": 0.30248001319432993,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001492734842871249,
+      "loss": 0.1123,
+      "step": 34846
+    },
+    {
+      "epoch": 0.30248869367453407,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.001492708063400264,
+      "loss": 0.0786,
+      "step": 34847
+    },
+    {
+      "epoch": 0.30249737415473826,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0014926812834998191,
+      "loss": 0.1484,
+      "step": 34848
+    },
+    {
+      "epoch": 0.3025060546349424,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001492654503169944,
+      "loss": 0.0986,
+      "step": 34849
+    },
+    {
+      "epoch": 0.3025147351151466,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0014926277224106682,
+      "loss": 0.1011,
+      "step": 34850
+    },
+    {
+      "epoch": 0.30252341559535073,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0014926009412220204,
+      "loss": 0.1328,
+      "step": 34851
+    },
+    {
+      "epoch": 0.3025320960755549,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.00149257415960403,
+      "loss": 0.1279,
+      "step": 34852
+    },
+    {
+      "epoch": 0.30254077655575906,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0014925473775567264,
+      "loss": 0.0962,
+      "step": 34853
+    },
+    {
+      "epoch": 0.30254945703596325,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001492520595080139,
+      "loss": 0.0869,
+      "step": 34854
+    },
+    {
+      "epoch": 0.3025581375161674,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.001492493812174297,
+      "loss": 0.0811,
+      "step": 34855
+    },
+    {
+      "epoch": 0.3025668179963716,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0014924670288392295,
+      "loss": 0.1006,
+      "step": 34856
+    },
+    {
+      "epoch": 0.3025754984765757,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0014924402450749658,
+      "loss": 0.1299,
+      "step": 34857
+    },
+    {
+      "epoch": 0.3025841789567799,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0014924134608815358,
+      "loss": 0.1279,
+      "step": 34858
+    },
+    {
+      "epoch": 0.30259285943698405,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0014923866762589684,
+      "loss": 0.1133,
+      "step": 34859
+    },
+    {
+      "epoch": 0.30260153991718824,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0014923598912072924,
+      "loss": 0.1167,
+      "step": 34860
+    },
+    {
+      "epoch": 0.3026102203973924,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0014923331057265375,
+      "loss": 0.064,
+      "step": 34861
+    },
+    {
+      "epoch": 0.3026189008775966,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0014923063198167333,
+      "loss": 0.1387,
+      "step": 34862
+    },
+    {
+      "epoch": 0.3026275813578007,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0014922795334779088,
+      "loss": 0.1055,
+      "step": 34863
+    },
+    {
+      "epoch": 0.3026362618380049,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001492252746710093,
+      "loss": 0.0908,
+      "step": 34864
+    },
+    {
+      "epoch": 0.30264494231820904,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0014922259595133156,
+      "loss": 0.082,
+      "step": 34865
+    },
+    {
+      "epoch": 0.30265362279841324,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0014921991718876058,
+      "loss": 0.0869,
+      "step": 34866
+    },
+    {
+      "epoch": 0.3026623032786174,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0014921723838329931,
+      "loss": 0.0913,
+      "step": 34867
+    },
+    {
+      "epoch": 0.30267098375882157,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0014921455953495065,
+      "loss": 0.083,
+      "step": 34868
+    },
+    {
+      "epoch": 0.3026796642390257,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001492118806437175,
+      "loss": 0.1152,
+      "step": 34869
+    },
+    {
+      "epoch": 0.30268834471922984,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0014920920170960286,
+      "loss": 0.1719,
+      "step": 34870
+    },
+    {
+      "epoch": 0.30269702519943403,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0014920652273260964,
+      "loss": 0.0713,
+      "step": 34871
+    },
+    {
+      "epoch": 0.30270570567963817,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0014920384371274072,
+      "loss": 0.1289,
+      "step": 34872
+    },
+    {
+      "epoch": 0.30271438615984236,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0014920116464999908,
+      "loss": 0.1025,
+      "step": 34873
+    },
+    {
+      "epoch": 0.3027230666400465,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0014919848554438763,
+      "loss": 0.0918,
+      "step": 34874
+    },
+    {
+      "epoch": 0.3027317471202507,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001491958063959093,
+      "loss": 0.1357,
+      "step": 34875
+    },
+    {
+      "epoch": 0.30274042760045483,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0014919312720456707,
+      "loss": 0.1221,
+      "step": 34876
+    },
+    {
+      "epoch": 0.302749108080659,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0014919044797036378,
+      "loss": 0.0898,
+      "step": 34877
+    },
+    {
+      "epoch": 0.30275778856086316,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.001491877686933024,
+      "loss": 0.0986,
+      "step": 34878
+    },
+    {
+      "epoch": 0.30276646904106735,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.001491850893733859,
+      "loss": 0.1221,
+      "step": 34879
+    },
+    {
+      "epoch": 0.3027751495212715,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014918241001061717,
+      "loss": 0.0742,
+      "step": 34880
+    },
+    {
+      "epoch": 0.3027838300014757,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0014917973060499911,
+      "loss": 0.1318,
+      "step": 34881
+    },
+    {
+      "epoch": 0.3027925104816798,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0014917705115653473,
+      "loss": 0.1113,
+      "step": 34882
+    },
+    {
+      "epoch": 0.302801190961884,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0014917437166522688,
+      "loss": 0.0688,
+      "step": 34883
+    },
+    {
+      "epoch": 0.30280987144208815,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0014917169213107852,
+      "loss": 0.082,
+      "step": 34884
+    },
+    {
+      "epoch": 0.30281855192229234,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0014916901255409261,
+      "loss": 0.0938,
+      "step": 34885
+    },
+    {
+      "epoch": 0.3028272324024965,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0014916633293427207,
+      "loss": 0.1064,
+      "step": 34886
+    },
+    {
+      "epoch": 0.3028359128827007,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0014916365327161977,
+      "loss": 0.1123,
+      "step": 34887
+    },
+    {
+      "epoch": 0.3028445933629048,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0014916097356613871,
+      "loss": 0.1094,
+      "step": 34888
+    },
+    {
+      "epoch": 0.302853273843109,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0014915829381783182,
+      "loss": 0.0781,
+      "step": 34889
+    },
+    {
+      "epoch": 0.30286195432331314,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0014915561402670202,
+      "loss": 0.1113,
+      "step": 34890
+    },
+    {
+      "epoch": 0.30287063480351734,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014915293419275218,
+      "loss": 0.1064,
+      "step": 34891
+    },
+    {
+      "epoch": 0.3028793152837215,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001491502543159853,
+      "loss": 0.1465,
+      "step": 34892
+    },
+    {
+      "epoch": 0.30288799576392567,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0014914757439640429,
+      "loss": 0.0801,
+      "step": 34893
+    },
+    {
+      "epoch": 0.3028966762441298,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0014914489443401207,
+      "loss": 0.0742,
+      "step": 34894
+    },
+    {
+      "epoch": 0.302905356724334,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0014914221442881158,
+      "loss": 0.1318,
+      "step": 34895
+    },
+    {
+      "epoch": 0.30291403720453813,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0014913953438080577,
+      "loss": 0.1387,
+      "step": 34896
+    },
+    {
+      "epoch": 0.3029227176847423,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0014913685428999755,
+      "loss": 0.0947,
+      "step": 34897
+    },
+    {
+      "epoch": 0.30293139816494646,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0014913417415638986,
+      "loss": 0.1279,
+      "step": 34898
+    },
+    {
+      "epoch": 0.30294007864515066,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0014913149397998563,
+      "loss": 0.1289,
+      "step": 34899
+    },
+    {
+      "epoch": 0.3029487591253548,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0014912881376078773,
+      "loss": 0.0854,
+      "step": 34900
+    },
+    {
+      "epoch": 0.302957439605559,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0014912613349879922,
+      "loss": 0.123,
+      "step": 34901
+    },
+    {
+      "epoch": 0.3029661200857631,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0014912345319402292,
+      "loss": 0.0737,
+      "step": 34902
+    },
+    {
+      "epoch": 0.3029748005659673,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0014912077284646179,
+      "loss": 0.1025,
+      "step": 34903
+    },
+    {
+      "epoch": 0.30298348104617145,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001491180924561188,
+      "loss": 0.0796,
+      "step": 34904
+    },
+    {
+      "epoch": 0.30299216152637565,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0014911541202299683,
+      "loss": 0.0869,
+      "step": 34905
+    },
+    {
+      "epoch": 0.3030008420065798,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0014911273154709886,
+      "loss": 0.1094,
+      "step": 34906
+    },
+    {
+      "epoch": 0.303009522486784,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0014911005102842777,
+      "loss": 0.0713,
+      "step": 34907
+    },
+    {
+      "epoch": 0.3030182029669881,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0014910737046698653,
+      "loss": 0.1436,
+      "step": 34908
+    },
+    {
+      "epoch": 0.3030268834471923,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0014910468986277805,
+      "loss": 0.1123,
+      "step": 34909
+    },
+    {
+      "epoch": 0.30303556392739645,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0014910200921580527,
+      "loss": 0.0933,
+      "step": 34910
+    },
+    {
+      "epoch": 0.30304424440760064,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001490993285260711,
+      "loss": 0.1182,
+      "step": 34911
+    },
+    {
+      "epoch": 0.3030529248878048,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0014909664779357852,
+      "loss": 0.1055,
+      "step": 34912
+    },
+    {
+      "epoch": 0.30306160536800897,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0014909396701833044,
+      "loss": 0.1191,
+      "step": 34913
+    },
+    {
+      "epoch": 0.3030702858482131,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0014909128620032974,
+      "loss": 0.1152,
+      "step": 34914
+    },
+    {
+      "epoch": 0.3030789663284173,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0014908860533957944,
+      "loss": 0.1328,
+      "step": 34915
+    },
+    {
+      "epoch": 0.30308764680862144,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0014908592443608243,
+      "loss": 0.0957,
+      "step": 34916
+    },
+    {
+      "epoch": 0.30309632728882563,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0014908324348984162,
+      "loss": 0.0723,
+      "step": 34917
+    },
+    {
+      "epoch": 0.30310500776902977,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0014908056250085996,
+      "loss": 0.1426,
+      "step": 34918
+    },
+    {
+      "epoch": 0.30311368824923396,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001490778814691404,
+      "loss": 0.0967,
+      "step": 34919
+    },
+    {
+      "epoch": 0.3031223687294381,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0014907520039468586,
+      "loss": 0.0918,
+      "step": 34920
+    },
+    {
+      "epoch": 0.3031310492096423,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0014907251927749926,
+      "loss": 0.1143,
+      "step": 34921
+    },
+    {
+      "epoch": 0.3031397296898464,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.001490698381175835,
+      "loss": 0.123,
+      "step": 34922
+    },
+    {
+      "epoch": 0.3031484101700506,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001490671569149416,
+      "loss": 0.0942,
+      "step": 34923
+    },
+    {
+      "epoch": 0.30315709065025476,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0014906447566957644,
+      "loss": 0.0962,
+      "step": 34924
+    },
+    {
+      "epoch": 0.30316577113045895,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0014906179438149093,
+      "loss": 0.1533,
+      "step": 34925
+    },
+    {
+      "epoch": 0.3031744516106631,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0014905911305068804,
+      "loss": 0.0625,
+      "step": 34926
+    },
+    {
+      "epoch": 0.3031831320908673,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001490564316771707,
+      "loss": 0.0884,
+      "step": 34927
+    },
+    {
+      "epoch": 0.3031918125710714,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0014905375026094184,
+      "loss": 0.0938,
+      "step": 34928
+    },
+    {
+      "epoch": 0.3032004930512756,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001490510688020044,
+      "loss": 0.0864,
+      "step": 34929
+    },
+    {
+      "epoch": 0.30320917353147975,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0014904838730036122,
+      "loss": 0.1011,
+      "step": 34930
+    },
+    {
+      "epoch": 0.30321785401168394,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0014904570575601538,
+      "loss": 0.1187,
+      "step": 34931
+    },
+    {
+      "epoch": 0.3032265344918881,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0014904302416896974,
+      "loss": 0.1104,
+      "step": 34932
+    },
+    {
+      "epoch": 0.30323521497209227,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0014904034253922721,
+      "loss": 0.0859,
+      "step": 34933
+    },
+    {
+      "epoch": 0.3032438954522964,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0014903766086679078,
+      "loss": 0.1143,
+      "step": 34934
+    },
+    {
+      "epoch": 0.3032525759325006,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001490349791516633,
+      "loss": 0.106,
+      "step": 34935
+    },
+    {
+      "epoch": 0.30326125641270474,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0014903229739384781,
+      "loss": 0.1074,
+      "step": 34936
+    },
+    {
+      "epoch": 0.30326993689290893,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0014902961559334715,
+      "loss": 0.0908,
+      "step": 34937
+    },
+    {
+      "epoch": 0.30327861737311307,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001490269337501643,
+      "loss": 0.0918,
+      "step": 34938
+    },
+    {
+      "epoch": 0.30328729785331726,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0014902425186430219,
+      "loss": 0.1338,
+      "step": 34939
+    },
+    {
+      "epoch": 0.3032959783335214,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0014902156993576374,
+      "loss": 0.1631,
+      "step": 34940
+    },
+    {
+      "epoch": 0.3033046588137256,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001490188879645519,
+      "loss": 0.0825,
+      "step": 34941
+    },
+    {
+      "epoch": 0.30331333929392973,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0014901620595066953,
+      "loss": 0.0698,
+      "step": 34942
+    },
+    {
+      "epoch": 0.3033220197741339,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0014901352389411968,
+      "loss": 0.1367,
+      "step": 34943
+    },
+    {
+      "epoch": 0.30333070025433806,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001490108417949052,
+      "loss": 0.0869,
+      "step": 34944
+    },
+    {
+      "epoch": 0.30333938073454225,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0014900815965302907,
+      "loss": 0.1025,
+      "step": 34945
+    },
+    {
+      "epoch": 0.3033480612147464,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001490054774684942,
+      "loss": 0.0859,
+      "step": 34946
+    },
+    {
+      "epoch": 0.3033567416949506,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0014900279524130353,
+      "loss": 0.1035,
+      "step": 34947
+    },
+    {
+      "epoch": 0.3033654221751547,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0014900011297145997,
+      "loss": 0.1025,
+      "step": 34948
+    },
+    {
+      "epoch": 0.3033741026553589,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001489974306589665,
+      "loss": 0.0815,
+      "step": 34949
+    },
+    {
+      "epoch": 0.30338278313556305,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.00148994748303826,
+      "loss": 0.1084,
+      "step": 34950
+    },
+    {
+      "epoch": 0.30339146361576724,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0014899206590604144,
+      "loss": 0.0742,
+      "step": 34951
+    },
+    {
+      "epoch": 0.3034001440959714,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001489893834656157,
+      "loss": 0.0938,
+      "step": 34952
+    },
+    {
+      "epoch": 0.3034088245761756,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0014898670098255182,
+      "loss": 0.0908,
+      "step": 34953
+    },
+    {
+      "epoch": 0.3034175050563797,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0014898401845685266,
+      "loss": 0.127,
+      "step": 34954
+    },
+    {
+      "epoch": 0.3034261855365839,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0014898133588852115,
+      "loss": 0.1143,
+      "step": 34955
+    },
+    {
+      "epoch": 0.30343486601678804,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0014897865327756022,
+      "loss": 0.1641,
+      "step": 34956
+    },
+    {
+      "epoch": 0.30344354649699223,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0014897597062397284,
+      "loss": 0.0825,
+      "step": 34957
+    },
+    {
+      "epoch": 0.30345222697719637,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0014897328792776192,
+      "loss": 0.0952,
+      "step": 34958
+    },
+    {
+      "epoch": 0.30346090745740056,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001489706051889304,
+      "loss": 0.25,
+      "step": 34959
+    },
+    {
+      "epoch": 0.3034695879376047,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014896792240748121,
+      "loss": 0.1387,
+      "step": 34960
+    },
+    {
+      "epoch": 0.3034782684178089,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001489652395834173,
+      "loss": 0.0972,
+      "step": 34961
+    },
+    {
+      "epoch": 0.30348694889801303,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0014896255671674157,
+      "loss": 0.1113,
+      "step": 34962
+    },
+    {
+      "epoch": 0.3034956293782172,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0014895987380745697,
+      "loss": 0.0967,
+      "step": 34963
+    },
+    {
+      "epoch": 0.30350430985842136,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0014895719085556646,
+      "loss": 0.0845,
+      "step": 34964
+    },
+    {
+      "epoch": 0.30351299033862555,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0014895450786107291,
+      "loss": 0.1016,
+      "step": 34965
+    },
+    {
+      "epoch": 0.3035216708188297,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0014895182482397933,
+      "loss": 0.1289,
+      "step": 34966
+    },
+    {
+      "epoch": 0.3035303512990339,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0014894914174428862,
+      "loss": 0.0728,
+      "step": 34967
+    },
+    {
+      "epoch": 0.303539031779238,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0014894645862200372,
+      "loss": 0.0947,
+      "step": 34968
+    },
+    {
+      "epoch": 0.3035477122594422,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0014894377545712752,
+      "loss": 0.0806,
+      "step": 34969
+    },
+    {
+      "epoch": 0.30355639273964635,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0014894109224966303,
+      "loss": 0.1074,
+      "step": 34970
+    },
+    {
+      "epoch": 0.30356507321985055,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001489384089996131,
+      "loss": 0.1045,
+      "step": 34971
+    },
+    {
+      "epoch": 0.3035737537000547,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0014893572570698077,
+      "loss": 0.0913,
+      "step": 34972
+    },
+    {
+      "epoch": 0.3035824341802589,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0014893304237176888,
+      "loss": 0.207,
+      "step": 34973
+    },
+    {
+      "epoch": 0.303591114660463,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001489303589939804,
+      "loss": 0.0835,
+      "step": 34974
+    },
+    {
+      "epoch": 0.3035997951406672,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0014892767557361827,
+      "loss": 0.0967,
+      "step": 34975
+    },
+    {
+      "epoch": 0.30360847562087134,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0014892499211068544,
+      "loss": 0.1523,
+      "step": 34976
+    },
+    {
+      "epoch": 0.30361715610107554,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0014892230860518482,
+      "loss": 0.0947,
+      "step": 34977
+    },
+    {
+      "epoch": 0.3036258365812797,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001489196250571193,
+      "loss": 0.0977,
+      "step": 34978
+    },
+    {
+      "epoch": 0.30363451706148387,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0014891694146649192,
+      "loss": 0.0879,
+      "step": 34979
+    },
+    {
+      "epoch": 0.303643197541688,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0014891425783330552,
+      "loss": 0.0957,
+      "step": 34980
+    },
+    {
+      "epoch": 0.3036518780218922,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0014891157415756308,
+      "loss": 0.0732,
+      "step": 34981
+    },
+    {
+      "epoch": 0.30366055850209633,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0014890889043926755,
+      "loss": 0.0781,
+      "step": 34982
+    },
+    {
+      "epoch": 0.3036692389823005,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0014890620667842181,
+      "loss": 0.1035,
+      "step": 34983
+    },
+    {
+      "epoch": 0.30367791946250466,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0014890352287502883,
+      "loss": 0.1387,
+      "step": 34984
+    },
+    {
+      "epoch": 0.30368659994270886,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0014890083902909157,
+      "loss": 0.1309,
+      "step": 34985
+    },
+    {
+      "epoch": 0.303695280422913,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0014889815514061294,
+      "loss": 0.1221,
+      "step": 34986
+    },
+    {
+      "epoch": 0.3037039609031172,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0014889547120959584,
+      "loss": 0.0693,
+      "step": 34987
+    },
+    {
+      "epoch": 0.3037126413833213,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0014889278723604325,
+      "loss": 0.0972,
+      "step": 34988
+    },
+    {
+      "epoch": 0.3037213218635255,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014889010321995808,
+      "loss": 0.1396,
+      "step": 34989
+    },
+    {
+      "epoch": 0.30373000234372965,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0014888741916134329,
+      "loss": 0.1406,
+      "step": 34990
+    },
+    {
+      "epoch": 0.30373868282393385,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001488847350602018,
+      "loss": 0.1416,
+      "step": 34991
+    },
+    {
+      "epoch": 0.303747363304138,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0014888205091653655,
+      "loss": 0.0747,
+      "step": 34992
+    },
+    {
+      "epoch": 0.3037560437843421,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0014887936673035047,
+      "loss": 0.105,
+      "step": 34993
+    },
+    {
+      "epoch": 0.3037647242645463,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0014887668250164652,
+      "loss": 0.2871,
+      "step": 34994
+    },
+    {
+      "epoch": 0.30377340474475045,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0014887399823042759,
+      "loss": 0.0991,
+      "step": 34995
+    },
+    {
+      "epoch": 0.30378208522495465,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0014887131391669666,
+      "loss": 0.084,
+      "step": 34996
+    },
+    {
+      "epoch": 0.3037907657051588,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0014886862956045661,
+      "loss": 0.0884,
+      "step": 34997
+    },
+    {
+      "epoch": 0.303799446185363,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0014886594516171046,
+      "loss": 0.1143,
+      "step": 34998
+    },
+    {
+      "epoch": 0.3038081266655671,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0014886326072046103,
+      "loss": 0.1113,
+      "step": 34999
+    },
+    {
+      "epoch": 0.3038168071457713,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0014886057623671137,
+      "loss": 0.0703,
+      "step": 35000
+    },
+    {
+      "epoch": 0.30382548762597544,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0014885789171046435,
+      "loss": 0.1719,
+      "step": 35001
+    },
+    {
+      "epoch": 0.30383416810617964,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0014885520714172293,
+      "loss": 0.1562,
+      "step": 35002
+    },
+    {
+      "epoch": 0.3038428485863838,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0014885252253049007,
+      "loss": 0.126,
+      "step": 35003
+    },
+    {
+      "epoch": 0.30385152906658797,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0014884983787676865,
+      "loss": 0.1299,
+      "step": 35004
+    },
+    {
+      "epoch": 0.3038602095467921,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0014884715318056161,
+      "loss": 0.124,
+      "step": 35005
+    },
+    {
+      "epoch": 0.3038688900269963,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0014884446844187194,
+      "loss": 0.0806,
+      "step": 35006
+    },
+    {
+      "epoch": 0.30387757050720043,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0014884178366070253,
+      "loss": 0.1035,
+      "step": 35007
+    },
+    {
+      "epoch": 0.3038862509874046,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0014883909883705632,
+      "loss": 0.1079,
+      "step": 35008
+    },
+    {
+      "epoch": 0.30389493146760876,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0014883641397093624,
+      "loss": 0.0972,
+      "step": 35009
+    },
+    {
+      "epoch": 0.30390361194781296,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0014883372906234528,
+      "loss": 0.0703,
+      "step": 35010
+    },
+    {
+      "epoch": 0.3039122924280171,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001488310441112863,
+      "loss": 0.1191,
+      "step": 35011
+    },
+    {
+      "epoch": 0.3039209729082213,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001488283591177623,
+      "loss": 0.1113,
+      "step": 35012
+    },
+    {
+      "epoch": 0.3039296533884254,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0014882567408177618,
+      "loss": 0.0645,
+      "step": 35013
+    },
+    {
+      "epoch": 0.3039383338686296,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001488229890033309,
+      "loss": 0.0918,
+      "step": 35014
+    },
+    {
+      "epoch": 0.30394701434883376,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0014882030388242938,
+      "loss": 0.0869,
+      "step": 35015
+    },
+    {
+      "epoch": 0.30395569482903795,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0014881761871907455,
+      "loss": 0.0703,
+      "step": 35016
+    },
+    {
+      "epoch": 0.3039643753092421,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0014881493351326935,
+      "loss": 0.0737,
+      "step": 35017
+    },
+    {
+      "epoch": 0.3039730557894463,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0014881224826501674,
+      "loss": 0.1426,
+      "step": 35018
+    },
+    {
+      "epoch": 0.3039817362696504,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0014880956297431962,
+      "loss": 0.1221,
+      "step": 35019
+    },
+    {
+      "epoch": 0.3039904167498546,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0014880687764118096,
+      "loss": 0.0898,
+      "step": 35020
+    },
+    {
+      "epoch": 0.30399909723005875,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0014880419226560367,
+      "loss": 0.1289,
+      "step": 35021
+    },
+    {
+      "epoch": 0.30400777771026294,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0014880150684759069,
+      "loss": 0.1455,
+      "step": 35022
+    },
+    {
+      "epoch": 0.3040164581904671,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0014879882138714498,
+      "loss": 0.106,
+      "step": 35023
+    },
+    {
+      "epoch": 0.30402513867067127,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0014879613588426946,
+      "loss": 0.0796,
+      "step": 35024
+    },
+    {
+      "epoch": 0.3040338191508754,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0014879345033896707,
+      "loss": 0.1118,
+      "step": 35025
+    },
+    {
+      "epoch": 0.3040424996310796,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0014879076475124073,
+      "loss": 0.1206,
+      "step": 35026
+    },
+    {
+      "epoch": 0.30405118011128374,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001487880791210934,
+      "loss": 0.0933,
+      "step": 35027
+    },
+    {
+      "epoch": 0.30405986059148793,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00148785393448528,
+      "loss": 0.0957,
+      "step": 35028
+    },
+    {
+      "epoch": 0.30406854107169207,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0014878270773354752,
+      "loss": 0.0806,
+      "step": 35029
+    },
+    {
+      "epoch": 0.30407722155189626,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001487800219761548,
+      "loss": 0.0654,
+      "step": 35030
+    },
+    {
+      "epoch": 0.3040859020321004,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0014877733617635288,
+      "loss": 0.1084,
+      "step": 35031
+    },
+    {
+      "epoch": 0.3040945825123046,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001487746503341446,
+      "loss": 0.0713,
+      "step": 35032
+    },
+    {
+      "epoch": 0.3041032629925087,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0014877196444953301,
+      "loss": 0.0815,
+      "step": 35033
+    },
+    {
+      "epoch": 0.3041119434727129,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0014876927852252092,
+      "loss": 0.1221,
+      "step": 35034
+    },
+    {
+      "epoch": 0.30412062395291706,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0014876659255311135,
+      "loss": 0.0781,
+      "step": 35035
+    },
+    {
+      "epoch": 0.30412930443312125,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0014876390654130721,
+      "loss": 0.1201,
+      "step": 35036
+    },
+    {
+      "epoch": 0.3041379849133254,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0014876122048711146,
+      "loss": 0.1621,
+      "step": 35037
+    },
+    {
+      "epoch": 0.3041466653935296,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.00148758534390527,
+      "loss": 0.1426,
+      "step": 35038
+    },
+    {
+      "epoch": 0.3041553458737337,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001487558482515568,
+      "loss": 0.1201,
+      "step": 35039
+    },
+    {
+      "epoch": 0.3041640263539379,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0014875316207020378,
+      "loss": 0.1016,
+      "step": 35040
+    },
+    {
+      "epoch": 0.30417270683414205,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001487504758464709,
+      "loss": 0.1074,
+      "step": 35041
+    },
+    {
+      "epoch": 0.30418138731434624,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001487477895803611,
+      "loss": 0.105,
+      "step": 35042
+    },
+    {
+      "epoch": 0.3041900677945504,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0014874510327187724,
+      "loss": 0.1074,
+      "step": 35043
+    },
+    {
+      "epoch": 0.30419874827475457,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014874241692102235,
+      "loss": 0.1045,
+      "step": 35044
+    },
+    {
+      "epoch": 0.3042074287549587,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0014873973052779938,
+      "loss": 0.0762,
+      "step": 35045
+    },
+    {
+      "epoch": 0.3042161092351629,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0014873704409221116,
+      "loss": 0.083,
+      "step": 35046
+    },
+    {
+      "epoch": 0.30422478971536704,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001487343576142607,
+      "loss": 0.0776,
+      "step": 35047
+    },
+    {
+      "epoch": 0.30423347019557123,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0014873167109395093,
+      "loss": 0.1196,
+      "step": 35048
+    },
+    {
+      "epoch": 0.30424215067577537,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001487289845312848,
+      "loss": 0.0869,
+      "step": 35049
+    },
+    {
+      "epoch": 0.30425083115597956,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014872629792626521,
+      "loss": 0.0684,
+      "step": 35050
+    },
+    {
+      "epoch": 0.3042595116361837,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0014872361127889516,
+      "loss": 0.0903,
+      "step": 35051
+    },
+    {
+      "epoch": 0.3042681921163879,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0014872092458917752,
+      "loss": 0.1367,
+      "step": 35052
+    },
+    {
+      "epoch": 0.30427687259659203,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0014871823785711526,
+      "loss": 0.1143,
+      "step": 35053
+    },
+    {
+      "epoch": 0.3042855530767962,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0014871555108271135,
+      "loss": 0.1201,
+      "step": 35054
+    },
+    {
+      "epoch": 0.30429423355700036,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0014871286426596869,
+      "loss": 0.0957,
+      "step": 35055
+    },
+    {
+      "epoch": 0.30430291403720455,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0014871017740689019,
+      "loss": 0.1377,
+      "step": 35056
+    },
+    {
+      "epoch": 0.3043115945174087,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0014870749050547882,
+      "loss": 0.1367,
+      "step": 35057
+    },
+    {
+      "epoch": 0.3043202749976129,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0014870480356173755,
+      "loss": 0.1094,
+      "step": 35058
+    },
+    {
+      "epoch": 0.304328955477817,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0014870211657566927,
+      "loss": 0.0938,
+      "step": 35059
+    },
+    {
+      "epoch": 0.3043376359580212,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0014869942954727692,
+      "loss": 0.1045,
+      "step": 35060
+    },
+    {
+      "epoch": 0.30434631643822535,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001486967424765635,
+      "loss": 0.1113,
+      "step": 35061
+    },
+    {
+      "epoch": 0.30435499691842954,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001486940553635319,
+      "loss": 0.1318,
+      "step": 35062
+    },
+    {
+      "epoch": 0.3043636773986337,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0014869136820818503,
+      "loss": 0.1221,
+      "step": 35063
+    },
+    {
+      "epoch": 0.3043723578788379,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0014868868101052587,
+      "loss": 0.1104,
+      "step": 35064
+    },
+    {
+      "epoch": 0.304381038359042,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0014868599377055737,
+      "loss": 0.0723,
+      "step": 35065
+    },
+    {
+      "epoch": 0.3043897188392462,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001486833064882824,
+      "loss": 0.166,
+      "step": 35066
+    },
+    {
+      "epoch": 0.30439839931945034,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0014868061916370398,
+      "loss": 0.1523,
+      "step": 35067
+    },
+    {
+      "epoch": 0.30440707979965453,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0014867793179682504,
+      "loss": 0.0811,
+      "step": 35068
+    },
+    {
+      "epoch": 0.30441576027985867,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0014867524438764847,
+      "loss": 0.1182,
+      "step": 35069
+    },
+    {
+      "epoch": 0.30442444076006286,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0014867255693617723,
+      "loss": 0.0732,
+      "step": 35070
+    },
+    {
+      "epoch": 0.304433121240267,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0014866986944241428,
+      "loss": 0.085,
+      "step": 35071
+    },
+    {
+      "epoch": 0.3044418017204712,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0014866718190636253,
+      "loss": 0.0908,
+      "step": 35072
+    },
+    {
+      "epoch": 0.30445048220067533,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0014866449432802491,
+      "loss": 0.1113,
+      "step": 35073
+    },
+    {
+      "epoch": 0.3044591626808795,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0014866180670740441,
+      "loss": 0.083,
+      "step": 35074
+    },
+    {
+      "epoch": 0.30446784316108366,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0014865911904450393,
+      "loss": 0.0942,
+      "step": 35075
+    },
+    {
+      "epoch": 0.30447652364128786,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0014865643133932644,
+      "loss": 0.127,
+      "step": 35076
+    },
+    {
+      "epoch": 0.304485204121492,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.001486537435918748,
+      "loss": 0.1309,
+      "step": 35077
+    },
+    {
+      "epoch": 0.3044938846016962,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0014865105580215206,
+      "loss": 0.0757,
+      "step": 35078
+    },
+    {
+      "epoch": 0.3045025650819003,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0014864836797016108,
+      "loss": 0.1172,
+      "step": 35079
+    },
+    {
+      "epoch": 0.3045112455621045,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0014864568009590484,
+      "loss": 0.0874,
+      "step": 35080
+    },
+    {
+      "epoch": 0.30451992604230865,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0014864299217938626,
+      "loss": 0.1309,
+      "step": 35081
+    },
+    {
+      "epoch": 0.30452860652251285,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0014864030422060829,
+      "loss": 0.1113,
+      "step": 35082
+    },
+    {
+      "epoch": 0.304537287002717,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0014863761621957386,
+      "loss": 0.1299,
+      "step": 35083
+    },
+    {
+      "epoch": 0.3045459674829212,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0014863492817628592,
+      "loss": 0.1206,
+      "step": 35084
+    },
+    {
+      "epoch": 0.3045546479631253,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0014863224009074739,
+      "loss": 0.0986,
+      "step": 35085
+    },
+    {
+      "epoch": 0.3045633284433295,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.001486295519629612,
+      "loss": 0.1338,
+      "step": 35086
+    },
+    {
+      "epoch": 0.30457200892353364,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0014862686379293034,
+      "loss": 0.0801,
+      "step": 35087
+    },
+    {
+      "epoch": 0.30458068940373784,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0014862417558065772,
+      "loss": 0.1523,
+      "step": 35088
+    },
+    {
+      "epoch": 0.304589369883942,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001486214873261463,
+      "loss": 0.1147,
+      "step": 35089
+    },
+    {
+      "epoch": 0.30459805036414617,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0014861879902939897,
+      "loss": 0.1357,
+      "step": 35090
+    },
+    {
+      "epoch": 0.3046067308443503,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001486161106904187,
+      "loss": 0.1113,
+      "step": 35091
+    },
+    {
+      "epoch": 0.3046154113245545,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0014861342230920846,
+      "loss": 0.1758,
+      "step": 35092
+    },
+    {
+      "epoch": 0.30462409180475863,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0014861073388577116,
+      "loss": 0.0967,
+      "step": 35093
+    },
+    {
+      "epoch": 0.3046327722849628,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0014860804542010972,
+      "loss": 0.103,
+      "step": 35094
+    },
+    {
+      "epoch": 0.30464145276516696,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0014860535691222712,
+      "loss": 0.1035,
+      "step": 35095
+    },
+    {
+      "epoch": 0.30465013324537116,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0014860266836212626,
+      "loss": 0.1221,
+      "step": 35096
+    },
+    {
+      "epoch": 0.3046588137255753,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0014859997976981012,
+      "loss": 0.1104,
+      "step": 35097
+    },
+    {
+      "epoch": 0.3046674942057795,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0014859729113528162,
+      "loss": 0.1338,
+      "step": 35098
+    },
+    {
+      "epoch": 0.3046761746859836,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0014859460245854371,
+      "loss": 0.1025,
+      "step": 35099
+    },
+    {
+      "epoch": 0.3046848551661878,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.001485919137395993,
+      "loss": 0.1113,
+      "step": 35100
+    },
+    {
+      "epoch": 0.30469353564639196,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0014858922497845138,
+      "loss": 0.0947,
+      "step": 35101
+    },
+    {
+      "epoch": 0.30470221612659615,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0014858653617510285,
+      "loss": 0.1348,
+      "step": 35102
+    },
+    {
+      "epoch": 0.3047108966068003,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0014858384732955665,
+      "loss": 0.1055,
+      "step": 35103
+    },
+    {
+      "epoch": 0.3047195770870045,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0014858115844181574,
+      "loss": 0.1367,
+      "step": 35104
+    },
+    {
+      "epoch": 0.3047282575672086,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0014857846951188307,
+      "loss": 0.0977,
+      "step": 35105
+    },
+    {
+      "epoch": 0.3047369380474128,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0014857578053976154,
+      "loss": 0.1543,
+      "step": 35106
+    },
+    {
+      "epoch": 0.30474561852761695,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0014857309152545414,
+      "loss": 0.1025,
+      "step": 35107
+    },
+    {
+      "epoch": 0.30475429900782114,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001485704024689638,
+      "loss": 0.1748,
+      "step": 35108
+    },
+    {
+      "epoch": 0.3047629794880253,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0014856771337029343,
+      "loss": 0.2461,
+      "step": 35109
+    },
+    {
+      "epoch": 0.30477165996822947,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0014856502422944597,
+      "loss": 0.0815,
+      "step": 35110
+    },
+    {
+      "epoch": 0.3047803404484336,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001485623350464244,
+      "loss": 0.1133,
+      "step": 35111
+    },
+    {
+      "epoch": 0.3047890209286378,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0014855964582123162,
+      "loss": 0.1143,
+      "step": 35112
+    },
+    {
+      "epoch": 0.30479770140884194,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0014855695655387063,
+      "loss": 0.1357,
+      "step": 35113
+    },
+    {
+      "epoch": 0.30480638188904613,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0014855426724434426,
+      "loss": 0.1113,
+      "step": 35114
+    },
+    {
+      "epoch": 0.30481506236925027,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001485515778926556,
+      "loss": 0.1279,
+      "step": 35115
+    },
+    {
+      "epoch": 0.3048237428494544,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0014854888849880749,
+      "loss": 0.1055,
+      "step": 35116
+    },
+    {
+      "epoch": 0.3048324233296586,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0014854619906280286,
+      "loss": 0.1309,
+      "step": 35117
+    },
+    {
+      "epoch": 0.30484110380986273,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0014854350958464473,
+      "loss": 0.1191,
+      "step": 35118
+    },
+    {
+      "epoch": 0.3048497842900669,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0014854082006433597,
+      "loss": 0.1084,
+      "step": 35119
+    },
+    {
+      "epoch": 0.30485846477027106,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014853813050187958,
+      "loss": 0.0791,
+      "step": 35120
+    },
+    {
+      "epoch": 0.30486714525047526,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0014853544089727844,
+      "loss": 0.1396,
+      "step": 35121
+    },
+    {
+      "epoch": 0.3048758257306794,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0014853275125053553,
+      "loss": 0.1074,
+      "step": 35122
+    },
+    {
+      "epoch": 0.3048845062108836,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0014853006156165382,
+      "loss": 0.1279,
+      "step": 35123
+    },
+    {
+      "epoch": 0.3048931866910877,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001485273718306361,
+      "loss": 0.1196,
+      "step": 35124
+    },
+    {
+      "epoch": 0.3049018671712919,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001485246820574855,
+      "loss": 0.1133,
+      "step": 35125
+    },
+    {
+      "epoch": 0.30491054765149606,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0014852199224220492,
+      "loss": 0.127,
+      "step": 35126
+    },
+    {
+      "epoch": 0.30491922813170025,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0014851930238479723,
+      "loss": 0.104,
+      "step": 35127
+    },
+    {
+      "epoch": 0.3049279086119044,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0014851661248526545,
+      "loss": 0.1123,
+      "step": 35128
+    },
+    {
+      "epoch": 0.3049365890921086,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0014851392254361243,
+      "loss": 0.0947,
+      "step": 35129
+    },
+    {
+      "epoch": 0.3049452695723127,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0014851123255984115,
+      "loss": 0.1299,
+      "step": 35130
+    },
+    {
+      "epoch": 0.3049539500525169,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001485085425339546,
+      "loss": 0.0801,
+      "step": 35131
+    },
+    {
+      "epoch": 0.30496263053272105,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.001485058524659557,
+      "loss": 0.0918,
+      "step": 35132
+    },
+    {
+      "epoch": 0.30497131101292524,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0014850316235584732,
+      "loss": 0.1084,
+      "step": 35133
+    },
+    {
+      "epoch": 0.3049799914931294,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0014850047220363249,
+      "loss": 0.0684,
+      "step": 35134
+    },
+    {
+      "epoch": 0.30498867197333357,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0014849778200931414,
+      "loss": 0.0693,
+      "step": 35135
+    },
+    {
+      "epoch": 0.3049973524535377,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001484950917728952,
+      "loss": 0.1245,
+      "step": 35136
+    },
+    {
+      "epoch": 0.3050060329337419,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0014849240149437859,
+      "loss": 0.0791,
+      "step": 35137
+    },
+    {
+      "epoch": 0.30501471341394604,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0014848971117376727,
+      "loss": 0.1084,
+      "step": 35138
+    },
+    {
+      "epoch": 0.30502339389415023,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0014848702081106417,
+      "loss": 0.125,
+      "step": 35139
+    },
+    {
+      "epoch": 0.30503207437435437,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0014848433040627227,
+      "loss": 0.0977,
+      "step": 35140
+    },
+    {
+      "epoch": 0.30504075485455856,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0014848163995939447,
+      "loss": 0.0967,
+      "step": 35141
+    },
+    {
+      "epoch": 0.3050494353347627,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0014847894947043372,
+      "loss": 0.1084,
+      "step": 35142
+    },
+    {
+      "epoch": 0.3050581158149669,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0014847625893939296,
+      "loss": 0.0571,
+      "step": 35143
+    },
+    {
+      "epoch": 0.305066796295171,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0014847356836627512,
+      "loss": 0.1094,
+      "step": 35144
+    },
+    {
+      "epoch": 0.3050754767753752,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0014847087775108324,
+      "loss": 0.0771,
+      "step": 35145
+    },
+    {
+      "epoch": 0.30508415725557936,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001484681870938201,
+      "loss": 0.1152,
+      "step": 35146
+    },
+    {
+      "epoch": 0.30509283773578355,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0014846549639448879,
+      "loss": 0.124,
+      "step": 35147
+    },
+    {
+      "epoch": 0.3051015182159877,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0014846280565309219,
+      "loss": 0.0708,
+      "step": 35148
+    },
+    {
+      "epoch": 0.3051101986961919,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001484601148696332,
+      "loss": 0.0806,
+      "step": 35149
+    },
+    {
+      "epoch": 0.305118879176396,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0014845742404411483,
+      "loss": 0.1206,
+      "step": 35150
+    },
+    {
+      "epoch": 0.3051275596566002,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0014845473317654002,
+      "loss": 0.0801,
+      "step": 35151
+    },
+    {
+      "epoch": 0.30513624013680435,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001484520422669117,
+      "loss": 0.0767,
+      "step": 35152
+    },
+    {
+      "epoch": 0.30514492061700854,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0014844935131523274,
+      "loss": 0.1104,
+      "step": 35153
+    },
+    {
+      "epoch": 0.3051536010972127,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0014844666032150617,
+      "loss": 0.1104,
+      "step": 35154
+    },
+    {
+      "epoch": 0.30516228157741687,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0014844396928573493,
+      "loss": 0.0986,
+      "step": 35155
+    },
+    {
+      "epoch": 0.305170962057621,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0014844127820792196,
+      "loss": 0.1001,
+      "step": 35156
+    },
+    {
+      "epoch": 0.3051796425378252,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0014843858708807015,
+      "loss": 0.105,
+      "step": 35157
+    },
+    {
+      "epoch": 0.30518832301802934,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0014843589592618248,
+      "loss": 0.0913,
+      "step": 35158
+    },
+    {
+      "epoch": 0.30519700349823353,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0014843320472226191,
+      "loss": 0.1602,
+      "step": 35159
+    },
+    {
+      "epoch": 0.30520568397843767,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0014843051347631136,
+      "loss": 0.1133,
+      "step": 35160
+    },
+    {
+      "epoch": 0.30521436445864186,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0014842782218833378,
+      "loss": 0.085,
+      "step": 35161
+    },
+    {
+      "epoch": 0.305223044938846,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001484251308583321,
+      "loss": 0.0981,
+      "step": 35162
+    },
+    {
+      "epoch": 0.3052317254190502,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0014842243948630927,
+      "loss": 0.1074,
+      "step": 35163
+    },
+    {
+      "epoch": 0.30524040589925433,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0014841974807226824,
+      "loss": 0.1006,
+      "step": 35164
+    },
+    {
+      "epoch": 0.3052490863794585,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0014841705661621197,
+      "loss": 0.0771,
+      "step": 35165
+    },
+    {
+      "epoch": 0.30525776685966266,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001484143651181434,
+      "loss": 0.1416,
+      "step": 35166
+    },
+    {
+      "epoch": 0.30526644733986685,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001484116735780654,
+      "loss": 0.0635,
+      "step": 35167
+    },
+    {
+      "epoch": 0.305275127820071,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0014840898199598098,
+      "loss": 0.1021,
+      "step": 35168
+    },
+    {
+      "epoch": 0.3052838083002752,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0014840629037189313,
+      "loss": 0.1064,
+      "step": 35169
+    },
+    {
+      "epoch": 0.3052924887804793,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0014840359870580471,
+      "loss": 0.1089,
+      "step": 35170
+    },
+    {
+      "epoch": 0.3053011692606835,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001484009069977187,
+      "loss": 0.0938,
+      "step": 35171
+    },
+    {
+      "epoch": 0.30530984974088765,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0014839821524763798,
+      "loss": 0.0957,
+      "step": 35172
+    },
+    {
+      "epoch": 0.30531853022109184,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.001483955234555656,
+      "loss": 0.1016,
+      "step": 35173
+    },
+    {
+      "epoch": 0.305327210701296,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0014839283162150442,
+      "loss": 0.0723,
+      "step": 35174
+    },
+    {
+      "epoch": 0.3053358911815002,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0014839013974545746,
+      "loss": 0.0908,
+      "step": 35175
+    },
+    {
+      "epoch": 0.3053445716617043,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.001483874478274276,
+      "loss": 0.125,
+      "step": 35176
+    },
+    {
+      "epoch": 0.3053532521419085,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0014838475586741778,
+      "loss": 0.0957,
+      "step": 35177
+    },
+    {
+      "epoch": 0.30536193262211264,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.00148382063865431,
+      "loss": 0.1182,
+      "step": 35178
+    },
+    {
+      "epoch": 0.30537061310231683,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014837937182147016,
+      "loss": 0.1172,
+      "step": 35179
+    },
+    {
+      "epoch": 0.30537929358252097,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0014837667973553823,
+      "loss": 0.127,
+      "step": 35180
+    },
+    {
+      "epoch": 0.30538797406272516,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001483739876076381,
+      "loss": 0.1562,
+      "step": 35181
+    },
+    {
+      "epoch": 0.3053966545429293,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001483712954377728,
+      "loss": 0.1406,
+      "step": 35182
+    },
+    {
+      "epoch": 0.3054053350231335,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001483686032259452,
+      "loss": 0.0752,
+      "step": 35183
+    },
+    {
+      "epoch": 0.30541401550333763,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0014836591097215825,
+      "loss": 0.0947,
+      "step": 35184
+    },
+    {
+      "epoch": 0.3054226959835418,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0014836321867641496,
+      "loss": 0.168,
+      "step": 35185
+    },
+    {
+      "epoch": 0.30543137646374596,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0014836052633871821,
+      "loss": 0.1045,
+      "step": 35186
+    },
+    {
+      "epoch": 0.30544005694395016,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0014835783395907099,
+      "loss": 0.0742,
+      "step": 35187
+    },
+    {
+      "epoch": 0.3054487374241543,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.001483551415374762,
+      "loss": 0.0742,
+      "step": 35188
+    },
+    {
+      "epoch": 0.3054574179043585,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0014835244907393681,
+      "loss": 0.0791,
+      "step": 35189
+    },
+    {
+      "epoch": 0.3054660983845626,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0014834975656845572,
+      "loss": 0.0815,
+      "step": 35190
+    },
+    {
+      "epoch": 0.3054747788647668,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0014834706402103595,
+      "loss": 0.0903,
+      "step": 35191
+    },
+    {
+      "epoch": 0.30548345934497095,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0014834437143168041,
+      "loss": 0.0786,
+      "step": 35192
+    },
+    {
+      "epoch": 0.30549213982517515,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0014834167880039204,
+      "loss": 0.0859,
+      "step": 35193
+    },
+    {
+      "epoch": 0.3055008203053793,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0014833898612717376,
+      "loss": 0.0791,
+      "step": 35194
+    },
+    {
+      "epoch": 0.3055095007855835,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0014833629341202857,
+      "loss": 0.1172,
+      "step": 35195
+    },
+    {
+      "epoch": 0.3055181812657876,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0014833360065495938,
+      "loss": 0.0854,
+      "step": 35196
+    },
+    {
+      "epoch": 0.3055268617459918,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0014833090785596912,
+      "loss": 0.1182,
+      "step": 35197
+    },
+    {
+      "epoch": 0.30553554222619594,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0014832821501506077,
+      "loss": 0.0957,
+      "step": 35198
+    },
+    {
+      "epoch": 0.30554422270640014,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014832552213223727,
+      "loss": 0.085,
+      "step": 35199
+    },
+    {
+      "epoch": 0.3055529031866043,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0014832282920750155,
+      "loss": 0.0918,
+      "step": 35200
+    },
+    {
+      "epoch": 0.30556158366680847,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0014832013624085654,
+      "loss": 0.0854,
+      "step": 35201
+    },
+    {
+      "epoch": 0.3055702641470126,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0014831744323230522,
+      "loss": 0.0737,
+      "step": 35202
+    },
+    {
+      "epoch": 0.3055789446272168,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001483147501818505,
+      "loss": 0.0801,
+      "step": 35203
+    },
+    {
+      "epoch": 0.30558762510742093,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0014831205708949538,
+      "loss": 0.1641,
+      "step": 35204
+    },
+    {
+      "epoch": 0.3055963055876251,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0014830936395524275,
+      "loss": 0.1426,
+      "step": 35205
+    },
+    {
+      "epoch": 0.30560498606782927,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0014830667077909562,
+      "loss": 0.0986,
+      "step": 35206
+    },
+    {
+      "epoch": 0.30561366654803346,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0014830397756105682,
+      "loss": 0.0991,
+      "step": 35207
+    },
+    {
+      "epoch": 0.3056223470282376,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001483012843011294,
+      "loss": 0.0933,
+      "step": 35208
+    },
+    {
+      "epoch": 0.3056310275084418,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0014829859099931627,
+      "loss": 0.0923,
+      "step": 35209
+    },
+    {
+      "epoch": 0.3056397079886459,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0014829589765562035,
+      "loss": 0.1387,
+      "step": 35210
+    },
+    {
+      "epoch": 0.3056483884688501,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0014829320427004462,
+      "loss": 0.105,
+      "step": 35211
+    },
+    {
+      "epoch": 0.30565706894905426,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0014829051084259204,
+      "loss": 0.1143,
+      "step": 35212
+    },
+    {
+      "epoch": 0.30566574942925845,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0014828781737326554,
+      "loss": 0.0938,
+      "step": 35213
+    },
+    {
+      "epoch": 0.3056744299094626,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0014828512386206804,
+      "loss": 0.0918,
+      "step": 35214
+    },
+    {
+      "epoch": 0.3056831103896668,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001482824303090025,
+      "loss": 0.1104,
+      "step": 35215
+    },
+    {
+      "epoch": 0.3056917908698709,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.001482797367140719,
+      "loss": 0.0752,
+      "step": 35216
+    },
+    {
+      "epoch": 0.3057004713500751,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0014827704307727911,
+      "loss": 0.0986,
+      "step": 35217
+    },
+    {
+      "epoch": 0.30570915183027925,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0014827434939862717,
+      "loss": 0.1035,
+      "step": 35218
+    },
+    {
+      "epoch": 0.30571783231048344,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0014827165567811893,
+      "loss": 0.124,
+      "step": 35219
+    },
+    {
+      "epoch": 0.3057265127906876,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0014826896191575739,
+      "loss": 0.0952,
+      "step": 35220
+    },
+    {
+      "epoch": 0.30573519327089177,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0014826626811154549,
+      "loss": 0.1738,
+      "step": 35221
+    },
+    {
+      "epoch": 0.3057438737510959,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001482635742654862,
+      "loss": 0.0986,
+      "step": 35222
+    },
+    {
+      "epoch": 0.3057525542313001,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0014826088037758242,
+      "loss": 0.1221,
+      "step": 35223
+    },
+    {
+      "epoch": 0.30576123471150424,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0014825818644783708,
+      "loss": 0.0938,
+      "step": 35224
+    },
+    {
+      "epoch": 0.30576991519170843,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0014825549247625323,
+      "loss": 0.1328,
+      "step": 35225
+    },
+    {
+      "epoch": 0.30577859567191257,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0014825279846283372,
+      "loss": 0.085,
+      "step": 35226
+    },
+    {
+      "epoch": 0.30578727615211676,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0014825010440758153,
+      "loss": 0.0957,
+      "step": 35227
+    },
+    {
+      "epoch": 0.3057959566323209,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.001482474103104996,
+      "loss": 0.0962,
+      "step": 35228
+    },
+    {
+      "epoch": 0.3058046371125251,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0014824471617159085,
+      "loss": 0.0713,
+      "step": 35229
+    },
+    {
+      "epoch": 0.30581331759272923,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0014824202199085828,
+      "loss": 0.1123,
+      "step": 35230
+    },
+    {
+      "epoch": 0.3058219980729334,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001482393277683048,
+      "loss": 0.1001,
+      "step": 35231
+    },
+    {
+      "epoch": 0.30583067855313756,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0014823663350393337,
+      "loss": 0.1235,
+      "step": 35232
+    },
+    {
+      "epoch": 0.30583935903334175,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0014823393919774692,
+      "loss": 0.082,
+      "step": 35233
+    },
+    {
+      "epoch": 0.3058480395135459,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0014823124484974841,
+      "loss": 0.1211,
+      "step": 35234
+    },
+    {
+      "epoch": 0.3058567199937501,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0014822855045994081,
+      "loss": 0.085,
+      "step": 35235
+    },
+    {
+      "epoch": 0.3058654004739542,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0014822585602832705,
+      "loss": 0.0874,
+      "step": 35236
+    },
+    {
+      "epoch": 0.3058740809541584,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0014822316155491003,
+      "loss": 0.1357,
+      "step": 35237
+    },
+    {
+      "epoch": 0.30588276143436255,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014822046703969274,
+      "loss": 0.0728,
+      "step": 35238
+    },
+    {
+      "epoch": 0.3058914419145667,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0014821777248267812,
+      "loss": 0.0962,
+      "step": 35239
+    },
+    {
+      "epoch": 0.3059001223947709,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0014821507788386912,
+      "loss": 0.1162,
+      "step": 35240
+    },
+    {
+      "epoch": 0.305908802874975,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0014821238324326871,
+      "loss": 0.0801,
+      "step": 35241
+    },
+    {
+      "epoch": 0.3059174833551792,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001482096885608798,
+      "loss": 0.083,
+      "step": 35242
+    },
+    {
+      "epoch": 0.30592616383538335,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0014820699383670535,
+      "loss": 0.0967,
+      "step": 35243
+    },
+    {
+      "epoch": 0.30593484431558754,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0014820429907074829,
+      "loss": 0.1504,
+      "step": 35244
+    },
+    {
+      "epoch": 0.3059435247957917,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001482016042630116,
+      "loss": 0.0938,
+      "step": 35245
+    },
+    {
+      "epoch": 0.30595220527599587,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0014819890941349819,
+      "loss": 0.0942,
+      "step": 35246
+    },
+    {
+      "epoch": 0.3059608857562,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0014819621452221105,
+      "loss": 0.1064,
+      "step": 35247
+    },
+    {
+      "epoch": 0.3059695662364042,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001481935195891531,
+      "loss": 0.1055,
+      "step": 35248
+    },
+    {
+      "epoch": 0.30597824671660834,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0014819082461432725,
+      "loss": 0.0742,
+      "step": 35249
+    },
+    {
+      "epoch": 0.30598692719681253,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0014818812959773652,
+      "loss": 0.1055,
+      "step": 35250
+    },
+    {
+      "epoch": 0.30599560767701667,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0014818543453938384,
+      "loss": 0.0869,
+      "step": 35251
+    },
+    {
+      "epoch": 0.30600428815722086,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0014818273943927214,
+      "loss": 0.1133,
+      "step": 35252
+    },
+    {
+      "epoch": 0.306012968637425,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0014818004429740434,
+      "loss": 0.0933,
+      "step": 35253
+    },
+    {
+      "epoch": 0.3060216491176292,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0014817734911378342,
+      "loss": 0.0742,
+      "step": 35254
+    },
+    {
+      "epoch": 0.30603032959783333,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0014817465388841237,
+      "loss": 0.1162,
+      "step": 35255
+    },
+    {
+      "epoch": 0.3060390100780375,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0014817195862129404,
+      "loss": 0.1094,
+      "step": 35256
+    },
+    {
+      "epoch": 0.30604769055824166,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0014816926331243149,
+      "loss": 0.0835,
+      "step": 35257
+    },
+    {
+      "epoch": 0.30605637103844585,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0014816656796182756,
+      "loss": 0.0913,
+      "step": 35258
+    },
+    {
+      "epoch": 0.30606505151865,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0014816387256948523,
+      "loss": 0.085,
+      "step": 35259
+    },
+    {
+      "epoch": 0.3060737319988542,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0014816117713540749,
+      "loss": 0.0835,
+      "step": 35260
+    },
+    {
+      "epoch": 0.3060824124790583,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0014815848165959727,
+      "loss": 0.1055,
+      "step": 35261
+    },
+    {
+      "epoch": 0.3060910929592625,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001481557861420575,
+      "loss": 0.1001,
+      "step": 35262
+    },
+    {
+      "epoch": 0.30609977343946665,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001481530905827911,
+      "loss": 0.1348,
+      "step": 35263
+    },
+    {
+      "epoch": 0.30610845391967084,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001481503949818011,
+      "loss": 0.125,
+      "step": 35264
+    },
+    {
+      "epoch": 0.306117134399875,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0014814769933909038,
+      "loss": 0.0674,
+      "step": 35265
+    },
+    {
+      "epoch": 0.3061258148800792,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0014814500365466193,
+      "loss": 0.0713,
+      "step": 35266
+    },
+    {
+      "epoch": 0.3061344953602833,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0014814230792851864,
+      "loss": 0.0879,
+      "step": 35267
+    },
+    {
+      "epoch": 0.3061431758404875,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0014813961216066354,
+      "loss": 0.0781,
+      "step": 35268
+    },
+    {
+      "epoch": 0.30615185632069164,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0014813691635109946,
+      "loss": 0.1118,
+      "step": 35269
+    },
+    {
+      "epoch": 0.30616053680089583,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0014813422049982949,
+      "loss": 0.0977,
+      "step": 35270
+    },
+    {
+      "epoch": 0.30616921728109997,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001481315246068565,
+      "loss": 0.0918,
+      "step": 35271
+    },
+    {
+      "epoch": 0.30617789776130416,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0014812882867218345,
+      "loss": 0.0879,
+      "step": 35272
+    },
+    {
+      "epoch": 0.3061865782415083,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0014812613269581327,
+      "loss": 0.1152,
+      "step": 35273
+    },
+    {
+      "epoch": 0.3061952587217125,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0014812343667774891,
+      "loss": 0.0898,
+      "step": 35274
+    },
+    {
+      "epoch": 0.30620393920191663,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0014812074061799337,
+      "loss": 0.0879,
+      "step": 35275
+    },
+    {
+      "epoch": 0.3062126196821208,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0014811804451654953,
+      "loss": 0.1328,
+      "step": 35276
+    },
+    {
+      "epoch": 0.30622130016232496,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0014811534837342038,
+      "loss": 0.1162,
+      "step": 35277
+    },
+    {
+      "epoch": 0.30622998064252915,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0014811265218860883,
+      "loss": 0.0645,
+      "step": 35278
+    },
+    {
+      "epoch": 0.3062386611227333,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001481099559621179,
+      "loss": 0.0967,
+      "step": 35279
+    },
+    {
+      "epoch": 0.3062473416029375,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0014810725969395047,
+      "loss": 0.0723,
+      "step": 35280
+    },
+    {
+      "epoch": 0.3062560220831416,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0014810456338410953,
+      "loss": 0.0864,
+      "step": 35281
+    },
+    {
+      "epoch": 0.3062647025633458,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.00148101867032598,
+      "loss": 0.0981,
+      "step": 35282
+    },
+    {
+      "epoch": 0.30627338304354995,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0014809917063941887,
+      "loss": 0.1118,
+      "step": 35283
+    },
+    {
+      "epoch": 0.30628206352375414,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0014809647420457501,
+      "loss": 0.1191,
+      "step": 35284
+    },
+    {
+      "epoch": 0.3062907440039583,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0014809377772806948,
+      "loss": 0.106,
+      "step": 35285
+    },
+    {
+      "epoch": 0.3062994244841625,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0014809108120990512,
+      "loss": 0.0757,
+      "step": 35286
+    },
+    {
+      "epoch": 0.3063081049643666,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0014808838465008491,
+      "loss": 0.0952,
+      "step": 35287
+    },
+    {
+      "epoch": 0.3063167854445708,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0014808568804861185,
+      "loss": 0.123,
+      "step": 35288
+    },
+    {
+      "epoch": 0.30632546592477494,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0014808299140548881,
+      "loss": 0.1182,
+      "step": 35289
+    },
+    {
+      "epoch": 0.30633414640497914,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0014808029472071884,
+      "loss": 0.124,
+      "step": 35290
+    },
+    {
+      "epoch": 0.3063428268851833,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001480775979943048,
+      "loss": 0.0815,
+      "step": 35291
+    },
+    {
+      "epoch": 0.30635150736538747,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.001480749012262497,
+      "loss": 0.1025,
+      "step": 35292
+    },
+    {
+      "epoch": 0.3063601878455916,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0014807220441655643,
+      "loss": 0.1045,
+      "step": 35293
+    },
+    {
+      "epoch": 0.3063688683257958,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0014806950756522798,
+      "loss": 0.1035,
+      "step": 35294
+    },
+    {
+      "epoch": 0.30637754880599993,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001480668106722673,
+      "loss": 0.1445,
+      "step": 35295
+    },
+    {
+      "epoch": 0.3063862292862041,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0014806411373767727,
+      "loss": 0.1387,
+      "step": 35296
+    },
+    {
+      "epoch": 0.30639490976640826,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0014806141676146096,
+      "loss": 0.127,
+      "step": 35297
+    },
+    {
+      "epoch": 0.30640359024661246,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0014805871974362123,
+      "loss": 0.0918,
+      "step": 35298
+    },
+    {
+      "epoch": 0.3064122707268166,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0014805602268416107,
+      "loss": 0.0762,
+      "step": 35299
+    },
+    {
+      "epoch": 0.3064209512070208,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0014805332558308342,
+      "loss": 0.1191,
+      "step": 35300
+    },
+    {
+      "epoch": 0.3064296316872249,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001480506284403912,
+      "loss": 0.0957,
+      "step": 35301
+    },
+    {
+      "epoch": 0.3064383121674291,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0014804793125608741,
+      "loss": 0.0923,
+      "step": 35302
+    },
+    {
+      "epoch": 0.30644699264763325,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0014804523403017496,
+      "loss": 0.0996,
+      "step": 35303
+    },
+    {
+      "epoch": 0.30645567312783745,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0014804253676265681,
+      "loss": 0.082,
+      "step": 35304
+    },
+    {
+      "epoch": 0.3064643536080416,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0014803983945353594,
+      "loss": 0.0586,
+      "step": 35305
+    },
+    {
+      "epoch": 0.3064730340882458,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0014803714210281525,
+      "loss": 0.1602,
+      "step": 35306
+    },
+    {
+      "epoch": 0.3064817145684499,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001480344447104977,
+      "loss": 0.127,
+      "step": 35307
+    },
+    {
+      "epoch": 0.3064903950486541,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0014803174727658628,
+      "loss": 0.0996,
+      "step": 35308
+    },
+    {
+      "epoch": 0.30649907552885824,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0014802904980108392,
+      "loss": 0.0996,
+      "step": 35309
+    },
+    {
+      "epoch": 0.30650775600906244,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0014802635228399353,
+      "loss": 0.1484,
+      "step": 35310
+    },
+    {
+      "epoch": 0.3065164364892666,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001480236547253181,
+      "loss": 0.1104,
+      "step": 35311
+    },
+    {
+      "epoch": 0.30652511696947077,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0014802095712506058,
+      "loss": 0.0908,
+      "step": 35312
+    },
+    {
+      "epoch": 0.3065337974496749,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0014801825948322395,
+      "loss": 0.0977,
+      "step": 35313
+    },
+    {
+      "epoch": 0.3065424779298791,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0014801556179981105,
+      "loss": 0.0752,
+      "step": 35314
+    },
+    {
+      "epoch": 0.30655115841008324,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0014801286407482492,
+      "loss": 0.1074,
+      "step": 35315
+    },
+    {
+      "epoch": 0.30655983889028743,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001480101663082685,
+      "loss": 0.1152,
+      "step": 35316
+    },
+    {
+      "epoch": 0.30656851937049157,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0014800746850014478,
+      "loss": 0.062,
+      "step": 35317
+    },
+    {
+      "epoch": 0.30657719985069576,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0014800477065045664,
+      "loss": 0.1011,
+      "step": 35318
+    },
+    {
+      "epoch": 0.3065858803308999,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.00148002072759207,
+      "loss": 0.0693,
+      "step": 35319
+    },
+    {
+      "epoch": 0.3065945608111041,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0014799937482639892,
+      "loss": 0.3398,
+      "step": 35320
+    },
+    {
+      "epoch": 0.3066032412913082,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001479966768520353,
+      "loss": 0.0835,
+      "step": 35321
+    },
+    {
+      "epoch": 0.3066119217715124,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0014799397883611908,
+      "loss": 0.0894,
+      "step": 35322
+    },
+    {
+      "epoch": 0.30662060225171656,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0014799128077865318,
+      "loss": 0.1143,
+      "step": 35323
+    },
+    {
+      "epoch": 0.30662928273192075,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0014798858267964059,
+      "loss": 0.083,
+      "step": 35324
+    },
+    {
+      "epoch": 0.3066379632121249,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0014798588453908427,
+      "loss": 0.0947,
+      "step": 35325
+    },
+    {
+      "epoch": 0.3066466436923291,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0014798318635698715,
+      "loss": 0.1523,
+      "step": 35326
+    },
+    {
+      "epoch": 0.3066553241725332,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0014798048813335221,
+      "loss": 0.1367,
+      "step": 35327
+    },
+    {
+      "epoch": 0.3066640046527374,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0014797778986818234,
+      "loss": 0.1055,
+      "step": 35328
+    },
+    {
+      "epoch": 0.30667268513294155,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001479750915614806,
+      "loss": 0.0693,
+      "step": 35329
+    },
+    {
+      "epoch": 0.30668136561314574,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001479723932132498,
+      "loss": 0.0791,
+      "step": 35330
+    },
+    {
+      "epoch": 0.3066900460933499,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00147969694823493,
+      "loss": 0.1289,
+      "step": 35331
+    },
+    {
+      "epoch": 0.30669872657355407,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001479669963922131,
+      "loss": 0.104,
+      "step": 35332
+    },
+    {
+      "epoch": 0.3067074070537582,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0014796429791941307,
+      "loss": 0.1055,
+      "step": 35333
+    },
+    {
+      "epoch": 0.3067160875339624,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0014796159940509586,
+      "loss": 0.1152,
+      "step": 35334
+    },
+    {
+      "epoch": 0.30672476801416654,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0014795890084926438,
+      "loss": 0.0913,
+      "step": 35335
+    },
+    {
+      "epoch": 0.30673344849437073,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0014795620225192165,
+      "loss": 0.0811,
+      "step": 35336
+    },
+    {
+      "epoch": 0.30674212897457487,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0014795350361307056,
+      "loss": 0.1152,
+      "step": 35337
+    },
+    {
+      "epoch": 0.30675080945477906,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0014795080493271412,
+      "loss": 0.1138,
+      "step": 35338
+    },
+    {
+      "epoch": 0.3067594899349832,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0014794810621085524,
+      "loss": 0.1328,
+      "step": 35339
+    },
+    {
+      "epoch": 0.3067681704151874,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001479454074474969,
+      "loss": 0.1235,
+      "step": 35340
+    },
+    {
+      "epoch": 0.30677685089539153,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0014794270864264196,
+      "loss": 0.0854,
+      "step": 35341
+    },
+    {
+      "epoch": 0.3067855313755957,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0014794000979629354,
+      "loss": 0.0791,
+      "step": 35342
+    },
+    {
+      "epoch": 0.30679421185579986,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0014793731090845443,
+      "loss": 0.1602,
+      "step": 35343
+    },
+    {
+      "epoch": 0.30680289233600405,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0014793461197912766,
+      "loss": 0.0781,
+      "step": 35344
+    },
+    {
+      "epoch": 0.3068115728162082,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001479319130083162,
+      "loss": 0.1475,
+      "step": 35345
+    },
+    {
+      "epoch": 0.3068202532964124,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001479292139960229,
+      "loss": 0.082,
+      "step": 35346
+    },
+    {
+      "epoch": 0.3068289337766165,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0014792651494225084,
+      "loss": 0.1069,
+      "step": 35347
+    },
+    {
+      "epoch": 0.3068376142568207,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001479238158470029,
+      "loss": 0.1318,
+      "step": 35348
+    },
+    {
+      "epoch": 0.30684629473702485,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0014792111671028208,
+      "loss": 0.0693,
+      "step": 35349
+    },
+    {
+      "epoch": 0.30685497521722904,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0014791841753209122,
+      "loss": 0.125,
+      "step": 35350
+    },
+    {
+      "epoch": 0.3068636556974332,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001479157183124334,
+      "loss": 0.1357,
+      "step": 35351
+    },
+    {
+      "epoch": 0.3068723361776374,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0014791301905131152,
+      "loss": 0.0894,
+      "step": 35352
+    },
+    {
+      "epoch": 0.3068810166578415,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0014791031974872853,
+      "loss": 0.0894,
+      "step": 35353
+    },
+    {
+      "epoch": 0.3068896971380457,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0014790762040468738,
+      "loss": 0.0635,
+      "step": 35354
+    },
+    {
+      "epoch": 0.30689837761824984,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.00147904921019191,
+      "loss": 0.166,
+      "step": 35355
+    },
+    {
+      "epoch": 0.30690705809845403,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0014790222159224238,
+      "loss": 0.0771,
+      "step": 35356
+    },
+    {
+      "epoch": 0.30691573857865817,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0014789952212384445,
+      "loss": 0.103,
+      "step": 35357
+    },
+    {
+      "epoch": 0.30692441905886236,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001478968226140002,
+      "loss": 0.1016,
+      "step": 35358
+    },
+    {
+      "epoch": 0.3069330995390665,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0014789412306271256,
+      "loss": 0.1011,
+      "step": 35359
+    },
+    {
+      "epoch": 0.3069417800192707,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0014789142346998446,
+      "loss": 0.105,
+      "step": 35360
+    },
+    {
+      "epoch": 0.30695046049947483,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014788872383581886,
+      "loss": 0.0898,
+      "step": 35361
+    },
+    {
+      "epoch": 0.30695914097967897,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0014788602416021875,
+      "loss": 0.0859,
+      "step": 35362
+    },
+    {
+      "epoch": 0.30696782145988316,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00147883324443187,
+      "loss": 0.0713,
+      "step": 35363
+    },
+    {
+      "epoch": 0.3069765019400873,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0014788062468472664,
+      "loss": 0.1123,
+      "step": 35364
+    },
+    {
+      "epoch": 0.3069851824202915,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0014787792488484062,
+      "loss": 0.1055,
+      "step": 35365
+    },
+    {
+      "epoch": 0.30699386290049563,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0014787522504353186,
+      "loss": 0.0962,
+      "step": 35366
+    },
+    {
+      "epoch": 0.3070025433806998,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0014787252516080332,
+      "loss": 0.0908,
+      "step": 35367
+    },
+    {
+      "epoch": 0.30701122386090396,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0014786982523665795,
+      "loss": 0.1104,
+      "step": 35368
+    },
+    {
+      "epoch": 0.30701990434110815,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0014786712527109873,
+      "loss": 0.1592,
+      "step": 35369
+    },
+    {
+      "epoch": 0.3070285848213123,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0014786442526412858,
+      "loss": 0.1021,
+      "step": 35370
+    },
+    {
+      "epoch": 0.3070372653015165,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0014786172521575045,
+      "loss": 0.1035,
+      "step": 35371
+    },
+    {
+      "epoch": 0.3070459457817206,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0014785902512596731,
+      "loss": 0.0752,
+      "step": 35372
+    },
+    {
+      "epoch": 0.3070546262619248,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0014785632499478211,
+      "loss": 0.0894,
+      "step": 35373
+    },
+    {
+      "epoch": 0.30706330674212895,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001478536248221978,
+      "loss": 0.0986,
+      "step": 35374
+    },
+    {
+      "epoch": 0.30707198722233314,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0014785092460821733,
+      "loss": 0.1113,
+      "step": 35375
+    },
+    {
+      "epoch": 0.3070806677025373,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0014784822435284367,
+      "loss": 0.0918,
+      "step": 35376
+    },
+    {
+      "epoch": 0.3070893481827415,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0014784552405607979,
+      "loss": 0.0884,
+      "step": 35377
+    },
+    {
+      "epoch": 0.3070980286629456,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001478428237179286,
+      "loss": 0.0928,
+      "step": 35378
+    },
+    {
+      "epoch": 0.3071067091431498,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.00147840123338393,
+      "loss": 0.0889,
+      "step": 35379
+    },
+    {
+      "epoch": 0.30711538962335394,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0014783742291747607,
+      "loss": 0.1108,
+      "step": 35380
+    },
+    {
+      "epoch": 0.30712407010355813,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0014783472245518071,
+      "loss": 0.1191,
+      "step": 35381
+    },
+    {
+      "epoch": 0.30713275058376227,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0014783202195150984,
+      "loss": 0.1406,
+      "step": 35382
+    },
+    {
+      "epoch": 0.30714143106396646,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0014782932140646643,
+      "loss": 0.1045,
+      "step": 35383
+    },
+    {
+      "epoch": 0.3071501115441706,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0014782662082005345,
+      "loss": 0.0977,
+      "step": 35384
+    },
+    {
+      "epoch": 0.3071587920243748,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0014782392019227384,
+      "loss": 0.0933,
+      "step": 35385
+    },
+    {
+      "epoch": 0.30716747250457893,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001478212195231306,
+      "loss": 0.1562,
+      "step": 35386
+    },
+    {
+      "epoch": 0.3071761529847831,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0014781851881262662,
+      "loss": 0.0869,
+      "step": 35387
+    },
+    {
+      "epoch": 0.30718483346498726,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0014781581806076487,
+      "loss": 0.0737,
+      "step": 35388
+    },
+    {
+      "epoch": 0.30719351394519145,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0014781311726754832,
+      "loss": 0.1201,
+      "step": 35389
+    },
+    {
+      "epoch": 0.3072021944253956,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0014781041643297987,
+      "loss": 0.0879,
+      "step": 35390
+    },
+    {
+      "epoch": 0.3072108749055998,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0014780771555706258,
+      "loss": 0.0864,
+      "step": 35391
+    },
+    {
+      "epoch": 0.3072195553858039,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.001478050146397993,
+      "loss": 0.0791,
+      "step": 35392
+    },
+    {
+      "epoch": 0.3072282358660081,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0014780231368119302,
+      "loss": 0.1064,
+      "step": 35393
+    },
+    {
+      "epoch": 0.30723691634621225,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0014779961268124673,
+      "loss": 0.1152,
+      "step": 35394
+    },
+    {
+      "epoch": 0.30724559682641644,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0014779691163996334,
+      "loss": 0.1475,
+      "step": 35395
+    },
+    {
+      "epoch": 0.3072542773066206,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0014779421055734578,
+      "loss": 0.0737,
+      "step": 35396
+    },
+    {
+      "epoch": 0.3072629577868248,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0014779150943339708,
+      "loss": 0.084,
+      "step": 35397
+    },
+    {
+      "epoch": 0.3072716382670289,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0014778880826812016,
+      "loss": 0.0869,
+      "step": 35398
+    },
+    {
+      "epoch": 0.3072803187472331,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0014778610706151795,
+      "loss": 0.1172,
+      "step": 35399
+    },
+    {
+      "epoch": 0.30728899922743724,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0014778340581359341,
+      "loss": 0.0986,
+      "step": 35400
+    },
+    {
+      "epoch": 0.30729767970764144,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0014778070452434954,
+      "loss": 0.1035,
+      "step": 35401
+    },
+    {
+      "epoch": 0.3073063601878456,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001477780031937892,
+      "loss": 0.1006,
+      "step": 35402
+    },
+    {
+      "epoch": 0.30731504066804977,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0014777530182191545,
+      "loss": 0.1055,
+      "step": 35403
+    },
+    {
+      "epoch": 0.3073237211482539,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001477726004087312,
+      "loss": 0.0933,
+      "step": 35404
+    },
+    {
+      "epoch": 0.3073324016284581,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0014776989895423937,
+      "loss": 0.0962,
+      "step": 35405
+    },
+    {
+      "epoch": 0.30734108210866223,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0014776719745844295,
+      "loss": 0.1338,
+      "step": 35406
+    },
+    {
+      "epoch": 0.3073497625888664,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001477644959213449,
+      "loss": 0.0835,
+      "step": 35407
+    },
+    {
+      "epoch": 0.30735844306907056,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0014776179434294822,
+      "loss": 0.1025,
+      "step": 35408
+    },
+    {
+      "epoch": 0.30736712354927476,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0014775909272325574,
+      "loss": 0.0898,
+      "step": 35409
+    },
+    {
+      "epoch": 0.3073758040294789,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0014775639106227047,
+      "loss": 0.1377,
+      "step": 35410
+    },
+    {
+      "epoch": 0.3073844845096831,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0014775368935999542,
+      "loss": 0.1338,
+      "step": 35411
+    },
+    {
+      "epoch": 0.3073931649898872,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0014775098761643348,
+      "loss": 0.0957,
+      "step": 35412
+    },
+    {
+      "epoch": 0.3074018454700914,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0014774828583158765,
+      "loss": 0.1143,
+      "step": 35413
+    },
+    {
+      "epoch": 0.30741052595029555,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0014774558400546083,
+      "loss": 0.1025,
+      "step": 35414
+    },
+    {
+      "epoch": 0.30741920643049975,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0014774288213805604,
+      "loss": 0.0752,
+      "step": 35415
+    },
+    {
+      "epoch": 0.3074278869107039,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001477401802293762,
+      "loss": 0.0977,
+      "step": 35416
+    },
+    {
+      "epoch": 0.3074365673909081,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0014773747827942425,
+      "loss": 0.1318,
+      "step": 35417
+    },
+    {
+      "epoch": 0.3074452478711122,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0014773477628820316,
+      "loss": 0.1001,
+      "step": 35418
+    },
+    {
+      "epoch": 0.3074539283513164,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0014773207425571588,
+      "loss": 0.1113,
+      "step": 35419
+    },
+    {
+      "epoch": 0.30746260883152055,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0014772937218196541,
+      "loss": 0.0806,
+      "step": 35420
+    },
+    {
+      "epoch": 0.30747128931172474,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0014772667006695461,
+      "loss": 0.0771,
+      "step": 35421
+    },
+    {
+      "epoch": 0.3074799697919289,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0014772396791068653,
+      "loss": 0.1045,
+      "step": 35422
+    },
+    {
+      "epoch": 0.30748865027213307,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0014772126571316404,
+      "loss": 0.0957,
+      "step": 35423
+    },
+    {
+      "epoch": 0.3074973307523372,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0014771856347439022,
+      "loss": 0.1094,
+      "step": 35424
+    },
+    {
+      "epoch": 0.3075060112325414,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0014771586119436788,
+      "loss": 0.0859,
+      "step": 35425
+    },
+    {
+      "epoch": 0.30751469171274554,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0014771315887310006,
+      "loss": 0.2734,
+      "step": 35426
+    },
+    {
+      "epoch": 0.30752337219294973,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0014771045651058969,
+      "loss": 0.0791,
+      "step": 35427
+    },
+    {
+      "epoch": 0.30753205267315387,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0014770775410683974,
+      "loss": 0.0933,
+      "step": 35428
+    },
+    {
+      "epoch": 0.30754073315335806,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0014770505166185317,
+      "loss": 0.0967,
+      "step": 35429
+    },
+    {
+      "epoch": 0.3075494136335622,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001477023491756329,
+      "loss": 0.0796,
+      "step": 35430
+    },
+    {
+      "epoch": 0.3075580941137664,
+      "grad_norm": 1.8671875,
+      "learning_rate": 0.001476996466481819,
+      "loss": 0.1309,
+      "step": 35431
+    },
+    {
+      "epoch": 0.3075667745939705,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0014769694407950314,
+      "loss": 0.1357,
+      "step": 35432
+    },
+    {
+      "epoch": 0.3075754550741747,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0014769424146959958,
+      "loss": 0.1201,
+      "step": 35433
+    },
+    {
+      "epoch": 0.30758413555437886,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0014769153881847416,
+      "loss": 0.1025,
+      "step": 35434
+    },
+    {
+      "epoch": 0.30759281603458305,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0014768883612612984,
+      "loss": 0.1016,
+      "step": 35435
+    },
+    {
+      "epoch": 0.3076014965147872,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0014768613339256956,
+      "loss": 0.0986,
+      "step": 35436
+    },
+    {
+      "epoch": 0.3076101769949914,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001476834306177963,
+      "loss": 0.1152,
+      "step": 35437
+    },
+    {
+      "epoch": 0.3076188574751955,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0014768072780181303,
+      "loss": 0.0864,
+      "step": 35438
+    },
+    {
+      "epoch": 0.3076275379553997,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0014767802494462265,
+      "loss": 0.0962,
+      "step": 35439
+    },
+    {
+      "epoch": 0.30763621843560385,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0014767532204622816,
+      "loss": 0.1045,
+      "step": 35440
+    },
+    {
+      "epoch": 0.30764489891580804,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.001476726191066325,
+      "loss": 0.0913,
+      "step": 35441
+    },
+    {
+      "epoch": 0.3076535793960122,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0014766991612583861,
+      "loss": 0.1016,
+      "step": 35442
+    },
+    {
+      "epoch": 0.30766225987621637,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0014766721310384952,
+      "loss": 0.1279,
+      "step": 35443
+    },
+    {
+      "epoch": 0.3076709403564205,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001476645100406681,
+      "loss": 0.1006,
+      "step": 35444
+    },
+    {
+      "epoch": 0.3076796208366247,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0014766180693629731,
+      "loss": 0.123,
+      "step": 35445
+    },
+    {
+      "epoch": 0.30768830131682884,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0014765910379074018,
+      "loss": 0.1406,
+      "step": 35446
+    },
+    {
+      "epoch": 0.30769698179703303,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0014765640060399963,
+      "loss": 0.1338,
+      "step": 35447
+    },
+    {
+      "epoch": 0.30770566227723717,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0014765369737607856,
+      "loss": 0.1094,
+      "step": 35448
+    },
+    {
+      "epoch": 0.30771434275744136,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0014765099410698,
+      "loss": 0.0859,
+      "step": 35449
+    },
+    {
+      "epoch": 0.3077230232376455,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0014764829079670688,
+      "loss": 0.1152,
+      "step": 35450
+    },
+    {
+      "epoch": 0.3077317037178497,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.001476455874452621,
+      "loss": 0.0664,
+      "step": 35451
+    },
+    {
+      "epoch": 0.30774038419805383,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0014764288405264873,
+      "loss": 0.0913,
+      "step": 35452
+    },
+    {
+      "epoch": 0.307749064678258,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0014764018061886965,
+      "loss": 0.1201,
+      "step": 35453
+    },
+    {
+      "epoch": 0.30775774515846216,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0014763747714392783,
+      "loss": 0.1211,
+      "step": 35454
+    },
+    {
+      "epoch": 0.30776642563866635,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0014763477362782626,
+      "loss": 0.0947,
+      "step": 35455
+    },
+    {
+      "epoch": 0.3077751061188705,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0014763207007056785,
+      "loss": 0.0874,
+      "step": 35456
+    },
+    {
+      "epoch": 0.3077837865990747,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0014762936647215556,
+      "loss": 0.1387,
+      "step": 35457
+    },
+    {
+      "epoch": 0.3077924670792788,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0014762666283259236,
+      "loss": 0.0903,
+      "step": 35458
+    },
+    {
+      "epoch": 0.307801147559483,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014762395915188123,
+      "loss": 0.0977,
+      "step": 35459
+    },
+    {
+      "epoch": 0.30780982803968715,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0014762125543002506,
+      "loss": 0.0898,
+      "step": 35460
+    },
+    {
+      "epoch": 0.30781850851989134,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0014761855166702692,
+      "loss": 0.085,
+      "step": 35461
+    },
+    {
+      "epoch": 0.3078271890000955,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0014761584786288963,
+      "loss": 0.0635,
+      "step": 35462
+    },
+    {
+      "epoch": 0.3078358694802997,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0014761314401761625,
+      "loss": 0.0938,
+      "step": 35463
+    },
+    {
+      "epoch": 0.3078445499605038,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0014761044013120968,
+      "loss": 0.1289,
+      "step": 35464
+    },
+    {
+      "epoch": 0.307853230440708,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001476077362036729,
+      "loss": 0.0732,
+      "step": 35465
+    },
+    {
+      "epoch": 0.30786191092091214,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0014760503223500887,
+      "loss": 0.0898,
+      "step": 35466
+    },
+    {
+      "epoch": 0.30787059140111633,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0014760232822522055,
+      "loss": 0.0781,
+      "step": 35467
+    },
+    {
+      "epoch": 0.30787927188132047,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001475996241743109,
+      "loss": 0.1064,
+      "step": 35468
+    },
+    {
+      "epoch": 0.30788795236152466,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0014759692008228286,
+      "loss": 0.085,
+      "step": 35469
+    },
+    {
+      "epoch": 0.3078966328417288,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0014759421594913937,
+      "loss": 0.0977,
+      "step": 35470
+    },
+    {
+      "epoch": 0.307905313321933,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001475915117748834,
+      "loss": 0.1104,
+      "step": 35471
+    },
+    {
+      "epoch": 0.30791399380213713,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0014758880755951793,
+      "loss": 0.105,
+      "step": 35472
+    },
+    {
+      "epoch": 0.3079226742823413,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0014758610330304596,
+      "loss": 0.082,
+      "step": 35473
+    },
+    {
+      "epoch": 0.30793135476254546,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0014758339900547033,
+      "loss": 0.0952,
+      "step": 35474
+    },
+    {
+      "epoch": 0.30794003524274965,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001475806946667941,
+      "loss": 0.1504,
+      "step": 35475
+    },
+    {
+      "epoch": 0.3079487157229538,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0014757799028702015,
+      "loss": 0.1221,
+      "step": 35476
+    },
+    {
+      "epoch": 0.307957396203158,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001475752858661515,
+      "loss": 0.1021,
+      "step": 35477
+    },
+    {
+      "epoch": 0.3079660766833621,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0014757258140419107,
+      "loss": 0.1162,
+      "step": 35478
+    },
+    {
+      "epoch": 0.3079747571635663,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0014756987690114182,
+      "loss": 0.0967,
+      "step": 35479
+    },
+    {
+      "epoch": 0.30798343764377045,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0014756717235700672,
+      "loss": 0.0981,
+      "step": 35480
+    },
+    {
+      "epoch": 0.30799211812397465,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0014756446777178874,
+      "loss": 0.1143,
+      "step": 35481
+    },
+    {
+      "epoch": 0.3080007986041788,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0014756176314549081,
+      "loss": 0.1211,
+      "step": 35482
+    },
+    {
+      "epoch": 0.308009479084383,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001475590584781159,
+      "loss": 0.084,
+      "step": 35483
+    },
+    {
+      "epoch": 0.3080181595645871,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014755635376966696,
+      "loss": 0.0986,
+      "step": 35484
+    },
+    {
+      "epoch": 0.30802684004479125,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0014755364902014699,
+      "loss": 0.1138,
+      "step": 35485
+    },
+    {
+      "epoch": 0.30803552052499544,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001475509442295589,
+      "loss": 0.124,
+      "step": 35486
+    },
+    {
+      "epoch": 0.3080442010051996,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0014754823939790564,
+      "loss": 0.0986,
+      "step": 35487
+    },
+    {
+      "epoch": 0.3080528814854038,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001475455345251902,
+      "loss": 0.0723,
+      "step": 35488
+    },
+    {
+      "epoch": 0.3080615619656079,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001475428296114155,
+      "loss": 0.0938,
+      "step": 35489
+    },
+    {
+      "epoch": 0.3080702424458121,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001475401246565846,
+      "loss": 0.1035,
+      "step": 35490
+    },
+    {
+      "epoch": 0.30807892292601624,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001475374196607003,
+      "loss": 0.1133,
+      "step": 35491
+    },
+    {
+      "epoch": 0.30808760340622043,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0014753471462376569,
+      "loss": 0.0688,
+      "step": 35492
+    },
+    {
+      "epoch": 0.30809628388642457,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0014753200954578367,
+      "loss": 0.1426,
+      "step": 35493
+    },
+    {
+      "epoch": 0.30810496436662876,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0014752930442675722,
+      "loss": 0.0908,
+      "step": 35494
+    },
+    {
+      "epoch": 0.3081136448468329,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.001475265992666893,
+      "loss": 0.0923,
+      "step": 35495
+    },
+    {
+      "epoch": 0.3081223253270371,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001475238940655828,
+      "loss": 0.1025,
+      "step": 35496
+    },
+    {
+      "epoch": 0.30813100580724123,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0014752118882344078,
+      "loss": 0.1021,
+      "step": 35497
+    },
+    {
+      "epoch": 0.3081396862874454,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0014751848354026614,
+      "loss": 0.0776,
+      "step": 35498
+    },
+    {
+      "epoch": 0.30814836676764956,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0014751577821606183,
+      "loss": 0.1924,
+      "step": 35499
+    },
+    {
+      "epoch": 0.30815704724785375,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0014751307285083086,
+      "loss": 0.0771,
+      "step": 35500
+    },
+    {
+      "epoch": 0.3081657277280579,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0014751036744457614,
+      "loss": 0.1357,
+      "step": 35501
+    },
+    {
+      "epoch": 0.3081744082082621,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0014750766199730066,
+      "loss": 0.0684,
+      "step": 35502
+    },
+    {
+      "epoch": 0.3081830886884662,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0014750495650900732,
+      "loss": 0.1133,
+      "step": 35503
+    },
+    {
+      "epoch": 0.3081917691686704,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0014750225097969918,
+      "loss": 0.1309,
+      "step": 35504
+    },
+    {
+      "epoch": 0.30820044964887455,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0014749954540937914,
+      "loss": 0.0825,
+      "step": 35505
+    },
+    {
+      "epoch": 0.30820913012907875,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0014749683979805013,
+      "loss": 0.0806,
+      "step": 35506
+    },
+    {
+      "epoch": 0.3082178106092829,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0014749413414571516,
+      "loss": 0.0879,
+      "step": 35507
+    },
+    {
+      "epoch": 0.3082264910894871,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0014749142845237712,
+      "loss": 0.0942,
+      "step": 35508
+    },
+    {
+      "epoch": 0.3082351715696912,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0014748872271803904,
+      "loss": 0.1309,
+      "step": 35509
+    },
+    {
+      "epoch": 0.3082438520498954,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0014748601694270387,
+      "loss": 0.1006,
+      "step": 35510
+    },
+    {
+      "epoch": 0.30825253253009954,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0014748331112637457,
+      "loss": 0.1094,
+      "step": 35511
+    },
+    {
+      "epoch": 0.30826121301030374,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0014748060526905407,
+      "loss": 0.1426,
+      "step": 35512
+    },
+    {
+      "epoch": 0.3082698934905079,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0014747789937074535,
+      "loss": 0.0752,
+      "step": 35513
+    },
+    {
+      "epoch": 0.30827857397071207,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0014747519343145134,
+      "loss": 0.1113,
+      "step": 35514
+    },
+    {
+      "epoch": 0.3082872544509162,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0014747248745117505,
+      "loss": 0.1055,
+      "step": 35515
+    },
+    {
+      "epoch": 0.3082959349311204,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0014746978142991941,
+      "loss": 0.1172,
+      "step": 35516
+    },
+    {
+      "epoch": 0.30830461541132453,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0014746707536768735,
+      "loss": 0.1768,
+      "step": 35517
+    },
+    {
+      "epoch": 0.3083132958915287,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0014746436926448185,
+      "loss": 0.0786,
+      "step": 35518
+    },
+    {
+      "epoch": 0.30832197637173286,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0014746166312030592,
+      "loss": 0.1074,
+      "step": 35519
+    },
+    {
+      "epoch": 0.30833065685193706,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0014745895693516248,
+      "loss": 0.1104,
+      "step": 35520
+    },
+    {
+      "epoch": 0.3083393373321412,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0014745625070905447,
+      "loss": 0.1152,
+      "step": 35521
+    },
+    {
+      "epoch": 0.3083480178123454,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0014745354444198485,
+      "loss": 0.1455,
+      "step": 35522
+    },
+    {
+      "epoch": 0.3083566982925495,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001474508381339566,
+      "loss": 0.1128,
+      "step": 35523
+    },
+    {
+      "epoch": 0.3083653787727537,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001474481317849727,
+      "loss": 0.0752,
+      "step": 35524
+    },
+    {
+      "epoch": 0.30837405925295786,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001474454253950361,
+      "loss": 0.1162,
+      "step": 35525
+    },
+    {
+      "epoch": 0.30838273973316205,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014744271896414971,
+      "loss": 0.0928,
+      "step": 35526
+    },
+    {
+      "epoch": 0.3083914202133662,
+      "grad_norm": 1.6328125,
+      "learning_rate": 0.0014744001249231652,
+      "loss": 0.1484,
+      "step": 35527
+    },
+    {
+      "epoch": 0.3084001006935704,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001474373059795395,
+      "loss": 0.0791,
+      "step": 35528
+    },
+    {
+      "epoch": 0.3084087811737745,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0014743459942582163,
+      "loss": 0.1182,
+      "step": 35529
+    },
+    {
+      "epoch": 0.3084174616539787,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0014743189283116583,
+      "loss": 0.1299,
+      "step": 35530
+    },
+    {
+      "epoch": 0.30842614213418285,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0014742918619557507,
+      "loss": 0.1001,
+      "step": 35531
+    },
+    {
+      "epoch": 0.30843482261438704,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001474264795190523,
+      "loss": 0.0977,
+      "step": 35532
+    },
+    {
+      "epoch": 0.3084435030945912,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0014742377280160053,
+      "loss": 0.1543,
+      "step": 35533
+    },
+    {
+      "epoch": 0.30845218357479537,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0014742106604322268,
+      "loss": 0.125,
+      "step": 35534
+    },
+    {
+      "epoch": 0.3084608640549995,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0014741835924392168,
+      "loss": 0.1133,
+      "step": 35535
+    },
+    {
+      "epoch": 0.3084695445352037,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0014741565240370055,
+      "loss": 0.0767,
+      "step": 35536
+    },
+    {
+      "epoch": 0.30847822501540784,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0014741294552256222,
+      "loss": 0.105,
+      "step": 35537
+    },
+    {
+      "epoch": 0.30848690549561203,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0014741023860050965,
+      "loss": 0.0903,
+      "step": 35538
+    },
+    {
+      "epoch": 0.30849558597581617,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0014740753163754584,
+      "loss": 0.0967,
+      "step": 35539
+    },
+    {
+      "epoch": 0.30850426645602036,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0014740482463367365,
+      "loss": 0.083,
+      "step": 35540
+    },
+    {
+      "epoch": 0.3085129469362245,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014740211758889617,
+      "loss": 0.124,
+      "step": 35541
+    },
+    {
+      "epoch": 0.3085216274164287,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0014739941050321626,
+      "loss": 0.1182,
+      "step": 35542
+    },
+    {
+      "epoch": 0.3085303078966328,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0014739670337663693,
+      "loss": 0.1011,
+      "step": 35543
+    },
+    {
+      "epoch": 0.308538988376837,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001473939962091611,
+      "loss": 0.0938,
+      "step": 35544
+    },
+    {
+      "epoch": 0.30854766885704116,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001473912890007918,
+      "loss": 0.1152,
+      "step": 35545
+    },
+    {
+      "epoch": 0.30855634933724535,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0014738858175153192,
+      "loss": 0.0786,
+      "step": 35546
+    },
+    {
+      "epoch": 0.3085650298174495,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0014738587446138445,
+      "loss": 0.0889,
+      "step": 35547
+    },
+    {
+      "epoch": 0.3085737102976537,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0014738316713035233,
+      "loss": 0.1494,
+      "step": 35548
+    },
+    {
+      "epoch": 0.3085823907778578,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0014738045975843858,
+      "loss": 0.0771,
+      "step": 35549
+    },
+    {
+      "epoch": 0.308591071258062,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0014737775234564612,
+      "loss": 0.0859,
+      "step": 35550
+    },
+    {
+      "epoch": 0.30859975173826615,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001473750448919779,
+      "loss": 0.1113,
+      "step": 35551
+    },
+    {
+      "epoch": 0.30860843221847034,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001473723373974369,
+      "loss": 0.1118,
+      "step": 35552
+    },
+    {
+      "epoch": 0.3086171126986745,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0014736962986202605,
+      "loss": 0.0771,
+      "step": 35553
+    },
+    {
+      "epoch": 0.30862579317887867,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0014736692228574837,
+      "loss": 0.0815,
+      "step": 35554
+    },
+    {
+      "epoch": 0.3086344736590828,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0014736421466860674,
+      "loss": 0.1738,
+      "step": 35555
+    },
+    {
+      "epoch": 0.308643154139287,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001473615070106042,
+      "loss": 0.1406,
+      "step": 35556
+    },
+    {
+      "epoch": 0.30865183461949114,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0014735879931174365,
+      "loss": 0.0942,
+      "step": 35557
+    },
+    {
+      "epoch": 0.30866051509969533,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001473560915720281,
+      "loss": 0.1045,
+      "step": 35558
+    },
+    {
+      "epoch": 0.30866919557989947,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001473533837914605,
+      "loss": 0.0972,
+      "step": 35559
+    },
+    {
+      "epoch": 0.30867787606010366,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0014735067597004378,
+      "loss": 0.0654,
+      "step": 35560
+    },
+    {
+      "epoch": 0.3086865565403078,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0014734796810778094,
+      "loss": 0.1465,
+      "step": 35561
+    },
+    {
+      "epoch": 0.308695237020512,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.001473452602046749,
+      "loss": 0.0752,
+      "step": 35562
+    },
+    {
+      "epoch": 0.30870391750071613,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0014734255226072867,
+      "loss": 0.1367,
+      "step": 35563
+    },
+    {
+      "epoch": 0.3087125979809203,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0014733984427594518,
+      "loss": 0.1025,
+      "step": 35564
+    },
+    {
+      "epoch": 0.30872127846112446,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0014733713625032737,
+      "loss": 0.1001,
+      "step": 35565
+    },
+    {
+      "epoch": 0.30872995894132865,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001473344281838782,
+      "loss": 0.1177,
+      "step": 35566
+    },
+    {
+      "epoch": 0.3087386394215328,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0014733172007660074,
+      "loss": 0.1128,
+      "step": 35567
+    },
+    {
+      "epoch": 0.308747319901737,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0014732901192849783,
+      "loss": 0.0854,
+      "step": 35568
+    },
+    {
+      "epoch": 0.3087560003819411,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001473263037395725,
+      "loss": 0.1064,
+      "step": 35569
+    },
+    {
+      "epoch": 0.3087646808621453,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0014732359550982765,
+      "loss": 0.0967,
+      "step": 35570
+    },
+    {
+      "epoch": 0.30877336134234945,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001473208872392663,
+      "loss": 0.0913,
+      "step": 35571
+    },
+    {
+      "epoch": 0.30878204182255364,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001473181789278914,
+      "loss": 0.0728,
+      "step": 35572
+    },
+    {
+      "epoch": 0.3087907223027578,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0014731547057570589,
+      "loss": 0.0933,
+      "step": 35573
+    },
+    {
+      "epoch": 0.308799402782962,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0014731276218271272,
+      "loss": 0.1152,
+      "step": 35574
+    },
+    {
+      "epoch": 0.3088080832631661,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.001473100537489149,
+      "loss": 0.1816,
+      "step": 35575
+    },
+    {
+      "epoch": 0.3088167637433703,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0014730734527431536,
+      "loss": 0.0918,
+      "step": 35576
+    },
+    {
+      "epoch": 0.30882544422357444,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0014730463675891705,
+      "loss": 0.0942,
+      "step": 35577
+    },
+    {
+      "epoch": 0.30883412470377863,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0014730192820272296,
+      "loss": 0.1001,
+      "step": 35578
+    },
+    {
+      "epoch": 0.30884280518398277,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0014729921960573602,
+      "loss": 0.1045,
+      "step": 35579
+    },
+    {
+      "epoch": 0.30885148566418696,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0014729651096795928,
+      "loss": 0.1602,
+      "step": 35580
+    },
+    {
+      "epoch": 0.3088601661443911,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0014729380228939559,
+      "loss": 0.1387,
+      "step": 35581
+    },
+    {
+      "epoch": 0.3088688466245953,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0014729109357004798,
+      "loss": 0.1416,
+      "step": 35582
+    },
+    {
+      "epoch": 0.30887752710479943,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0014728838480991934,
+      "loss": 0.1172,
+      "step": 35583
+    },
+    {
+      "epoch": 0.3088862075850036,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0014728567600901272,
+      "loss": 0.0972,
+      "step": 35584
+    },
+    {
+      "epoch": 0.30889488806520776,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0014728296716733102,
+      "loss": 0.0869,
+      "step": 35585
+    },
+    {
+      "epoch": 0.30890356854541196,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0014728025828487728,
+      "loss": 0.0986,
+      "step": 35586
+    },
+    {
+      "epoch": 0.3089122490256161,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0014727754936165436,
+      "loss": 0.0967,
+      "step": 35587
+    },
+    {
+      "epoch": 0.3089209295058203,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0014727484039766528,
+      "loss": 0.1309,
+      "step": 35588
+    },
+    {
+      "epoch": 0.3089296099860244,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0014727213139291303,
+      "loss": 0.1123,
+      "step": 35589
+    },
+    {
+      "epoch": 0.3089382904662286,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001472694223474005,
+      "loss": 0.1602,
+      "step": 35590
+    },
+    {
+      "epoch": 0.30894697094643275,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0014726671326113071,
+      "loss": 0.1211,
+      "step": 35591
+    },
+    {
+      "epoch": 0.30895565142663695,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0014726400413410659,
+      "loss": 0.0845,
+      "step": 35592
+    },
+    {
+      "epoch": 0.3089643319068411,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0014726129496633112,
+      "loss": 0.1147,
+      "step": 35593
+    },
+    {
+      "epoch": 0.3089730123870453,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0014725858575780724,
+      "loss": 0.1426,
+      "step": 35594
+    },
+    {
+      "epoch": 0.3089816928672494,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0014725587650853795,
+      "loss": 0.1582,
+      "step": 35595
+    },
+    {
+      "epoch": 0.3089903733474536,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0014725316721852617,
+      "loss": 0.0879,
+      "step": 35596
+    },
+    {
+      "epoch": 0.30899905382765774,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0014725045788777493,
+      "loss": 0.1104,
+      "step": 35597
+    },
+    {
+      "epoch": 0.30900773430786194,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0014724774851628713,
+      "loss": 0.0747,
+      "step": 35598
+    },
+    {
+      "epoch": 0.3090164147880661,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0014724503910406575,
+      "loss": 0.1016,
+      "step": 35599
+    },
+    {
+      "epoch": 0.30902509526827027,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0014724232965111373,
+      "loss": 0.0933,
+      "step": 35600
+    },
+    {
+      "epoch": 0.3090337757484744,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001472396201574341,
+      "loss": 0.1279,
+      "step": 35601
+    },
+    {
+      "epoch": 0.3090424562286786,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0014723691062302974,
+      "loss": 0.1182,
+      "step": 35602
+    },
+    {
+      "epoch": 0.30905113670888273,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0014723420104790369,
+      "loss": 0.0854,
+      "step": 35603
+    },
+    {
+      "epoch": 0.3090598171890869,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0014723149143205886,
+      "loss": 0.1123,
+      "step": 35604
+    },
+    {
+      "epoch": 0.30906849766929106,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001472287817754982,
+      "loss": 0.083,
+      "step": 35605
+    },
+    {
+      "epoch": 0.30907717814949526,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0014722607207822474,
+      "loss": 0.0864,
+      "step": 35606
+    },
+    {
+      "epoch": 0.3090858586296994,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0014722336234024142,
+      "loss": 0.0913,
+      "step": 35607
+    },
+    {
+      "epoch": 0.30909453910990353,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001472206525615512,
+      "loss": 0.1147,
+      "step": 35608
+    },
+    {
+      "epoch": 0.3091032195901077,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0014721794274215696,
+      "loss": 0.1602,
+      "step": 35609
+    },
+    {
+      "epoch": 0.30911190007031186,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001472152328820618,
+      "loss": 0.1055,
+      "step": 35610
+    },
+    {
+      "epoch": 0.30912058055051606,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0014721252298126862,
+      "loss": 0.0869,
+      "step": 35611
+    },
+    {
+      "epoch": 0.3091292610307202,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0014720981303978037,
+      "loss": 0.1367,
+      "step": 35612
+    },
+    {
+      "epoch": 0.3091379415109244,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0014720710305760002,
+      "loss": 0.0928,
+      "step": 35613
+    },
+    {
+      "epoch": 0.3091466219911285,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0014720439303473052,
+      "loss": 0.1309,
+      "step": 35614
+    },
+    {
+      "epoch": 0.3091553024713327,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0014720168297117488,
+      "loss": 0.1035,
+      "step": 35615
+    },
+    {
+      "epoch": 0.30916398295153685,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014719897286693607,
+      "loss": 0.0767,
+      "step": 35616
+    },
+    {
+      "epoch": 0.30917266343174105,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0014719626272201695,
+      "loss": 0.103,
+      "step": 35617
+    },
+    {
+      "epoch": 0.3091813439119452,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.001471935525364206,
+      "loss": 0.1182,
+      "step": 35618
+    },
+    {
+      "epoch": 0.3091900243921494,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0014719084231014994,
+      "loss": 0.0742,
+      "step": 35619
+    },
+    {
+      "epoch": 0.3091987048723535,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0014718813204320795,
+      "loss": 0.0825,
+      "step": 35620
+    },
+    {
+      "epoch": 0.3092073853525577,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0014718542173559758,
+      "loss": 0.0854,
+      "step": 35621
+    },
+    {
+      "epoch": 0.30921606583276184,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0014718271138732175,
+      "loss": 0.1064,
+      "step": 35622
+    },
+    {
+      "epoch": 0.30922474631296604,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0014718000099838348,
+      "loss": 0.0928,
+      "step": 35623
+    },
+    {
+      "epoch": 0.3092334267931702,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0014717729056878573,
+      "loss": 0.1045,
+      "step": 35624
+    },
+    {
+      "epoch": 0.30924210727337437,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0014717458009853143,
+      "loss": 0.0942,
+      "step": 35625
+    },
+    {
+      "epoch": 0.3092507877535785,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0014717186958762359,
+      "loss": 0.0967,
+      "step": 35626
+    },
+    {
+      "epoch": 0.3092594682337827,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0014716915903606517,
+      "loss": 0.1328,
+      "step": 35627
+    },
+    {
+      "epoch": 0.30926814871398683,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001471664484438591,
+      "loss": 0.1064,
+      "step": 35628
+    },
+    {
+      "epoch": 0.309276829194191,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0014716373781100836,
+      "loss": 0.064,
+      "step": 35629
+    },
+    {
+      "epoch": 0.30928550967439516,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0014716102713751588,
+      "loss": 0.126,
+      "step": 35630
+    },
+    {
+      "epoch": 0.30929419015459936,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0014715831642338468,
+      "loss": 0.106,
+      "step": 35631
+    },
+    {
+      "epoch": 0.3093028706348035,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0014715560566861772,
+      "loss": 0.0708,
+      "step": 35632
+    },
+    {
+      "epoch": 0.3093115511150077,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0014715289487321793,
+      "loss": 0.1289,
+      "step": 35633
+    },
+    {
+      "epoch": 0.3093202315952118,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001471501840371883,
+      "loss": 0.1128,
+      "step": 35634
+    },
+    {
+      "epoch": 0.309328912075416,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001471474731605318,
+      "loss": 0.0618,
+      "step": 35635
+    },
+    {
+      "epoch": 0.30933759255562016,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014714476224325138,
+      "loss": 0.1182,
+      "step": 35636
+    },
+    {
+      "epoch": 0.30934627303582435,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0014714205128534999,
+      "loss": 0.0942,
+      "step": 35637
+    },
+    {
+      "epoch": 0.3093549535160285,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0014713934028683063,
+      "loss": 0.0942,
+      "step": 35638
+    },
+    {
+      "epoch": 0.3093636339962327,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0014713662924769621,
+      "loss": 0.0879,
+      "step": 35639
+    },
+    {
+      "epoch": 0.3093723144764368,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0014713391816794977,
+      "loss": 0.1079,
+      "step": 35640
+    },
+    {
+      "epoch": 0.309380994956641,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0014713120704759421,
+      "loss": 0.1426,
+      "step": 35641
+    },
+    {
+      "epoch": 0.30938967543684515,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0014712849588663253,
+      "loss": 0.1108,
+      "step": 35642
+    },
+    {
+      "epoch": 0.30939835591704934,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0014712578468506765,
+      "loss": 0.1475,
+      "step": 35643
+    },
+    {
+      "epoch": 0.3094070363972535,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001471230734429026,
+      "loss": 0.1021,
+      "step": 35644
+    },
+    {
+      "epoch": 0.30941571687745767,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0014712036216014033,
+      "loss": 0.126,
+      "step": 35645
+    },
+    {
+      "epoch": 0.3094243973576618,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001471176508367838,
+      "loss": 0.0732,
+      "step": 35646
+    },
+    {
+      "epoch": 0.309433077837866,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0014711493947283593,
+      "loss": 0.085,
+      "step": 35647
+    },
+    {
+      "epoch": 0.30944175831807014,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0014711222806829973,
+      "loss": 0.0811,
+      "step": 35648
+    },
+    {
+      "epoch": 0.30945043879827433,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0014710951662317815,
+      "loss": 0.0967,
+      "step": 35649
+    },
+    {
+      "epoch": 0.30945911927847847,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0014710680513747418,
+      "loss": 0.0728,
+      "step": 35650
+    },
+    {
+      "epoch": 0.30946779975868266,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014710409361119075,
+      "loss": 0.1221,
+      "step": 35651
+    },
+    {
+      "epoch": 0.3094764802388868,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0014710138204433083,
+      "loss": 0.1484,
+      "step": 35652
+    },
+    {
+      "epoch": 0.309485160719091,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0014709867043689743,
+      "loss": 0.0806,
+      "step": 35653
+    },
+    {
+      "epoch": 0.3094938411992951,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0014709595878889346,
+      "loss": 0.127,
+      "step": 35654
+    },
+    {
+      "epoch": 0.3095025216794993,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.001470932471003219,
+      "loss": 0.0781,
+      "step": 35655
+    },
+    {
+      "epoch": 0.30951120215970346,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0014709053537118573,
+      "loss": 0.0737,
+      "step": 35656
+    },
+    {
+      "epoch": 0.30951988263990765,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.001470878236014879,
+      "loss": 0.0747,
+      "step": 35657
+    },
+    {
+      "epoch": 0.3095285631201118,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001470851117912314,
+      "loss": 0.1152,
+      "step": 35658
+    },
+    {
+      "epoch": 0.309537243600316,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0014708239994041916,
+      "loss": 0.1133,
+      "step": 35659
+    },
+    {
+      "epoch": 0.3095459240805201,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0014707968804905421,
+      "loss": 0.1152,
+      "step": 35660
+    },
+    {
+      "epoch": 0.3095546045607243,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0014707697611713942,
+      "loss": 0.0859,
+      "step": 35661
+    },
+    {
+      "epoch": 0.30956328504092845,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.001470742641446778,
+      "loss": 0.1206,
+      "step": 35662
+    },
+    {
+      "epoch": 0.30957196552113264,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0014707155213167233,
+      "loss": 0.1094,
+      "step": 35663
+    },
+    {
+      "epoch": 0.3095806460013368,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.00147068840078126,
+      "loss": 0.0879,
+      "step": 35664
+    },
+    {
+      "epoch": 0.30958932648154097,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0014706612798404172,
+      "loss": 0.0767,
+      "step": 35665
+    },
+    {
+      "epoch": 0.3095980069617451,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0014706341584942249,
+      "loss": 0.1138,
+      "step": 35666
+    },
+    {
+      "epoch": 0.3096066874419493,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0014706070367427125,
+      "loss": 0.0854,
+      "step": 35667
+    },
+    {
+      "epoch": 0.30961536792215344,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0014705799145859101,
+      "loss": 0.1318,
+      "step": 35668
+    },
+    {
+      "epoch": 0.30962404840235763,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0014705527920238467,
+      "loss": 0.1436,
+      "step": 35669
+    },
+    {
+      "epoch": 0.30963272888256177,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0014705256690565524,
+      "loss": 0.1221,
+      "step": 35670
+    },
+    {
+      "epoch": 0.30964140936276596,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0014704985456840567,
+      "loss": 0.0962,
+      "step": 35671
+    },
+    {
+      "epoch": 0.3096500898429701,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0014704714219063897,
+      "loss": 0.1035,
+      "step": 35672
+    },
+    {
+      "epoch": 0.3096587703231743,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0014704442977235806,
+      "loss": 0.0996,
+      "step": 35673
+    },
+    {
+      "epoch": 0.30966745080337843,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0014704171731356591,
+      "loss": 0.1211,
+      "step": 35674
+    },
+    {
+      "epoch": 0.3096761312835826,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.001470390048142655,
+      "loss": 0.1104,
+      "step": 35675
+    },
+    {
+      "epoch": 0.30968481176378676,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001470362922744598,
+      "loss": 0.0991,
+      "step": 35676
+    },
+    {
+      "epoch": 0.30969349224399095,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0014703357969415177,
+      "loss": 0.0747,
+      "step": 35677
+    },
+    {
+      "epoch": 0.3097021727241951,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0014703086707334434,
+      "loss": 0.1123,
+      "step": 35678
+    },
+    {
+      "epoch": 0.3097108532043993,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0014702815441204053,
+      "loss": 0.0913,
+      "step": 35679
+    },
+    {
+      "epoch": 0.3097195336846034,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001470254417102433,
+      "loss": 0.0723,
+      "step": 35680
+    },
+    {
+      "epoch": 0.3097282141648076,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0014702272896795557,
+      "loss": 0.1113,
+      "step": 35681
+    },
+    {
+      "epoch": 0.30973689464501175,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0014702001618518035,
+      "loss": 0.0986,
+      "step": 35682
+    },
+    {
+      "epoch": 0.30974557512521594,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0014701730336192063,
+      "loss": 0.1245,
+      "step": 35683
+    },
+    {
+      "epoch": 0.3097542556054201,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001470145904981793,
+      "loss": 0.1001,
+      "step": 35684
+    },
+    {
+      "epoch": 0.3097629360856243,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0014701187759395941,
+      "loss": 0.0898,
+      "step": 35685
+    },
+    {
+      "epoch": 0.3097716165658284,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001470091646492639,
+      "loss": 0.0732,
+      "step": 35686
+    },
+    {
+      "epoch": 0.3097802970460326,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014700645166409567,
+      "loss": 0.1201,
+      "step": 35687
+    },
+    {
+      "epoch": 0.30978897752623674,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0014700373863845778,
+      "loss": 0.0688,
+      "step": 35688
+    },
+    {
+      "epoch": 0.30979765800644093,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.001470010255723531,
+      "loss": 0.1211,
+      "step": 35689
+    },
+    {
+      "epoch": 0.30980633848664507,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001469983124657847,
+      "loss": 0.1035,
+      "step": 35690
+    },
+    {
+      "epoch": 0.30981501896684926,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001469955993187555,
+      "loss": 0.1216,
+      "step": 35691
+    },
+    {
+      "epoch": 0.3098236994470534,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0014699288613126847,
+      "loss": 0.0918,
+      "step": 35692
+    },
+    {
+      "epoch": 0.3098323799272576,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0014699017290332656,
+      "loss": 0.1143,
+      "step": 35693
+    },
+    {
+      "epoch": 0.30984106040746173,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001469874596349328,
+      "loss": 0.1211,
+      "step": 35694
+    },
+    {
+      "epoch": 0.3098497408876659,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0014698474632609002,
+      "loss": 0.0859,
+      "step": 35695
+    },
+    {
+      "epoch": 0.30985842136787006,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0014698203297680134,
+      "loss": 0.1191,
+      "step": 35696
+    },
+    {
+      "epoch": 0.30986710184807426,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0014697931958706967,
+      "loss": 0.0947,
+      "step": 35697
+    },
+    {
+      "epoch": 0.3098757823282784,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0014697660615689795,
+      "loss": 0.0879,
+      "step": 35698
+    },
+    {
+      "epoch": 0.3098844628084826,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0014697389268628915,
+      "loss": 0.1367,
+      "step": 35699
+    },
+    {
+      "epoch": 0.3098931432886867,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001469711791752463,
+      "loss": 0.0737,
+      "step": 35700
+    },
+    {
+      "epoch": 0.3099018237688909,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001469684656237723,
+      "loss": 0.1016,
+      "step": 35701
+    },
+    {
+      "epoch": 0.30991050424909505,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0014696575203187014,
+      "loss": 0.1143,
+      "step": 35702
+    },
+    {
+      "epoch": 0.30991918472929925,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001469630383995428,
+      "loss": 0.0996,
+      "step": 35703
+    },
+    {
+      "epoch": 0.3099278652095034,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0014696032472679321,
+      "loss": 0.0791,
+      "step": 35704
+    },
+    {
+      "epoch": 0.3099365456897076,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001469576110136244,
+      "loss": 0.1113,
+      "step": 35705
+    },
+    {
+      "epoch": 0.3099452261699117,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001469548972600393,
+      "loss": 0.0845,
+      "step": 35706
+    },
+    {
+      "epoch": 0.3099539066501159,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0014695218346604086,
+      "loss": 0.1006,
+      "step": 35707
+    },
+    {
+      "epoch": 0.30996258713032004,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014694946963163208,
+      "loss": 0.127,
+      "step": 35708
+    },
+    {
+      "epoch": 0.30997126761052424,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001469467557568159,
+      "loss": 0.1133,
+      "step": 35709
+    },
+    {
+      "epoch": 0.3099799480907284,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0014694404184159532,
+      "loss": 0.2305,
+      "step": 35710
+    },
+    {
+      "epoch": 0.30998862857093257,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0014694132788597328,
+      "loss": 0.0938,
+      "step": 35711
+    },
+    {
+      "epoch": 0.3099973090511367,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0014693861388995278,
+      "loss": 0.1406,
+      "step": 35712
+    },
+    {
+      "epoch": 0.3100059895313409,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0014693589985353673,
+      "loss": 0.0688,
+      "step": 35713
+    },
+    {
+      "epoch": 0.31001467001154503,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0014693318577672816,
+      "loss": 0.082,
+      "step": 35714
+    },
+    {
+      "epoch": 0.31002335049174923,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0014693047165952999,
+      "loss": 0.0884,
+      "step": 35715
+    },
+    {
+      "epoch": 0.31003203097195337,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0014692775750194526,
+      "loss": 0.1465,
+      "step": 35716
+    },
+    {
+      "epoch": 0.31004071145215756,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0014692504330397684,
+      "loss": 0.1055,
+      "step": 35717
+    },
+    {
+      "epoch": 0.3100493919323617,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0014692232906562777,
+      "loss": 0.1387,
+      "step": 35718
+    },
+    {
+      "epoch": 0.3100580724125659,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0014691961478690097,
+      "loss": 0.1426,
+      "step": 35719
+    },
+    {
+      "epoch": 0.31006675289277,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0014691690046779947,
+      "loss": 0.1094,
+      "step": 35720
+    },
+    {
+      "epoch": 0.3100754333729742,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001469141861083262,
+      "loss": 0.0801,
+      "step": 35721
+    },
+    {
+      "epoch": 0.31008411385317836,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0014691147170848408,
+      "loss": 0.1191,
+      "step": 35722
+    },
+    {
+      "epoch": 0.31009279433338255,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0014690875726827619,
+      "loss": 0.0771,
+      "step": 35723
+    },
+    {
+      "epoch": 0.3101014748135867,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001469060427877054,
+      "loss": 0.0942,
+      "step": 35724
+    },
+    {
+      "epoch": 0.3101101552937909,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0014690332826677475,
+      "loss": 0.1465,
+      "step": 35725
+    },
+    {
+      "epoch": 0.310118835773995,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0014690061370548715,
+      "loss": 0.0986,
+      "step": 35726
+    },
+    {
+      "epoch": 0.3101275162541992,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.001468978991038456,
+      "loss": 0.0972,
+      "step": 35727
+    },
+    {
+      "epoch": 0.31013619673440335,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0014689518446185305,
+      "loss": 0.0903,
+      "step": 35728
+    },
+    {
+      "epoch": 0.31014487721460754,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.001468924697795125,
+      "loss": 0.0732,
+      "step": 35729
+    },
+    {
+      "epoch": 0.3101535576948117,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0014688975505682688,
+      "loss": 0.0718,
+      "step": 35730
+    },
+    {
+      "epoch": 0.3101622381750158,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0014688704029379917,
+      "loss": 0.0854,
+      "step": 35731
+    },
+    {
+      "epoch": 0.31017091865522,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001468843254904324,
+      "loss": 0.0908,
+      "step": 35732
+    },
+    {
+      "epoch": 0.31017959913542414,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0014688161064672941,
+      "loss": 0.1172,
+      "step": 35733
+    },
+    {
+      "epoch": 0.31018827961562834,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.001468788957626933,
+      "loss": 0.1416,
+      "step": 35734
+    },
+    {
+      "epoch": 0.3101969600958325,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0014687618083832697,
+      "loss": 0.0835,
+      "step": 35735
+    },
+    {
+      "epoch": 0.31020564057603667,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0014687346587363343,
+      "loss": 0.1147,
+      "step": 35736
+    },
+    {
+      "epoch": 0.3102143210562408,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0014687075086861558,
+      "loss": 0.0811,
+      "step": 35737
+    },
+    {
+      "epoch": 0.310223001536445,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0014686803582327644,
+      "loss": 0.0957,
+      "step": 35738
+    },
+    {
+      "epoch": 0.31023168201664914,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0014686532073761899,
+      "loss": 0.1182,
+      "step": 35739
+    },
+    {
+      "epoch": 0.31024036249685333,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.0014686260561164613,
+      "loss": 0.0796,
+      "step": 35740
+    },
+    {
+      "epoch": 0.31024904297705747,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0014685989044536092,
+      "loss": 0.1035,
+      "step": 35741
+    },
+    {
+      "epoch": 0.31025772345726166,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0014685717523876631,
+      "loss": 0.1387,
+      "step": 35742
+    },
+    {
+      "epoch": 0.3102664039374658,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.0014685445999186519,
+      "loss": 0.1494,
+      "step": 35743
+    },
+    {
+      "epoch": 0.31027508441767,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0014685174470466062,
+      "loss": 0.1099,
+      "step": 35744
+    },
+    {
+      "epoch": 0.3102837648978741,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0014684902937715555,
+      "loss": 0.0835,
+      "step": 35745
+    },
+    {
+      "epoch": 0.3102924453780783,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0014684631400935292,
+      "loss": 0.103,
+      "step": 35746
+    },
+    {
+      "epoch": 0.31030112585828246,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001468435986012557,
+      "loss": 0.1123,
+      "step": 35747
+    },
+    {
+      "epoch": 0.31030980633848665,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0014684088315286687,
+      "loss": 0.1113,
+      "step": 35748
+    },
+    {
+      "epoch": 0.3103184868186908,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001468381676641894,
+      "loss": 0.1875,
+      "step": 35749
+    },
+    {
+      "epoch": 0.310327167298895,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0014683545213522629,
+      "loss": 0.1069,
+      "step": 35750
+    },
+    {
+      "epoch": 0.3103358477790991,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0014683273656598047,
+      "loss": 0.0986,
+      "step": 35751
+    },
+    {
+      "epoch": 0.3103445282593033,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0014683002095645492,
+      "loss": 0.1006,
+      "step": 35752
+    },
+    {
+      "epoch": 0.31035320873950745,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0014682730530665262,
+      "loss": 0.1025,
+      "step": 35753
+    },
+    {
+      "epoch": 0.31036188921971164,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0014682458961657655,
+      "loss": 0.0972,
+      "step": 35754
+    },
+    {
+      "epoch": 0.3103705696999158,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0014682187388622963,
+      "loss": 0.1128,
+      "step": 35755
+    },
+    {
+      "epoch": 0.31037925018011997,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0014681915811561486,
+      "loss": 0.0986,
+      "step": 35756
+    },
+    {
+      "epoch": 0.3103879306603241,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0014681644230473522,
+      "loss": 0.1104,
+      "step": 35757
+    },
+    {
+      "epoch": 0.3103966111405283,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0014681372645359368,
+      "loss": 0.1348,
+      "step": 35758
+    },
+    {
+      "epoch": 0.31040529162073244,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0014681101056219321,
+      "loss": 0.1406,
+      "step": 35759
+    },
+    {
+      "epoch": 0.31041397210093663,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0014680829463053675,
+      "loss": 0.0938,
+      "step": 35760
+    },
+    {
+      "epoch": 0.31042265258114077,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0014680557865862731,
+      "loss": 0.0859,
+      "step": 35761
+    },
+    {
+      "epoch": 0.31043133306134496,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0014680286264646784,
+      "loss": 0.1182,
+      "step": 35762
+    },
+    {
+      "epoch": 0.3104400135415491,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0014680014659406134,
+      "loss": 0.1187,
+      "step": 35763
+    },
+    {
+      "epoch": 0.3104486940217533,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001467974305014107,
+      "loss": 0.1055,
+      "step": 35764
+    },
+    {
+      "epoch": 0.31045737450195743,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0014679471436851897,
+      "loss": 0.085,
+      "step": 35765
+    },
+    {
+      "epoch": 0.3104660549821616,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0014679199819538909,
+      "loss": 0.085,
+      "step": 35766
+    },
+    {
+      "epoch": 0.31047473546236576,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.00146789281982024,
+      "loss": 0.1064,
+      "step": 35767
+    },
+    {
+      "epoch": 0.31048341594256995,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0014678656572842674,
+      "loss": 0.1055,
+      "step": 35768
+    },
+    {
+      "epoch": 0.3104920964227741,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0014678384943460025,
+      "loss": 0.1216,
+      "step": 35769
+    },
+    {
+      "epoch": 0.3105007769029783,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0014678113310054748,
+      "loss": 0.1357,
+      "step": 35770
+    },
+    {
+      "epoch": 0.3105094573831824,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0014677841672627143,
+      "loss": 0.0791,
+      "step": 35771
+    },
+    {
+      "epoch": 0.3105181378633866,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0014677570031177505,
+      "loss": 0.0962,
+      "step": 35772
+    },
+    {
+      "epoch": 0.31052681834359075,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001467729838570613,
+      "loss": 0.0928,
+      "step": 35773
+    },
+    {
+      "epoch": 0.31053549882379494,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0014677026736213317,
+      "loss": 0.1621,
+      "step": 35774
+    },
+    {
+      "epoch": 0.3105441793039991,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0014676755082699364,
+      "loss": 0.1787,
+      "step": 35775
+    },
+    {
+      "epoch": 0.3105528597842033,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0014676483425164566,
+      "loss": 0.0737,
+      "step": 35776
+    },
+    {
+      "epoch": 0.3105615402644074,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001467621176360922,
+      "loss": 0.0693,
+      "step": 35777
+    },
+    {
+      "epoch": 0.3105702207446116,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0014675940098033626,
+      "loss": 0.0889,
+      "step": 35778
+    },
+    {
+      "epoch": 0.31057890122481574,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001467566842843808,
+      "loss": 0.1514,
+      "step": 35779
+    },
+    {
+      "epoch": 0.31058758170501993,
+      "grad_norm": 2.890625,
+      "learning_rate": 0.0014675396754822879,
+      "loss": 0.1826,
+      "step": 35780
+    },
+    {
+      "epoch": 0.31059626218522407,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0014675125077188318,
+      "loss": 0.1064,
+      "step": 35781
+    },
+    {
+      "epoch": 0.31060494266542826,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001467485339553469,
+      "loss": 0.0972,
+      "step": 35782
+    },
+    {
+      "epoch": 0.3106136231456324,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0014674581709862306,
+      "loss": 0.1104,
+      "step": 35783
+    },
+    {
+      "epoch": 0.3106223036258366,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001467431002017145,
+      "loss": 0.0918,
+      "step": 35784
+    },
+    {
+      "epoch": 0.31063098410604073,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0014674038326462427,
+      "loss": 0.1011,
+      "step": 35785
+    },
+    {
+      "epoch": 0.3106396645862449,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0014673766628735528,
+      "loss": 0.1221,
+      "step": 35786
+    },
+    {
+      "epoch": 0.31064834506644906,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0014673494926991054,
+      "loss": 0.1069,
+      "step": 35787
+    },
+    {
+      "epoch": 0.31065702554665325,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.00146732232212293,
+      "loss": 0.0854,
+      "step": 35788
+    },
+    {
+      "epoch": 0.3106657060268574,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0014672951511450565,
+      "loss": 0.1089,
+      "step": 35789
+    },
+    {
+      "epoch": 0.3106743865070616,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0014672679797655143,
+      "loss": 0.1309,
+      "step": 35790
+    },
+    {
+      "epoch": 0.3106830669872657,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0014672408079843336,
+      "loss": 0.0957,
+      "step": 35791
+    },
+    {
+      "epoch": 0.3106917474674699,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001467213635801544,
+      "loss": 0.1279,
+      "step": 35792
+    },
+    {
+      "epoch": 0.31070042794767405,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001467186463217175,
+      "loss": 0.1035,
+      "step": 35793
+    },
+    {
+      "epoch": 0.31070910842787824,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0014671592902312566,
+      "loss": 0.1104,
+      "step": 35794
+    },
+    {
+      "epoch": 0.3107177889080824,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001467132116843818,
+      "loss": 0.0732,
+      "step": 35795
+    },
+    {
+      "epoch": 0.3107264693882866,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001467104943054889,
+      "loss": 0.0718,
+      "step": 35796
+    },
+    {
+      "epoch": 0.3107351498684907,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0014670777688645,
+      "loss": 0.0967,
+      "step": 35797
+    },
+    {
+      "epoch": 0.3107438303486949,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.00146705059427268,
+      "loss": 0.1138,
+      "step": 35798
+    },
+    {
+      "epoch": 0.31075251082889904,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001467023419279459,
+      "loss": 0.0928,
+      "step": 35799
+    },
+    {
+      "epoch": 0.31076119130910324,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0014669962438848669,
+      "loss": 0.1016,
+      "step": 35800
+    },
+    {
+      "epoch": 0.3107698717893074,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0014669690680889334,
+      "loss": 0.1797,
+      "step": 35801
+    },
+    {
+      "epoch": 0.31077855226951157,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0014669418918916877,
+      "loss": 0.0688,
+      "step": 35802
+    },
+    {
+      "epoch": 0.3107872327497157,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.00146691471529316,
+      "loss": 0.1074,
+      "step": 35803
+    },
+    {
+      "epoch": 0.3107959132299199,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0014668875382933798,
+      "loss": 0.1543,
+      "step": 35804
+    },
+    {
+      "epoch": 0.31080459371012403,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.001466860360892377,
+      "loss": 0.1338,
+      "step": 35805
+    },
+    {
+      "epoch": 0.3108132741903282,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0014668331830901809,
+      "loss": 0.0674,
+      "step": 35806
+    },
+    {
+      "epoch": 0.31082195467053236,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0014668060048868218,
+      "loss": 0.1074,
+      "step": 35807
+    },
+    {
+      "epoch": 0.31083063515073656,
+      "grad_norm": 1.96875,
+      "learning_rate": 0.0014667788262823292,
+      "loss": 0.126,
+      "step": 35808
+    },
+    {
+      "epoch": 0.3108393156309407,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0014667516472767325,
+      "loss": 0.0894,
+      "step": 35809
+    },
+    {
+      "epoch": 0.3108479961111449,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0014667244678700621,
+      "loss": 0.0977,
+      "step": 35810
+    },
+    {
+      "epoch": 0.310856676591349,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0014666972880623475,
+      "loss": 0.0981,
+      "step": 35811
+    },
+    {
+      "epoch": 0.3108653570715532,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0014666701078536177,
+      "loss": 0.0879,
+      "step": 35812
+    },
+    {
+      "epoch": 0.31087403755175735,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0014666429272439032,
+      "loss": 0.1045,
+      "step": 35813
+    },
+    {
+      "epoch": 0.31088271803196155,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0014666157462332335,
+      "loss": 0.0942,
+      "step": 35814
+    },
+    {
+      "epoch": 0.3108913985121657,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0014665885648216383,
+      "loss": 0.083,
+      "step": 35815
+    },
+    {
+      "epoch": 0.3109000789923699,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0014665613830091473,
+      "loss": 0.1245,
+      "step": 35816
+    },
+    {
+      "epoch": 0.310908759472574,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0014665342007957904,
+      "loss": 0.0938,
+      "step": 35817
+    },
+    {
+      "epoch": 0.3109174399527782,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014665070181815972,
+      "loss": 0.1006,
+      "step": 35818
+    },
+    {
+      "epoch": 0.31092612043298234,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0014664798351665975,
+      "loss": 0.1094,
+      "step": 35819
+    },
+    {
+      "epoch": 0.31093480091318654,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0014664526517508209,
+      "loss": 0.0938,
+      "step": 35820
+    },
+    {
+      "epoch": 0.3109434813933907,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001466425467934297,
+      "loss": 0.0894,
+      "step": 35821
+    },
+    {
+      "epoch": 0.31095216187359487,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001466398283717056,
+      "loss": 0.126,
+      "step": 35822
+    },
+    {
+      "epoch": 0.310960842353799,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0014663710990991273,
+      "loss": 0.1084,
+      "step": 35823
+    },
+    {
+      "epoch": 0.3109695228340032,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0014663439140805404,
+      "loss": 0.1089,
+      "step": 35824
+    },
+    {
+      "epoch": 0.31097820331420734,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0014663167286613256,
+      "loss": 0.0791,
+      "step": 35825
+    },
+    {
+      "epoch": 0.31098688379441153,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0014662895428415122,
+      "loss": 0.0967,
+      "step": 35826
+    },
+    {
+      "epoch": 0.31099556427461567,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0014662623566211298,
+      "loss": 0.084,
+      "step": 35827
+    },
+    {
+      "epoch": 0.31100424475481986,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.001466235170000209,
+      "loss": 0.0713,
+      "step": 35828
+    },
+    {
+      "epoch": 0.311012925235024,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0014662079829787784,
+      "loss": 0.0894,
+      "step": 35829
+    },
+    {
+      "epoch": 0.3110216057152282,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0014661807955568685,
+      "loss": 0.1191,
+      "step": 35830
+    },
+    {
+      "epoch": 0.3110302861954323,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0014661536077345085,
+      "loss": 0.0986,
+      "step": 35831
+    },
+    {
+      "epoch": 0.3110389666756365,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0014661264195117288,
+      "loss": 0.1104,
+      "step": 35832
+    },
+    {
+      "epoch": 0.31104764715584066,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0014660992308885587,
+      "loss": 0.1016,
+      "step": 35833
+    },
+    {
+      "epoch": 0.31105632763604485,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0014660720418650278,
+      "loss": 0.1143,
+      "step": 35834
+    },
+    {
+      "epoch": 0.311065008116249,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001466044852441166,
+      "loss": 0.0654,
+      "step": 35835
+    },
+    {
+      "epoch": 0.3110736885964532,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001466017662617003,
+      "loss": 0.0918,
+      "step": 35836
+    },
+    {
+      "epoch": 0.3110823690766573,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0014659904723925688,
+      "loss": 0.0864,
+      "step": 35837
+    },
+    {
+      "epoch": 0.3110910495568615,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001465963281767893,
+      "loss": 0.0586,
+      "step": 35838
+    },
+    {
+      "epoch": 0.31109973003706565,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001465936090743005,
+      "loss": 0.0879,
+      "step": 35839
+    },
+    {
+      "epoch": 0.31110841051726984,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0014659088993179347,
+      "loss": 0.0889,
+      "step": 35840
+    },
+    {
+      "epoch": 0.311117090997474,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0014658817074927122,
+      "loss": 0.1006,
+      "step": 35841
+    },
+    {
+      "epoch": 0.31112577147767817,
+      "grad_norm": 1.7734375,
+      "learning_rate": 0.0014658545152673667,
+      "loss": 0.1338,
+      "step": 35842
+    },
+    {
+      "epoch": 0.3111344519578823,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0014658273226419284,
+      "loss": 0.0923,
+      "step": 35843
+    },
+    {
+      "epoch": 0.3111431324380865,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0014658001296164267,
+      "loss": 0.1045,
+      "step": 35844
+    },
+    {
+      "epoch": 0.31115181291829064,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001465772936190892,
+      "loss": 0.1055,
+      "step": 35845
+    },
+    {
+      "epoch": 0.31116049339849483,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001465745742365353,
+      "loss": 0.1006,
+      "step": 35846
+    },
+    {
+      "epoch": 0.31116917387869897,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.00146571854813984,
+      "loss": 0.1758,
+      "step": 35847
+    },
+    {
+      "epoch": 0.31117785435890316,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0014656913535143826,
+      "loss": 0.1211,
+      "step": 35848
+    },
+    {
+      "epoch": 0.3111865348391073,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0014656641584890107,
+      "loss": 0.0767,
+      "step": 35849
+    },
+    {
+      "epoch": 0.3111952153193115,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0014656369630637544,
+      "loss": 0.1064,
+      "step": 35850
+    },
+    {
+      "epoch": 0.31120389579951563,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0014656097672386423,
+      "loss": 0.1172,
+      "step": 35851
+    },
+    {
+      "epoch": 0.3112125762797198,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0014655825710137053,
+      "loss": 0.0815,
+      "step": 35852
+    },
+    {
+      "epoch": 0.31122125675992396,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0014655553743889724,
+      "loss": 0.0879,
+      "step": 35853
+    },
+    {
+      "epoch": 0.31122993724012815,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001465528177364474,
+      "loss": 0.1299,
+      "step": 35854
+    },
+    {
+      "epoch": 0.3112386177203323,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0014655009799402392,
+      "loss": 0.0967,
+      "step": 35855
+    },
+    {
+      "epoch": 0.3112472982005364,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001465473782116298,
+      "loss": 0.083,
+      "step": 35856
+    },
+    {
+      "epoch": 0.3112559786807406,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0014654465838926805,
+      "loss": 0.0913,
+      "step": 35857
+    },
+    {
+      "epoch": 0.31126465916094476,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0014654193852694157,
+      "loss": 0.0771,
+      "step": 35858
+    },
+    {
+      "epoch": 0.31127333964114895,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0014653921862465339,
+      "loss": 0.0908,
+      "step": 35859
+    },
+    {
+      "epoch": 0.3112820201213531,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0014653649868240648,
+      "loss": 0.1387,
+      "step": 35860
+    },
+    {
+      "epoch": 0.3112907006015573,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0014653377870020381,
+      "loss": 0.125,
+      "step": 35861
+    },
+    {
+      "epoch": 0.3112993810817614,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0014653105867804834,
+      "loss": 0.0938,
+      "step": 35862
+    },
+    {
+      "epoch": 0.3113080615619656,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0014652833861594301,
+      "loss": 0.1025,
+      "step": 35863
+    },
+    {
+      "epoch": 0.31131674204216975,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014652561851389088,
+      "loss": 0.1348,
+      "step": 35864
+    },
+    {
+      "epoch": 0.31132542252237394,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001465228983718949,
+      "loss": 0.1025,
+      "step": 35865
+    },
+    {
+      "epoch": 0.3113341030025781,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.00146520178189958,
+      "loss": 0.1104,
+      "step": 35866
+    },
+    {
+      "epoch": 0.31134278348278227,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001465174579680832,
+      "loss": 0.1025,
+      "step": 35867
+    },
+    {
+      "epoch": 0.3113514639629864,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0014651473770627343,
+      "loss": 0.0771,
+      "step": 35868
+    },
+    {
+      "epoch": 0.3113601444431906,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001465120174045317,
+      "loss": 0.1128,
+      "step": 35869
+    },
+    {
+      "epoch": 0.31136882492339474,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0014650929706286101,
+      "loss": 0.0776,
+      "step": 35870
+    },
+    {
+      "epoch": 0.31137750540359893,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0014650657668126429,
+      "loss": 0.0869,
+      "step": 35871
+    },
+    {
+      "epoch": 0.31138618588380307,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0014650385625974446,
+      "loss": 0.1182,
+      "step": 35872
+    },
+    {
+      "epoch": 0.31139486636400726,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001465011357983046,
+      "loss": 0.0898,
+      "step": 35873
+    },
+    {
+      "epoch": 0.3114035468442114,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0014649841529694768,
+      "loss": 0.1094,
+      "step": 35874
+    },
+    {
+      "epoch": 0.3114122273244156,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0014649569475567662,
+      "loss": 0.1123,
+      "step": 35875
+    },
+    {
+      "epoch": 0.31142090780461973,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0014649297417449441,
+      "loss": 0.0962,
+      "step": 35876
+    },
+    {
+      "epoch": 0.3114295882848239,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0014649025355340404,
+      "loss": 0.0967,
+      "step": 35877
+    },
+    {
+      "epoch": 0.31143826876502806,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0014648753289240846,
+      "loss": 0.1279,
+      "step": 35878
+    },
+    {
+      "epoch": 0.31144694924523225,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0014648481219151068,
+      "loss": 0.1011,
+      "step": 35879
+    },
+    {
+      "epoch": 0.3114556297254364,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0014648209145071368,
+      "loss": 0.126,
+      "step": 35880
+    },
+    {
+      "epoch": 0.3114643102056406,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0014647937067002038,
+      "loss": 0.0957,
+      "step": 35881
+    },
+    {
+      "epoch": 0.3114729906858447,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0014647664984943378,
+      "loss": 0.1123,
+      "step": 35882
+    },
+    {
+      "epoch": 0.3114816711660489,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0014647392898895688,
+      "loss": 0.1113,
+      "step": 35883
+    },
+    {
+      "epoch": 0.31149035164625305,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0014647120808859264,
+      "loss": 0.1133,
+      "step": 35884
+    },
+    {
+      "epoch": 0.31149903212645724,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0014646848714834403,
+      "loss": 0.103,
+      "step": 35885
+    },
+    {
+      "epoch": 0.3115077126066614,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0014646576616821404,
+      "loss": 0.1006,
+      "step": 35886
+    },
+    {
+      "epoch": 0.3115163930868656,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0014646304514820564,
+      "loss": 0.0918,
+      "step": 35887
+    },
+    {
+      "epoch": 0.3115250735670697,
+      "grad_norm": 2.328125,
+      "learning_rate": 0.001464603240883218,
+      "loss": 0.2246,
+      "step": 35888
+    },
+    {
+      "epoch": 0.3115337540472739,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0014645760298856547,
+      "loss": 0.083,
+      "step": 35889
+    },
+    {
+      "epoch": 0.31154243452747804,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0014645488184893967,
+      "loss": 0.1543,
+      "step": 35890
+    },
+    {
+      "epoch": 0.31155111500768223,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0014645216066944736,
+      "loss": 0.1089,
+      "step": 35891
+    },
+    {
+      "epoch": 0.31155979548788637,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.001464494394500915,
+      "loss": 0.2324,
+      "step": 35892
+    },
+    {
+      "epoch": 0.31156847596809056,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0014644671819087512,
+      "loss": 0.0864,
+      "step": 35893
+    },
+    {
+      "epoch": 0.3115771564482947,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001464439968918011,
+      "loss": 0.1777,
+      "step": 35894
+    },
+    {
+      "epoch": 0.3115858369284989,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0014644127555287253,
+      "loss": 0.1074,
+      "step": 35895
+    },
+    {
+      "epoch": 0.31159451740870303,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001464385541740923,
+      "loss": 0.0732,
+      "step": 35896
+    },
+    {
+      "epoch": 0.3116031978889072,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0014643583275546345,
+      "loss": 0.1123,
+      "step": 35897
+    },
+    {
+      "epoch": 0.31161187836911136,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0014643311129698886,
+      "loss": 0.1152,
+      "step": 35898
+    },
+    {
+      "epoch": 0.31162055884931555,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001464303897986716,
+      "loss": 0.104,
+      "step": 35899
+    },
+    {
+      "epoch": 0.3116292393295197,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0014642766826051458,
+      "loss": 0.0918,
+      "step": 35900
+    },
+    {
+      "epoch": 0.3116379198097239,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0014642494668252084,
+      "loss": 0.1162,
+      "step": 35901
+    },
+    {
+      "epoch": 0.311646600289928,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0014642222506469334,
+      "loss": 0.0918,
+      "step": 35902
+    },
+    {
+      "epoch": 0.3116552807701322,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0014641950340703502,
+      "loss": 0.0938,
+      "step": 35903
+    },
+    {
+      "epoch": 0.31166396125033635,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0014641678170954889,
+      "loss": 0.0581,
+      "step": 35904
+    },
+    {
+      "epoch": 0.31167264173054054,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001464140599722379,
+      "loss": 0.0967,
+      "step": 35905
+    },
+    {
+      "epoch": 0.3116813222107447,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0014641133819510506,
+      "loss": 0.1221,
+      "step": 35906
+    },
+    {
+      "epoch": 0.3116900026909489,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0014640861637815331,
+      "loss": 0.0781,
+      "step": 35907
+    },
+    {
+      "epoch": 0.311698683171153,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0014640589452138567,
+      "loss": 0.125,
+      "step": 35908
+    },
+    {
+      "epoch": 0.3117073636513572,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0014640317262480506,
+      "loss": 0.1162,
+      "step": 35909
+    },
+    {
+      "epoch": 0.31171604413156134,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001464004506884145,
+      "loss": 0.0757,
+      "step": 35910
+    },
+    {
+      "epoch": 0.31172472461176554,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0014639772871221696,
+      "loss": 0.3457,
+      "step": 35911
+    },
+    {
+      "epoch": 0.3117334050919697,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001463950066962154,
+      "loss": 0.1074,
+      "step": 35912
+    },
+    {
+      "epoch": 0.31174208557217387,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001463922846404128,
+      "loss": 0.0654,
+      "step": 35913
+    },
+    {
+      "epoch": 0.311750766052378,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0014638956254481215,
+      "loss": 0.0752,
+      "step": 35914
+    },
+    {
+      "epoch": 0.3117594465325822,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0014638684040941643,
+      "loss": 0.1318,
+      "step": 35915
+    },
+    {
+      "epoch": 0.31176812701278633,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0014638411823422858,
+      "loss": 0.0859,
+      "step": 35916
+    },
+    {
+      "epoch": 0.3117768074929905,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0014638139601925166,
+      "loss": 0.1099,
+      "step": 35917
+    },
+    {
+      "epoch": 0.31178548797319466,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0014637867376448856,
+      "loss": 0.1797,
+      "step": 35918
+    },
+    {
+      "epoch": 0.31179416845339886,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001463759514699423,
+      "loss": 0.1025,
+      "step": 35919
+    },
+    {
+      "epoch": 0.311802848933603,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0014637322913561583,
+      "loss": 0.0845,
+      "step": 35920
+    },
+    {
+      "epoch": 0.3118115294138072,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0014637050676151214,
+      "loss": 0.103,
+      "step": 35921
+    },
+    {
+      "epoch": 0.3118202098940113,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0014636778434763424,
+      "loss": 0.1348,
+      "step": 35922
+    },
+    {
+      "epoch": 0.3118288903742155,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0014636506189398505,
+      "loss": 0.1582,
+      "step": 35923
+    },
+    {
+      "epoch": 0.31183757085441965,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0014636233940056758,
+      "loss": 0.0879,
+      "step": 35924
+    },
+    {
+      "epoch": 0.31184625133462385,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0014635961686738478,
+      "loss": 0.1206,
+      "step": 35925
+    },
+    {
+      "epoch": 0.311854931814828,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001463568942944397,
+      "loss": 0.1289,
+      "step": 35926
+    },
+    {
+      "epoch": 0.3118636122950322,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0014635417168173525,
+      "loss": 0.1172,
+      "step": 35927
+    },
+    {
+      "epoch": 0.3118722927752363,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0014635144902927443,
+      "loss": 0.0962,
+      "step": 35928
+    },
+    {
+      "epoch": 0.3118809732554405,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0014634872633706019,
+      "loss": 0.0933,
+      "step": 35929
+    },
+    {
+      "epoch": 0.31188965373564465,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0014634600360509554,
+      "loss": 0.1006,
+      "step": 35930
+    },
+    {
+      "epoch": 0.31189833421584884,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0014634328083338345,
+      "loss": 0.1279,
+      "step": 35931
+    },
+    {
+      "epoch": 0.311907014696053,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0014634055802192688,
+      "loss": 0.1167,
+      "step": 35932
+    },
+    {
+      "epoch": 0.31191569517625717,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001463378351707288,
+      "loss": 0.1211,
+      "step": 35933
+    },
+    {
+      "epoch": 0.3119243756564613,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0014633511227979223,
+      "loss": 0.0752,
+      "step": 35934
+    },
+    {
+      "epoch": 0.3119330561366655,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0014633238934912017,
+      "loss": 0.1309,
+      "step": 35935
+    },
+    {
+      "epoch": 0.31194173661686964,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0014632966637871552,
+      "loss": 0.0742,
+      "step": 35936
+    },
+    {
+      "epoch": 0.31195041709707383,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0014632694336858126,
+      "loss": 0.0791,
+      "step": 35937
+    },
+    {
+      "epoch": 0.31195909757727797,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0014632422031872043,
+      "loss": 0.0835,
+      "step": 35938
+    },
+    {
+      "epoch": 0.31196777805748216,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.00146321497229136,
+      "loss": 0.1367,
+      "step": 35939
+    },
+    {
+      "epoch": 0.3119764585376863,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001463187740998309,
+      "loss": 0.0972,
+      "step": 35940
+    },
+    {
+      "epoch": 0.3119851390178905,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0014631605093080816,
+      "loss": 0.082,
+      "step": 35941
+    },
+    {
+      "epoch": 0.3119938194980946,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0014631332772207068,
+      "loss": 0.0942,
+      "step": 35942
+    },
+    {
+      "epoch": 0.3120024999782988,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0014631060447362154,
+      "loss": 0.1396,
+      "step": 35943
+    },
+    {
+      "epoch": 0.31201118045850296,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0014630788118546365,
+      "loss": 0.1211,
+      "step": 35944
+    },
+    {
+      "epoch": 0.31201986093870715,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0014630515785760004,
+      "loss": 0.0869,
+      "step": 35945
+    },
+    {
+      "epoch": 0.3120285414189113,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0014630243449003358,
+      "loss": 0.0781,
+      "step": 35946
+    },
+    {
+      "epoch": 0.3120372218991155,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0014629971108276738,
+      "loss": 0.0986,
+      "step": 35947
+    },
+    {
+      "epoch": 0.3120459023793196,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0014629698763580434,
+      "loss": 0.0913,
+      "step": 35948
+    },
+    {
+      "epoch": 0.3120545828595238,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0014629426414914748,
+      "loss": 0.123,
+      "step": 35949
+    },
+    {
+      "epoch": 0.31206326333972795,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0014629154062279974,
+      "loss": 0.0928,
+      "step": 35950
+    },
+    {
+      "epoch": 0.31207194381993214,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0014628881705676413,
+      "loss": 0.084,
+      "step": 35951
+    },
+    {
+      "epoch": 0.3120806243001363,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0014628609345104361,
+      "loss": 0.0879,
+      "step": 35952
+    },
+    {
+      "epoch": 0.31208930478034047,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0014628336980564117,
+      "loss": 0.1064,
+      "step": 35953
+    },
+    {
+      "epoch": 0.3120979852605446,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0014628064612055977,
+      "loss": 0.1123,
+      "step": 35954
+    },
+    {
+      "epoch": 0.3121066657407488,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001462779223958024,
+      "loss": 0.0913,
+      "step": 35955
+    },
+    {
+      "epoch": 0.31211534622095294,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0014627519863137203,
+      "loss": 0.1025,
+      "step": 35956
+    },
+    {
+      "epoch": 0.31212402670115713,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0014627247482727169,
+      "loss": 0.0747,
+      "step": 35957
+    },
+    {
+      "epoch": 0.31213270718136127,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0014626975098350427,
+      "loss": 0.0874,
+      "step": 35958
+    },
+    {
+      "epoch": 0.31214138766156546,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001462670271000728,
+      "loss": 0.1006,
+      "step": 35959
+    },
+    {
+      "epoch": 0.3121500681417696,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0014626430317698027,
+      "loss": 0.1035,
+      "step": 35960
+    },
+    {
+      "epoch": 0.3121587486219738,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0014626157921422964,
+      "loss": 0.083,
+      "step": 35961
+    },
+    {
+      "epoch": 0.31216742910217793,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0014625885521182388,
+      "loss": 0.0938,
+      "step": 35962
+    },
+    {
+      "epoch": 0.3121761095823821,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.00146256131169766,
+      "loss": 0.0908,
+      "step": 35963
+    },
+    {
+      "epoch": 0.31218479006258626,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0014625340708805897,
+      "loss": 0.1406,
+      "step": 35964
+    },
+    {
+      "epoch": 0.31219347054279045,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0014625068296670574,
+      "loss": 0.0996,
+      "step": 35965
+    },
+    {
+      "epoch": 0.3122021510229946,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0014624795880570929,
+      "loss": 0.0977,
+      "step": 35966
+    },
+    {
+      "epoch": 0.3122108315031988,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014624523460507264,
+      "loss": 0.0869,
+      "step": 35967
+    },
+    {
+      "epoch": 0.3122195119834029,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0014624251036479874,
+      "loss": 0.0791,
+      "step": 35968
+    },
+    {
+      "epoch": 0.3122281924636071,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0014623978608489059,
+      "loss": 0.0874,
+      "step": 35969
+    },
+    {
+      "epoch": 0.31223687294381125,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0014623706176535112,
+      "loss": 0.085,
+      "step": 35970
+    },
+    {
+      "epoch": 0.31224555342401544,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0014623433740618337,
+      "loss": 0.085,
+      "step": 35971
+    },
+    {
+      "epoch": 0.3122542339042196,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0014623161300739028,
+      "loss": 0.1104,
+      "step": 35972
+    },
+    {
+      "epoch": 0.3122629143844238,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0014622888856897485,
+      "loss": 0.1221,
+      "step": 35973
+    },
+    {
+      "epoch": 0.3122715948646279,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0014622616409094007,
+      "loss": 0.0991,
+      "step": 35974
+    },
+    {
+      "epoch": 0.3122802753448321,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0014622343957328887,
+      "loss": 0.1494,
+      "step": 35975
+    },
+    {
+      "epoch": 0.31228895582503624,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0014622071501602428,
+      "loss": 0.0972,
+      "step": 35976
+    },
+    {
+      "epoch": 0.31229763630524043,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0014621799041914926,
+      "loss": 0.1309,
+      "step": 35977
+    },
+    {
+      "epoch": 0.31230631678544457,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0014621526578266675,
+      "loss": 0.1406,
+      "step": 35978
+    },
+    {
+      "epoch": 0.3123149972656487,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0014621254110657983,
+      "loss": 0.0996,
+      "step": 35979
+    },
+    {
+      "epoch": 0.3123236777458529,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0014620981639089138,
+      "loss": 0.1074,
+      "step": 35980
+    },
+    {
+      "epoch": 0.31233235822605704,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0014620709163560444,
+      "loss": 0.0869,
+      "step": 35981
+    },
+    {
+      "epoch": 0.31234103870626123,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0014620436684072199,
+      "loss": 0.1025,
+      "step": 35982
+    },
+    {
+      "epoch": 0.31234971918646537,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0014620164200624692,
+      "loss": 0.0923,
+      "step": 35983
+    },
+    {
+      "epoch": 0.31235839966666956,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0014619891713218234,
+      "loss": 0.1182,
+      "step": 35984
+    },
+    {
+      "epoch": 0.3123670801468737,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0014619619221853113,
+      "loss": 0.1177,
+      "step": 35985
+    },
+    {
+      "epoch": 0.3123757606270779,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0014619346726529635,
+      "loss": 0.1079,
+      "step": 35986
+    },
+    {
+      "epoch": 0.31238444110728203,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001461907422724809,
+      "loss": 0.1079,
+      "step": 35987
+    },
+    {
+      "epoch": 0.3123931215874862,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001461880172400878,
+      "loss": 0.084,
+      "step": 35988
+    },
+    {
+      "epoch": 0.31240180206769036,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0014618529216812001,
+      "loss": 0.0972,
+      "step": 35989
+    },
+    {
+      "epoch": 0.31241048254789455,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0014618256705658055,
+      "loss": 0.1338,
+      "step": 35990
+    },
+    {
+      "epoch": 0.3124191630280987,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001461798419054724,
+      "loss": 0.1079,
+      "step": 35991
+    },
+    {
+      "epoch": 0.3124278435083029,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0014617711671479853,
+      "loss": 0.0996,
+      "step": 35992
+    },
+    {
+      "epoch": 0.312436523988507,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0014617439148456187,
+      "loss": 0.1367,
+      "step": 35993
+    },
+    {
+      "epoch": 0.3124452044687112,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0014617166621476543,
+      "loss": 0.0879,
+      "step": 35994
+    },
+    {
+      "epoch": 0.31245388494891535,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0014616894090541223,
+      "loss": 0.1108,
+      "step": 35995
+    },
+    {
+      "epoch": 0.31246256542911954,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001461662155565052,
+      "loss": 0.0688,
+      "step": 35996
+    },
+    {
+      "epoch": 0.3124712459093237,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0014616349016804736,
+      "loss": 0.1094,
+      "step": 35997
+    },
+    {
+      "epoch": 0.3124799263895279,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0014616076474004162,
+      "loss": 0.0986,
+      "step": 35998
+    },
+    {
+      "epoch": 0.312488606869732,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0014615803927249106,
+      "loss": 0.1016,
+      "step": 35999
+    },
+    {
+      "epoch": 0.3124972873499362,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0014615531376539863,
+      "loss": 0.0996,
+      "step": 36000
+    },
+    {
+      "epoch": 0.31250596783014034,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0014615258821876727,
+      "loss": 0.123,
+      "step": 36001
+    },
+    {
+      "epoch": 0.31251464831034453,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014614986263259996,
+      "loss": 0.1162,
+      "step": 36002
+    },
+    {
+      "epoch": 0.31252332879054867,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0014614713700689971,
+      "loss": 0.126,
+      "step": 36003
+    },
+    {
+      "epoch": 0.31253200927075286,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0014614441134166952,
+      "loss": 0.1094,
+      "step": 36004
+    },
+    {
+      "epoch": 0.312540689750957,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0014614168563691233,
+      "loss": 0.0845,
+      "step": 36005
+    },
+    {
+      "epoch": 0.3125493702311612,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0014613895989263113,
+      "loss": 0.0762,
+      "step": 36006
+    },
+    {
+      "epoch": 0.31255805071136533,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001461362341088289,
+      "loss": 0.0898,
+      "step": 36007
+    },
+    {
+      "epoch": 0.3125667311915695,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0014613350828550863,
+      "loss": 0.0986,
+      "step": 36008
+    },
+    {
+      "epoch": 0.31257541167177366,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0014613078242267333,
+      "loss": 0.1191,
+      "step": 36009
+    },
+    {
+      "epoch": 0.31258409215197785,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0014612805652032593,
+      "loss": 0.0732,
+      "step": 36010
+    },
+    {
+      "epoch": 0.312592772632182,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0014612533057846943,
+      "loss": 0.1143,
+      "step": 36011
+    },
+    {
+      "epoch": 0.3126014531123862,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001461226045971068,
+      "loss": 0.1309,
+      "step": 36012
+    },
+    {
+      "epoch": 0.3126101335925903,
+      "grad_norm": 0.05908203125,
+      "learning_rate": 0.0014611987857624104,
+      "loss": 0.0583,
+      "step": 36013
+    },
+    {
+      "epoch": 0.3126188140727945,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0014611715251587514,
+      "loss": 0.0786,
+      "step": 36014
+    },
+    {
+      "epoch": 0.31262749455299865,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0014611442641601202,
+      "loss": 0.1406,
+      "step": 36015
+    },
+    {
+      "epoch": 0.31263617503320285,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0014611170027665475,
+      "loss": 0.082,
+      "step": 36016
+    },
+    {
+      "epoch": 0.312644855513407,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0014610897409780622,
+      "loss": 0.0835,
+      "step": 36017
+    },
+    {
+      "epoch": 0.3126535359936112,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0014610624787946948,
+      "loss": 0.0986,
+      "step": 36018
+    },
+    {
+      "epoch": 0.3126622164738153,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0014610352162164751,
+      "loss": 0.1187,
+      "step": 36019
+    },
+    {
+      "epoch": 0.3126708969540195,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0014610079532434328,
+      "loss": 0.1006,
+      "step": 36020
+    },
+    {
+      "epoch": 0.31267957743422364,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0014609806898755976,
+      "loss": 0.1328,
+      "step": 36021
+    },
+    {
+      "epoch": 0.31268825791442784,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001460953426112999,
+      "loss": 0.0898,
+      "step": 36022
+    },
+    {
+      "epoch": 0.312696938394632,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0014609261619556674,
+      "loss": 0.123,
+      "step": 36023
+    },
+    {
+      "epoch": 0.31270561887483617,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001460898897403632,
+      "loss": 0.103,
+      "step": 36024
+    },
+    {
+      "epoch": 0.3127142993550403,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0014608716324569235,
+      "loss": 0.0977,
+      "step": 36025
+    },
+    {
+      "epoch": 0.3127229798352445,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0014608443671155708,
+      "loss": 0.1055,
+      "step": 36026
+    },
+    {
+      "epoch": 0.31273166031544863,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0014608171013796043,
+      "loss": 0.1768,
+      "step": 36027
+    },
+    {
+      "epoch": 0.3127403407956528,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0014607898352490536,
+      "loss": 0.082,
+      "step": 36028
+    },
+    {
+      "epoch": 0.31274902127585696,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0014607625687239488,
+      "loss": 0.1123,
+      "step": 36029
+    },
+    {
+      "epoch": 0.31275770175606116,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001460735301804319,
+      "loss": 0.0898,
+      "step": 36030
+    },
+    {
+      "epoch": 0.3127663822362653,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001460708034490195,
+      "loss": 0.0977,
+      "step": 36031
+    },
+    {
+      "epoch": 0.3127750627164695,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0014606807667816059,
+      "loss": 0.1035,
+      "step": 36032
+    },
+    {
+      "epoch": 0.3127837431966736,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0014606534986785816,
+      "loss": 0.1299,
+      "step": 36033
+    },
+    {
+      "epoch": 0.3127924236768778,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0014606262301811522,
+      "loss": 0.1035,
+      "step": 36034
+    },
+    {
+      "epoch": 0.31280110415708196,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0014605989612893476,
+      "loss": 0.1162,
+      "step": 36035
+    },
+    {
+      "epoch": 0.31280978463728615,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0014605716920031969,
+      "loss": 0.0859,
+      "step": 36036
+    },
+    {
+      "epoch": 0.3128184651174903,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0014605444223227304,
+      "loss": 0.0996,
+      "step": 36037
+    },
+    {
+      "epoch": 0.3128271455976945,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0014605171522479783,
+      "loss": 0.1006,
+      "step": 36038
+    },
+    {
+      "epoch": 0.3128358260778986,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0014604898817789703,
+      "loss": 0.1357,
+      "step": 36039
+    },
+    {
+      "epoch": 0.3128445065581028,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0014604626109157356,
+      "loss": 0.0996,
+      "step": 36040
+    },
+    {
+      "epoch": 0.31285318703830695,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0014604353396583042,
+      "loss": 0.0908,
+      "step": 36041
+    },
+    {
+      "epoch": 0.31286186751851114,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0014604080680067064,
+      "loss": 0.1523,
+      "step": 36042
+    },
+    {
+      "epoch": 0.3128705479987153,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0014603807959609719,
+      "loss": 0.1035,
+      "step": 36043
+    },
+    {
+      "epoch": 0.31287922847891947,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0014603535235211303,
+      "loss": 0.0654,
+      "step": 36044
+    },
+    {
+      "epoch": 0.3128879089591236,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001460326250687211,
+      "loss": 0.0918,
+      "step": 36045
+    },
+    {
+      "epoch": 0.3128965894393278,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0014602989774592445,
+      "loss": 0.1211,
+      "step": 36046
+    },
+    {
+      "epoch": 0.31290526991953194,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001460271703837261,
+      "loss": 0.0996,
+      "step": 36047
+    },
+    {
+      "epoch": 0.31291395039973613,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0014602444298212895,
+      "loss": 0.1016,
+      "step": 36048
+    },
+    {
+      "epoch": 0.31292263087994027,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0014602171554113602,
+      "loss": 0.1582,
+      "step": 36049
+    },
+    {
+      "epoch": 0.31293131136014446,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0014601898806075025,
+      "loss": 0.064,
+      "step": 36050
+    },
+    {
+      "epoch": 0.3129399918403486,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0014601626054097467,
+      "loss": 0.1016,
+      "step": 36051
+    },
+    {
+      "epoch": 0.3129486723205528,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0014601353298181227,
+      "loss": 0.0815,
+      "step": 36052
+    },
+    {
+      "epoch": 0.3129573528007569,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0014601080538326598,
+      "loss": 0.1338,
+      "step": 36053
+    },
+    {
+      "epoch": 0.3129660332809611,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0014600807774533884,
+      "loss": 0.1133,
+      "step": 36054
+    },
+    {
+      "epoch": 0.31297471376116526,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001460053500680338,
+      "loss": 0.1152,
+      "step": 36055
+    },
+    {
+      "epoch": 0.31298339424136945,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.001460026223513538,
+      "loss": 0.1123,
+      "step": 36056
+    },
+    {
+      "epoch": 0.3129920747215736,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0014599989459530194,
+      "loss": 0.1055,
+      "step": 36057
+    },
+    {
+      "epoch": 0.3130007552017778,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0014599716679988112,
+      "loss": 0.1621,
+      "step": 36058
+    },
+    {
+      "epoch": 0.3130094356819819,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0014599443896509434,
+      "loss": 0.0771,
+      "step": 36059
+    },
+    {
+      "epoch": 0.3130181161621861,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0014599171109094457,
+      "loss": 0.1216,
+      "step": 36060
+    },
+    {
+      "epoch": 0.31302679664239025,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0014598898317743482,
+      "loss": 0.1406,
+      "step": 36061
+    },
+    {
+      "epoch": 0.31303547712259444,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0014598625522456805,
+      "loss": 0.1201,
+      "step": 36062
+    },
+    {
+      "epoch": 0.3130441576027986,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0014598352723234726,
+      "loss": 0.0952,
+      "step": 36063
+    },
+    {
+      "epoch": 0.31305283808300277,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0014598079920077542,
+      "loss": 0.125,
+      "step": 36064
+    },
+    {
+      "epoch": 0.3130615185632069,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0014597807112985551,
+      "loss": 0.123,
+      "step": 36065
+    },
+    {
+      "epoch": 0.3130701990434111,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0014597534301959052,
+      "loss": 0.1128,
+      "step": 36066
+    },
+    {
+      "epoch": 0.31307887952361524,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0014597261486998343,
+      "loss": 0.1562,
+      "step": 36067
+    },
+    {
+      "epoch": 0.31308756000381943,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0014596988668103726,
+      "loss": 0.0918,
+      "step": 36068
+    },
+    {
+      "epoch": 0.31309624048402357,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0014596715845275495,
+      "loss": 0.0947,
+      "step": 36069
+    },
+    {
+      "epoch": 0.31310492096422776,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0014596443018513952,
+      "loss": 0.1187,
+      "step": 36070
+    },
+    {
+      "epoch": 0.3131136014444319,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0014596170187819385,
+      "loss": 0.0938,
+      "step": 36071
+    },
+    {
+      "epoch": 0.3131222819246361,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0014595897353192108,
+      "loss": 0.0996,
+      "step": 36072
+    },
+    {
+      "epoch": 0.31313096240484023,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0014595624514632411,
+      "loss": 0.0928,
+      "step": 36073
+    },
+    {
+      "epoch": 0.3131396428850444,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0014595351672140589,
+      "loss": 0.1094,
+      "step": 36074
+    },
+    {
+      "epoch": 0.31314832336524856,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0014595078825716946,
+      "loss": 0.1108,
+      "step": 36075
+    },
+    {
+      "epoch": 0.31315700384545275,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0014594805975361779,
+      "loss": 0.1035,
+      "step": 36076
+    },
+    {
+      "epoch": 0.3131656843256569,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001459453312107539,
+      "loss": 0.0908,
+      "step": 36077
+    },
+    {
+      "epoch": 0.3131743648058611,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001459426026285807,
+      "loss": 0.1221,
+      "step": 36078
+    },
+    {
+      "epoch": 0.3131830452860652,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.001459398740071012,
+      "loss": 0.1016,
+      "step": 36079
+    },
+    {
+      "epoch": 0.3131917257662694,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0014593714534631842,
+      "loss": 0.0898,
+      "step": 36080
+    },
+    {
+      "epoch": 0.31320040624647355,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0014593441664623528,
+      "loss": 0.105,
+      "step": 36081
+    },
+    {
+      "epoch": 0.31320908672667774,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0014593168790685482,
+      "loss": 0.0898,
+      "step": 36082
+    },
+    {
+      "epoch": 0.3132177672068819,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0014592895912818001,
+      "loss": 0.123,
+      "step": 36083
+    },
+    {
+      "epoch": 0.3132264476870861,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0014592623031021387,
+      "loss": 0.1055,
+      "step": 36084
+    },
+    {
+      "epoch": 0.3132351281672902,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0014592350145295927,
+      "loss": 0.0942,
+      "step": 36085
+    },
+    {
+      "epoch": 0.3132438086474944,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001459207725564193,
+      "loss": 0.0923,
+      "step": 36086
+    },
+    {
+      "epoch": 0.31325248912769854,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0014591804362059694,
+      "loss": 0.0674,
+      "step": 36087
+    },
+    {
+      "epoch": 0.31326116960790273,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001459153146454951,
+      "loss": 0.0713,
+      "step": 36088
+    },
+    {
+      "epoch": 0.31326985008810687,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0014591258563111684,
+      "loss": 0.1074,
+      "step": 36089
+    },
+    {
+      "epoch": 0.31327853056831106,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0014590985657746511,
+      "loss": 0.127,
+      "step": 36090
+    },
+    {
+      "epoch": 0.3132872110485152,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0014590712748454291,
+      "loss": 0.1133,
+      "step": 36091
+    },
+    {
+      "epoch": 0.3132958915287194,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0014590439835235323,
+      "loss": 0.0801,
+      "step": 36092
+    },
+    {
+      "epoch": 0.31330457200892353,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.00145901669180899,
+      "loss": 0.0962,
+      "step": 36093
+    },
+    {
+      "epoch": 0.3133132524891277,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0014589893997018323,
+      "loss": 0.0825,
+      "step": 36094
+    },
+    {
+      "epoch": 0.31332193296933186,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0014589621072020895,
+      "loss": 0.103,
+      "step": 36095
+    },
+    {
+      "epoch": 0.31333061344953606,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0014589348143097913,
+      "loss": 0.0757,
+      "step": 36096
+    },
+    {
+      "epoch": 0.3133392939297402,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001458907521024967,
+      "loss": 0.1006,
+      "step": 36097
+    },
+    {
+      "epoch": 0.3133479744099444,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001458880227347647,
+      "loss": 0.1035,
+      "step": 36098
+    },
+    {
+      "epoch": 0.3133566548901485,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0014588529332778613,
+      "loss": 0.0762,
+      "step": 36099
+    },
+    {
+      "epoch": 0.3133653353703527,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001458825638815639,
+      "loss": 0.1445,
+      "step": 36100
+    },
+    {
+      "epoch": 0.31337401585055685,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0014587983439610106,
+      "loss": 0.1426,
+      "step": 36101
+    },
+    {
+      "epoch": 0.313382696330761,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0014587710487140055,
+      "loss": 0.082,
+      "step": 36102
+    },
+    {
+      "epoch": 0.3133913768109652,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0014587437530746538,
+      "loss": 0.1465,
+      "step": 36103
+    },
+    {
+      "epoch": 0.3134000572911693,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0014587164570429856,
+      "loss": 0.1299,
+      "step": 36104
+    },
+    {
+      "epoch": 0.3134087377713735,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0014586891606190302,
+      "loss": 0.0801,
+      "step": 36105
+    },
+    {
+      "epoch": 0.31341741825157765,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0014586618638028177,
+      "loss": 0.1035,
+      "step": 36106
+    },
+    {
+      "epoch": 0.31342609873178184,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001458634566594378,
+      "loss": 0.0967,
+      "step": 36107
+    },
+    {
+      "epoch": 0.313434779211986,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0014586072689937413,
+      "loss": 0.1133,
+      "step": 36108
+    },
+    {
+      "epoch": 0.3134434596921902,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0014585799710009368,
+      "loss": 0.1162,
+      "step": 36109
+    },
+    {
+      "epoch": 0.3134521401723943,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014585526726159945,
+      "loss": 0.2441,
+      "step": 36110
+    },
+    {
+      "epoch": 0.3134608206525985,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0014585253738389446,
+      "loss": 0.0771,
+      "step": 36111
+    },
+    {
+      "epoch": 0.31346950113280264,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.001458498074669817,
+      "loss": 0.1113,
+      "step": 36112
+    },
+    {
+      "epoch": 0.31347818161300683,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0014584707751086406,
+      "loss": 0.1318,
+      "step": 36113
+    },
+    {
+      "epoch": 0.31348686209321097,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.001458443475155446,
+      "loss": 0.124,
+      "step": 36114
+    },
+    {
+      "epoch": 0.31349554257341516,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0014584161748102634,
+      "loss": 0.1562,
+      "step": 36115
+    },
+    {
+      "epoch": 0.3135042230536193,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0014583888740731222,
+      "loss": 0.1055,
+      "step": 36116
+    },
+    {
+      "epoch": 0.3135129035338235,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0014583615729440525,
+      "loss": 0.0938,
+      "step": 36117
+    },
+    {
+      "epoch": 0.31352158401402763,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0014583342714230836,
+      "loss": 0.0913,
+      "step": 36118
+    },
+    {
+      "epoch": 0.3135302644942318,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0014583069695102457,
+      "loss": 0.1533,
+      "step": 36119
+    },
+    {
+      "epoch": 0.31353894497443596,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001458279667205569,
+      "loss": 0.0962,
+      "step": 36120
+    },
+    {
+      "epoch": 0.31354762545464016,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.001458252364509083,
+      "loss": 0.168,
+      "step": 36121
+    },
+    {
+      "epoch": 0.3135563059348443,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0014582250614208173,
+      "loss": 0.1475,
+      "step": 36122
+    },
+    {
+      "epoch": 0.3135649864150485,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0014581977579408022,
+      "loss": 0.0903,
+      "step": 36123
+    },
+    {
+      "epoch": 0.3135736668952526,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0014581704540690672,
+      "loss": 0.1055,
+      "step": 36124
+    },
+    {
+      "epoch": 0.3135823473754568,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0014581431498056428,
+      "loss": 0.085,
+      "step": 36125
+    },
+    {
+      "epoch": 0.31359102785566095,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0014581158451505583,
+      "loss": 0.1123,
+      "step": 36126
+    },
+    {
+      "epoch": 0.31359970833586515,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0014580885401038436,
+      "loss": 0.0986,
+      "step": 36127
+    },
+    {
+      "epoch": 0.3136083888160693,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0014580612346655286,
+      "loss": 0.1738,
+      "step": 36128
+    },
+    {
+      "epoch": 0.3136170692962735,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0014580339288356435,
+      "loss": 0.1016,
+      "step": 36129
+    },
+    {
+      "epoch": 0.3136257497764776,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0014580066226142177,
+      "loss": 0.105,
+      "step": 36130
+    },
+    {
+      "epoch": 0.3136344302566818,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001457979316001281,
+      "loss": 0.1104,
+      "step": 36131
+    },
+    {
+      "epoch": 0.31364311073688594,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0014579520089968638,
+      "loss": 0.1167,
+      "step": 36132
+    },
+    {
+      "epoch": 0.31365179121709014,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0014579247016009954,
+      "loss": 0.0986,
+      "step": 36133
+    },
+    {
+      "epoch": 0.3136604716972943,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0014578973938137061,
+      "loss": 0.0977,
+      "step": 36134
+    },
+    {
+      "epoch": 0.31366915217749847,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0014578700856350255,
+      "loss": 0.1123,
+      "step": 36135
+    },
+    {
+      "epoch": 0.3136778326577026,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0014578427770649837,
+      "loss": 0.082,
+      "step": 36136
+    },
+    {
+      "epoch": 0.3136865131379068,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0014578154681036105,
+      "loss": 0.1094,
+      "step": 36137
+    },
+    {
+      "epoch": 0.31369519361811093,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0014577881587509354,
+      "loss": 0.1089,
+      "step": 36138
+    },
+    {
+      "epoch": 0.3137038740983151,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0014577608490069886,
+      "loss": 0.1104,
+      "step": 36139
+    },
+    {
+      "epoch": 0.31371255457851926,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0014577335388718,
+      "loss": 0.1084,
+      "step": 36140
+    },
+    {
+      "epoch": 0.31372123505872346,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0014577062283453995,
+      "loss": 0.0889,
+      "step": 36141
+    },
+    {
+      "epoch": 0.3137299155389276,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0014576789174278167,
+      "loss": 0.1055,
+      "step": 36142
+    },
+    {
+      "epoch": 0.3137385960191318,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0014576516061190815,
+      "loss": 0.1074,
+      "step": 36143
+    },
+    {
+      "epoch": 0.3137472764993359,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0014576242944192243,
+      "loss": 0.0879,
+      "step": 36144
+    },
+    {
+      "epoch": 0.3137559569795401,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001457596982328274,
+      "loss": 0.1177,
+      "step": 36145
+    },
+    {
+      "epoch": 0.31376463745974426,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0014575696698462613,
+      "loss": 0.0889,
+      "step": 36146
+    },
+    {
+      "epoch": 0.31377331793994845,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0014575423569732158,
+      "loss": 0.1328,
+      "step": 36147
+    },
+    {
+      "epoch": 0.3137819984201526,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014575150437091673,
+      "loss": 0.1113,
+      "step": 36148
+    },
+    {
+      "epoch": 0.3137906789003568,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0014574877300541455,
+      "loss": 0.1152,
+      "step": 36149
+    },
+    {
+      "epoch": 0.3137993593805609,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0014574604160081809,
+      "loss": 0.0771,
+      "step": 36150
+    },
+    {
+      "epoch": 0.3138080398607651,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0014574331015713025,
+      "loss": 0.1074,
+      "step": 36151
+    },
+    {
+      "epoch": 0.31381672034096925,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0014574057867435411,
+      "loss": 0.1357,
+      "step": 36152
+    },
+    {
+      "epoch": 0.31382540082117344,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0014573784715249257,
+      "loss": 0.1001,
+      "step": 36153
+    },
+    {
+      "epoch": 0.3138340813013776,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0014573511559154866,
+      "loss": 0.1191,
+      "step": 36154
+    },
+    {
+      "epoch": 0.31384276178158177,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0014573238399152543,
+      "loss": 0.0898,
+      "step": 36155
+    },
+    {
+      "epoch": 0.3138514422617859,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0014572965235242574,
+      "loss": 0.0801,
+      "step": 36156
+    },
+    {
+      "epoch": 0.3138601227419901,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0014572692067425263,
+      "loss": 0.1084,
+      "step": 36157
+    },
+    {
+      "epoch": 0.31386880322219424,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0014572418895700912,
+      "loss": 0.0884,
+      "step": 36158
+    },
+    {
+      "epoch": 0.31387748370239843,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0014572145720069818,
+      "loss": 0.0913,
+      "step": 36159
+    },
+    {
+      "epoch": 0.31388616418260257,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001457187254053228,
+      "loss": 0.1699,
+      "step": 36160
+    },
+    {
+      "epoch": 0.31389484466280676,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0014571599357088591,
+      "loss": 0.0923,
+      "step": 36161
+    },
+    {
+      "epoch": 0.3139035251430109,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0014571326169739056,
+      "loss": 0.1094,
+      "step": 36162
+    },
+    {
+      "epoch": 0.3139122056232151,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0014571052978483974,
+      "loss": 0.1094,
+      "step": 36163
+    },
+    {
+      "epoch": 0.3139208861034192,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0014570779783323643,
+      "loss": 0.0869,
+      "step": 36164
+    },
+    {
+      "epoch": 0.3139295665836234,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001457050658425836,
+      "loss": 0.1143,
+      "step": 36165
+    },
+    {
+      "epoch": 0.31393824706382756,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0014570233381288424,
+      "loss": 0.0923,
+      "step": 36166
+    },
+    {
+      "epoch": 0.31394692754403175,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0014569960174414139,
+      "loss": 0.0776,
+      "step": 36167
+    },
+    {
+      "epoch": 0.3139556080242359,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0014569686963635793,
+      "loss": 0.1182,
+      "step": 36168
+    },
+    {
+      "epoch": 0.3139642885044401,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0014569413748953695,
+      "loss": 0.1387,
+      "step": 36169
+    },
+    {
+      "epoch": 0.3139729689846442,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0014569140530368138,
+      "loss": 0.1299,
+      "step": 36170
+    },
+    {
+      "epoch": 0.3139816494648484,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001456886730787942,
+      "loss": 0.0908,
+      "step": 36171
+    },
+    {
+      "epoch": 0.31399032994505255,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0014568594081487845,
+      "loss": 0.1025,
+      "step": 36172
+    },
+    {
+      "epoch": 0.31399901042525674,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0014568320851193709,
+      "loss": 0.1162,
+      "step": 36173
+    },
+    {
+      "epoch": 0.3140076909054609,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0014568047616997314,
+      "loss": 0.082,
+      "step": 36174
+    },
+    {
+      "epoch": 0.31401637138566507,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001456777437889895,
+      "loss": 0.0752,
+      "step": 36175
+    },
+    {
+      "epoch": 0.3140250518658692,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0014567501136898927,
+      "loss": 0.0908,
+      "step": 36176
+    },
+    {
+      "epoch": 0.3140337323460734,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0014567227890997533,
+      "loss": 0.1123,
+      "step": 36177
+    },
+    {
+      "epoch": 0.31404241282627754,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0014566954641195076,
+      "loss": 0.0947,
+      "step": 36178
+    },
+    {
+      "epoch": 0.31405109330648173,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0014566681387491848,
+      "loss": 0.0928,
+      "step": 36179
+    },
+    {
+      "epoch": 0.31405977378668587,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0014566408129888153,
+      "loss": 0.1016,
+      "step": 36180
+    },
+    {
+      "epoch": 0.31406845426689006,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0014566134868384286,
+      "loss": 0.0918,
+      "step": 36181
+    },
+    {
+      "epoch": 0.3140771347470942,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.001456586160298055,
+      "loss": 0.0757,
+      "step": 36182
+    },
+    {
+      "epoch": 0.3140858152272984,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001456558833367724,
+      "loss": 0.0869,
+      "step": 36183
+    },
+    {
+      "epoch": 0.31409449570750253,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0014565315060474655,
+      "loss": 0.0898,
+      "step": 36184
+    },
+    {
+      "epoch": 0.3141031761877067,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0014565041783373097,
+      "loss": 0.0854,
+      "step": 36185
+    },
+    {
+      "epoch": 0.31411185666791086,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0014564768502372863,
+      "loss": 0.1104,
+      "step": 36186
+    },
+    {
+      "epoch": 0.31412053714811505,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0014564495217474253,
+      "loss": 0.0713,
+      "step": 36187
+    },
+    {
+      "epoch": 0.3141292176283192,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0014564221928677558,
+      "loss": 0.0942,
+      "step": 36188
+    },
+    {
+      "epoch": 0.3141378981085234,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001456394863598309,
+      "loss": 0.0913,
+      "step": 36189
+    },
+    {
+      "epoch": 0.3141465785887275,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0014563675339391137,
+      "loss": 0.1201,
+      "step": 36190
+    },
+    {
+      "epoch": 0.3141552590689317,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0014563402038902007,
+      "loss": 0.085,
+      "step": 36191
+    },
+    {
+      "epoch": 0.31416393954913585,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001456312873451599,
+      "loss": 0.1182,
+      "step": 36192
+    },
+    {
+      "epoch": 0.31417262002934004,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.001456285542623339,
+      "loss": 0.0742,
+      "step": 36193
+    },
+    {
+      "epoch": 0.3141813005095442,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0014562582114054506,
+      "loss": 0.0742,
+      "step": 36194
+    },
+    {
+      "epoch": 0.3141899809897484,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0014562308797979636,
+      "loss": 0.1182,
+      "step": 36195
+    },
+    {
+      "epoch": 0.3141986614699525,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0014562035478009076,
+      "loss": 0.1387,
+      "step": 36196
+    },
+    {
+      "epoch": 0.3142073419501567,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0014561762154143127,
+      "loss": 0.1113,
+      "step": 36197
+    },
+    {
+      "epoch": 0.31421602243036084,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0014561488826382095,
+      "loss": 0.0947,
+      "step": 36198
+    },
+    {
+      "epoch": 0.31422470291056503,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0014561215494726266,
+      "loss": 0.1621,
+      "step": 36199
+    },
+    {
+      "epoch": 0.31423338339076917,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0014560942159175948,
+      "loss": 0.0972,
+      "step": 36200
+    },
+    {
+      "epoch": 0.31424206387097336,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0014560668819731436,
+      "loss": 0.1289,
+      "step": 36201
+    },
+    {
+      "epoch": 0.3142507443511775,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001456039547639303,
+      "loss": 0.103,
+      "step": 36202
+    },
+    {
+      "epoch": 0.3142594248313817,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001456012212916103,
+      "loss": 0.1152,
+      "step": 36203
+    },
+    {
+      "epoch": 0.31426810531158583,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0014559848778035731,
+      "loss": 0.1396,
+      "step": 36204
+    },
+    {
+      "epoch": 0.31427678579179,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0014559575423017438,
+      "loss": 0.1162,
+      "step": 36205
+    },
+    {
+      "epoch": 0.31428546627199416,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0014559302064106445,
+      "loss": 0.1123,
+      "step": 36206
+    },
+    {
+      "epoch": 0.31429414675219836,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0014559028701303057,
+      "loss": 0.0776,
+      "step": 36207
+    },
+    {
+      "epoch": 0.3143028272324025,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0014558755334607564,
+      "loss": 0.1201,
+      "step": 36208
+    },
+    {
+      "epoch": 0.3143115077126067,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0014558481964020268,
+      "loss": 0.1245,
+      "step": 36209
+    },
+    {
+      "epoch": 0.3143201881928108,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0014558208589541472,
+      "loss": 0.1484,
+      "step": 36210
+    },
+    {
+      "epoch": 0.314328868673015,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0014557935211171476,
+      "loss": 0.1406,
+      "step": 36211
+    },
+    {
+      "epoch": 0.31433754915321915,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0014557661828910574,
+      "loss": 0.0869,
+      "step": 36212
+    },
+    {
+      "epoch": 0.31434622963342335,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0014557388442759064,
+      "loss": 0.0767,
+      "step": 36213
+    },
+    {
+      "epoch": 0.3143549101136275,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0014557115052717248,
+      "loss": 0.1221,
+      "step": 36214
+    },
+    {
+      "epoch": 0.3143635905938317,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0014556841658785425,
+      "loss": 0.1162,
+      "step": 36215
+    },
+    {
+      "epoch": 0.3143722710740358,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0014556568260963895,
+      "loss": 0.1162,
+      "step": 36216
+    },
+    {
+      "epoch": 0.31438095155424,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0014556294859252952,
+      "loss": 0.0786,
+      "step": 36217
+    },
+    {
+      "epoch": 0.31438963203444414,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0014556021453652902,
+      "loss": 0.0889,
+      "step": 36218
+    },
+    {
+      "epoch": 0.31439831251464834,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0014555748044164036,
+      "loss": 0.0996,
+      "step": 36219
+    },
+    {
+      "epoch": 0.3144069929948525,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0014555474630786663,
+      "loss": 0.1084,
+      "step": 36220
+    },
+    {
+      "epoch": 0.31441567347505667,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0014555201213521073,
+      "loss": 0.1152,
+      "step": 36221
+    },
+    {
+      "epoch": 0.3144243539552608,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0014554927792367566,
+      "loss": 0.1147,
+      "step": 36222
+    },
+    {
+      "epoch": 0.314433034435465,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0014554654367326448,
+      "loss": 0.0884,
+      "step": 36223
+    },
+    {
+      "epoch": 0.31444171491566913,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0014554380938398008,
+      "loss": 0.1084,
+      "step": 36224
+    },
+    {
+      "epoch": 0.3144503953958733,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0014554107505582556,
+      "loss": 0.1309,
+      "step": 36225
+    },
+    {
+      "epoch": 0.31445907587607747,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0014553834068880384,
+      "loss": 0.1045,
+      "step": 36226
+    },
+    {
+      "epoch": 0.3144677563562816,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001455356062829179,
+      "loss": 0.1426,
+      "step": 36227
+    },
+    {
+      "epoch": 0.3144764368364858,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0014553287183817077,
+      "loss": 0.1162,
+      "step": 36228
+    },
+    {
+      "epoch": 0.31448511731668993,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0014553013735456545,
+      "loss": 0.1123,
+      "step": 36229
+    },
+    {
+      "epoch": 0.3144937977968941,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0014552740283210485,
+      "loss": 0.1289,
+      "step": 36230
+    },
+    {
+      "epoch": 0.31450247827709826,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0014552466827079205,
+      "loss": 0.1099,
+      "step": 36231
+    },
+    {
+      "epoch": 0.31451115875730246,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0014552193367063002,
+      "loss": 0.1055,
+      "step": 36232
+    },
+    {
+      "epoch": 0.3145198392375066,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0014551919903162173,
+      "loss": 0.0811,
+      "step": 36233
+    },
+    {
+      "epoch": 0.3145285197177108,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0014551646435377018,
+      "loss": 0.1216,
+      "step": 36234
+    },
+    {
+      "epoch": 0.3145372001979149,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0014551372963707832,
+      "loss": 0.1226,
+      "step": 36235
+    },
+    {
+      "epoch": 0.3145458806781191,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001455109948815492,
+      "loss": 0.1328,
+      "step": 36236
+    },
+    {
+      "epoch": 0.31455456115832325,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001455082600871858,
+      "loss": 0.1001,
+      "step": 36237
+    },
+    {
+      "epoch": 0.31456324163852745,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001455055252539911,
+      "loss": 0.1201,
+      "step": 36238
+    },
+    {
+      "epoch": 0.3145719221187316,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001455027903819681,
+      "loss": 0.0972,
+      "step": 36239
+    },
+    {
+      "epoch": 0.3145806025989358,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0014550005547111975,
+      "loss": 0.0996,
+      "step": 36240
+    },
+    {
+      "epoch": 0.3145892830791399,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0014549732052144913,
+      "loss": 0.124,
+      "step": 36241
+    },
+    {
+      "epoch": 0.3145979635593441,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001454945855329591,
+      "loss": 0.0967,
+      "step": 36242
+    },
+    {
+      "epoch": 0.31460664403954824,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001454918505056528,
+      "loss": 0.0869,
+      "step": 36243
+    },
+    {
+      "epoch": 0.31461532451975244,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0014548911543953308,
+      "loss": 0.1807,
+      "step": 36244
+    },
+    {
+      "epoch": 0.3146240049999566,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0014548638033460304,
+      "loss": 0.0825,
+      "step": 36245
+    },
+    {
+      "epoch": 0.31463268548016077,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0014548364519086558,
+      "loss": 0.0708,
+      "step": 36246
+    },
+    {
+      "epoch": 0.3146413659603649,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0014548091000832378,
+      "loss": 0.1021,
+      "step": 36247
+    },
+    {
+      "epoch": 0.3146500464405691,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0014547817478698056,
+      "loss": 0.0869,
+      "step": 36248
+    },
+    {
+      "epoch": 0.31465872692077324,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0014547543952683898,
+      "loss": 0.085,
+      "step": 36249
+    },
+    {
+      "epoch": 0.31466740740097743,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0014547270422790198,
+      "loss": 0.1133,
+      "step": 36250
+    },
+    {
+      "epoch": 0.31467608788118157,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0014546996889017256,
+      "loss": 0.0737,
+      "step": 36251
+    },
+    {
+      "epoch": 0.31468476836138576,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0014546723351365372,
+      "loss": 0.2617,
+      "step": 36252
+    },
+    {
+      "epoch": 0.3146934488415899,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001454644980983484,
+      "loss": 0.1216,
+      "step": 36253
+    },
+    {
+      "epoch": 0.3147021293217941,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001454617626442597,
+      "loss": 0.1348,
+      "step": 36254
+    },
+    {
+      "epoch": 0.3147108098019982,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0014545902715139053,
+      "loss": 0.1182,
+      "step": 36255
+    },
+    {
+      "epoch": 0.3147194902822024,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0014545629161974388,
+      "loss": 0.1816,
+      "step": 36256
+    },
+    {
+      "epoch": 0.31472817076240656,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001454535560493228,
+      "loss": 0.1182,
+      "step": 36257
+    },
+    {
+      "epoch": 0.31473685124261075,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0014545082044013018,
+      "loss": 0.1211,
+      "step": 36258
+    },
+    {
+      "epoch": 0.3147455317228149,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0014544808479216914,
+      "loss": 0.1455,
+      "step": 36259
+    },
+    {
+      "epoch": 0.3147542122030191,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0014544534910544257,
+      "loss": 0.0967,
+      "step": 36260
+    },
+    {
+      "epoch": 0.3147628926832232,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0014544261337995352,
+      "loss": 0.1021,
+      "step": 36261
+    },
+    {
+      "epoch": 0.3147715731634274,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0014543987761570494,
+      "loss": 0.1099,
+      "step": 36262
+    },
+    {
+      "epoch": 0.31478025364363155,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0014543714181269988,
+      "loss": 0.1064,
+      "step": 36263
+    },
+    {
+      "epoch": 0.31478893412383574,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0014543440597094125,
+      "loss": 0.0811,
+      "step": 36264
+    },
+    {
+      "epoch": 0.3147976146040399,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0014543167009043211,
+      "loss": 0.0957,
+      "step": 36265
+    },
+    {
+      "epoch": 0.31480629508424407,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0014542893417117545,
+      "loss": 0.1055,
+      "step": 36266
+    },
+    {
+      "epoch": 0.3148149755644482,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0014542619821317417,
+      "loss": 0.1113,
+      "step": 36267
+    },
+    {
+      "epoch": 0.3148236560446524,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001454234622164314,
+      "loss": 0.0918,
+      "step": 36268
+    },
+    {
+      "epoch": 0.31483233652485654,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0014542072618095004,
+      "loss": 0.0947,
+      "step": 36269
+    },
+    {
+      "epoch": 0.31484101700506073,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0014541799010673309,
+      "loss": 0.0903,
+      "step": 36270
+    },
+    {
+      "epoch": 0.31484969748526487,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001454152539937836,
+      "loss": 0.0864,
+      "step": 36271
+    },
+    {
+      "epoch": 0.31485837796546906,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0014541251784210448,
+      "loss": 0.0918,
+      "step": 36272
+    },
+    {
+      "epoch": 0.3148670584456732,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014540978165169878,
+      "loss": 0.1719,
+      "step": 36273
+    },
+    {
+      "epoch": 0.3148757389258774,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0014540704542256946,
+      "loss": 0.1191,
+      "step": 36274
+    },
+    {
+      "epoch": 0.31488441940608153,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0014540430915471955,
+      "loss": 0.1523,
+      "step": 36275
+    },
+    {
+      "epoch": 0.3148930998862857,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0014540157284815198,
+      "loss": 0.1328,
+      "step": 36276
+    },
+    {
+      "epoch": 0.31490178036648986,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0014539883650286983,
+      "loss": 0.1235,
+      "step": 36277
+    },
+    {
+      "epoch": 0.31491046084669405,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0014539610011887604,
+      "loss": 0.0806,
+      "step": 36278
+    },
+    {
+      "epoch": 0.3149191413268982,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0014539336369617354,
+      "loss": 0.1011,
+      "step": 36279
+    },
+    {
+      "epoch": 0.3149278218071024,
+      "grad_norm": 1.703125,
+      "learning_rate": 0.0014539062723476549,
+      "loss": 0.125,
+      "step": 36280
+    },
+    {
+      "epoch": 0.3149365022873065,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001453878907346547,
+      "loss": 0.0933,
+      "step": 36281
+    },
+    {
+      "epoch": 0.3149451827675107,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0014538515419584431,
+      "loss": 0.1152,
+      "step": 36282
+    },
+    {
+      "epoch": 0.31495386324771485,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001453824176183372,
+      "loss": 0.0698,
+      "step": 36283
+    },
+    {
+      "epoch": 0.31496254372791904,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0014537968100213643,
+      "loss": 0.0957,
+      "step": 36284
+    },
+    {
+      "epoch": 0.3149712242081232,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014537694434724495,
+      "loss": 0.1074,
+      "step": 36285
+    },
+    {
+      "epoch": 0.3149799046883274,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001453742076536658,
+      "loss": 0.1172,
+      "step": 36286
+    },
+    {
+      "epoch": 0.3149885851685315,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001453714709214019,
+      "loss": 0.0913,
+      "step": 36287
+    },
+    {
+      "epoch": 0.3149972656487357,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0014536873415045635,
+      "loss": 0.0801,
+      "step": 36288
+    },
+    {
+      "epoch": 0.31500594612893984,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0014536599734083207,
+      "loss": 0.0947,
+      "step": 36289
+    },
+    {
+      "epoch": 0.31501462660914403,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0014536326049253206,
+      "loss": 0.1123,
+      "step": 36290
+    },
+    {
+      "epoch": 0.31502330708934817,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0014536052360555934,
+      "loss": 0.1196,
+      "step": 36291
+    },
+    {
+      "epoch": 0.31503198756955236,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0014535778667991684,
+      "loss": 0.1289,
+      "step": 36292
+    },
+    {
+      "epoch": 0.3150406680497565,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0014535504971560761,
+      "loss": 0.1865,
+      "step": 36293
+    },
+    {
+      "epoch": 0.3150493485299607,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0014535231271263465,
+      "loss": 0.0928,
+      "step": 36294
+    },
+    {
+      "epoch": 0.31505802901016483,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001453495756710009,
+      "loss": 0.1104,
+      "step": 36295
+    },
+    {
+      "epoch": 0.315066709490369,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0014534683859070941,
+      "loss": 0.0977,
+      "step": 36296
+    },
+    {
+      "epoch": 0.31507538997057316,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0014534410147176314,
+      "loss": 0.1025,
+      "step": 36297
+    },
+    {
+      "epoch": 0.31508407045077735,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0014534136431416512,
+      "loss": 0.1318,
+      "step": 36298
+    },
+    {
+      "epoch": 0.3150927509309815,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0014533862711791827,
+      "loss": 0.1045,
+      "step": 36299
+    },
+    {
+      "epoch": 0.3151014314111857,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0014533588988302564,
+      "loss": 0.1533,
+      "step": 36300
+    },
+    {
+      "epoch": 0.3151101118913898,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0014533315260949022,
+      "loss": 0.1016,
+      "step": 36301
+    },
+    {
+      "epoch": 0.315118792371594,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0014533041529731499,
+      "loss": 0.1016,
+      "step": 36302
+    },
+    {
+      "epoch": 0.31512747285179815,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0014532767794650297,
+      "loss": 0.0732,
+      "step": 36303
+    },
+    {
+      "epoch": 0.31513615333200234,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001453249405570571,
+      "loss": 0.1045,
+      "step": 36304
+    },
+    {
+      "epoch": 0.3151448338122065,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0014532220312898042,
+      "loss": 0.1011,
+      "step": 36305
+    },
+    {
+      "epoch": 0.3151535142924107,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.001453194656622759,
+      "loss": 0.1309,
+      "step": 36306
+    },
+    {
+      "epoch": 0.3151621947726148,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0014531672815694657,
+      "loss": 0.1118,
+      "step": 36307
+    },
+    {
+      "epoch": 0.315170875252819,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0014531399061299538,
+      "loss": 0.1094,
+      "step": 36308
+    },
+    {
+      "epoch": 0.31517955573302314,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0014531125303042533,
+      "loss": 0.125,
+      "step": 36309
+    },
+    {
+      "epoch": 0.31518823621322734,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0014530851540923945,
+      "loss": 0.0884,
+      "step": 36310
+    },
+    {
+      "epoch": 0.3151969166934315,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001453057777494407,
+      "loss": 0.1318,
+      "step": 36311
+    },
+    {
+      "epoch": 0.31520559717363567,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0014530304005103208,
+      "loss": 0.1436,
+      "step": 36312
+    },
+    {
+      "epoch": 0.3152142776538398,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0014530030231401654,
+      "loss": 0.085,
+      "step": 36313
+    },
+    {
+      "epoch": 0.315222958134044,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0014529756453839718,
+      "loss": 0.1357,
+      "step": 36314
+    },
+    {
+      "epoch": 0.31523163861424813,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001452948267241769,
+      "loss": 0.0884,
+      "step": 36315
+    },
+    {
+      "epoch": 0.3152403190944523,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0014529208887135874,
+      "loss": 0.1455,
+      "step": 36316
+    },
+    {
+      "epoch": 0.31524899957465646,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0014528935097994568,
+      "loss": 0.1006,
+      "step": 36317
+    },
+    {
+      "epoch": 0.31525768005486066,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0014528661304994072,
+      "loss": 0.0933,
+      "step": 36318
+    },
+    {
+      "epoch": 0.3152663605350648,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0014528387508134686,
+      "loss": 0.123,
+      "step": 36319
+    },
+    {
+      "epoch": 0.315275041015269,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001452811370741671,
+      "loss": 0.0664,
+      "step": 36320
+    },
+    {
+      "epoch": 0.3152837214954731,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0014527839902840438,
+      "loss": 0.0947,
+      "step": 36321
+    },
+    {
+      "epoch": 0.3152924019756773,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0014527566094406173,
+      "loss": 0.1387,
+      "step": 36322
+    },
+    {
+      "epoch": 0.31530108245588145,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0014527292282114218,
+      "loss": 0.0986,
+      "step": 36323
+    },
+    {
+      "epoch": 0.31530976293608565,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0014527018465964865,
+      "loss": 0.082,
+      "step": 36324
+    },
+    {
+      "epoch": 0.3153184434162898,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001452674464595842,
+      "loss": 0.1406,
+      "step": 36325
+    },
+    {
+      "epoch": 0.315327123896494,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.001452647082209518,
+      "loss": 0.1128,
+      "step": 36326
+    },
+    {
+      "epoch": 0.3153358043766981,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0014526196994375444,
+      "loss": 0.0967,
+      "step": 36327
+    },
+    {
+      "epoch": 0.3153444848569023,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0014525923162799514,
+      "loss": 0.085,
+      "step": 36328
+    },
+    {
+      "epoch": 0.31535316533710644,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0014525649327367686,
+      "loss": 0.1309,
+      "step": 36329
+    },
+    {
+      "epoch": 0.31536184581731064,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0014525375488080262,
+      "loss": 0.0918,
+      "step": 36330
+    },
+    {
+      "epoch": 0.3153705262975148,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0014525101644937539,
+      "loss": 0.124,
+      "step": 36331
+    },
+    {
+      "epoch": 0.31537920677771897,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0014524827797939817,
+      "loss": 0.1279,
+      "step": 36332
+    },
+    {
+      "epoch": 0.3153878872579231,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0014524553947087395,
+      "loss": 0.1025,
+      "step": 36333
+    },
+    {
+      "epoch": 0.3153965677381273,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0014524280092380578,
+      "loss": 0.0967,
+      "step": 36334
+    },
+    {
+      "epoch": 0.31540524821833144,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0014524006233819658,
+      "loss": 0.0977,
+      "step": 36335
+    },
+    {
+      "epoch": 0.31541392869853563,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0014523732371404942,
+      "loss": 0.1299,
+      "step": 36336
+    },
+    {
+      "epoch": 0.31542260917873977,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.001452345850513672,
+      "loss": 0.1875,
+      "step": 36337
+    },
+    {
+      "epoch": 0.31543128965894396,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014523184635015299,
+      "loss": 0.127,
+      "step": 36338
+    },
+    {
+      "epoch": 0.3154399701391481,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0014522910761040977,
+      "loss": 0.0732,
+      "step": 36339
+    },
+    {
+      "epoch": 0.3154486506193523,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0014522636883214052,
+      "loss": 0.1133,
+      "step": 36340
+    },
+    {
+      "epoch": 0.3154573310995564,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0014522363001534823,
+      "loss": 0.127,
+      "step": 36341
+    },
+    {
+      "epoch": 0.3154660115797606,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0014522089116003592,
+      "loss": 0.0796,
+      "step": 36342
+    },
+    {
+      "epoch": 0.31547469205996476,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0014521815226620656,
+      "loss": 0.1104,
+      "step": 36343
+    },
+    {
+      "epoch": 0.31548337254016895,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0014521541333386316,
+      "loss": 0.1299,
+      "step": 36344
+    },
+    {
+      "epoch": 0.3154920530203731,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0014521267436300872,
+      "loss": 0.0972,
+      "step": 36345
+    },
+    {
+      "epoch": 0.3155007335005773,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0014520993535364624,
+      "loss": 0.0947,
+      "step": 36346
+    },
+    {
+      "epoch": 0.3155094139807814,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001452071963057787,
+      "loss": 0.0859,
+      "step": 36347
+    },
+    {
+      "epoch": 0.31551809446098555,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0014520445721940907,
+      "loss": 0.1021,
+      "step": 36348
+    },
+    {
+      "epoch": 0.31552677494118975,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0014520171809454042,
+      "loss": 0.0977,
+      "step": 36349
+    },
+    {
+      "epoch": 0.3155354554213939,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.001451989789311757,
+      "loss": 0.1157,
+      "step": 36350
+    },
+    {
+      "epoch": 0.3155441359015981,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0014519623972931786,
+      "loss": 0.0908,
+      "step": 36351
+    },
+    {
+      "epoch": 0.3155528163818022,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0014519350048896994,
+      "loss": 0.0591,
+      "step": 36352
+    },
+    {
+      "epoch": 0.3155614968620064,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0014519076121013498,
+      "loss": 0.0889,
+      "step": 36353
+    },
+    {
+      "epoch": 0.31557017734221054,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001451880218928159,
+      "loss": 0.0693,
+      "step": 36354
+    },
+    {
+      "epoch": 0.31557885782241474,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0014518528253701574,
+      "loss": 0.1299,
+      "step": 36355
+    },
+    {
+      "epoch": 0.3155875383026189,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0014518254314273746,
+      "loss": 0.1064,
+      "step": 36356
+    },
+    {
+      "epoch": 0.31559621878282307,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0014517980370998414,
+      "loss": 0.0894,
+      "step": 36357
+    },
+    {
+      "epoch": 0.3156048992630272,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014517706423875867,
+      "loss": 0.1152,
+      "step": 36358
+    },
+    {
+      "epoch": 0.3156135797432314,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001451743247290641,
+      "loss": 0.0981,
+      "step": 36359
+    },
+    {
+      "epoch": 0.31562226022343554,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0014517158518090342,
+      "loss": 0.0996,
+      "step": 36360
+    },
+    {
+      "epoch": 0.31563094070363973,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0014516884559427961,
+      "loss": 0.1099,
+      "step": 36361
+    },
+    {
+      "epoch": 0.31563962118384387,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001451661059691957,
+      "loss": 0.1631,
+      "step": 36362
+    },
+    {
+      "epoch": 0.31564830166404806,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0014516336630565462,
+      "loss": 0.1094,
+      "step": 36363
+    },
+    {
+      "epoch": 0.3156569821442522,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001451606266036595,
+      "loss": 0.1133,
+      "step": 36364
+    },
+    {
+      "epoch": 0.3156656626244564,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0014515788686321315,
+      "loss": 0.1006,
+      "step": 36365
+    },
+    {
+      "epoch": 0.3156743431046605,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001451551470843187,
+      "loss": 0.1074,
+      "step": 36366
+    },
+    {
+      "epoch": 0.3156830235848647,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0014515240726697915,
+      "loss": 0.084,
+      "step": 36367
+    },
+    {
+      "epoch": 0.31569170406506886,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.001451496674111974,
+      "loss": 0.1138,
+      "step": 36368
+    },
+    {
+      "epoch": 0.31570038454527305,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0014514692751697652,
+      "loss": 0.1191,
+      "step": 36369
+    },
+    {
+      "epoch": 0.3157090650254772,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001451441875843195,
+      "loss": 0.1709,
+      "step": 36370
+    },
+    {
+      "epoch": 0.3157177455056814,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0014514144761322931,
+      "loss": 0.0771,
+      "step": 36371
+    },
+    {
+      "epoch": 0.3157264259858855,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0014513870760370898,
+      "loss": 0.1328,
+      "step": 36372
+    },
+    {
+      "epoch": 0.3157351064660897,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0014513596755576146,
+      "loss": 0.085,
+      "step": 36373
+    },
+    {
+      "epoch": 0.31574378694629385,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0014513322746938977,
+      "loss": 0.1211,
+      "step": 36374
+    },
+    {
+      "epoch": 0.31575246742649804,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0014513048734459692,
+      "loss": 0.1011,
+      "step": 36375
+    },
+    {
+      "epoch": 0.3157611479067022,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001451277471813859,
+      "loss": 0.0981,
+      "step": 36376
+    },
+    {
+      "epoch": 0.31576982838690637,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0014512500697975972,
+      "loss": 0.2969,
+      "step": 36377
+    },
+    {
+      "epoch": 0.3157785088671105,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0014512226673972135,
+      "loss": 0.1035,
+      "step": 36378
+    },
+    {
+      "epoch": 0.3157871893473147,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001451195264612738,
+      "loss": 0.0889,
+      "step": 36379
+    },
+    {
+      "epoch": 0.31579586982751884,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0014511678614442006,
+      "loss": 0.0908,
+      "step": 36380
+    },
+    {
+      "epoch": 0.31580455030772303,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0014511404578916311,
+      "loss": 0.0947,
+      "step": 36381
+    },
+    {
+      "epoch": 0.31581323078792717,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00145111305395506,
+      "loss": 0.0996,
+      "step": 36382
+    },
+    {
+      "epoch": 0.31582191126813136,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0014510856496345167,
+      "loss": 0.1318,
+      "step": 36383
+    },
+    {
+      "epoch": 0.3158305917483355,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0014510582449300316,
+      "loss": 0.1055,
+      "step": 36384
+    },
+    {
+      "epoch": 0.3158392722285397,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0014510308398416342,
+      "loss": 0.1357,
+      "step": 36385
+    },
+    {
+      "epoch": 0.31584795270874383,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001451003434369355,
+      "loss": 0.0786,
+      "step": 36386
+    },
+    {
+      "epoch": 0.315856633188948,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0014509760285132236,
+      "loss": 0.1035,
+      "step": 36387
+    },
+    {
+      "epoch": 0.31586531366915216,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0014509486222732703,
+      "loss": 0.1064,
+      "step": 36388
+    },
+    {
+      "epoch": 0.31587399414935635,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0014509212156495247,
+      "loss": 0.0918,
+      "step": 36389
+    },
+    {
+      "epoch": 0.3158826746295605,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0014508938086420168,
+      "loss": 0.1465,
+      "step": 36390
+    },
+    {
+      "epoch": 0.3158913551097647,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0014508664012507768,
+      "loss": 0.1206,
+      "step": 36391
+    },
+    {
+      "epoch": 0.3159000355899688,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0014508389934758345,
+      "loss": 0.106,
+      "step": 36392
+    },
+    {
+      "epoch": 0.315908716070173,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0014508115853172198,
+      "loss": 0.165,
+      "step": 36393
+    },
+    {
+      "epoch": 0.31591739655037715,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0014507841767749635,
+      "loss": 0.1064,
+      "step": 36394
+    },
+    {
+      "epoch": 0.31592607703058134,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0014507567678490943,
+      "loss": 0.0928,
+      "step": 36395
+    },
+    {
+      "epoch": 0.3159347575107855,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0014507293585396428,
+      "loss": 0.1123,
+      "step": 36396
+    },
+    {
+      "epoch": 0.3159434379909897,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014507019488466393,
+      "loss": 0.1035,
+      "step": 36397
+    },
+    {
+      "epoch": 0.3159521184711938,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0014506745387701133,
+      "loss": 0.0933,
+      "step": 36398
+    },
+    {
+      "epoch": 0.315960798951398,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0014506471283100948,
+      "loss": 0.1006,
+      "step": 36399
+    },
+    {
+      "epoch": 0.31596947943160214,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001450619717466614,
+      "loss": 0.1025,
+      "step": 36400
+    },
+    {
+      "epoch": 0.31597815991180633,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0014505923062397002,
+      "loss": 0.1055,
+      "step": 36401
+    },
+    {
+      "epoch": 0.31598684039201047,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0014505648946293844,
+      "loss": 0.0918,
+      "step": 36402
+    },
+    {
+      "epoch": 0.31599552087221466,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0014505374826356962,
+      "loss": 0.1191,
+      "step": 36403
+    },
+    {
+      "epoch": 0.3160042013524188,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0014505100702586651,
+      "loss": 0.0913,
+      "step": 36404
+    },
+    {
+      "epoch": 0.316012881832623,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0014504826574983215,
+      "loss": 0.1147,
+      "step": 36405
+    },
+    {
+      "epoch": 0.31602156231282713,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0014504552443546956,
+      "loss": 0.1123,
+      "step": 36406
+    },
+    {
+      "epoch": 0.3160302427930313,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0014504278308278171,
+      "loss": 0.167,
+      "step": 36407
+    },
+    {
+      "epoch": 0.31603892327323546,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0014504004169177158,
+      "loss": 0.0815,
+      "step": 36408
+    },
+    {
+      "epoch": 0.31604760375343965,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001450373002624422,
+      "loss": 0.1113,
+      "step": 36409
+    },
+    {
+      "epoch": 0.3160562842336438,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0014503455879479655,
+      "loss": 0.123,
+      "step": 36410
+    },
+    {
+      "epoch": 0.316064964713848,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001450318172888376,
+      "loss": 0.1006,
+      "step": 36411
+    },
+    {
+      "epoch": 0.3160736451940521,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0014502907574456844,
+      "loss": 0.0947,
+      "step": 36412
+    },
+    {
+      "epoch": 0.3160823256742563,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014502633416199197,
+      "loss": 0.0889,
+      "step": 36413
+    },
+    {
+      "epoch": 0.31609100615446045,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014502359254111123,
+      "loss": 0.1221,
+      "step": 36414
+    },
+    {
+      "epoch": 0.31609968663466465,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0014502085088192924,
+      "loss": 0.0869,
+      "step": 36415
+    },
+    {
+      "epoch": 0.3161083671148688,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0014501810918444895,
+      "loss": 0.083,
+      "step": 36416
+    },
+    {
+      "epoch": 0.316117047595073,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001450153674486734,
+      "loss": 0.1152,
+      "step": 36417
+    },
+    {
+      "epoch": 0.3161257280752771,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0014501262567460556,
+      "loss": 0.1484,
+      "step": 36418
+    },
+    {
+      "epoch": 0.3161344085554813,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0014500988386224845,
+      "loss": 0.1279,
+      "step": 36419
+    },
+    {
+      "epoch": 0.31614308903568544,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0014500714201160503,
+      "loss": 0.0869,
+      "step": 36420
+    },
+    {
+      "epoch": 0.31615176951588964,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0014500440012267832,
+      "loss": 0.1064,
+      "step": 36421
+    },
+    {
+      "epoch": 0.3161604499960938,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0014500165819547137,
+      "loss": 0.1641,
+      "step": 36422
+    },
+    {
+      "epoch": 0.31616913047629797,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.001449989162299871,
+      "loss": 0.1816,
+      "step": 36423
+    },
+    {
+      "epoch": 0.3161778109565021,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0014499617422622858,
+      "loss": 0.1406,
+      "step": 36424
+    },
+    {
+      "epoch": 0.3161864914367063,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0014499343218419875,
+      "loss": 0.1064,
+      "step": 36425
+    },
+    {
+      "epoch": 0.31619517191691043,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001449906901039006,
+      "loss": 0.1172,
+      "step": 36426
+    },
+    {
+      "epoch": 0.3162038523971146,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0014498794798533717,
+      "loss": 0.1021,
+      "step": 36427
+    },
+    {
+      "epoch": 0.31621253287731876,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0014498520582851145,
+      "loss": 0.0947,
+      "step": 36428
+    },
+    {
+      "epoch": 0.31622121335752296,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0014498246363342643,
+      "loss": 0.0977,
+      "step": 36429
+    },
+    {
+      "epoch": 0.3162298938377271,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0014497972140008512,
+      "loss": 0.0977,
+      "step": 36430
+    },
+    {
+      "epoch": 0.3162385743179313,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0014497697912849048,
+      "loss": 0.1064,
+      "step": 36431
+    },
+    {
+      "epoch": 0.3162472547981354,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001449742368186456,
+      "loss": 0.0771,
+      "step": 36432
+    },
+    {
+      "epoch": 0.3162559352783396,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001449714944705534,
+      "loss": 0.0928,
+      "step": 36433
+    },
+    {
+      "epoch": 0.31626461575854375,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0014496875208421687,
+      "loss": 0.085,
+      "step": 36434
+    },
+    {
+      "epoch": 0.31627329623874795,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0014496600965963906,
+      "loss": 0.1045,
+      "step": 36435
+    },
+    {
+      "epoch": 0.3162819767189521,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0014496326719682295,
+      "loss": 0.0698,
+      "step": 36436
+    },
+    {
+      "epoch": 0.3162906571991563,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0014496052469577154,
+      "loss": 0.0938,
+      "step": 36437
+    },
+    {
+      "epoch": 0.3162993376793604,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.001449577821564878,
+      "loss": 0.0908,
+      "step": 36438
+    },
+    {
+      "epoch": 0.3163080181595646,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001449550395789748,
+      "loss": 0.1045,
+      "step": 36439
+    },
+    {
+      "epoch": 0.31631669863976875,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0014495229696323544,
+      "loss": 0.0576,
+      "step": 36440
+    },
+    {
+      "epoch": 0.31632537911997294,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0014494955430927282,
+      "loss": 0.1201,
+      "step": 36441
+    },
+    {
+      "epoch": 0.3163340596001771,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0014494681161708986,
+      "loss": 0.0835,
+      "step": 36442
+    },
+    {
+      "epoch": 0.31634274008038127,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.001449440688866896,
+      "loss": 0.1377,
+      "step": 36443
+    },
+    {
+      "epoch": 0.3163514205605854,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0014494132611807505,
+      "loss": 0.1738,
+      "step": 36444
+    },
+    {
+      "epoch": 0.3163601010407896,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0014493858331124918,
+      "loss": 0.0952,
+      "step": 36445
+    },
+    {
+      "epoch": 0.31636878152099374,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.00144935840466215,
+      "loss": 0.0825,
+      "step": 36446
+    },
+    {
+      "epoch": 0.31637746200119793,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001449330975829755,
+      "loss": 0.0928,
+      "step": 36447
+    },
+    {
+      "epoch": 0.31638614248140207,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001449303546615337,
+      "loss": 0.0723,
+      "step": 36448
+    },
+    {
+      "epoch": 0.31639482296160626,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0014492761170189258,
+      "loss": 0.1016,
+      "step": 36449
+    },
+    {
+      "epoch": 0.3164035034418104,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0014492486870405514,
+      "loss": 0.0947,
+      "step": 36450
+    },
+    {
+      "epoch": 0.3164121839220146,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0014492212566802438,
+      "loss": 0.1099,
+      "step": 36451
+    },
+    {
+      "epoch": 0.3164208644022187,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0014491938259380336,
+      "loss": 0.0898,
+      "step": 36452
+    },
+    {
+      "epoch": 0.3164295448824229,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.00144916639481395,
+      "loss": 0.1113,
+      "step": 36453
+    },
+    {
+      "epoch": 0.31643822536262706,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0014491389633080232,
+      "loss": 0.1113,
+      "step": 36454
+    },
+    {
+      "epoch": 0.31644690584283125,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0014491115314202835,
+      "loss": 0.1089,
+      "step": 36455
+    },
+    {
+      "epoch": 0.3164555863230354,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0014490840991507603,
+      "loss": 0.1216,
+      "step": 36456
+    },
+    {
+      "epoch": 0.3164642668032396,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0014490566664994843,
+      "loss": 0.0884,
+      "step": 36457
+    },
+    {
+      "epoch": 0.3164729472834437,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001449029233466485,
+      "loss": 0.0918,
+      "step": 36458
+    },
+    {
+      "epoch": 0.3164816277636479,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0014490018000517925,
+      "loss": 0.1309,
+      "step": 36459
+    },
+    {
+      "epoch": 0.31649030824385205,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0014489743662554368,
+      "loss": 0.1064,
+      "step": 36460
+    },
+    {
+      "epoch": 0.31649898872405624,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0014489469320774482,
+      "loss": 0.0771,
+      "step": 36461
+    },
+    {
+      "epoch": 0.3165076692042604,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0014489194975178565,
+      "loss": 0.1196,
+      "step": 36462
+    },
+    {
+      "epoch": 0.31651634968446457,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0014488920625766917,
+      "loss": 0.1084,
+      "step": 36463
+    },
+    {
+      "epoch": 0.3165250301646687,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0014488646272539836,
+      "loss": 0.167,
+      "step": 36464
+    },
+    {
+      "epoch": 0.3165337106448729,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001448837191549763,
+      "loss": 0.0903,
+      "step": 36465
+    },
+    {
+      "epoch": 0.31654239112507704,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0014488097554640586,
+      "loss": 0.1104,
+      "step": 36466
+    },
+    {
+      "epoch": 0.31655107160528123,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0014487823189969014,
+      "loss": 0.1152,
+      "step": 36467
+    },
+    {
+      "epoch": 0.31655975208548537,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0014487548821483208,
+      "loss": 0.1025,
+      "step": 36468
+    },
+    {
+      "epoch": 0.31656843256568956,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0014487274449183469,
+      "loss": 0.0723,
+      "step": 36469
+    },
+    {
+      "epoch": 0.3165771130458937,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0014487000073070105,
+      "loss": 0.1011,
+      "step": 36470
+    },
+    {
+      "epoch": 0.31658579352609784,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001448672569314341,
+      "loss": 0.1172,
+      "step": 36471
+    },
+    {
+      "epoch": 0.31659447400630203,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0014486451309403683,
+      "loss": 0.1387,
+      "step": 36472
+    },
+    {
+      "epoch": 0.31660315448650617,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0014486176921851223,
+      "loss": 0.1357,
+      "step": 36473
+    },
+    {
+      "epoch": 0.31661183496671036,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0014485902530486335,
+      "loss": 0.0869,
+      "step": 36474
+    },
+    {
+      "epoch": 0.3166205154469145,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0014485628135309314,
+      "loss": 0.1123,
+      "step": 36475
+    },
+    {
+      "epoch": 0.3166291959271187,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0014485353736320467,
+      "loss": 0.1289,
+      "step": 36476
+    },
+    {
+      "epoch": 0.3166378764073228,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0014485079333520084,
+      "loss": 0.1172,
+      "step": 36477
+    },
+    {
+      "epoch": 0.316646556887527,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001448480492690847,
+      "loss": 0.123,
+      "step": 36478
+    },
+    {
+      "epoch": 0.31665523736773116,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0014484530516485927,
+      "loss": 0.1406,
+      "step": 36479
+    },
+    {
+      "epoch": 0.31666391784793535,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0014484256102252756,
+      "loss": 0.1182,
+      "step": 36480
+    },
+    {
+      "epoch": 0.3166725983281395,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0014483981684209255,
+      "loss": 0.0762,
+      "step": 36481
+    },
+    {
+      "epoch": 0.3166812788083437,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0014483707262355722,
+      "loss": 0.1216,
+      "step": 36482
+    },
+    {
+      "epoch": 0.3166899592885478,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001448343283669246,
+      "loss": 0.1016,
+      "step": 36483
+    },
+    {
+      "epoch": 0.316698639768752,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001448315840721977,
+      "loss": 0.0913,
+      "step": 36484
+    },
+    {
+      "epoch": 0.31670732024895615,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0014482883973937948,
+      "loss": 0.1367,
+      "step": 36485
+    },
+    {
+      "epoch": 0.31671600072916034,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0014482609536847295,
+      "loss": 0.1445,
+      "step": 36486
+    },
+    {
+      "epoch": 0.3167246812093645,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0014482335095948116,
+      "loss": 0.0986,
+      "step": 36487
+    },
+    {
+      "epoch": 0.31673336168956867,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0014482060651240705,
+      "loss": 0.1074,
+      "step": 36488
+    },
+    {
+      "epoch": 0.3167420421697728,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0014481786202725367,
+      "loss": 0.1406,
+      "step": 36489
+    },
+    {
+      "epoch": 0.316750722649977,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0014481511750402399,
+      "loss": 0.084,
+      "step": 36490
+    },
+    {
+      "epoch": 0.31675940313018114,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.00144812372942721,
+      "loss": 0.1162,
+      "step": 36491
+    },
+    {
+      "epoch": 0.31676808361038533,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0014480962834334777,
+      "loss": 0.0967,
+      "step": 36492
+    },
+    {
+      "epoch": 0.31677676409058947,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0014480688370590723,
+      "loss": 0.1079,
+      "step": 36493
+    },
+    {
+      "epoch": 0.31678544457079366,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0014480413903040239,
+      "loss": 0.1099,
+      "step": 36494
+    },
+    {
+      "epoch": 0.3167941250509978,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0014480139431683626,
+      "loss": 0.1318,
+      "step": 36495
+    },
+    {
+      "epoch": 0.316802805531202,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0014479864956521188,
+      "loss": 0.0732,
+      "step": 36496
+    },
+    {
+      "epoch": 0.31681148601140613,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001447959047755322,
+      "loss": 0.0801,
+      "step": 36497
+    },
+    {
+      "epoch": 0.3168201664916103,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0014479315994780021,
+      "loss": 0.1113,
+      "step": 36498
+    },
+    {
+      "epoch": 0.31682884697181446,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0014479041508201897,
+      "loss": 0.0972,
+      "step": 36499
+    },
+    {
+      "epoch": 0.31683752745201865,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0014478767017819148,
+      "loss": 0.1172,
+      "step": 36500
+    },
+    {
+      "epoch": 0.3168462079322228,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001447849252363207,
+      "loss": 0.0771,
+      "step": 36501
+    },
+    {
+      "epoch": 0.316854888412427,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0014478218025640965,
+      "loss": 0.085,
+      "step": 36502
+    },
+    {
+      "epoch": 0.3168635688926311,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0014477943523846133,
+      "loss": 0.1201,
+      "step": 36503
+    },
+    {
+      "epoch": 0.3168722493728353,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0014477669018247874,
+      "loss": 0.0957,
+      "step": 36504
+    },
+    {
+      "epoch": 0.31688092985303945,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0014477394508846488,
+      "loss": 0.1504,
+      "step": 36505
+    },
+    {
+      "epoch": 0.31688961033324364,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0014477119995642276,
+      "loss": 0.1064,
+      "step": 36506
+    },
+    {
+      "epoch": 0.3168982908134478,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0014476845478635539,
+      "loss": 0.0918,
+      "step": 36507
+    },
+    {
+      "epoch": 0.316906971293652,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0014476570957826574,
+      "loss": 0.0718,
+      "step": 36508
+    },
+    {
+      "epoch": 0.3169156517738561,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0014476296433215685,
+      "loss": 0.123,
+      "step": 36509
+    },
+    {
+      "epoch": 0.3169243322540603,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001447602190480317,
+      "loss": 0.1406,
+      "step": 36510
+    },
+    {
+      "epoch": 0.31693301273426444,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001447574737258933,
+      "loss": 0.0645,
+      "step": 36511
+    },
+    {
+      "epoch": 0.31694169321446863,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001447547283657446,
+      "loss": 0.0938,
+      "step": 36512
+    },
+    {
+      "epoch": 0.31695037369467277,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0014475198296758874,
+      "loss": 0.0874,
+      "step": 36513
+    },
+    {
+      "epoch": 0.31695905417487696,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0014474923753142856,
+      "loss": 0.0898,
+      "step": 36514
+    },
+    {
+      "epoch": 0.3169677346550811,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0014474649205726716,
+      "loss": 0.1299,
+      "step": 36515
+    },
+    {
+      "epoch": 0.3169764151352853,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0014474374654510755,
+      "loss": 0.1162,
+      "step": 36516
+    },
+    {
+      "epoch": 0.31698509561548943,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0014474100099495263,
+      "loss": 0.1055,
+      "step": 36517
+    },
+    {
+      "epoch": 0.3169937760956936,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0014473825540680552,
+      "loss": 0.0967,
+      "step": 36518
+    },
+    {
+      "epoch": 0.31700245657589776,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0014473550978066918,
+      "loss": 0.1035,
+      "step": 36519
+    },
+    {
+      "epoch": 0.31701113705610195,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001447327641165466,
+      "loss": 0.1113,
+      "step": 36520
+    },
+    {
+      "epoch": 0.3170198175363061,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0014473001841444077,
+      "loss": 0.1377,
+      "step": 36521
+    },
+    {
+      "epoch": 0.3170284980165103,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0014472727267435474,
+      "loss": 0.085,
+      "step": 36522
+    },
+    {
+      "epoch": 0.3170371784967144,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0014472452689629149,
+      "loss": 0.1299,
+      "step": 36523
+    },
+    {
+      "epoch": 0.3170458589769186,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0014472178108025401,
+      "loss": 0.084,
+      "step": 36524
+    },
+    {
+      "epoch": 0.31705453945712275,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0014471903522624533,
+      "loss": 0.1216,
+      "step": 36525
+    },
+    {
+      "epoch": 0.31706321993732695,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0014471628933426838,
+      "loss": 0.1113,
+      "step": 36526
+    },
+    {
+      "epoch": 0.3170719004175311,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0014471354340432625,
+      "loss": 0.1221,
+      "step": 36527
+    },
+    {
+      "epoch": 0.3170805808977353,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0014471079743642193,
+      "loss": 0.1206,
+      "step": 36528
+    },
+    {
+      "epoch": 0.3170892613779394,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001447080514305584,
+      "loss": 0.0938,
+      "step": 36529
+    },
+    {
+      "epoch": 0.3170979418581436,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0014470530538673865,
+      "loss": 0.1377,
+      "step": 36530
+    },
+    {
+      "epoch": 0.31710662233834774,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001447025593049657,
+      "loss": 0.1143,
+      "step": 36531
+    },
+    {
+      "epoch": 0.31711530281855194,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0014469981318524256,
+      "loss": 0.1162,
+      "step": 36532
+    },
+    {
+      "epoch": 0.3171239832987561,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0014469706702757222,
+      "loss": 0.1309,
+      "step": 36533
+    },
+    {
+      "epoch": 0.31713266377896027,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0014469432083195769,
+      "loss": 0.1035,
+      "step": 36534
+    },
+    {
+      "epoch": 0.3171413442591644,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0014469157459840194,
+      "loss": 0.1167,
+      "step": 36535
+    },
+    {
+      "epoch": 0.3171500247393686,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0014468882832690806,
+      "loss": 0.0864,
+      "step": 36536
+    },
+    {
+      "epoch": 0.31715870521957273,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.00144686082017479,
+      "loss": 0.0967,
+      "step": 36537
+    },
+    {
+      "epoch": 0.3171673856997769,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001446833356701177,
+      "loss": 0.1084,
+      "step": 36538
+    },
+    {
+      "epoch": 0.31717606617998106,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0014468058928482728,
+      "loss": 0.1289,
+      "step": 36539
+    },
+    {
+      "epoch": 0.31718474666018526,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0014467784286161067,
+      "loss": 0.0898,
+      "step": 36540
+    },
+    {
+      "epoch": 0.3171934271403894,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0014467509640047089,
+      "loss": 0.1221,
+      "step": 36541
+    },
+    {
+      "epoch": 0.3172021076205936,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0014467234990141096,
+      "loss": 0.0815,
+      "step": 36542
+    },
+    {
+      "epoch": 0.3172107881007977,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0014466960336443385,
+      "loss": 0.127,
+      "step": 36543
+    },
+    {
+      "epoch": 0.3172194685810019,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0014466685678954261,
+      "loss": 0.1133,
+      "step": 36544
+    },
+    {
+      "epoch": 0.31722814906120606,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0014466411017674016,
+      "loss": 0.125,
+      "step": 36545
+    },
+    {
+      "epoch": 0.31723682954141025,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001446613635260296,
+      "loss": 0.104,
+      "step": 36546
+    },
+    {
+      "epoch": 0.3172455100216144,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001446586168374139,
+      "loss": 0.1299,
+      "step": 36547
+    },
+    {
+      "epoch": 0.3172541905018186,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0014465587011089605,
+      "loss": 0.0957,
+      "step": 36548
+    },
+    {
+      "epoch": 0.3172628709820227,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0014465312334647907,
+      "loss": 0.1055,
+      "step": 36549
+    },
+    {
+      "epoch": 0.3172715514622269,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014465037654416595,
+      "loss": 0.1436,
+      "step": 36550
+    },
+    {
+      "epoch": 0.31728023194243105,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0014464762970395968,
+      "loss": 0.1079,
+      "step": 36551
+    },
+    {
+      "epoch": 0.31728891242263524,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001446448828258633,
+      "loss": 0.0845,
+      "step": 36552
+    },
+    {
+      "epoch": 0.3172975929028394,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001446421359098798,
+      "loss": 0.0791,
+      "step": 36553
+    },
+    {
+      "epoch": 0.31730627338304357,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0014463938895601213,
+      "loss": 0.126,
+      "step": 36554
+    },
+    {
+      "epoch": 0.3173149538632477,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001446366419642634,
+      "loss": 0.1055,
+      "step": 36555
+    },
+    {
+      "epoch": 0.3173236343434519,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0014463389493463655,
+      "loss": 0.0923,
+      "step": 36556
+    },
+    {
+      "epoch": 0.31733231482365604,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001446311478671346,
+      "loss": 0.084,
+      "step": 36557
+    },
+    {
+      "epoch": 0.31734099530386023,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014462840076176053,
+      "loss": 0.1162,
+      "step": 36558
+    },
+    {
+      "epoch": 0.31734967578406437,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0014462565361851739,
+      "loss": 0.0903,
+      "step": 36559
+    },
+    {
+      "epoch": 0.31735835626426856,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0014462290643740812,
+      "loss": 0.0869,
+      "step": 36560
+    },
+    {
+      "epoch": 0.3173670367444727,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0014462015921843578,
+      "loss": 0.0894,
+      "step": 36561
+    },
+    {
+      "epoch": 0.3173757172246769,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0014461741196160336,
+      "loss": 0.082,
+      "step": 36562
+    },
+    {
+      "epoch": 0.317384397704881,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001446146646669138,
+      "loss": 0.124,
+      "step": 36563
+    },
+    {
+      "epoch": 0.3173930781850852,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0014461191733437023,
+      "loss": 0.1133,
+      "step": 36564
+    },
+    {
+      "epoch": 0.31740175866528936,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0014460916996397557,
+      "loss": 0.0986,
+      "step": 36565
+    },
+    {
+      "epoch": 0.31741043914549355,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0014460642255573282,
+      "loss": 0.1152,
+      "step": 36566
+    },
+    {
+      "epoch": 0.3174191196256977,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0014460367510964506,
+      "loss": 0.1494,
+      "step": 36567
+    },
+    {
+      "epoch": 0.3174278001059019,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0014460092762571518,
+      "loss": 0.0786,
+      "step": 36568
+    },
+    {
+      "epoch": 0.317436480586106,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0014459818010394627,
+      "loss": 0.1123,
+      "step": 36569
+    },
+    {
+      "epoch": 0.3174451610663102,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001445954325443413,
+      "loss": 0.0918,
+      "step": 36570
+    },
+    {
+      "epoch": 0.31745384154651435,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0014459268494690332,
+      "loss": 0.0703,
+      "step": 36571
+    },
+    {
+      "epoch": 0.31746252202671854,
+      "grad_norm": 1.7890625,
+      "learning_rate": 0.0014458993731163525,
+      "loss": 0.2148,
+      "step": 36572
+    },
+    {
+      "epoch": 0.3174712025069227,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001445871896385402,
+      "loss": 0.1006,
+      "step": 36573
+    },
+    {
+      "epoch": 0.31747988298712687,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0014458444192762105,
+      "loss": 0.1084,
+      "step": 36574
+    },
+    {
+      "epoch": 0.317488563467331,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0014458169417888092,
+      "loss": 0.0918,
+      "step": 36575
+    },
+    {
+      "epoch": 0.3174972439475352,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0014457894639232276,
+      "loss": 0.0811,
+      "step": 36576
+    },
+    {
+      "epoch": 0.31750592442773934,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001445761985679496,
+      "loss": 0.0718,
+      "step": 36577
+    },
+    {
+      "epoch": 0.31751460490794353,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0014457345070576439,
+      "loss": 0.0928,
+      "step": 36578
+    },
+    {
+      "epoch": 0.31752328538814767,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001445707028057702,
+      "loss": 0.0874,
+      "step": 36579
+    },
+    {
+      "epoch": 0.31753196586835186,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0014456795486797,
+      "loss": 0.0593,
+      "step": 36580
+    },
+    {
+      "epoch": 0.317540646348556,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0014456520689236684,
+      "loss": 0.0796,
+      "step": 36581
+    },
+    {
+      "epoch": 0.3175493268287602,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0014456245887896364,
+      "loss": 0.0869,
+      "step": 36582
+    },
+    {
+      "epoch": 0.31755800730896433,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0014455971082776345,
+      "loss": 0.0752,
+      "step": 36583
+    },
+    {
+      "epoch": 0.3175666877891685,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001445569627387693,
+      "loss": 0.0928,
+      "step": 36584
+    },
+    {
+      "epoch": 0.31757536826937266,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0014455421461198417,
+      "loss": 0.0903,
+      "step": 36585
+    },
+    {
+      "epoch": 0.31758404874957685,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0014455146644741109,
+      "loss": 0.1133,
+      "step": 36586
+    },
+    {
+      "epoch": 0.317592729229781,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0014454871824505305,
+      "loss": 0.0728,
+      "step": 36587
+    },
+    {
+      "epoch": 0.3176014097099852,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0014454597000491303,
+      "loss": 0.1064,
+      "step": 36588
+    },
+    {
+      "epoch": 0.3176100901901893,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0014454322172699405,
+      "loss": 0.126,
+      "step": 36589
+    },
+    {
+      "epoch": 0.3176187706703935,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0014454047341129915,
+      "loss": 0.0713,
+      "step": 36590
+    },
+    {
+      "epoch": 0.31762745115059765,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0014453772505783127,
+      "loss": 0.0928,
+      "step": 36591
+    },
+    {
+      "epoch": 0.31763613163080184,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0014453497666659348,
+      "loss": 0.0771,
+      "step": 36592
+    },
+    {
+      "epoch": 0.317644812111006,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0014453222823758874,
+      "loss": 0.1416,
+      "step": 36593
+    },
+    {
+      "epoch": 0.3176534925912101,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0014452947977082007,
+      "loss": 0.0747,
+      "step": 36594
+    },
+    {
+      "epoch": 0.3176621730714143,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001445267312662905,
+      "loss": 0.0908,
+      "step": 36595
+    },
+    {
+      "epoch": 0.31767085355161845,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0014452398272400302,
+      "loss": 0.1523,
+      "step": 36596
+    },
+    {
+      "epoch": 0.31767953403182264,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0014452123414396064,
+      "loss": 0.0845,
+      "step": 36597
+    },
+    {
+      "epoch": 0.3176882145120268,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001445184855261663,
+      "loss": 0.1118,
+      "step": 36598
+    },
+    {
+      "epoch": 0.31769689499223097,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0014451573687062312,
+      "loss": 0.1191,
+      "step": 36599
+    },
+    {
+      "epoch": 0.3177055754724351,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0014451298817733404,
+      "loss": 0.103,
+      "step": 36600
+    },
+    {
+      "epoch": 0.3177142559526393,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0014451023944630206,
+      "loss": 0.0967,
+      "step": 36601
+    },
+    {
+      "epoch": 0.31772293643284344,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001445074906775302,
+      "loss": 0.1104,
+      "step": 36602
+    },
+    {
+      "epoch": 0.31773161691304763,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0014450474187102146,
+      "loss": 0.1128,
+      "step": 36603
+    },
+    {
+      "epoch": 0.31774029739325177,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0014450199302677884,
+      "loss": 0.1523,
+      "step": 36604
+    },
+    {
+      "epoch": 0.31774897787345596,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0014449924414480539,
+      "loss": 0.0889,
+      "step": 36605
+    },
+    {
+      "epoch": 0.3177576583536601,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001444964952251041,
+      "loss": 0.0981,
+      "step": 36606
+    },
+    {
+      "epoch": 0.3177663388338643,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0014449374626767794,
+      "loss": 0.1152,
+      "step": 36607
+    },
+    {
+      "epoch": 0.31777501931406843,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0014449099727252993,
+      "loss": 0.0664,
+      "step": 36608
+    },
+    {
+      "epoch": 0.3177836997942726,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0014448824823966309,
+      "loss": 0.1055,
+      "step": 36609
+    },
+    {
+      "epoch": 0.31779238027447676,
+      "grad_norm": 0.060791015625,
+      "learning_rate": 0.0014448549916908043,
+      "loss": 0.0781,
+      "step": 36610
+    },
+    {
+      "epoch": 0.31780106075468095,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0014448275006078493,
+      "loss": 0.1348,
+      "step": 36611
+    },
+    {
+      "epoch": 0.3178097412348851,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014448000091477962,
+      "loss": 0.0815,
+      "step": 36612
+    },
+    {
+      "epoch": 0.3178184217150893,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0014447725173106747,
+      "loss": 0.0674,
+      "step": 36613
+    },
+    {
+      "epoch": 0.3178271021952934,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0014447450250965155,
+      "loss": 0.1001,
+      "step": 36614
+    },
+    {
+      "epoch": 0.3178357826754976,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0014447175325053483,
+      "loss": 0.105,
+      "step": 36615
+    },
+    {
+      "epoch": 0.31784446315570175,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0014446900395372031,
+      "loss": 0.0986,
+      "step": 36616
+    },
+    {
+      "epoch": 0.31785314363590594,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0014446625461921102,
+      "loss": 0.0918,
+      "step": 36617
+    },
+    {
+      "epoch": 0.3178618241161101,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0014446350524700995,
+      "loss": 0.0957,
+      "step": 36618
+    },
+    {
+      "epoch": 0.3178705045963143,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001444607558371201,
+      "loss": 0.0693,
+      "step": 36619
+    },
+    {
+      "epoch": 0.3178791850765184,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0014445800638954445,
+      "loss": 0.0747,
+      "step": 36620
+    },
+    {
+      "epoch": 0.3178878655567226,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001444552569042861,
+      "loss": 0.1128,
+      "step": 36621
+    },
+    {
+      "epoch": 0.31789654603692674,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0014445250738134797,
+      "loss": 0.123,
+      "step": 36622
+    },
+    {
+      "epoch": 0.31790522651713093,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0014444975782073309,
+      "loss": 0.0752,
+      "step": 36623
+    },
+    {
+      "epoch": 0.31791390699733507,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0014444700822244447,
+      "loss": 0.0591,
+      "step": 36624
+    },
+    {
+      "epoch": 0.31792258747753926,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0014444425858648514,
+      "loss": 0.1035,
+      "step": 36625
+    },
+    {
+      "epoch": 0.3179312679577434,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0014444150891285807,
+      "loss": 0.0996,
+      "step": 36626
+    },
+    {
+      "epoch": 0.3179399484379476,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0014443875920156632,
+      "loss": 0.1006,
+      "step": 36627
+    },
+    {
+      "epoch": 0.31794862891815173,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0014443600945261285,
+      "loss": 0.105,
+      "step": 36628
+    },
+    {
+      "epoch": 0.3179573093983559,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0014443325966600063,
+      "loss": 0.0938,
+      "step": 36629
+    },
+    {
+      "epoch": 0.31796598987856006,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0014443050984173276,
+      "loss": 0.1226,
+      "step": 36630
+    },
+    {
+      "epoch": 0.31797467035876426,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0014442775997981216,
+      "loss": 0.1221,
+      "step": 36631
+    },
+    {
+      "epoch": 0.3179833508389684,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0014442501008024192,
+      "loss": 0.0991,
+      "step": 36632
+    },
+    {
+      "epoch": 0.3179920313191726,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0014442226014302498,
+      "loss": 0.0962,
+      "step": 36633
+    },
+    {
+      "epoch": 0.3180007117993767,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001444195101681644,
+      "loss": 0.0977,
+      "step": 36634
+    },
+    {
+      "epoch": 0.3180093922795809,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0014441676015566312,
+      "loss": 0.1206,
+      "step": 36635
+    },
+    {
+      "epoch": 0.31801807275978505,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0014441401010552423,
+      "loss": 0.1177,
+      "step": 36636
+    },
+    {
+      "epoch": 0.31802675323998925,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0014441126001775067,
+      "loss": 0.126,
+      "step": 36637
+    },
+    {
+      "epoch": 0.3180354337201934,
+      "grad_norm": 1.9453125,
+      "learning_rate": 0.0014440850989234548,
+      "loss": 0.126,
+      "step": 36638
+    },
+    {
+      "epoch": 0.3180441142003976,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0014440575972931168,
+      "loss": 0.123,
+      "step": 36639
+    },
+    {
+      "epoch": 0.3180527946806017,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0014440300952865224,
+      "loss": 0.1143,
+      "step": 36640
+    },
+    {
+      "epoch": 0.3180614751608059,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0014440025929037018,
+      "loss": 0.125,
+      "step": 36641
+    },
+    {
+      "epoch": 0.31807015564101004,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.001443975090144685,
+      "loss": 0.0957,
+      "step": 36642
+    },
+    {
+      "epoch": 0.31807883612121424,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0014439475870095025,
+      "loss": 0.1162,
+      "step": 36643
+    },
+    {
+      "epoch": 0.3180875166014184,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001443920083498184,
+      "loss": 0.0864,
+      "step": 36644
+    },
+    {
+      "epoch": 0.31809619708162257,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0014438925796107598,
+      "loss": 0.1602,
+      "step": 36645
+    },
+    {
+      "epoch": 0.3181048775618267,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0014438650753472596,
+      "loss": 0.1006,
+      "step": 36646
+    },
+    {
+      "epoch": 0.3181135580420309,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0014438375707077138,
+      "loss": 0.1045,
+      "step": 36647
+    },
+    {
+      "epoch": 0.31812223852223503,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0014438100656921525,
+      "loss": 0.1235,
+      "step": 36648
+    },
+    {
+      "epoch": 0.3181309190024392,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0014437825603006056,
+      "loss": 0.1074,
+      "step": 36649
+    },
+    {
+      "epoch": 0.31813959948264336,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.001443755054533103,
+      "loss": 0.0718,
+      "step": 36650
+    },
+    {
+      "epoch": 0.31814827996284756,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0014437275483896752,
+      "loss": 0.1475,
+      "step": 36651
+    },
+    {
+      "epoch": 0.3181569604430517,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0014437000418703525,
+      "loss": 0.1113,
+      "step": 36652
+    },
+    {
+      "epoch": 0.3181656409232559,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0014436725349751642,
+      "loss": 0.166,
+      "step": 36653
+    },
+    {
+      "epoch": 0.31817432140346,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0014436450277041408,
+      "loss": 0.0967,
+      "step": 36654
+    },
+    {
+      "epoch": 0.3181830018836642,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0014436175200573124,
+      "loss": 0.0977,
+      "step": 36655
+    },
+    {
+      "epoch": 0.31819168236386836,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0014435900120347092,
+      "loss": 0.0879,
+      "step": 36656
+    },
+    {
+      "epoch": 0.31820036284407255,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0014435625036363611,
+      "loss": 0.0933,
+      "step": 36657
+    },
+    {
+      "epoch": 0.3182090433242767,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0014435349948622982,
+      "loss": 0.1064,
+      "step": 36658
+    },
+    {
+      "epoch": 0.3182177238044809,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0014435074857125504,
+      "loss": 0.0806,
+      "step": 36659
+    },
+    {
+      "epoch": 0.318226404284685,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001443479976187148,
+      "loss": 0.0776,
+      "step": 36660
+    },
+    {
+      "epoch": 0.3182350847648892,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0014434524662861213,
+      "loss": 0.1123,
+      "step": 36661
+    },
+    {
+      "epoch": 0.31824376524509335,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0014434249560095,
+      "loss": 0.1318,
+      "step": 36662
+    },
+    {
+      "epoch": 0.31825244572529754,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0014433974453573143,
+      "loss": 0.0928,
+      "step": 36663
+    },
+    {
+      "epoch": 0.3182611262055017,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0014433699343295943,
+      "loss": 0.0786,
+      "step": 36664
+    },
+    {
+      "epoch": 0.31826980668570587,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0014433424229263704,
+      "loss": 0.1279,
+      "step": 36665
+    },
+    {
+      "epoch": 0.31827848716591,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0014433149111476719,
+      "loss": 0.0977,
+      "step": 36666
+    },
+    {
+      "epoch": 0.3182871676461142,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0014432873989935295,
+      "loss": 0.1211,
+      "step": 36667
+    },
+    {
+      "epoch": 0.31829584812631834,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0014432598864639733,
+      "loss": 0.0767,
+      "step": 36668
+    },
+    {
+      "epoch": 0.31830452860652253,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0014432323735590333,
+      "loss": 0.0928,
+      "step": 36669
+    },
+    {
+      "epoch": 0.31831320908672667,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0014432048602787393,
+      "loss": 0.0947,
+      "step": 36670
+    },
+    {
+      "epoch": 0.31832188956693086,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001443177346623122,
+      "loss": 0.0884,
+      "step": 36671
+    },
+    {
+      "epoch": 0.318330570047135,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0014431498325922104,
+      "loss": 0.1064,
+      "step": 36672
+    },
+    {
+      "epoch": 0.3183392505273392,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001443122318186036,
+      "loss": 0.0923,
+      "step": 36673
+    },
+    {
+      "epoch": 0.3183479310075433,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0014430948034046278,
+      "loss": 0.0918,
+      "step": 36674
+    },
+    {
+      "epoch": 0.3183566114877475,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0014430672882480165,
+      "loss": 0.0957,
+      "step": 36675
+    },
+    {
+      "epoch": 0.31836529196795166,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0014430397727162316,
+      "loss": 0.1299,
+      "step": 36676
+    },
+    {
+      "epoch": 0.31837397244815585,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0014430122568093043,
+      "loss": 0.0898,
+      "step": 36677
+    },
+    {
+      "epoch": 0.31838265292836,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0014429847405272637,
+      "loss": 0.1162,
+      "step": 36678
+    },
+    {
+      "epoch": 0.3183913334085642,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0014429572238701394,
+      "loss": 0.0986,
+      "step": 36679
+    },
+    {
+      "epoch": 0.3184000138887683,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001442929706837963,
+      "loss": 0.0854,
+      "step": 36680
+    },
+    {
+      "epoch": 0.3184086943689725,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0014429021894307632,
+      "loss": 0.1641,
+      "step": 36681
+    },
+    {
+      "epoch": 0.31841737484917665,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0014428746716485714,
+      "loss": 0.0889,
+      "step": 36682
+    },
+    {
+      "epoch": 0.31842605532938084,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0014428471534914168,
+      "loss": 0.0889,
+      "step": 36683
+    },
+    {
+      "epoch": 0.318434735809585,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0014428196349593296,
+      "loss": 0.0942,
+      "step": 36684
+    },
+    {
+      "epoch": 0.31844341628978917,
+      "grad_norm": 0.375,
+      "learning_rate": 0.00144279211605234,
+      "loss": 0.0864,
+      "step": 36685
+    },
+    {
+      "epoch": 0.3184520967699933,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0014427645967704781,
+      "loss": 0.0947,
+      "step": 36686
+    },
+    {
+      "epoch": 0.3184607772501975,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0014427370771137743,
+      "loss": 0.0986,
+      "step": 36687
+    },
+    {
+      "epoch": 0.31846945773040164,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0014427095570822576,
+      "loss": 0.1035,
+      "step": 36688
+    },
+    {
+      "epoch": 0.31847813821060583,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0014426820366759596,
+      "loss": 0.1084,
+      "step": 36689
+    },
+    {
+      "epoch": 0.31848681869080997,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0014426545158949093,
+      "loss": 0.1348,
+      "step": 36690
+    },
+    {
+      "epoch": 0.31849549917101416,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0014426269947391374,
+      "loss": 0.1289,
+      "step": 36691
+    },
+    {
+      "epoch": 0.3185041796512183,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0014425994732086738,
+      "loss": 0.1328,
+      "step": 36692
+    },
+    {
+      "epoch": 0.3185128601314225,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0014425719513035482,
+      "loss": 0.1123,
+      "step": 36693
+    },
+    {
+      "epoch": 0.31852154061162663,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0014425444290237916,
+      "loss": 0.083,
+      "step": 36694
+    },
+    {
+      "epoch": 0.3185302210918308,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0014425169063694333,
+      "loss": 0.1309,
+      "step": 36695
+    },
+    {
+      "epoch": 0.31853890157203496,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0014424893833405036,
+      "loss": 0.1299,
+      "step": 36696
+    },
+    {
+      "epoch": 0.31854758205223915,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0014424618599370325,
+      "loss": 0.0908,
+      "step": 36697
+    },
+    {
+      "epoch": 0.3185562625324433,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0014424343361590505,
+      "loss": 0.0815,
+      "step": 36698
+    },
+    {
+      "epoch": 0.3185649430126475,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0014424068120065872,
+      "loss": 0.0938,
+      "step": 36699
+    },
+    {
+      "epoch": 0.3185736234928516,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0014423792874796732,
+      "loss": 0.1094,
+      "step": 36700
+    },
+    {
+      "epoch": 0.3185823039730558,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0014423517625783383,
+      "loss": 0.105,
+      "step": 36701
+    },
+    {
+      "epoch": 0.31859098445325995,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0014423242373026127,
+      "loss": 0.0996,
+      "step": 36702
+    },
+    {
+      "epoch": 0.31859966493346414,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0014422967116525262,
+      "loss": 0.0991,
+      "step": 36703
+    },
+    {
+      "epoch": 0.3186083454136683,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0014422691856281096,
+      "loss": 0.1299,
+      "step": 36704
+    },
+    {
+      "epoch": 0.3186170258938725,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0014422416592293924,
+      "loss": 0.1123,
+      "step": 36705
+    },
+    {
+      "epoch": 0.3186257063740766,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0014422141324564046,
+      "loss": 0.1387,
+      "step": 36706
+    },
+    {
+      "epoch": 0.3186343868542808,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0014421866053091767,
+      "loss": 0.1094,
+      "step": 36707
+    },
+    {
+      "epoch": 0.31864306733448494,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0014421590777877389,
+      "loss": 0.1138,
+      "step": 36708
+    },
+    {
+      "epoch": 0.31865174781468913,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001442131549892121,
+      "loss": 0.1045,
+      "step": 36709
+    },
+    {
+      "epoch": 0.31866042829489327,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0014421040216223528,
+      "loss": 0.1504,
+      "step": 36710
+    },
+    {
+      "epoch": 0.31866910877509746,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0014420764929784652,
+      "loss": 0.0859,
+      "step": 36711
+    },
+    {
+      "epoch": 0.3186777892553016,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0014420489639604876,
+      "loss": 0.0825,
+      "step": 36712
+    },
+    {
+      "epoch": 0.3186864697355058,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0014420214345684506,
+      "loss": 0.0894,
+      "step": 36713
+    },
+    {
+      "epoch": 0.31869515021570993,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001441993904802384,
+      "loss": 0.1045,
+      "step": 36714
+    },
+    {
+      "epoch": 0.3187038306959141,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0014419663746623181,
+      "loss": 0.0928,
+      "step": 36715
+    },
+    {
+      "epoch": 0.31871251117611826,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0014419388441482828,
+      "loss": 0.1143,
+      "step": 36716
+    },
+    {
+      "epoch": 0.3187211916563224,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0014419113132603085,
+      "loss": 0.0986,
+      "step": 36717
+    },
+    {
+      "epoch": 0.3187298721365266,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001441883781998425,
+      "loss": 0.1064,
+      "step": 36718
+    },
+    {
+      "epoch": 0.31873855261673073,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0014418562503626625,
+      "loss": 0.1084,
+      "step": 36719
+    },
+    {
+      "epoch": 0.3187472330969349,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001441828718353051,
+      "loss": 0.1177,
+      "step": 36720
+    },
+    {
+      "epoch": 0.31875591357713906,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0014418011859696211,
+      "loss": 0.0874,
+      "step": 36721
+    },
+    {
+      "epoch": 0.31876459405734325,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0014417736532124025,
+      "loss": 0.123,
+      "step": 36722
+    },
+    {
+      "epoch": 0.3187732745375474,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001441746120081425,
+      "loss": 0.1118,
+      "step": 36723
+    },
+    {
+      "epoch": 0.3187819550177516,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0014417185865767193,
+      "loss": 0.1118,
+      "step": 36724
+    },
+    {
+      "epoch": 0.3187906354979557,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0014416910526983153,
+      "loss": 0.1211,
+      "step": 36725
+    },
+    {
+      "epoch": 0.3187993159781599,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0014416635184462431,
+      "loss": 0.1035,
+      "step": 36726
+    },
+    {
+      "epoch": 0.31880799645836405,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001441635983820533,
+      "loss": 0.0898,
+      "step": 36727
+    },
+    {
+      "epoch": 0.31881667693856824,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0014416084488212144,
+      "loss": 0.0752,
+      "step": 36728
+    },
+    {
+      "epoch": 0.3188253574187724,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0014415809134483188,
+      "loss": 0.1348,
+      "step": 36729
+    },
+    {
+      "epoch": 0.3188340378989766,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0014415533777018745,
+      "loss": 0.1348,
+      "step": 36730
+    },
+    {
+      "epoch": 0.3188427183791807,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0014415258415819132,
+      "loss": 0.1206,
+      "step": 36731
+    },
+    {
+      "epoch": 0.3188513988593849,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001441498305088464,
+      "loss": 0.0986,
+      "step": 36732
+    },
+    {
+      "epoch": 0.31886007933958904,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0014414707682215576,
+      "loss": 0.1201,
+      "step": 36733
+    },
+    {
+      "epoch": 0.31886875981979323,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0014414432309812237,
+      "loss": 0.1123,
+      "step": 36734
+    },
+    {
+      "epoch": 0.3188774402999974,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0014414156933674931,
+      "loss": 0.061,
+      "step": 36735
+    },
+    {
+      "epoch": 0.31888612078020157,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001441388155380395,
+      "loss": 0.0723,
+      "step": 36736
+    },
+    {
+      "epoch": 0.3188948012604057,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.00144136061701996,
+      "loss": 0.1182,
+      "step": 36737
+    },
+    {
+      "epoch": 0.3189034817406099,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001441333078286218,
+      "loss": 0.1167,
+      "step": 36738
+    },
+    {
+      "epoch": 0.31891216222081403,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0014413055391791995,
+      "loss": 0.0938,
+      "step": 36739
+    },
+    {
+      "epoch": 0.3189208427010182,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0014412779996989345,
+      "loss": 0.1348,
+      "step": 36740
+    },
+    {
+      "epoch": 0.31892952318122236,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0014412504598454526,
+      "loss": 0.0762,
+      "step": 36741
+    },
+    {
+      "epoch": 0.31893820366142656,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0014412229196187846,
+      "loss": 0.1089,
+      "step": 36742
+    },
+    {
+      "epoch": 0.3189468841416307,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0014411953790189605,
+      "loss": 0.1123,
+      "step": 36743
+    },
+    {
+      "epoch": 0.3189555646218349,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0014411678380460101,
+      "loss": 0.0967,
+      "step": 36744
+    },
+    {
+      "epoch": 0.318964245102039,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0014411402966999635,
+      "loss": 0.085,
+      "step": 36745
+    },
+    {
+      "epoch": 0.3189729255822432,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001441112754980851,
+      "loss": 0.123,
+      "step": 36746
+    },
+    {
+      "epoch": 0.31898160606244735,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0014410852128887031,
+      "loss": 0.0879,
+      "step": 36747
+    },
+    {
+      "epoch": 0.31899028654265155,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0014410576704235493,
+      "loss": 0.0806,
+      "step": 36748
+    },
+    {
+      "epoch": 0.3189989670228557,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.00144103012758542,
+      "loss": 0.1016,
+      "step": 36749
+    },
+    {
+      "epoch": 0.3190076475030599,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0014410025843743447,
+      "loss": 0.1108,
+      "step": 36750
+    },
+    {
+      "epoch": 0.319016327983264,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0014409750407903548,
+      "loss": 0.1001,
+      "step": 36751
+    },
+    {
+      "epoch": 0.3190250084634682,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014409474968334797,
+      "loss": 0.127,
+      "step": 36752
+    },
+    {
+      "epoch": 0.31903368894367234,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0014409199525037496,
+      "loss": 0.1123,
+      "step": 36753
+    },
+    {
+      "epoch": 0.31904236942387654,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0014408924078011941,
+      "loss": 0.0752,
+      "step": 36754
+    },
+    {
+      "epoch": 0.3190510499040807,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0014408648627258441,
+      "loss": 0.0962,
+      "step": 36755
+    },
+    {
+      "epoch": 0.31905973038428487,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0014408373172777292,
+      "loss": 0.0918,
+      "step": 36756
+    },
+    {
+      "epoch": 0.319068410864489,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.00144080977145688,
+      "loss": 0.1143,
+      "step": 36757
+    },
+    {
+      "epoch": 0.3190770913446932,
+      "grad_norm": 1.546875,
+      "learning_rate": 0.001440782225263326,
+      "loss": 0.4395,
+      "step": 36758
+    },
+    {
+      "epoch": 0.31908577182489734,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001440754678697098,
+      "loss": 0.0889,
+      "step": 36759
+    },
+    {
+      "epoch": 0.31909445230510153,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001440727131758226,
+      "loss": 0.1211,
+      "step": 36760
+    },
+    {
+      "epoch": 0.31910313278530567,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0014406995844467396,
+      "loss": 0.0957,
+      "step": 36761
+    },
+    {
+      "epoch": 0.31911181326550986,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0014406720367626693,
+      "loss": 0.1021,
+      "step": 36762
+    },
+    {
+      "epoch": 0.319120493745714,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.001440644488706045,
+      "loss": 0.0947,
+      "step": 36763
+    },
+    {
+      "epoch": 0.3191291742259182,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0014406169402768972,
+      "loss": 0.0933,
+      "step": 36764
+    },
+    {
+      "epoch": 0.3191378547061223,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0014405893914752558,
+      "loss": 0.1182,
+      "step": 36765
+    },
+    {
+      "epoch": 0.3191465351863265,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0014405618423011508,
+      "loss": 0.0806,
+      "step": 36766
+    },
+    {
+      "epoch": 0.31915521566653066,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0014405342927546125,
+      "loss": 0.0947,
+      "step": 36767
+    },
+    {
+      "epoch": 0.31916389614673485,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0014405067428356715,
+      "loss": 0.1123,
+      "step": 36768
+    },
+    {
+      "epoch": 0.319172576626939,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.001440479192544357,
+      "loss": 0.1377,
+      "step": 36769
+    },
+    {
+      "epoch": 0.3191812571071432,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0014404516418806996,
+      "loss": 0.083,
+      "step": 36770
+    },
+    {
+      "epoch": 0.3191899375873473,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0014404240908447296,
+      "loss": 0.0977,
+      "step": 36771
+    },
+    {
+      "epoch": 0.3191986180675515,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0014403965394364769,
+      "loss": 0.1016,
+      "step": 36772
+    },
+    {
+      "epoch": 0.31920729854775565,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0014403689876559714,
+      "loss": 0.0723,
+      "step": 36773
+    },
+    {
+      "epoch": 0.31921597902795984,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0014403414355032439,
+      "loss": 0.1079,
+      "step": 36774
+    },
+    {
+      "epoch": 0.319224659508164,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0014403138829783238,
+      "loss": 0.1016,
+      "step": 36775
+    },
+    {
+      "epoch": 0.31923333998836817,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0014402863300812415,
+      "loss": 0.1084,
+      "step": 36776
+    },
+    {
+      "epoch": 0.3192420204685723,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0014402587768120273,
+      "loss": 0.0967,
+      "step": 36777
+    },
+    {
+      "epoch": 0.3192507009487765,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0014402312231707112,
+      "loss": 0.1128,
+      "step": 36778
+    },
+    {
+      "epoch": 0.31925938142898064,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0014402036691573234,
+      "loss": 0.1328,
+      "step": 36779
+    },
+    {
+      "epoch": 0.31926806190918483,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0014401761147718939,
+      "loss": 0.0923,
+      "step": 36780
+    },
+    {
+      "epoch": 0.31927674238938897,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.001440148560014453,
+      "loss": 0.1245,
+      "step": 36781
+    },
+    {
+      "epoch": 0.31928542286959316,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001440121004885031,
+      "loss": 0.084,
+      "step": 36782
+    },
+    {
+      "epoch": 0.3192941033497973,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0014400934493836572,
+      "loss": 0.0649,
+      "step": 36783
+    },
+    {
+      "epoch": 0.3193027838300015,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0014400658935103626,
+      "loss": 0.1309,
+      "step": 36784
+    },
+    {
+      "epoch": 0.31931146431020563,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0014400383372651768,
+      "loss": 0.1279,
+      "step": 36785
+    },
+    {
+      "epoch": 0.3193201447904098,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0014400107806481306,
+      "loss": 0.0942,
+      "step": 36786
+    },
+    {
+      "epoch": 0.31932882527061396,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0014399832236592536,
+      "loss": 0.1406,
+      "step": 36787
+    },
+    {
+      "epoch": 0.31933750575081815,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001439955666298576,
+      "loss": 0.0737,
+      "step": 36788
+    },
+    {
+      "epoch": 0.3193461862310223,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001439928108566128,
+      "loss": 0.083,
+      "step": 36789
+    },
+    {
+      "epoch": 0.3193548667112265,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0014399005504619397,
+      "loss": 0.1035,
+      "step": 36790
+    },
+    {
+      "epoch": 0.3193635471914306,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0014398729919860413,
+      "loss": 0.123,
+      "step": 36791
+    },
+    {
+      "epoch": 0.3193722276716348,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0014398454331384632,
+      "loss": 0.0977,
+      "step": 36792
+    },
+    {
+      "epoch": 0.31938090815183895,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0014398178739192346,
+      "loss": 0.1099,
+      "step": 36793
+    },
+    {
+      "epoch": 0.31938958863204314,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0014397903143283867,
+      "loss": 0.1455,
+      "step": 36794
+    },
+    {
+      "epoch": 0.3193982691122473,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001439762754365949,
+      "loss": 0.1445,
+      "step": 36795
+    },
+    {
+      "epoch": 0.3194069495924515,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0014397351940319521,
+      "loss": 0.1113,
+      "step": 36796
+    },
+    {
+      "epoch": 0.3194156300726556,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014397076333264257,
+      "loss": 0.1348,
+      "step": 36797
+    },
+    {
+      "epoch": 0.3194243105528598,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0014396800722494,
+      "loss": 0.083,
+      "step": 36798
+    },
+    {
+      "epoch": 0.31943299103306394,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0014396525108009057,
+      "loss": 0.061,
+      "step": 36799
+    },
+    {
+      "epoch": 0.31944167151326813,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0014396249489809724,
+      "loss": 0.0923,
+      "step": 36800
+    },
+    {
+      "epoch": 0.31945035199347227,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0014395973867896304,
+      "loss": 0.0996,
+      "step": 36801
+    },
+    {
+      "epoch": 0.31945903247367646,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0014395698242269095,
+      "loss": 0.1094,
+      "step": 36802
+    },
+    {
+      "epoch": 0.3194677129538806,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0014395422612928405,
+      "loss": 0.1084,
+      "step": 36803
+    },
+    {
+      "epoch": 0.3194763934340848,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0014395146979874528,
+      "loss": 0.0835,
+      "step": 36804
+    },
+    {
+      "epoch": 0.31948507391428893,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001439487134310777,
+      "loss": 0.1406,
+      "step": 36805
+    },
+    {
+      "epoch": 0.3194937543944931,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0014394595702628432,
+      "loss": 0.1055,
+      "step": 36806
+    },
+    {
+      "epoch": 0.31950243487469726,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001439432005843682,
+      "loss": 0.0791,
+      "step": 36807
+    },
+    {
+      "epoch": 0.31951111535490145,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0014394044410533224,
+      "loss": 0.1113,
+      "step": 36808
+    },
+    {
+      "epoch": 0.3195197958351056,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0014393768758917955,
+      "loss": 0.0957,
+      "step": 36809
+    },
+    {
+      "epoch": 0.3195284763153098,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001439349310359131,
+      "loss": 0.1089,
+      "step": 36810
+    },
+    {
+      "epoch": 0.3195371567955139,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0014393217444553593,
+      "loss": 0.0747,
+      "step": 36811
+    },
+    {
+      "epoch": 0.3195458372757181,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0014392941781805105,
+      "loss": 0.0918,
+      "step": 36812
+    },
+    {
+      "epoch": 0.31955451775592225,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0014392666115346144,
+      "loss": 0.0962,
+      "step": 36813
+    },
+    {
+      "epoch": 0.31956319823612644,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.001439239044517702,
+      "loss": 0.0967,
+      "step": 36814
+    },
+    {
+      "epoch": 0.3195718787163306,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0014392114771298026,
+      "loss": 0.1582,
+      "step": 36815
+    },
+    {
+      "epoch": 0.3195805591965348,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0014391839093709465,
+      "loss": 0.0601,
+      "step": 36816
+    },
+    {
+      "epoch": 0.3195892396767389,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0014391563412411638,
+      "loss": 0.0991,
+      "step": 36817
+    },
+    {
+      "epoch": 0.3195979201569431,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0014391287727404853,
+      "loss": 0.1191,
+      "step": 36818
+    },
+    {
+      "epoch": 0.31960660063714724,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.00143910120386894,
+      "loss": 0.1152,
+      "step": 36819
+    },
+    {
+      "epoch": 0.31961528111735144,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0014390736346265596,
+      "loss": 0.1201,
+      "step": 36820
+    },
+    {
+      "epoch": 0.3196239615975556,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0014390460650133728,
+      "loss": 0.1035,
+      "step": 36821
+    },
+    {
+      "epoch": 0.31963264207775977,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0014390184950294107,
+      "loss": 0.1143,
+      "step": 36822
+    },
+    {
+      "epoch": 0.3196413225579639,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0014389909246747027,
+      "loss": 0.0728,
+      "step": 36823
+    },
+    {
+      "epoch": 0.3196500030381681,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001438963353949279,
+      "loss": 0.0835,
+      "step": 36824
+    },
+    {
+      "epoch": 0.31965868351837223,
+      "grad_norm": 2.75,
+      "learning_rate": 0.0014389357828531707,
+      "loss": 0.1445,
+      "step": 36825
+    },
+    {
+      "epoch": 0.3196673639985764,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001438908211386407,
+      "loss": 0.0908,
+      "step": 36826
+    },
+    {
+      "epoch": 0.31967604447878056,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0014388806395490186,
+      "loss": 0.127,
+      "step": 36827
+    },
+    {
+      "epoch": 0.31968472495898476,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0014388530673410352,
+      "loss": 0.0859,
+      "step": 36828
+    },
+    {
+      "epoch": 0.3196934054391889,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0014388254947624874,
+      "loss": 0.106,
+      "step": 36829
+    },
+    {
+      "epoch": 0.3197020859193931,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001438797921813405,
+      "loss": 0.1143,
+      "step": 36830
+    },
+    {
+      "epoch": 0.3197107663995972,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0014387703484938187,
+      "loss": 0.1504,
+      "step": 36831
+    },
+    {
+      "epoch": 0.3197194468798014,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0014387427748037576,
+      "loss": 0.0957,
+      "step": 36832
+    },
+    {
+      "epoch": 0.31972812736000555,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0014387152007432524,
+      "loss": 0.1748,
+      "step": 36833
+    },
+    {
+      "epoch": 0.31973680784020975,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001438687626312334,
+      "loss": 0.1084,
+      "step": 36834
+    },
+    {
+      "epoch": 0.3197454883204139,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0014386600515110316,
+      "loss": 0.1211,
+      "step": 36835
+    },
+    {
+      "epoch": 0.3197541688006181,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0014386324763393755,
+      "loss": 0.1006,
+      "step": 36836
+    },
+    {
+      "epoch": 0.3197628492808222,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.001438604900797396,
+      "loss": 0.0752,
+      "step": 36837
+    },
+    {
+      "epoch": 0.3197715297610264,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0014385773248851235,
+      "loss": 0.0928,
+      "step": 36838
+    },
+    {
+      "epoch": 0.31978021024123054,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001438549748602588,
+      "loss": 0.0649,
+      "step": 36839
+    },
+    {
+      "epoch": 0.3197888907214347,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0014385221719498192,
+      "loss": 0.0977,
+      "step": 36840
+    },
+    {
+      "epoch": 0.3197975712016389,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.001438494594926848,
+      "loss": 0.0967,
+      "step": 36841
+    },
+    {
+      "epoch": 0.319806251681843,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.001438467017533704,
+      "loss": 0.1475,
+      "step": 36842
+    },
+    {
+      "epoch": 0.3198149321620472,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0014384394397704175,
+      "loss": 0.1338,
+      "step": 36843
+    },
+    {
+      "epoch": 0.31982361264225134,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001438411861637019,
+      "loss": 0.105,
+      "step": 36844
+    },
+    {
+      "epoch": 0.31983229312245554,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0014383842831335381,
+      "loss": 0.0977,
+      "step": 36845
+    },
+    {
+      "epoch": 0.3198409736026597,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0014383567042600051,
+      "loss": 0.1191,
+      "step": 36846
+    },
+    {
+      "epoch": 0.31984965408286387,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014383291250164507,
+      "loss": 0.0986,
+      "step": 36847
+    },
+    {
+      "epoch": 0.319858334563068,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0014383015454029044,
+      "loss": 0.1191,
+      "step": 36848
+    },
+    {
+      "epoch": 0.3198670150432722,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0014382739654193969,
+      "loss": 0.0859,
+      "step": 36849
+    },
+    {
+      "epoch": 0.31987569552347633,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0014382463850659576,
+      "loss": 0.0913,
+      "step": 36850
+    },
+    {
+      "epoch": 0.3198843760036805,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0014382188043426175,
+      "loss": 0.1143,
+      "step": 36851
+    },
+    {
+      "epoch": 0.31989305648388466,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001438191223249406,
+      "loss": 0.1299,
+      "step": 36852
+    },
+    {
+      "epoch": 0.31990173696408886,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001438163641786354,
+      "loss": 0.126,
+      "step": 36853
+    },
+    {
+      "epoch": 0.319910417444293,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0014381360599534914,
+      "loss": 0.0928,
+      "step": 36854
+    },
+    {
+      "epoch": 0.3199190979244972,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0014381084777508482,
+      "loss": 0.0938,
+      "step": 36855
+    },
+    {
+      "epoch": 0.3199277784047013,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0014380808951784546,
+      "loss": 0.0791,
+      "step": 36856
+    },
+    {
+      "epoch": 0.3199364588849055,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0014380533122363408,
+      "loss": 0.1504,
+      "step": 36857
+    },
+    {
+      "epoch": 0.31994513936510965,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0014380257289245372,
+      "loss": 0.126,
+      "step": 36858
+    },
+    {
+      "epoch": 0.31995381984531385,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0014379981452430732,
+      "loss": 0.0864,
+      "step": 36859
+    },
+    {
+      "epoch": 0.319962500325518,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.00143797056119198,
+      "loss": 0.1143,
+      "step": 36860
+    },
+    {
+      "epoch": 0.3199711808057222,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0014379429767712871,
+      "loss": 0.0908,
+      "step": 36861
+    },
+    {
+      "epoch": 0.3199798612859263,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0014379153919810246,
+      "loss": 0.1099,
+      "step": 36862
+    },
+    {
+      "epoch": 0.3199885417661305,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001437887806821223,
+      "loss": 0.1348,
+      "step": 36863
+    },
+    {
+      "epoch": 0.31999722224633464,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0014378602212919128,
+      "loss": 0.1182,
+      "step": 36864
+    },
+    {
+      "epoch": 0.32000590272653884,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0014378326353931235,
+      "loss": 0.0884,
+      "step": 36865
+    },
+    {
+      "epoch": 0.320014583206743,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0014378050491248853,
+      "loss": 0.1064,
+      "step": 36866
+    },
+    {
+      "epoch": 0.32002326368694717,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0014377774624872287,
+      "loss": 0.126,
+      "step": 36867
+    },
+    {
+      "epoch": 0.3200319441671513,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0014377498754801841,
+      "loss": 0.1055,
+      "step": 36868
+    },
+    {
+      "epoch": 0.3200406246473555,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0014377222881037809,
+      "loss": 0.1152,
+      "step": 36869
+    },
+    {
+      "epoch": 0.32004930512755964,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0014376947003580495,
+      "loss": 0.084,
+      "step": 36870
+    },
+    {
+      "epoch": 0.32005798560776383,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0014376671122430208,
+      "loss": 0.0938,
+      "step": 36871
+    },
+    {
+      "epoch": 0.32006666608796797,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001437639523758724,
+      "loss": 0.1006,
+      "step": 36872
+    },
+    {
+      "epoch": 0.32007534656817216,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00143761193490519,
+      "loss": 0.1138,
+      "step": 36873
+    },
+    {
+      "epoch": 0.3200840270483763,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0014375843456824484,
+      "loss": 0.125,
+      "step": 36874
+    },
+    {
+      "epoch": 0.3200927075285805,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0014375567560905298,
+      "loss": 0.123,
+      "step": 36875
+    },
+    {
+      "epoch": 0.3201013880087846,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001437529166129464,
+      "loss": 0.1216,
+      "step": 36876
+    },
+    {
+      "epoch": 0.3201100684889888,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0014375015757992818,
+      "loss": 0.085,
+      "step": 36877
+    },
+    {
+      "epoch": 0.32011874896919296,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0014374739851000127,
+      "loss": 0.0947,
+      "step": 36878
+    },
+    {
+      "epoch": 0.32012742944939715,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0014374463940316866,
+      "loss": 0.0918,
+      "step": 36879
+    },
+    {
+      "epoch": 0.3201361099296013,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.001437418802594335,
+      "loss": 0.0811,
+      "step": 36880
+    },
+    {
+      "epoch": 0.3201447904098055,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0014373912107879868,
+      "loss": 0.0835,
+      "step": 36881
+    },
+    {
+      "epoch": 0.3201534708900096,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001437363618612673,
+      "loss": 0.125,
+      "step": 36882
+    },
+    {
+      "epoch": 0.3201621513702138,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.001437336026068423,
+      "loss": 0.1069,
+      "step": 36883
+    },
+    {
+      "epoch": 0.32017083185041795,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014373084331552675,
+      "loss": 0.1211,
+      "step": 36884
+    },
+    {
+      "epoch": 0.32017951233062214,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0014372808398732367,
+      "loss": 0.0972,
+      "step": 36885
+    },
+    {
+      "epoch": 0.3201881928108263,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0014372532462223605,
+      "loss": 0.168,
+      "step": 36886
+    },
+    {
+      "epoch": 0.32019687329103047,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0014372256522026693,
+      "loss": 0.1221,
+      "step": 36887
+    },
+    {
+      "epoch": 0.3202055537712346,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0014371980578141933,
+      "loss": 0.0791,
+      "step": 36888
+    },
+    {
+      "epoch": 0.3202142342514388,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0014371704630569623,
+      "loss": 0.1309,
+      "step": 36889
+    },
+    {
+      "epoch": 0.32022291473164294,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.001437142867931007,
+      "loss": 0.126,
+      "step": 36890
+    },
+    {
+      "epoch": 0.32023159521184713,
+      "grad_norm": 1.7578125,
+      "learning_rate": 0.0014371152724363574,
+      "loss": 0.2266,
+      "step": 36891
+    },
+    {
+      "epoch": 0.32024027569205127,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0014370876765730433,
+      "loss": 0.0645,
+      "step": 36892
+    },
+    {
+      "epoch": 0.32024895617225546,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0014370600803410953,
+      "loss": 0.0908,
+      "step": 36893
+    },
+    {
+      "epoch": 0.3202576366524596,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0014370324837405435,
+      "loss": 0.0933,
+      "step": 36894
+    },
+    {
+      "epoch": 0.3202663171326638,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001437004886771418,
+      "loss": 0.0942,
+      "step": 36895
+    },
+    {
+      "epoch": 0.32027499761286793,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0014369772894337492,
+      "loss": 0.1299,
+      "step": 36896
+    },
+    {
+      "epoch": 0.3202836780930721,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0014369496917275668,
+      "loss": 0.0977,
+      "step": 36897
+    },
+    {
+      "epoch": 0.32029235857327626,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0014369220936529014,
+      "loss": 0.0703,
+      "step": 36898
+    },
+    {
+      "epoch": 0.32030103905348045,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0014368944952097833,
+      "loss": 0.0762,
+      "step": 36899
+    },
+    {
+      "epoch": 0.3203097195336846,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001436866896398242,
+      "loss": 0.0894,
+      "step": 36900
+    },
+    {
+      "epoch": 0.3203184000138888,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0014368392972183082,
+      "loss": 0.1133,
+      "step": 36901
+    },
+    {
+      "epoch": 0.3203270804940929,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0014368116976700123,
+      "loss": 0.0938,
+      "step": 36902
+    },
+    {
+      "epoch": 0.3203357609742971,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0014367840977533841,
+      "loss": 0.0923,
+      "step": 36903
+    },
+    {
+      "epoch": 0.32034444145450125,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001436756497468454,
+      "loss": 0.1094,
+      "step": 36904
+    },
+    {
+      "epoch": 0.32035312193470544,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014367288968152516,
+      "loss": 0.0918,
+      "step": 36905
+    },
+    {
+      "epoch": 0.3203618024149096,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001436701295793808,
+      "loss": 0.1182,
+      "step": 36906
+    },
+    {
+      "epoch": 0.3203704828951138,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0014366736944041527,
+      "loss": 0.0908,
+      "step": 36907
+    },
+    {
+      "epoch": 0.3203791633753179,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0014366460926463164,
+      "loss": 0.1025,
+      "step": 36908
+    },
+    {
+      "epoch": 0.3203878438555221,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0014366184905203283,
+      "loss": 0.1104,
+      "step": 36909
+    },
+    {
+      "epoch": 0.32039652433572624,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0014365908880262196,
+      "loss": 0.0747,
+      "step": 36910
+    },
+    {
+      "epoch": 0.32040520481593043,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0014365632851640203,
+      "loss": 0.124,
+      "step": 36911
+    },
+    {
+      "epoch": 0.32041388529613457,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0014365356819337604,
+      "loss": 0.1426,
+      "step": 36912
+    },
+    {
+      "epoch": 0.32042256577633876,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0014365080783354705,
+      "loss": 0.1484,
+      "step": 36913
+    },
+    {
+      "epoch": 0.3204312462565429,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.00143648047436918,
+      "loss": 0.1348,
+      "step": 36914
+    },
+    {
+      "epoch": 0.3204399267367471,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0014364528700349195,
+      "loss": 0.1182,
+      "step": 36915
+    },
+    {
+      "epoch": 0.32044860721695123,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0014364252653327192,
+      "loss": 0.0962,
+      "step": 36916
+    },
+    {
+      "epoch": 0.3204572876971554,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0014363976602626094,
+      "loss": 0.0728,
+      "step": 36917
+    },
+    {
+      "epoch": 0.32046596817735956,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.00143637005482462,
+      "loss": 0.085,
+      "step": 36918
+    },
+    {
+      "epoch": 0.32047464865756375,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0014363424490187815,
+      "loss": 0.0859,
+      "step": 36919
+    },
+    {
+      "epoch": 0.3204833291377679,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001436314842845124,
+      "loss": 0.1387,
+      "step": 36920
+    },
+    {
+      "epoch": 0.3204920096179721,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0014362872363036776,
+      "loss": 0.1211,
+      "step": 36921
+    },
+    {
+      "epoch": 0.3205006900981762,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0014362596293944726,
+      "loss": 0.0928,
+      "step": 36922
+    },
+    {
+      "epoch": 0.3205093705783804,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0014362320221175389,
+      "loss": 0.104,
+      "step": 36923
+    },
+    {
+      "epoch": 0.32051805105858455,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0014362044144729071,
+      "loss": 0.1113,
+      "step": 36924
+    },
+    {
+      "epoch": 0.32052673153878875,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0014361768064606072,
+      "loss": 0.0967,
+      "step": 36925
+    },
+    {
+      "epoch": 0.3205354120189929,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0014361491980806693,
+      "loss": 0.1211,
+      "step": 36926
+    },
+    {
+      "epoch": 0.3205440924991971,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0014361215893331236,
+      "loss": 0.1152,
+      "step": 36927
+    },
+    {
+      "epoch": 0.3205527729794012,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0014360939802180008,
+      "loss": 0.084,
+      "step": 36928
+    },
+    {
+      "epoch": 0.3205614534596054,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0014360663707353304,
+      "loss": 0.084,
+      "step": 36929
+    },
+    {
+      "epoch": 0.32057013393980954,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0014360387608851426,
+      "loss": 0.1621,
+      "step": 36930
+    },
+    {
+      "epoch": 0.32057881442001374,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0014360111506674681,
+      "loss": 0.085,
+      "step": 36931
+    },
+    {
+      "epoch": 0.3205874949002179,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001435983540082337,
+      "loss": 0.0859,
+      "step": 36932
+    },
+    {
+      "epoch": 0.32059617538042207,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0014359559291297792,
+      "loss": 0.1133,
+      "step": 36933
+    },
+    {
+      "epoch": 0.3206048558606262,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.0014359283178098253,
+      "loss": 0.1299,
+      "step": 36934
+    },
+    {
+      "epoch": 0.3206135363408304,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0014359007061225048,
+      "loss": 0.0884,
+      "step": 36935
+    },
+    {
+      "epoch": 0.32062221682103453,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0014358730940678484,
+      "loss": 0.1074,
+      "step": 36936
+    },
+    {
+      "epoch": 0.3206308973012387,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0014358454816458866,
+      "loss": 0.1084,
+      "step": 36937
+    },
+    {
+      "epoch": 0.32063957778144286,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0014358178688566488,
+      "loss": 0.0962,
+      "step": 36938
+    },
+    {
+      "epoch": 0.32064825826164706,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0014357902557001658,
+      "loss": 0.0742,
+      "step": 36939
+    },
+    {
+      "epoch": 0.3206569387418512,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0014357626421764675,
+      "loss": 0.0967,
+      "step": 36940
+    },
+    {
+      "epoch": 0.3206656192220554,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0014357350282855846,
+      "loss": 0.1035,
+      "step": 36941
+    },
+    {
+      "epoch": 0.3206742997022595,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0014357074140275469,
+      "loss": 0.1064,
+      "step": 36942
+    },
+    {
+      "epoch": 0.3206829801824637,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0014356797994023842,
+      "loss": 0.0781,
+      "step": 36943
+    },
+    {
+      "epoch": 0.32069166066266785,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0014356521844101273,
+      "loss": 0.0874,
+      "step": 36944
+    },
+    {
+      "epoch": 0.32070034114287205,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0014356245690508062,
+      "loss": 0.0967,
+      "step": 36945
+    },
+    {
+      "epoch": 0.3207090216230762,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0014355969533244512,
+      "loss": 0.1079,
+      "step": 36946
+    },
+    {
+      "epoch": 0.3207177021032804,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0014355693372310925,
+      "loss": 0.0918,
+      "step": 36947
+    },
+    {
+      "epoch": 0.3207263825834845,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00143554172077076,
+      "loss": 0.0938,
+      "step": 36948
+    },
+    {
+      "epoch": 0.3207350630636887,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001435514103943484,
+      "loss": 0.1328,
+      "step": 36949
+    },
+    {
+      "epoch": 0.32074374354389285,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0014354864867492949,
+      "loss": 0.0903,
+      "step": 36950
+    },
+    {
+      "epoch": 0.32075242402409704,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001435458869188223,
+      "loss": 0.1299,
+      "step": 36951
+    },
+    {
+      "epoch": 0.3207611045043012,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0014354312512602982,
+      "loss": 0.0884,
+      "step": 36952
+    },
+    {
+      "epoch": 0.32076978498450537,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0014354036329655509,
+      "loss": 0.0903,
+      "step": 36953
+    },
+    {
+      "epoch": 0.3207784654647095,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0014353760143040112,
+      "loss": 0.0884,
+      "step": 36954
+    },
+    {
+      "epoch": 0.3207871459449137,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0014353483952757094,
+      "loss": 0.1289,
+      "step": 36955
+    },
+    {
+      "epoch": 0.32079582642511784,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0014353207758806756,
+      "loss": 0.1621,
+      "step": 36956
+    },
+    {
+      "epoch": 0.32080450690532203,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0014352931561189399,
+      "loss": 0.0806,
+      "step": 36957
+    },
+    {
+      "epoch": 0.32081318738552617,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0014352655359905324,
+      "loss": 0.1328,
+      "step": 36958
+    },
+    {
+      "epoch": 0.32082186786573036,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0014352379154954841,
+      "loss": 0.1104,
+      "step": 36959
+    },
+    {
+      "epoch": 0.3208305483459345,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0014352102946338246,
+      "loss": 0.1021,
+      "step": 36960
+    },
+    {
+      "epoch": 0.3208392288261387,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0014351826734055838,
+      "loss": 0.1719,
+      "step": 36961
+    },
+    {
+      "epoch": 0.3208479093063428,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0014351550518107924,
+      "loss": 0.1191,
+      "step": 36962
+    },
+    {
+      "epoch": 0.32085658978654696,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0014351274298494807,
+      "loss": 0.0996,
+      "step": 36963
+    },
+    {
+      "epoch": 0.32086527026675116,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0014350998075216787,
+      "loss": 0.0835,
+      "step": 36964
+    },
+    {
+      "epoch": 0.3208739507469553,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001435072184827416,
+      "loss": 0.1016,
+      "step": 36965
+    },
+    {
+      "epoch": 0.3208826312271595,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0014350445617667238,
+      "loss": 0.0791,
+      "step": 36966
+    },
+    {
+      "epoch": 0.3208913117073636,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0014350169383396318,
+      "loss": 0.1396,
+      "step": 36967
+    },
+    {
+      "epoch": 0.3208999921875678,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0014349893145461708,
+      "loss": 0.124,
+      "step": 36968
+    },
+    {
+      "epoch": 0.32090867266777195,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0014349616903863698,
+      "loss": 0.1001,
+      "step": 36969
+    },
+    {
+      "epoch": 0.32091735314797615,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00143493406586026,
+      "loss": 0.0854,
+      "step": 36970
+    },
+    {
+      "epoch": 0.3209260336281803,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0014349064409678713,
+      "loss": 0.1021,
+      "step": 36971
+    },
+    {
+      "epoch": 0.3209347141083845,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0014348788157092342,
+      "loss": 0.1016,
+      "step": 36972
+    },
+    {
+      "epoch": 0.3209433945885886,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0014348511900843786,
+      "loss": 0.123,
+      "step": 36973
+    },
+    {
+      "epoch": 0.3209520750687928,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0014348235640933345,
+      "loss": 0.126,
+      "step": 36974
+    },
+    {
+      "epoch": 0.32096075554899695,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0014347959377361327,
+      "loss": 0.0781,
+      "step": 36975
+    },
+    {
+      "epoch": 0.32096943602920114,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0014347683110128029,
+      "loss": 0.1138,
+      "step": 36976
+    },
+    {
+      "epoch": 0.3209781165094053,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0014347406839233755,
+      "loss": 0.0957,
+      "step": 36977
+    },
+    {
+      "epoch": 0.32098679698960947,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0014347130564678807,
+      "loss": 0.0854,
+      "step": 36978
+    },
+    {
+      "epoch": 0.3209954774698136,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0014346854286463489,
+      "loss": 0.0835,
+      "step": 36979
+    },
+    {
+      "epoch": 0.3210041579500178,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0014346578004588097,
+      "loss": 0.1543,
+      "step": 36980
+    },
+    {
+      "epoch": 0.32101283843022194,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0014346301719052944,
+      "loss": 0.1445,
+      "step": 36981
+    },
+    {
+      "epoch": 0.32102151891042613,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001434602542985832,
+      "loss": 0.0947,
+      "step": 36982
+    },
+    {
+      "epoch": 0.32103019939063027,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0014345749137004537,
+      "loss": 0.0952,
+      "step": 36983
+    },
+    {
+      "epoch": 0.32103887987083446,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0014345472840491893,
+      "loss": 0.0869,
+      "step": 36984
+    },
+    {
+      "epoch": 0.3210475603510386,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001434519654032069,
+      "loss": 0.0938,
+      "step": 36985
+    },
+    {
+      "epoch": 0.3210562408312428,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0014344920236491225,
+      "loss": 0.1406,
+      "step": 36986
+    },
+    {
+      "epoch": 0.3210649213114469,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0014344643929003807,
+      "loss": 0.0903,
+      "step": 36987
+    },
+    {
+      "epoch": 0.3210736017916511,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001434436761785874,
+      "loss": 0.082,
+      "step": 36988
+    },
+    {
+      "epoch": 0.32108228227185526,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0014344091303056324,
+      "loss": 0.085,
+      "step": 36989
+    },
+    {
+      "epoch": 0.32109096275205945,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0014343814984596858,
+      "loss": 0.1074,
+      "step": 36990
+    },
+    {
+      "epoch": 0.3210996432322636,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0014343538662480646,
+      "loss": 0.0718,
+      "step": 36991
+    },
+    {
+      "epoch": 0.3211083237124678,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001434326233670799,
+      "loss": 0.1641,
+      "step": 36992
+    },
+    {
+      "epoch": 0.3211170041926719,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0014342986007279193,
+      "loss": 0.0811,
+      "step": 36993
+    },
+    {
+      "epoch": 0.3211256846728761,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0014342709674194558,
+      "loss": 0.1094,
+      "step": 36994
+    },
+    {
+      "epoch": 0.32113436515308025,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0014342433337454384,
+      "loss": 0.0791,
+      "step": 36995
+    },
+    {
+      "epoch": 0.32114304563328444,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0014342156997058976,
+      "loss": 0.1191,
+      "step": 36996
+    },
+    {
+      "epoch": 0.3211517261134886,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0014341880653008631,
+      "loss": 0.0752,
+      "step": 36997
+    },
+    {
+      "epoch": 0.32116040659369277,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0014341604305303664,
+      "loss": 0.0923,
+      "step": 36998
+    },
+    {
+      "epoch": 0.3211690870738969,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0014341327953944364,
+      "loss": 0.1504,
+      "step": 36999
+    },
+    {
+      "epoch": 0.3211777675541011,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0014341051598931037,
+      "loss": 0.0938,
+      "step": 37000
+    },
+    {
+      "epoch": 0.32118644803430524,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0014340775240263986,
+      "loss": 0.0947,
+      "step": 37001
+    },
+    {
+      "epoch": 0.32119512851450943,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0014340498877943514,
+      "loss": 0.1328,
+      "step": 37002
+    },
+    {
+      "epoch": 0.32120380899471357,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0014340222511969928,
+      "loss": 0.126,
+      "step": 37003
+    },
+    {
+      "epoch": 0.32121248947491776,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001433994614234352,
+      "loss": 0.1348,
+      "step": 37004
+    },
+    {
+      "epoch": 0.3212211699551219,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0014339669769064596,
+      "loss": 0.1133,
+      "step": 37005
+    },
+    {
+      "epoch": 0.3212298504353261,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001433939339213346,
+      "loss": 0.1011,
+      "step": 37006
+    },
+    {
+      "epoch": 0.32123853091553023,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0014339117011550413,
+      "loss": 0.0688,
+      "step": 37007
+    },
+    {
+      "epoch": 0.3212472113957344,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001433884062731576,
+      "loss": 0.0967,
+      "step": 37008
+    },
+    {
+      "epoch": 0.32125589187593856,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.00143385642394298,
+      "loss": 0.0791,
+      "step": 37009
+    },
+    {
+      "epoch": 0.32126457235614275,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0014338287847892835,
+      "loss": 0.0889,
+      "step": 37010
+    },
+    {
+      "epoch": 0.3212732528363469,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0014338011452705171,
+      "loss": 0.0967,
+      "step": 37011
+    },
+    {
+      "epoch": 0.3212819333165511,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001433773505386711,
+      "loss": 0.0757,
+      "step": 37012
+    },
+    {
+      "epoch": 0.3212906137967552,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0014337458651378944,
+      "loss": 0.1138,
+      "step": 37013
+    },
+    {
+      "epoch": 0.3212992942769594,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0014337182245240992,
+      "loss": 0.1211,
+      "step": 37014
+    },
+    {
+      "epoch": 0.32130797475716355,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.001433690583545354,
+      "loss": 0.1123,
+      "step": 37015
+    },
+    {
+      "epoch": 0.32131665523736774,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0014336629422016902,
+      "loss": 0.1074,
+      "step": 37016
+    },
+    {
+      "epoch": 0.3213253357175719,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001433635300493138,
+      "loss": 0.103,
+      "step": 37017
+    },
+    {
+      "epoch": 0.3213340161977761,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0014336076584197264,
+      "loss": 0.1172,
+      "step": 37018
+    },
+    {
+      "epoch": 0.3213426966779802,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0014335800159814875,
+      "loss": 0.0825,
+      "step": 37019
+    },
+    {
+      "epoch": 0.3213513771581844,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0014335523731784496,
+      "loss": 0.1143,
+      "step": 37020
+    },
+    {
+      "epoch": 0.32136005763838854,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0014335247300106442,
+      "loss": 0.0957,
+      "step": 37021
+    },
+    {
+      "epoch": 0.32136873811859273,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0014334970864781009,
+      "loss": 0.1738,
+      "step": 37022
+    },
+    {
+      "epoch": 0.32137741859879687,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0014334694425808508,
+      "loss": 0.1006,
+      "step": 37023
+    },
+    {
+      "epoch": 0.32138609907900106,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0014334417983189233,
+      "loss": 0.1006,
+      "step": 37024
+    },
+    {
+      "epoch": 0.3213947795592052,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0014334141536923484,
+      "loss": 0.1055,
+      "step": 37025
+    },
+    {
+      "epoch": 0.3214034600394094,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001433386508701157,
+      "loss": 0.085,
+      "step": 37026
+    },
+    {
+      "epoch": 0.32141214051961353,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0014333588633453794,
+      "loss": 0.1162,
+      "step": 37027
+    },
+    {
+      "epoch": 0.3214208209998177,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0014333312176250453,
+      "loss": 0.1143,
+      "step": 37028
+    },
+    {
+      "epoch": 0.32142950148002186,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0014333035715401851,
+      "loss": 0.0933,
+      "step": 37029
+    },
+    {
+      "epoch": 0.32143818196022605,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0014332759250908296,
+      "loss": 0.0752,
+      "step": 37030
+    },
+    {
+      "epoch": 0.3214468624404302,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0014332482782770082,
+      "loss": 0.0977,
+      "step": 37031
+    },
+    {
+      "epoch": 0.3214555429206344,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0014332206310987516,
+      "loss": 0.1074,
+      "step": 37032
+    },
+    {
+      "epoch": 0.3214642234008385,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0014331929835560899,
+      "loss": 0.0854,
+      "step": 37033
+    },
+    {
+      "epoch": 0.3214729038810427,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001433165335649053,
+      "loss": 0.0947,
+      "step": 37034
+    },
+    {
+      "epoch": 0.32148158436124685,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0014331376873776718,
+      "loss": 0.1152,
+      "step": 37035
+    },
+    {
+      "epoch": 0.32149026484145105,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0014331100387419765,
+      "loss": 0.0757,
+      "step": 37036
+    },
+    {
+      "epoch": 0.3214989453216552,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0014330823897419967,
+      "loss": 0.1001,
+      "step": 37037
+    },
+    {
+      "epoch": 0.3215076258018594,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0014330547403777632,
+      "loss": 0.1084,
+      "step": 37038
+    },
+    {
+      "epoch": 0.3215163062820635,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001433027090649306,
+      "loss": 0.1001,
+      "step": 37039
+    },
+    {
+      "epoch": 0.3215249867622677,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0014329994405566553,
+      "loss": 0.1338,
+      "step": 37040
+    },
+    {
+      "epoch": 0.32153366724247184,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0014329717900998416,
+      "loss": 0.1172,
+      "step": 37041
+    },
+    {
+      "epoch": 0.32154234772267604,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0014329441392788946,
+      "loss": 0.1055,
+      "step": 37042
+    },
+    {
+      "epoch": 0.3215510282028802,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0014329164880938453,
+      "loss": 0.1177,
+      "step": 37043
+    },
+    {
+      "epoch": 0.32155970868308437,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0014328888365447232,
+      "loss": 0.083,
+      "step": 37044
+    },
+    {
+      "epoch": 0.3215683891632885,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0014328611846315589,
+      "loss": 0.0869,
+      "step": 37045
+    },
+    {
+      "epoch": 0.3215770696434927,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001432833532354383,
+      "loss": 0.1152,
+      "step": 37046
+    },
+    {
+      "epoch": 0.32158575012369683,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001432805879713225,
+      "loss": 0.1445,
+      "step": 37047
+    },
+    {
+      "epoch": 0.321594430603901,
+      "grad_norm": 2.0,
+      "learning_rate": 0.0014327782267081154,
+      "loss": 0.1699,
+      "step": 37048
+    },
+    {
+      "epoch": 0.32160311108410516,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0014327505733390848,
+      "loss": 0.0967,
+      "step": 37049
+    },
+    {
+      "epoch": 0.32161179156430936,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014327229196061631,
+      "loss": 0.0767,
+      "step": 37050
+    },
+    {
+      "epoch": 0.3216204720445135,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0014326952655093806,
+      "loss": 0.0723,
+      "step": 37051
+    },
+    {
+      "epoch": 0.3216291525247177,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0014326676110487672,
+      "loss": 0.0884,
+      "step": 37052
+    },
+    {
+      "epoch": 0.3216378330049218,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001432639956224354,
+      "loss": 0.085,
+      "step": 37053
+    },
+    {
+      "epoch": 0.321646513485126,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0014326123010361704,
+      "loss": 0.0947,
+      "step": 37054
+    },
+    {
+      "epoch": 0.32165519396533016,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0014325846454842474,
+      "loss": 0.1191,
+      "step": 37055
+    },
+    {
+      "epoch": 0.32166387444553435,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0014325569895686143,
+      "loss": 0.0811,
+      "step": 37056
+    },
+    {
+      "epoch": 0.3216725549257385,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0014325293332893023,
+      "loss": 0.1113,
+      "step": 37057
+    },
+    {
+      "epoch": 0.3216812354059427,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001432501676646341,
+      "loss": 0.1426,
+      "step": 37058
+    },
+    {
+      "epoch": 0.3216899158861468,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001432474019639761,
+      "loss": 0.1167,
+      "step": 37059
+    },
+    {
+      "epoch": 0.321698596366351,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0014324463622695926,
+      "loss": 0.1113,
+      "step": 37060
+    },
+    {
+      "epoch": 0.32170727684655515,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0014324187045358654,
+      "loss": 0.1133,
+      "step": 37061
+    },
+    {
+      "epoch": 0.32171595732675934,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0014323910464386102,
+      "loss": 0.0957,
+      "step": 37062
+    },
+    {
+      "epoch": 0.3217246378069635,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0014323633879778574,
+      "loss": 0.1138,
+      "step": 37063
+    },
+    {
+      "epoch": 0.32173331828716767,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0014323357291536368,
+      "loss": 0.0928,
+      "step": 37064
+    },
+    {
+      "epoch": 0.3217419987673718,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0014323080699659785,
+      "loss": 0.1069,
+      "step": 37065
+    },
+    {
+      "epoch": 0.321750679247576,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0014322804104149136,
+      "loss": 0.0864,
+      "step": 37066
+    },
+    {
+      "epoch": 0.32175935972778014,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0014322527505004718,
+      "loss": 0.1235,
+      "step": 37067
+    },
+    {
+      "epoch": 0.32176804020798433,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0014322250902226834,
+      "loss": 0.0786,
+      "step": 37068
+    },
+    {
+      "epoch": 0.32177672068818847,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0014321974295815783,
+      "loss": 0.0942,
+      "step": 37069
+    },
+    {
+      "epoch": 0.32178540116839266,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0014321697685771872,
+      "loss": 0.1104,
+      "step": 37070
+    },
+    {
+      "epoch": 0.3217940816485968,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0014321421072095407,
+      "loss": 0.0732,
+      "step": 37071
+    },
+    {
+      "epoch": 0.321802762128801,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001432114445478668,
+      "loss": 0.1035,
+      "step": 37072
+    },
+    {
+      "epoch": 0.3218114426090051,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0014320867833846,
+      "loss": 0.1123,
+      "step": 37073
+    },
+    {
+      "epoch": 0.3218201230892093,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.001432059120927367,
+      "loss": 0.103,
+      "step": 37074
+    },
+    {
+      "epoch": 0.32182880356941346,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0014320314581069992,
+      "loss": 0.0679,
+      "step": 37075
+    },
+    {
+      "epoch": 0.32183748404961765,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001432003794923527,
+      "loss": 0.1094,
+      "step": 37076
+    },
+    {
+      "epoch": 0.3218461645298218,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0014319761313769804,
+      "loss": 0.1201,
+      "step": 37077
+    },
+    {
+      "epoch": 0.321854845010026,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0014319484674673892,
+      "loss": 0.123,
+      "step": 37078
+    },
+    {
+      "epoch": 0.3218635254902301,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0014319208031947845,
+      "loss": 0.1094,
+      "step": 37079
+    },
+    {
+      "epoch": 0.3218722059704343,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0014318931385591963,
+      "loss": 0.0918,
+      "step": 37080
+    },
+    {
+      "epoch": 0.32188088645063845,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0014318654735606545,
+      "loss": 0.1016,
+      "step": 37081
+    },
+    {
+      "epoch": 0.32188956693084264,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0014318378081991895,
+      "loss": 0.1074,
+      "step": 37082
+    },
+    {
+      "epoch": 0.3218982474110468,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0014318101424748316,
+      "loss": 0.0835,
+      "step": 37083
+    },
+    {
+      "epoch": 0.32190692789125097,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0014317824763876116,
+      "loss": 0.124,
+      "step": 37084
+    },
+    {
+      "epoch": 0.3219156083714551,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.001431754809937559,
+      "loss": 0.0864,
+      "step": 37085
+    },
+    {
+      "epoch": 0.3219242888516593,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0014317271431247044,
+      "loss": 0.083,
+      "step": 37086
+    },
+    {
+      "epoch": 0.32193296933186344,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0014316994759490777,
+      "loss": 0.0972,
+      "step": 37087
+    },
+    {
+      "epoch": 0.3219416498120676,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.00143167180841071,
+      "loss": 0.084,
+      "step": 37088
+    },
+    {
+      "epoch": 0.32195033029227177,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0014316441405096303,
+      "loss": 0.0967,
+      "step": 37089
+    },
+    {
+      "epoch": 0.3219590107724759,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.00143161647224587,
+      "loss": 0.1348,
+      "step": 37090
+    },
+    {
+      "epoch": 0.3219676912526801,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0014315888036194589,
+      "loss": 0.0981,
+      "step": 37091
+    },
+    {
+      "epoch": 0.32197637173288424,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0014315611346304268,
+      "loss": 0.0972,
+      "step": 37092
+    },
+    {
+      "epoch": 0.32198505221308843,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001431533465278805,
+      "loss": 0.1025,
+      "step": 37093
+    },
+    {
+      "epoch": 0.32199373269329257,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001431505795564623,
+      "loss": 0.0869,
+      "step": 37094
+    },
+    {
+      "epoch": 0.32200241317349676,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001431478125487911,
+      "loss": 0.0894,
+      "step": 37095
+    },
+    {
+      "epoch": 0.3220110936537009,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014314504550487,
+      "loss": 0.1045,
+      "step": 37096
+    },
+    {
+      "epoch": 0.3220197741339051,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0014314227842470194,
+      "loss": 0.1113,
+      "step": 37097
+    },
+    {
+      "epoch": 0.3220284546141092,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0014313951130828998,
+      "loss": 0.0986,
+      "step": 37098
+    },
+    {
+      "epoch": 0.3220371350943134,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0014313674415563718,
+      "loss": 0.0776,
+      "step": 37099
+    },
+    {
+      "epoch": 0.32204581557451756,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001431339769667465,
+      "loss": 0.0728,
+      "step": 37100
+    },
+    {
+      "epoch": 0.32205449605472175,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0014313120974162104,
+      "loss": 0.0957,
+      "step": 37101
+    },
+    {
+      "epoch": 0.3220631765349259,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0014312844248026374,
+      "loss": 0.0957,
+      "step": 37102
+    },
+    {
+      "epoch": 0.3220718570151301,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001431256751826777,
+      "loss": 0.1045,
+      "step": 37103
+    },
+    {
+      "epoch": 0.3220805374953342,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0014312290784886587,
+      "loss": 0.1123,
+      "step": 37104
+    },
+    {
+      "epoch": 0.3220892179755384,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001431201404788314,
+      "loss": 0.126,
+      "step": 37105
+    },
+    {
+      "epoch": 0.32209789845574255,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0014311737307257722,
+      "loss": 0.1152,
+      "step": 37106
+    },
+    {
+      "epoch": 0.32210657893594674,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001431146056301064,
+      "loss": 0.1006,
+      "step": 37107
+    },
+    {
+      "epoch": 0.3221152594161509,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0014311183815142188,
+      "loss": 0.1157,
+      "step": 37108
+    },
+    {
+      "epoch": 0.32212393989635507,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0014310907063652678,
+      "loss": 0.1455,
+      "step": 37109
+    },
+    {
+      "epoch": 0.3221326203765592,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0014310630308542413,
+      "loss": 0.1123,
+      "step": 37110
+    },
+    {
+      "epoch": 0.3221413008567634,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001431035354981169,
+      "loss": 0.0786,
+      "step": 37111
+    },
+    {
+      "epoch": 0.32214998133696754,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0014310076787460812,
+      "loss": 0.0967,
+      "step": 37112
+    },
+    {
+      "epoch": 0.32215866181717173,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0014309800021490085,
+      "loss": 0.0845,
+      "step": 37113
+    },
+    {
+      "epoch": 0.32216734229737587,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0014309523251899812,
+      "loss": 0.2266,
+      "step": 37114
+    },
+    {
+      "epoch": 0.32217602277758006,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0014309246478690293,
+      "loss": 0.1123,
+      "step": 37115
+    },
+    {
+      "epoch": 0.3221847032577842,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0014308969701861834,
+      "loss": 0.1147,
+      "step": 37116
+    },
+    {
+      "epoch": 0.3221933837379884,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0014308692921414733,
+      "loss": 0.1196,
+      "step": 37117
+    },
+    {
+      "epoch": 0.32220206421819253,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0014308416137349294,
+      "loss": 0.1196,
+      "step": 37118
+    },
+    {
+      "epoch": 0.3222107446983967,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0014308139349665822,
+      "loss": 0.1328,
+      "step": 37119
+    },
+    {
+      "epoch": 0.32221942517860086,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0014307862558364617,
+      "loss": 0.1445,
+      "step": 37120
+    },
+    {
+      "epoch": 0.32222810565880505,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0014307585763445986,
+      "loss": 0.1553,
+      "step": 37121
+    },
+    {
+      "epoch": 0.3222367861390092,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0014307308964910228,
+      "loss": 0.0938,
+      "step": 37122
+    },
+    {
+      "epoch": 0.3222454666192134,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0014307032162757646,
+      "loss": 0.085,
+      "step": 37123
+    },
+    {
+      "epoch": 0.3222541470994175,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0014306755356988546,
+      "loss": 0.0996,
+      "step": 37124
+    },
+    {
+      "epoch": 0.3222628275796217,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0014306478547603226,
+      "loss": 0.0967,
+      "step": 37125
+    },
+    {
+      "epoch": 0.32227150805982585,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001430620173460199,
+      "loss": 0.1069,
+      "step": 37126
+    },
+    {
+      "epoch": 0.32228018854003004,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0014305924917985143,
+      "loss": 0.123,
+      "step": 37127
+    },
+    {
+      "epoch": 0.3222888690202342,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0014305648097752985,
+      "loss": 0.1143,
+      "step": 37128
+    },
+    {
+      "epoch": 0.3222975495004384,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001430537127390582,
+      "loss": 0.0986,
+      "step": 37129
+    },
+    {
+      "epoch": 0.3223062299806425,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001430509444644395,
+      "loss": 0.1396,
+      "step": 37130
+    },
+    {
+      "epoch": 0.3223149104608467,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0014304817615367678,
+      "loss": 0.0581,
+      "step": 37131
+    },
+    {
+      "epoch": 0.32232359094105084,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001430454078067731,
+      "loss": 0.0747,
+      "step": 37132
+    },
+    {
+      "epoch": 0.32233227142125503,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0014304263942373143,
+      "loss": 0.1045,
+      "step": 37133
+    },
+    {
+      "epoch": 0.32234095190145917,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0014303987100455484,
+      "loss": 0.0859,
+      "step": 37134
+    },
+    {
+      "epoch": 0.32234963238166336,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0014303710254924633,
+      "loss": 0.0884,
+      "step": 37135
+    },
+    {
+      "epoch": 0.3223583128618675,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0014303433405780898,
+      "loss": 0.0938,
+      "step": 37136
+    },
+    {
+      "epoch": 0.3223669933420717,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0014303156553024576,
+      "loss": 0.0879,
+      "step": 37137
+    },
+    {
+      "epoch": 0.32237567382227583,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001430287969665597,
+      "loss": 0.1084,
+      "step": 37138
+    },
+    {
+      "epoch": 0.32238435430248,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0014302602836675385,
+      "loss": 0.1113,
+      "step": 37139
+    },
+    {
+      "epoch": 0.32239303478268416,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0014302325973083122,
+      "loss": 0.0923,
+      "step": 37140
+    },
+    {
+      "epoch": 0.32240171526288836,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0014302049105879486,
+      "loss": 0.1289,
+      "step": 37141
+    },
+    {
+      "epoch": 0.3224103957430925,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0014301772235064778,
+      "loss": 0.0825,
+      "step": 37142
+    },
+    {
+      "epoch": 0.3224190762232967,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0014301495360639304,
+      "loss": 0.0703,
+      "step": 37143
+    },
+    {
+      "epoch": 0.3224277567035008,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0014301218482603362,
+      "loss": 0.1143,
+      "step": 37144
+    },
+    {
+      "epoch": 0.322436437183705,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0014300941600957258,
+      "loss": 0.082,
+      "step": 37145
+    },
+    {
+      "epoch": 0.32244511766390915,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0014300664715701295,
+      "loss": 0.1235,
+      "step": 37146
+    },
+    {
+      "epoch": 0.32245379814411335,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0014300387826835772,
+      "loss": 0.0884,
+      "step": 37147
+    },
+    {
+      "epoch": 0.3224624786243175,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0014300110934360995,
+      "loss": 0.1309,
+      "step": 37148
+    },
+    {
+      "epoch": 0.3224711591045217,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0014299834038277268,
+      "loss": 0.0684,
+      "step": 37149
+    },
+    {
+      "epoch": 0.3224798395847258,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0014299557138584891,
+      "loss": 0.0903,
+      "step": 37150
+    },
+    {
+      "epoch": 0.32248852006493,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0014299280235284169,
+      "loss": 0.1006,
+      "step": 37151
+    },
+    {
+      "epoch": 0.32249720054513414,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.00142990033283754,
+      "loss": 0.0986,
+      "step": 37152
+    },
+    {
+      "epoch": 0.32250588102533834,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0014298726417858894,
+      "loss": 0.1133,
+      "step": 37153
+    },
+    {
+      "epoch": 0.3225145615055425,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.001429844950373495,
+      "loss": 0.1084,
+      "step": 37154
+    },
+    {
+      "epoch": 0.32252324198574667,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0014298172586003873,
+      "loss": 0.0776,
+      "step": 37155
+    },
+    {
+      "epoch": 0.3225319224659508,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001429789566466596,
+      "loss": 0.0859,
+      "step": 37156
+    },
+    {
+      "epoch": 0.322540602946155,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001429761873972152,
+      "loss": 0.1436,
+      "step": 37157
+    },
+    {
+      "epoch": 0.32254928342635913,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0014297341811170853,
+      "loss": 0.1553,
+      "step": 37158
+    },
+    {
+      "epoch": 0.3225579639065633,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001429706487901426,
+      "loss": 0.1348,
+      "step": 37159
+    },
+    {
+      "epoch": 0.32256664438676746,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001429678794325205,
+      "loss": 0.0854,
+      "step": 37160
+    },
+    {
+      "epoch": 0.32257532486697166,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001429651100388452,
+      "loss": 0.0996,
+      "step": 37161
+    },
+    {
+      "epoch": 0.3225840053471758,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0014296234060911979,
+      "loss": 0.1143,
+      "step": 37162
+    },
+    {
+      "epoch": 0.32259268582738,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0014295957114334723,
+      "loss": 0.126,
+      "step": 37163
+    },
+    {
+      "epoch": 0.3226013663075841,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001429568016415306,
+      "loss": 0.0928,
+      "step": 37164
+    },
+    {
+      "epoch": 0.3226100467877883,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0014295403210367289,
+      "loss": 0.0928,
+      "step": 37165
+    },
+    {
+      "epoch": 0.32261872726799246,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0014295126252977714,
+      "loss": 0.0791,
+      "step": 37166
+    },
+    {
+      "epoch": 0.32262740774819665,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001429484929198464,
+      "loss": 0.1143,
+      "step": 37167
+    },
+    {
+      "epoch": 0.3226360882284008,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0014294572327388367,
+      "loss": 0.1108,
+      "step": 37168
+    },
+    {
+      "epoch": 0.322644768708605,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.00142942953591892,
+      "loss": 0.0957,
+      "step": 37169
+    },
+    {
+      "epoch": 0.3226534491888091,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0014294018387387437,
+      "loss": 0.0889,
+      "step": 37170
+    },
+    {
+      "epoch": 0.3226621296690133,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001429374141198339,
+      "loss": 0.1484,
+      "step": 37171
+    },
+    {
+      "epoch": 0.32267081014921745,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0014293464432977357,
+      "loss": 0.1211,
+      "step": 37172
+    },
+    {
+      "epoch": 0.32267949062942164,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0014293187450369637,
+      "loss": 0.126,
+      "step": 37173
+    },
+    {
+      "epoch": 0.3226881711096258,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001429291046416054,
+      "loss": 0.0859,
+      "step": 37174
+    },
+    {
+      "epoch": 0.32269685158982997,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0014292633474350365,
+      "loss": 0.0874,
+      "step": 37175
+    },
+    {
+      "epoch": 0.3227055320700341,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0014292356480939414,
+      "loss": 0.1187,
+      "step": 37176
+    },
+    {
+      "epoch": 0.3227142125502383,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0014292079483927992,
+      "loss": 0.1182,
+      "step": 37177
+    },
+    {
+      "epoch": 0.32272289303044244,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0014291802483316404,
+      "loss": 0.0732,
+      "step": 37178
+    },
+    {
+      "epoch": 0.32273157351064663,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0014291525479104947,
+      "loss": 0.0859,
+      "step": 37179
+    },
+    {
+      "epoch": 0.32274025399085077,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001429124847129393,
+      "loss": 0.1309,
+      "step": 37180
+    },
+    {
+      "epoch": 0.32274893447105496,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0014290971459883651,
+      "loss": 0.1348,
+      "step": 37181
+    },
+    {
+      "epoch": 0.3227576149512591,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0014290694444874415,
+      "loss": 0.1064,
+      "step": 37182
+    },
+    {
+      "epoch": 0.3227662954314633,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0014290417426266525,
+      "loss": 0.1289,
+      "step": 37183
+    },
+    {
+      "epoch": 0.3227749759116674,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0014290140404060285,
+      "loss": 0.1084,
+      "step": 37184
+    },
+    {
+      "epoch": 0.3227836563918716,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0014289863378255997,
+      "loss": 0.1089,
+      "step": 37185
+    },
+    {
+      "epoch": 0.32279233687207576,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001428958634885396,
+      "loss": 0.1172,
+      "step": 37186
+    },
+    {
+      "epoch": 0.32280101735227995,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0014289309315854487,
+      "loss": 0.1235,
+      "step": 37187
+    },
+    {
+      "epoch": 0.3228096978324841,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001428903227925787,
+      "loss": 0.0786,
+      "step": 37188
+    },
+    {
+      "epoch": 0.3228183783126883,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001428875523906442,
+      "loss": 0.165,
+      "step": 37189
+    },
+    {
+      "epoch": 0.3228270587928924,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0014288478195274433,
+      "loss": 0.0801,
+      "step": 37190
+    },
+    {
+      "epoch": 0.3228357392730966,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0014288201147888218,
+      "loss": 0.1104,
+      "step": 37191
+    },
+    {
+      "epoch": 0.32284441975330075,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0014287924096906075,
+      "loss": 0.1143,
+      "step": 37192
+    },
+    {
+      "epoch": 0.32285310023350494,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0014287647042328308,
+      "loss": 0.0967,
+      "step": 37193
+    },
+    {
+      "epoch": 0.3228617807137091,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001428736998415522,
+      "loss": 0.1299,
+      "step": 37194
+    },
+    {
+      "epoch": 0.32287046119391327,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0014287092922387113,
+      "loss": 0.0801,
+      "step": 37195
+    },
+    {
+      "epoch": 0.3228791416741174,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001428681585702429,
+      "loss": 0.1006,
+      "step": 37196
+    },
+    {
+      "epoch": 0.3228878221543216,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0014286538788067054,
+      "loss": 0.0957,
+      "step": 37197
+    },
+    {
+      "epoch": 0.32289650263452574,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0014286261715515712,
+      "loss": 0.1177,
+      "step": 37198
+    },
+    {
+      "epoch": 0.32290518311472993,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.001428598463937056,
+      "loss": 0.0747,
+      "step": 37199
+    },
+    {
+      "epoch": 0.32291386359493407,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0014285707559631906,
+      "loss": 0.0972,
+      "step": 37200
+    },
+    {
+      "epoch": 0.32292254407513826,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0014285430476300052,
+      "loss": 0.1064,
+      "step": 37201
+    },
+    {
+      "epoch": 0.3229312245553424,
+      "grad_norm": 0.125,
+      "learning_rate": 0.00142851533893753,
+      "loss": 0.0879,
+      "step": 37202
+    },
+    {
+      "epoch": 0.3229399050355466,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0014284876298857954,
+      "loss": 0.1387,
+      "step": 37203
+    },
+    {
+      "epoch": 0.32294858551575073,
+      "grad_norm": 1.8671875,
+      "learning_rate": 0.0014284599204748315,
+      "loss": 0.1357,
+      "step": 37204
+    },
+    {
+      "epoch": 0.3229572659959549,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001428432210704669,
+      "loss": 0.1113,
+      "step": 37205
+    },
+    {
+      "epoch": 0.32296594647615906,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0014284045005753378,
+      "loss": 0.127,
+      "step": 37206
+    },
+    {
+      "epoch": 0.32297462695636325,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0014283767900868684,
+      "loss": 0.123,
+      "step": 37207
+    },
+    {
+      "epoch": 0.3229833074365674,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0014283490792392913,
+      "loss": 0.0874,
+      "step": 37208
+    },
+    {
+      "epoch": 0.3229919879167716,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0014283213680326362,
+      "loss": 0.1074,
+      "step": 37209
+    },
+    {
+      "epoch": 0.3230006683969757,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0014282936564669342,
+      "loss": 0.1406,
+      "step": 37210
+    },
+    {
+      "epoch": 0.32300934887717986,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001428265944542215,
+      "loss": 0.0918,
+      "step": 37211
+    },
+    {
+      "epoch": 0.32301802935738405,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001428238232258509,
+      "loss": 0.1016,
+      "step": 37212
+    },
+    {
+      "epoch": 0.3230267098375882,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0014282105196158465,
+      "loss": 0.083,
+      "step": 37213
+    },
+    {
+      "epoch": 0.3230353903177924,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0014281828066142583,
+      "loss": 0.0996,
+      "step": 37214
+    },
+    {
+      "epoch": 0.3230440707979965,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0014281550932537745,
+      "loss": 0.1445,
+      "step": 37215
+    },
+    {
+      "epoch": 0.3230527512782007,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0014281273795344245,
+      "loss": 0.1211,
+      "step": 37216
+    },
+    {
+      "epoch": 0.32306143175840485,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0014280996654562393,
+      "loss": 0.0918,
+      "step": 37217
+    },
+    {
+      "epoch": 0.32307011223860904,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0014280719510192496,
+      "loss": 0.0928,
+      "step": 37218
+    },
+    {
+      "epoch": 0.3230787927188132,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0014280442362234855,
+      "loss": 0.0977,
+      "step": 37219
+    },
+    {
+      "epoch": 0.32308747319901737,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001428016521068977,
+      "loss": 0.1406,
+      "step": 37220
+    },
+    {
+      "epoch": 0.3230961536792215,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0014279888055557545,
+      "loss": 0.1182,
+      "step": 37221
+    },
+    {
+      "epoch": 0.3231048341594257,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0014279610896838484,
+      "loss": 0.0942,
+      "step": 37222
+    },
+    {
+      "epoch": 0.32311351463962984,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001427933373453289,
+      "loss": 0.0884,
+      "step": 37223
+    },
+    {
+      "epoch": 0.32312219511983403,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0014279056568641067,
+      "loss": 0.0874,
+      "step": 37224
+    },
+    {
+      "epoch": 0.32313087560003817,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0014278779399163315,
+      "loss": 0.0957,
+      "step": 37225
+    },
+    {
+      "epoch": 0.32313955608024236,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001427850222609994,
+      "loss": 0.1245,
+      "step": 37226
+    },
+    {
+      "epoch": 0.3231482365604465,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0014278225049451244,
+      "loss": 0.1069,
+      "step": 37227
+    },
+    {
+      "epoch": 0.3231569170406507,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0014277947869217528,
+      "loss": 0.0986,
+      "step": 37228
+    },
+    {
+      "epoch": 0.32316559752085483,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.00142776706853991,
+      "loss": 0.1377,
+      "step": 37229
+    },
+    {
+      "epoch": 0.323174278001059,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001427739349799626,
+      "loss": 0.125,
+      "step": 37230
+    },
+    {
+      "epoch": 0.32318295848126316,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0014277116307009315,
+      "loss": 0.0928,
+      "step": 37231
+    },
+    {
+      "epoch": 0.32319163896146735,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0014276839112438563,
+      "loss": 0.1001,
+      "step": 37232
+    },
+    {
+      "epoch": 0.3232003194416715,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0014276561914284306,
+      "loss": 0.1191,
+      "step": 37233
+    },
+    {
+      "epoch": 0.3232089999218757,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0014276284712546855,
+      "loss": 0.1016,
+      "step": 37234
+    },
+    {
+      "epoch": 0.3232176804020798,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0014276007507226503,
+      "loss": 0.1172,
+      "step": 37235
+    },
+    {
+      "epoch": 0.323226360882284,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001427573029832356,
+      "loss": 0.0693,
+      "step": 37236
+    },
+    {
+      "epoch": 0.32323504136248815,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0014275453085838329,
+      "loss": 0.1123,
+      "step": 37237
+    },
+    {
+      "epoch": 0.32324372184269234,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0014275175869771111,
+      "loss": 0.1289,
+      "step": 37238
+    },
+    {
+      "epoch": 0.3232524023228965,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001427489865012221,
+      "loss": 0.0967,
+      "step": 37239
+    },
+    {
+      "epoch": 0.3232610828031007,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001427462142689193,
+      "loss": 0.1201,
+      "step": 37240
+    },
+    {
+      "epoch": 0.3232697632833048,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0014274344200080572,
+      "loss": 0.1514,
+      "step": 37241
+    },
+    {
+      "epoch": 0.323278443763509,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.001427406696968844,
+      "loss": 0.0864,
+      "step": 37242
+    },
+    {
+      "epoch": 0.32328712424371314,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0014273789735715838,
+      "loss": 0.1104,
+      "step": 37243
+    },
+    {
+      "epoch": 0.32329580472391733,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0014273512498163068,
+      "loss": 0.0962,
+      "step": 37244
+    },
+    {
+      "epoch": 0.3233044852041215,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0014273235257030437,
+      "loss": 0.1191,
+      "step": 37245
+    },
+    {
+      "epoch": 0.32331316568432567,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0014272958012318242,
+      "loss": 0.1025,
+      "step": 37246
+    },
+    {
+      "epoch": 0.3233218461645298,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0014272680764026789,
+      "loss": 0.0967,
+      "step": 37247
+    },
+    {
+      "epoch": 0.323330526644734,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0014272403512156381,
+      "loss": 0.1211,
+      "step": 37248
+    },
+    {
+      "epoch": 0.32333920712493813,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0014272126256707324,
+      "loss": 0.1055,
+      "step": 37249
+    },
+    {
+      "epoch": 0.3233478876051423,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.001427184899767992,
+      "loss": 0.0801,
+      "step": 37250
+    },
+    {
+      "epoch": 0.32335656808534646,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001427157173507447,
+      "loss": 0.1099,
+      "step": 37251
+    },
+    {
+      "epoch": 0.32336524856555066,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0014271294468891278,
+      "loss": 0.0786,
+      "step": 37252
+    },
+    {
+      "epoch": 0.3233739290457548,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0014271017199130648,
+      "loss": 0.127,
+      "step": 37253
+    },
+    {
+      "epoch": 0.323382609525959,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001427073992579288,
+      "loss": 0.1406,
+      "step": 37254
+    },
+    {
+      "epoch": 0.3233912900061631,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0014270462648878282,
+      "loss": 0.0669,
+      "step": 37255
+    },
+    {
+      "epoch": 0.3233999704863673,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0014270185368387154,
+      "loss": 0.0742,
+      "step": 37256
+    },
+    {
+      "epoch": 0.32340865096657145,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.00142699080843198,
+      "loss": 0.0957,
+      "step": 37257
+    },
+    {
+      "epoch": 0.32341733144677565,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0014269630796676527,
+      "loss": 0.1318,
+      "step": 37258
+    },
+    {
+      "epoch": 0.3234260119269798,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0014269353505457635,
+      "loss": 0.0708,
+      "step": 37259
+    },
+    {
+      "epoch": 0.323434692407184,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0014269076210663423,
+      "loss": 0.1201,
+      "step": 37260
+    },
+    {
+      "epoch": 0.3234433728873881,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0014268798912294202,
+      "loss": 0.0981,
+      "step": 37261
+    },
+    {
+      "epoch": 0.3234520533675923,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0014268521610350268,
+      "loss": 0.0879,
+      "step": 37262
+    },
+    {
+      "epoch": 0.32346073384779644,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0014268244304831933,
+      "loss": 0.1113,
+      "step": 37263
+    },
+    {
+      "epoch": 0.32346941432800064,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001426796699573949,
+      "loss": 0.0977,
+      "step": 37264
+    },
+    {
+      "epoch": 0.3234780948082048,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0014267689683073249,
+      "loss": 0.1299,
+      "step": 37265
+    },
+    {
+      "epoch": 0.32348677528840897,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0014267412366833511,
+      "loss": 0.0967,
+      "step": 37266
+    },
+    {
+      "epoch": 0.3234954557686131,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0014267135047020584,
+      "loss": 0.1074,
+      "step": 37267
+    },
+    {
+      "epoch": 0.3235041362488173,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014266857723634764,
+      "loss": 0.1094,
+      "step": 37268
+    },
+    {
+      "epoch": 0.32351281672902144,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0014266580396676356,
+      "loss": 0.1113,
+      "step": 37269
+    },
+    {
+      "epoch": 0.32352149720922563,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001426630306614567,
+      "loss": 0.0928,
+      "step": 37270
+    },
+    {
+      "epoch": 0.32353017768942977,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0014266025732042997,
+      "loss": 0.0933,
+      "step": 37271
+    },
+    {
+      "epoch": 0.32353885816963396,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0014265748394368654,
+      "loss": 0.0884,
+      "step": 37272
+    },
+    {
+      "epoch": 0.3235475386498381,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0014265471053122934,
+      "loss": 0.1147,
+      "step": 37273
+    },
+    {
+      "epoch": 0.3235562191300423,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0014265193708306143,
+      "loss": 0.1357,
+      "step": 37274
+    },
+    {
+      "epoch": 0.3235648996102464,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0014264916359918588,
+      "loss": 0.0957,
+      "step": 37275
+    },
+    {
+      "epoch": 0.3235735800904506,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0014264639007960567,
+      "loss": 0.0874,
+      "step": 37276
+    },
+    {
+      "epoch": 0.32358226057065476,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0014264361652432385,
+      "loss": 0.0967,
+      "step": 37277
+    },
+    {
+      "epoch": 0.32359094105085895,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001426408429333435,
+      "loss": 0.0947,
+      "step": 37278
+    },
+    {
+      "epoch": 0.3235996215310631,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.001426380693066676,
+      "loss": 0.1484,
+      "step": 37279
+    },
+    {
+      "epoch": 0.3236083020112673,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0014263529564429922,
+      "loss": 0.123,
+      "step": 37280
+    },
+    {
+      "epoch": 0.3236169824914714,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0014263252194624133,
+      "loss": 0.0669,
+      "step": 37281
+    },
+    {
+      "epoch": 0.3236256629716756,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0014262974821249701,
+      "loss": 0.0981,
+      "step": 37282
+    },
+    {
+      "epoch": 0.32363434345187975,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001426269744430693,
+      "loss": 0.0986,
+      "step": 37283
+    },
+    {
+      "epoch": 0.32364302393208394,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001426242006379612,
+      "loss": 0.0898,
+      "step": 37284
+    },
+    {
+      "epoch": 0.3236517044122881,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001426214267971758,
+      "loss": 0.1167,
+      "step": 37285
+    },
+    {
+      "epoch": 0.32366038489249227,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0014261865292071608,
+      "loss": 0.1279,
+      "step": 37286
+    },
+    {
+      "epoch": 0.3236690653726964,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0014261587900858508,
+      "loss": 0.0933,
+      "step": 37287
+    },
+    {
+      "epoch": 0.3236777458529006,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0014261310506078586,
+      "loss": 0.1069,
+      "step": 37288
+    },
+    {
+      "epoch": 0.32368642633310474,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0014261033107732144,
+      "loss": 0.1069,
+      "step": 37289
+    },
+    {
+      "epoch": 0.32369510681330893,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0014260755705819484,
+      "loss": 0.1465,
+      "step": 37290
+    },
+    {
+      "epoch": 0.32370378729351307,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001426047830034091,
+      "loss": 0.082,
+      "step": 37291
+    },
+    {
+      "epoch": 0.32371246777371726,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0014260200891296731,
+      "loss": 0.1221,
+      "step": 37292
+    },
+    {
+      "epoch": 0.3237211482539214,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001425992347868724,
+      "loss": 0.1172,
+      "step": 37293
+    },
+    {
+      "epoch": 0.3237298287341256,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0014259646062512747,
+      "loss": 0.1318,
+      "step": 37294
+    },
+    {
+      "epoch": 0.32373850921432973,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0014259368642773555,
+      "loss": 0.0913,
+      "step": 37295
+    },
+    {
+      "epoch": 0.3237471896945339,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0014259091219469965,
+      "loss": 0.1816,
+      "step": 37296
+    },
+    {
+      "epoch": 0.32375587017473806,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0014258813792602284,
+      "loss": 0.0908,
+      "step": 37297
+    },
+    {
+      "epoch": 0.32376455065494225,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.001425853636217081,
+      "loss": 0.0986,
+      "step": 37298
+    },
+    {
+      "epoch": 0.3237732311351464,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001425825892817585,
+      "loss": 0.0815,
+      "step": 37299
+    },
+    {
+      "epoch": 0.3237819116153506,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0014257981490617708,
+      "loss": 0.1104,
+      "step": 37300
+    },
+    {
+      "epoch": 0.3237905920955547,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0014257704049496687,
+      "loss": 0.0718,
+      "step": 37301
+    },
+    {
+      "epoch": 0.3237992725757589,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001425742660481309,
+      "loss": 0.1182,
+      "step": 37302
+    },
+    {
+      "epoch": 0.32380795305596305,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001425714915656722,
+      "loss": 0.1206,
+      "step": 37303
+    },
+    {
+      "epoch": 0.32381663353616724,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0014256871704759378,
+      "loss": 0.1025,
+      "step": 37304
+    },
+    {
+      "epoch": 0.3238253140163714,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0014256594249389871,
+      "loss": 0.0864,
+      "step": 37305
+    },
+    {
+      "epoch": 0.3238339944965756,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0014256316790459006,
+      "loss": 0.0869,
+      "step": 37306
+    },
+    {
+      "epoch": 0.3238426749767797,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0014256039327967076,
+      "loss": 0.1367,
+      "step": 37307
+    },
+    {
+      "epoch": 0.3238513554569839,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0014255761861914394,
+      "loss": 0.1016,
+      "step": 37308
+    },
+    {
+      "epoch": 0.32386003593718804,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0014255484392301258,
+      "loss": 0.1201,
+      "step": 37309
+    },
+    {
+      "epoch": 0.32386871641739223,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0014255206919127975,
+      "loss": 0.127,
+      "step": 37310
+    },
+    {
+      "epoch": 0.32387739689759637,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0014254929442394843,
+      "loss": 0.106,
+      "step": 37311
+    },
+    {
+      "epoch": 0.32388607737780056,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0014254651962102175,
+      "loss": 0.1084,
+      "step": 37312
+    },
+    {
+      "epoch": 0.3238947578580047,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0014254374478250263,
+      "loss": 0.0898,
+      "step": 37313
+    },
+    {
+      "epoch": 0.3239034383382089,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0014254096990839419,
+      "loss": 0.1328,
+      "step": 37314
+    },
+    {
+      "epoch": 0.32391211881841303,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0014253819499869943,
+      "loss": 0.0742,
+      "step": 37315
+    },
+    {
+      "epoch": 0.3239207992986172,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0014253542005342138,
+      "loss": 0.084,
+      "step": 37316
+    },
+    {
+      "epoch": 0.32392947977882136,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0014253264507256306,
+      "loss": 0.0737,
+      "step": 37317
+    },
+    {
+      "epoch": 0.32393816025902555,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0014252987005612758,
+      "loss": 0.1641,
+      "step": 37318
+    },
+    {
+      "epoch": 0.3239468407392297,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001425270950041179,
+      "loss": 0.0947,
+      "step": 37319
+    },
+    {
+      "epoch": 0.3239555212194339,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0014252431991653707,
+      "loss": 0.0908,
+      "step": 37320
+    },
+    {
+      "epoch": 0.323964201699638,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0014252154479338813,
+      "loss": 0.1123,
+      "step": 37321
+    },
+    {
+      "epoch": 0.3239728821798422,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0014251876963467412,
+      "loss": 0.1104,
+      "step": 37322
+    },
+    {
+      "epoch": 0.32398156266004635,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0014251599444039806,
+      "loss": 0.0957,
+      "step": 37323
+    },
+    {
+      "epoch": 0.32399024314025054,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0014251321921056302,
+      "loss": 0.1094,
+      "step": 37324
+    },
+    {
+      "epoch": 0.3239989236204547,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.00142510443945172,
+      "loss": 0.1406,
+      "step": 37325
+    },
+    {
+      "epoch": 0.3240076041006589,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0014250766864422808,
+      "loss": 0.1055,
+      "step": 37326
+    },
+    {
+      "epoch": 0.324016284580863,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0014250489330773421,
+      "loss": 0.0898,
+      "step": 37327
+    },
+    {
+      "epoch": 0.3240249650610672,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0014250211793569352,
+      "loss": 0.0752,
+      "step": 37328
+    },
+    {
+      "epoch": 0.32403364554127134,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0014249934252810897,
+      "loss": 0.0908,
+      "step": 37329
+    },
+    {
+      "epoch": 0.32404232602147554,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0014249656708498364,
+      "loss": 0.0933,
+      "step": 37330
+    },
+    {
+      "epoch": 0.3240510065016797,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0014249379160632056,
+      "loss": 0.1113,
+      "step": 37331
+    },
+    {
+      "epoch": 0.32405968698188387,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0014249101609212275,
+      "loss": 0.1055,
+      "step": 37332
+    },
+    {
+      "epoch": 0.324068367462088,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0014248824054239324,
+      "loss": 0.249,
+      "step": 37333
+    },
+    {
+      "epoch": 0.32407704794229214,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001424854649571351,
+      "loss": 0.1216,
+      "step": 37334
+    },
+    {
+      "epoch": 0.32408572842249633,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0014248268933635135,
+      "loss": 0.1196,
+      "step": 37335
+    },
+    {
+      "epoch": 0.32409440890270047,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0014247991368004501,
+      "loss": 0.0898,
+      "step": 37336
+    },
+    {
+      "epoch": 0.32410308938290466,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001424771379882191,
+      "loss": 0.1084,
+      "step": 37337
+    },
+    {
+      "epoch": 0.3241117698631088,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001424743622608767,
+      "loss": 0.1104,
+      "step": 37338
+    },
+    {
+      "epoch": 0.324120450343313,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0014247158649802086,
+      "loss": 0.1138,
+      "step": 37339
+    },
+    {
+      "epoch": 0.32412913082351713,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0014246881069965452,
+      "loss": 0.0996,
+      "step": 37340
+    },
+    {
+      "epoch": 0.3241378113037213,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001424660348657808,
+      "loss": 0.126,
+      "step": 37341
+    },
+    {
+      "epoch": 0.32414649178392546,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0014246325899640272,
+      "loss": 0.1055,
+      "step": 37342
+    },
+    {
+      "epoch": 0.32415517226412965,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0014246048309152328,
+      "loss": 0.1582,
+      "step": 37343
+    },
+    {
+      "epoch": 0.3241638527443338,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0014245770715114558,
+      "loss": 0.1123,
+      "step": 37344
+    },
+    {
+      "epoch": 0.324172533224538,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0014245493117527263,
+      "loss": 0.1133,
+      "step": 37345
+    },
+    {
+      "epoch": 0.3241812137047421,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001424521551639074,
+      "loss": 0.0952,
+      "step": 37346
+    },
+    {
+      "epoch": 0.3241898941849463,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0014244937911705303,
+      "loss": 0.1162,
+      "step": 37347
+    },
+    {
+      "epoch": 0.32419857466515045,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0014244660303471249,
+      "loss": 0.0903,
+      "step": 37348
+    },
+    {
+      "epoch": 0.32420725514535464,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0014244382691688884,
+      "loss": 0.0757,
+      "step": 37349
+    },
+    {
+      "epoch": 0.3242159356255588,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0014244105076358508,
+      "loss": 0.0894,
+      "step": 37350
+    },
+    {
+      "epoch": 0.324224616105763,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001424382745748043,
+      "loss": 0.0869,
+      "step": 37351
+    },
+    {
+      "epoch": 0.3242332965859671,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001424354983505495,
+      "loss": 0.0684,
+      "step": 37352
+    },
+    {
+      "epoch": 0.3242419770661713,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0014243272209082372,
+      "loss": 0.167,
+      "step": 37353
+    },
+    {
+      "epoch": 0.32425065754637544,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0014242994579563,
+      "loss": 0.0928,
+      "step": 37354
+    },
+    {
+      "epoch": 0.32425933802657964,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0014242716946497138,
+      "loss": 0.1475,
+      "step": 37355
+    },
+    {
+      "epoch": 0.3242680185067838,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001424243930988509,
+      "loss": 0.1104,
+      "step": 37356
+    },
+    {
+      "epoch": 0.32427669898698797,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0014242161669727163,
+      "loss": 0.0879,
+      "step": 37357
+    },
+    {
+      "epoch": 0.3242853794671921,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0014241884026023654,
+      "loss": 0.0713,
+      "step": 37358
+    },
+    {
+      "epoch": 0.3242940599473963,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0014241606378774866,
+      "loss": 0.0957,
+      "step": 37359
+    },
+    {
+      "epoch": 0.32430274042760043,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001424132872798111,
+      "loss": 0.0908,
+      "step": 37360
+    },
+    {
+      "epoch": 0.3243114209078046,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001424105107364268,
+      "loss": 0.0771,
+      "step": 37361
+    },
+    {
+      "epoch": 0.32432010138800876,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0014240773415759892,
+      "loss": 0.1348,
+      "step": 37362
+    },
+    {
+      "epoch": 0.32432878186821296,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0014240495754333038,
+      "loss": 0.0835,
+      "step": 37363
+    },
+    {
+      "epoch": 0.3243374623484171,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0014240218089362432,
+      "loss": 0.124,
+      "step": 37364
+    },
+    {
+      "epoch": 0.3243461428286213,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0014239940420848367,
+      "loss": 0.1357,
+      "step": 37365
+    },
+    {
+      "epoch": 0.3243548233088254,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0014239662748791156,
+      "loss": 0.168,
+      "step": 37366
+    },
+    {
+      "epoch": 0.3243635037890296,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0014239385073191095,
+      "loss": 0.125,
+      "step": 37367
+    },
+    {
+      "epoch": 0.32437218426923375,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0014239107394048492,
+      "loss": 0.0747,
+      "step": 37368
+    },
+    {
+      "epoch": 0.32438086474943795,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001423882971136365,
+      "loss": 0.0894,
+      "step": 37369
+    },
+    {
+      "epoch": 0.3243895452296421,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0014238552025136874,
+      "loss": 0.1123,
+      "step": 37370
+    },
+    {
+      "epoch": 0.3243982257098463,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0014238274335368463,
+      "loss": 0.0815,
+      "step": 37371
+    },
+    {
+      "epoch": 0.3244069061900504,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0014237996642058725,
+      "loss": 0.082,
+      "step": 37372
+    },
+    {
+      "epoch": 0.3244155866702546,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0014237718945207965,
+      "loss": 0.0742,
+      "step": 37373
+    },
+    {
+      "epoch": 0.32442426715045874,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0014237441244816483,
+      "loss": 0.1245,
+      "step": 37374
+    },
+    {
+      "epoch": 0.32443294763066294,
+      "grad_norm": 1.765625,
+      "learning_rate": 0.0014237163540884584,
+      "loss": 0.3789,
+      "step": 37375
+    },
+    {
+      "epoch": 0.3244416281108671,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.001423688583341257,
+      "loss": 0.1348,
+      "step": 37376
+    },
+    {
+      "epoch": 0.32445030859107127,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0014236608122400747,
+      "loss": 0.0776,
+      "step": 37377
+    },
+    {
+      "epoch": 0.3244589890712754,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014236330407849417,
+      "loss": 0.1299,
+      "step": 37378
+    },
+    {
+      "epoch": 0.3244676695514796,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.001423605268975889,
+      "loss": 0.1123,
+      "step": 37379
+    },
+    {
+      "epoch": 0.32447635003168374,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0014235774968129457,
+      "loss": 0.085,
+      "step": 37380
+    },
+    {
+      "epoch": 0.32448503051188793,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0014235497242961433,
+      "loss": 0.0908,
+      "step": 37381
+    },
+    {
+      "epoch": 0.32449371099209207,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0014235219514255116,
+      "loss": 0.1348,
+      "step": 37382
+    },
+    {
+      "epoch": 0.32450239147229626,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0014234941782010813,
+      "loss": 0.0732,
+      "step": 37383
+    },
+    {
+      "epoch": 0.3245110719525004,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001423466404622883,
+      "loss": 0.0859,
+      "step": 37384
+    },
+    {
+      "epoch": 0.3245197524327046,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0014234386306909458,
+      "loss": 0.1143,
+      "step": 37385
+    },
+    {
+      "epoch": 0.3245284329129087,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0014234108564053017,
+      "loss": 0.1025,
+      "step": 37386
+    },
+    {
+      "epoch": 0.3245371133931129,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0014233830817659803,
+      "loss": 0.1162,
+      "step": 37387
+    },
+    {
+      "epoch": 0.32454579387331706,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0014233553067730118,
+      "loss": 0.1201,
+      "step": 37388
+    },
+    {
+      "epoch": 0.32455447435352125,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0014233275314264266,
+      "loss": 0.125,
+      "step": 37389
+    },
+    {
+      "epoch": 0.3245631548337254,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0014232997557262555,
+      "loss": 0.0811,
+      "step": 37390
+    },
+    {
+      "epoch": 0.3245718353139296,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0014232719796725287,
+      "loss": 0.0889,
+      "step": 37391
+    },
+    {
+      "epoch": 0.3245805157941337,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0014232442032652765,
+      "loss": 0.0967,
+      "step": 37392
+    },
+    {
+      "epoch": 0.3245891962743379,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0014232164265045295,
+      "loss": 0.0889,
+      "step": 37393
+    },
+    {
+      "epoch": 0.32459787675454205,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0014231886493903173,
+      "loss": 0.1357,
+      "step": 37394
+    },
+    {
+      "epoch": 0.32460655723474624,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0014231608719226711,
+      "loss": 0.1367,
+      "step": 37395
+    },
+    {
+      "epoch": 0.3246152377149504,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0014231330941016217,
+      "loss": 0.0889,
+      "step": 37396
+    },
+    {
+      "epoch": 0.32462391819515457,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.001423105315927198,
+      "loss": 0.1514,
+      "step": 37397
+    },
+    {
+      "epoch": 0.3246325986753587,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0014230775373994312,
+      "loss": 0.0752,
+      "step": 37398
+    },
+    {
+      "epoch": 0.3246412791555629,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0014230497585183519,
+      "loss": 0.1289,
+      "step": 37399
+    },
+    {
+      "epoch": 0.32464995963576704,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0014230219792839903,
+      "loss": 0.0928,
+      "step": 37400
+    },
+    {
+      "epoch": 0.32465864011597123,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0014229941996963765,
+      "loss": 0.1162,
+      "step": 37401
+    },
+    {
+      "epoch": 0.32466732059617537,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0014229664197555412,
+      "loss": 0.0669,
+      "step": 37402
+    },
+    {
+      "epoch": 0.32467600107637956,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0014229386394615147,
+      "loss": 0.1309,
+      "step": 37403
+    },
+    {
+      "epoch": 0.3246846815565837,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0014229108588143273,
+      "loss": 0.1299,
+      "step": 37404
+    },
+    {
+      "epoch": 0.3246933620367879,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0014228830778140098,
+      "loss": 0.082,
+      "step": 37405
+    },
+    {
+      "epoch": 0.32470204251699203,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0014228552964605917,
+      "loss": 0.1562,
+      "step": 37406
+    },
+    {
+      "epoch": 0.3247107229971962,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0014228275147541041,
+      "loss": 0.0913,
+      "step": 37407
+    },
+    {
+      "epoch": 0.32471940347740036,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001422799732694577,
+      "loss": 0.0713,
+      "step": 37408
+    },
+    {
+      "epoch": 0.32472808395760455,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0014227719502820414,
+      "loss": 0.0674,
+      "step": 37409
+    },
+    {
+      "epoch": 0.3247367644378087,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001422744167516527,
+      "loss": 0.0718,
+      "step": 37410
+    },
+    {
+      "epoch": 0.3247454449180129,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0014227163843980642,
+      "loss": 0.1025,
+      "step": 37411
+    },
+    {
+      "epoch": 0.324754125398217,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001422688600926684,
+      "loss": 0.1123,
+      "step": 37412
+    },
+    {
+      "epoch": 0.3247628058784212,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0014226608171024162,
+      "loss": 0.1689,
+      "step": 37413
+    },
+    {
+      "epoch": 0.32477148635862535,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0014226330329252914,
+      "loss": 0.0869,
+      "step": 37414
+    },
+    {
+      "epoch": 0.32478016683882954,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0014226052483953399,
+      "loss": 0.0918,
+      "step": 37415
+    },
+    {
+      "epoch": 0.3247888473190337,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0014225774635125924,
+      "loss": 0.0884,
+      "step": 37416
+    },
+    {
+      "epoch": 0.3247975277992379,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0014225496782770788,
+      "loss": 0.0854,
+      "step": 37417
+    },
+    {
+      "epoch": 0.324806208279442,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0014225218926888297,
+      "loss": 0.0947,
+      "step": 37418
+    },
+    {
+      "epoch": 0.3248148887596462,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0014224941067478754,
+      "loss": 0.0898,
+      "step": 37419
+    },
+    {
+      "epoch": 0.32482356923985034,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0014224663204542465,
+      "loss": 0.1328,
+      "step": 37420
+    },
+    {
+      "epoch": 0.32483224972005453,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0014224385338079736,
+      "loss": 0.1118,
+      "step": 37421
+    },
+    {
+      "epoch": 0.32484093020025867,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0014224107468090866,
+      "loss": 0.0928,
+      "step": 37422
+    },
+    {
+      "epoch": 0.32484961068046286,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0014223829594576158,
+      "loss": 0.1079,
+      "step": 37423
+    },
+    {
+      "epoch": 0.324858291160667,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001422355171753592,
+      "loss": 0.0737,
+      "step": 37424
+    },
+    {
+      "epoch": 0.3248669716408712,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0014223273836970454,
+      "loss": 0.166,
+      "step": 37425
+    },
+    {
+      "epoch": 0.32487565212107533,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0014222995952880065,
+      "loss": 0.1152,
+      "step": 37426
+    },
+    {
+      "epoch": 0.3248843326012795,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0014222718065265054,
+      "loss": 0.127,
+      "step": 37427
+    },
+    {
+      "epoch": 0.32489301308148366,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.001422244017412573,
+      "loss": 0.1426,
+      "step": 37428
+    },
+    {
+      "epoch": 0.32490169356168785,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001422216227946239,
+      "loss": 0.1035,
+      "step": 37429
+    },
+    {
+      "epoch": 0.324910374041892,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0014221884381275345,
+      "loss": 0.1562,
+      "step": 37430
+    },
+    {
+      "epoch": 0.3249190545220962,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0014221606479564895,
+      "loss": 0.0889,
+      "step": 37431
+    },
+    {
+      "epoch": 0.3249277350023003,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0014221328574331343,
+      "loss": 0.2793,
+      "step": 37432
+    },
+    {
+      "epoch": 0.3249364154825045,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0014221050665574994,
+      "loss": 0.1191,
+      "step": 37433
+    },
+    {
+      "epoch": 0.32494509596270865,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0014220772753296156,
+      "loss": 0.1245,
+      "step": 37434
+    },
+    {
+      "epoch": 0.32495377644291285,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0014220494837495128,
+      "loss": 0.1504,
+      "step": 37435
+    },
+    {
+      "epoch": 0.324962456923117,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0014220216918172214,
+      "loss": 0.0879,
+      "step": 37436
+    },
+    {
+      "epoch": 0.3249711374033212,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001421993899532772,
+      "loss": 0.126,
+      "step": 37437
+    },
+    {
+      "epoch": 0.3249798178835253,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0014219661068961946,
+      "loss": 0.0806,
+      "step": 37438
+    },
+    {
+      "epoch": 0.3249884983637295,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0014219383139075204,
+      "loss": 0.1328,
+      "step": 37439
+    },
+    {
+      "epoch": 0.32499717884393364,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001421910520566779,
+      "loss": 0.0952,
+      "step": 37440
+    },
+    {
+      "epoch": 0.32500585932413784,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001421882726874001,
+      "loss": 0.126,
+      "step": 37441
+    },
+    {
+      "epoch": 0.325014539804342,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0014218549328292175,
+      "loss": 0.1099,
+      "step": 37442
+    },
+    {
+      "epoch": 0.32502322028454617,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0014218271384324574,
+      "loss": 0.1543,
+      "step": 37443
+    },
+    {
+      "epoch": 0.3250319007647503,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0014217993436837525,
+      "loss": 0.0913,
+      "step": 37444
+    },
+    {
+      "epoch": 0.3250405812449545,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0014217715485831325,
+      "loss": 0.1123,
+      "step": 37445
+    },
+    {
+      "epoch": 0.32504926172515863,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.001421743753130628,
+      "loss": 0.1104,
+      "step": 37446
+    },
+    {
+      "epoch": 0.3250579422053628,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0014217159573262697,
+      "loss": 0.0781,
+      "step": 37447
+    },
+    {
+      "epoch": 0.32506662268556696,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0014216881611700872,
+      "loss": 0.0713,
+      "step": 37448
+    },
+    {
+      "epoch": 0.32507530316577116,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0014216603646621116,
+      "loss": 0.0742,
+      "step": 37449
+    },
+    {
+      "epoch": 0.3250839836459753,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001421632567802373,
+      "loss": 0.0801,
+      "step": 37450
+    },
+    {
+      "epoch": 0.3250926641261795,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0014216047705909018,
+      "loss": 0.0957,
+      "step": 37451
+    },
+    {
+      "epoch": 0.3251013446063836,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0014215769730277286,
+      "loss": 0.124,
+      "step": 37452
+    },
+    {
+      "epoch": 0.3251100250865878,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0014215491751128836,
+      "loss": 0.1157,
+      "step": 37453
+    },
+    {
+      "epoch": 0.32511870556679195,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001421521376846397,
+      "loss": 0.1021,
+      "step": 37454
+    },
+    {
+      "epoch": 0.32512738604699615,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0014214935782283,
+      "loss": 0.0991,
+      "step": 37455
+    },
+    {
+      "epoch": 0.3251360665272003,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001421465779258622,
+      "loss": 0.0898,
+      "step": 37456
+    },
+    {
+      "epoch": 0.3251447470074044,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001421437979937394,
+      "loss": 0.1021,
+      "step": 37457
+    },
+    {
+      "epoch": 0.3251534274876086,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0014214101802646462,
+      "loss": 0.0767,
+      "step": 37458
+    },
+    {
+      "epoch": 0.32516210796781275,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0014213823802404092,
+      "loss": 0.084,
+      "step": 37459
+    },
+    {
+      "epoch": 0.32517078844801695,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001421354579864713,
+      "loss": 0.0728,
+      "step": 37460
+    },
+    {
+      "epoch": 0.3251794689282211,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0014213267791375883,
+      "loss": 0.0889,
+      "step": 37461
+    },
+    {
+      "epoch": 0.3251881494084253,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001421298978059066,
+      "loss": 0.1289,
+      "step": 37462
+    },
+    {
+      "epoch": 0.3251968298886294,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0014212711766291753,
+      "loss": 0.0889,
+      "step": 37463
+    },
+    {
+      "epoch": 0.3252055103688336,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0014212433748479474,
+      "loss": 0.0811,
+      "step": 37464
+    },
+    {
+      "epoch": 0.32521419084903774,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0014212155727154128,
+      "loss": 0.0967,
+      "step": 37465
+    },
+    {
+      "epoch": 0.32522287132924194,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0014211877702316016,
+      "loss": 0.0859,
+      "step": 37466
+    },
+    {
+      "epoch": 0.3252315518094461,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001421159967396544,
+      "loss": 0.0845,
+      "step": 37467
+    },
+    {
+      "epoch": 0.32524023228965027,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001421132164210271,
+      "loss": 0.0947,
+      "step": 37468
+    },
+    {
+      "epoch": 0.3252489127698544,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0014211043606728124,
+      "loss": 0.1221,
+      "step": 37469
+    },
+    {
+      "epoch": 0.3252575932500586,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0014210765567841993,
+      "loss": 0.1074,
+      "step": 37470
+    },
+    {
+      "epoch": 0.32526627373026273,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0014210487525444615,
+      "loss": 0.0986,
+      "step": 37471
+    },
+    {
+      "epoch": 0.3252749542104669,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0014210209479536295,
+      "loss": 0.1084,
+      "step": 37472
+    },
+    {
+      "epoch": 0.32528363469067106,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001420993143011734,
+      "loss": 0.123,
+      "step": 37473
+    },
+    {
+      "epoch": 0.32529231517087526,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0014209653377188053,
+      "loss": 0.0859,
+      "step": 37474
+    },
+    {
+      "epoch": 0.3253009956510794,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0014209375320748735,
+      "loss": 0.0864,
+      "step": 37475
+    },
+    {
+      "epoch": 0.3253096761312836,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0014209097260799695,
+      "loss": 0.0986,
+      "step": 37476
+    },
+    {
+      "epoch": 0.3253183566114877,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001420881919734123,
+      "loss": 0.1167,
+      "step": 37477
+    },
+    {
+      "epoch": 0.3253270370916919,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001420854113037365,
+      "loss": 0.0752,
+      "step": 37478
+    },
+    {
+      "epoch": 0.32533571757189605,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001420826305989726,
+      "loss": 0.1426,
+      "step": 37479
+    },
+    {
+      "epoch": 0.32534439805210025,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001420798498591236,
+      "loss": 0.082,
+      "step": 37480
+    },
+    {
+      "epoch": 0.3253530785323044,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0014207706908419256,
+      "loss": 0.0894,
+      "step": 37481
+    },
+    {
+      "epoch": 0.3253617590125086,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0014207428827418254,
+      "loss": 0.1143,
+      "step": 37482
+    },
+    {
+      "epoch": 0.3253704394927127,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0014207150742909654,
+      "loss": 0.0703,
+      "step": 37483
+    },
+    {
+      "epoch": 0.3253791199729169,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0014206872654893763,
+      "loss": 0.0918,
+      "step": 37484
+    },
+    {
+      "epoch": 0.32538780045312105,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0014206594563370882,
+      "loss": 0.082,
+      "step": 37485
+    },
+    {
+      "epoch": 0.32539648093332524,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0014206316468341317,
+      "loss": 0.0928,
+      "step": 37486
+    },
+    {
+      "epoch": 0.3254051614135294,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0014206038369805377,
+      "loss": 0.1167,
+      "step": 37487
+    },
+    {
+      "epoch": 0.32541384189373357,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001420576026776336,
+      "loss": 0.1436,
+      "step": 37488
+    },
+    {
+      "epoch": 0.3254225223739377,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001420548216221557,
+      "loss": 0.0771,
+      "step": 37489
+    },
+    {
+      "epoch": 0.3254312028541419,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0014205204053162313,
+      "loss": 0.0835,
+      "step": 37490
+    },
+    {
+      "epoch": 0.32543988333434604,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0014204925940603894,
+      "loss": 0.1279,
+      "step": 37491
+    },
+    {
+      "epoch": 0.32544856381455023,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0014204647824540618,
+      "loss": 0.1152,
+      "step": 37492
+    },
+    {
+      "epoch": 0.32545724429475437,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0014204369704972784,
+      "loss": 0.1621,
+      "step": 37493
+    },
+    {
+      "epoch": 0.32546592477495856,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.00142040915819007,
+      "loss": 0.0869,
+      "step": 37494
+    },
+    {
+      "epoch": 0.3254746052551627,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0014203813455324669,
+      "loss": 0.0967,
+      "step": 37495
+    },
+    {
+      "epoch": 0.3254832857353669,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0014203535325244996,
+      "loss": 0.0894,
+      "step": 37496
+    },
+    {
+      "epoch": 0.325491966215571,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0014203257191661987,
+      "loss": 0.0684,
+      "step": 37497
+    },
+    {
+      "epoch": 0.3255006466957752,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0014202979054575942,
+      "loss": 0.0952,
+      "step": 37498
+    },
+    {
+      "epoch": 0.32550932717597936,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0014202700913987167,
+      "loss": 0.063,
+      "step": 37499
+    },
+    {
+      "epoch": 0.32551800765618355,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0014202422769895968,
+      "loss": 0.0791,
+      "step": 37500
+    },
+    {
+      "epoch": 0.3255266881363877,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0014202144622302649,
+      "loss": 0.1025,
+      "step": 37501
+    },
+    {
+      "epoch": 0.3255353686165919,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0014201866471207512,
+      "loss": 0.1011,
+      "step": 37502
+    },
+    {
+      "epoch": 0.325544049096796,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0014201588316610858,
+      "loss": 0.0894,
+      "step": 37503
+    },
+    {
+      "epoch": 0.3255527295770002,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0014201310158513001,
+      "loss": 0.1172,
+      "step": 37504
+    },
+    {
+      "epoch": 0.32556141005720435,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0014201031996914234,
+      "loss": 0.0771,
+      "step": 37505
+    },
+    {
+      "epoch": 0.32557009053740854,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0014200753831814868,
+      "loss": 0.1045,
+      "step": 37506
+    },
+    {
+      "epoch": 0.3255787710176127,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0014200475663215207,
+      "loss": 0.0962,
+      "step": 37507
+    },
+    {
+      "epoch": 0.32558745149781687,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0014200197491115554,
+      "loss": 0.0879,
+      "step": 37508
+    },
+    {
+      "epoch": 0.325596131978021,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0014199919315516212,
+      "loss": 0.1621,
+      "step": 37509
+    },
+    {
+      "epoch": 0.3256048124582252,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0014199641136417487,
+      "loss": 0.1523,
+      "step": 37510
+    },
+    {
+      "epoch": 0.32561349293842934,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0014199362953819681,
+      "loss": 0.0698,
+      "step": 37511
+    },
+    {
+      "epoch": 0.32562217341863353,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0014199084767723103,
+      "loss": 0.0947,
+      "step": 37512
+    },
+    {
+      "epoch": 0.32563085389883767,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0014198806578128052,
+      "loss": 0.1416,
+      "step": 37513
+    },
+    {
+      "epoch": 0.32563953437904186,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0014198528385034835,
+      "loss": 0.1953,
+      "step": 37514
+    },
+    {
+      "epoch": 0.325648214859246,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001419825018844375,
+      "loss": 0.1279,
+      "step": 37515
+    },
+    {
+      "epoch": 0.3256568953394502,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0014197971988355112,
+      "loss": 0.0957,
+      "step": 37516
+    },
+    {
+      "epoch": 0.32566557581965433,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0014197693784769219,
+      "loss": 0.1465,
+      "step": 37517
+    },
+    {
+      "epoch": 0.3256742562998585,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0014197415577686379,
+      "loss": 0.1016,
+      "step": 37518
+    },
+    {
+      "epoch": 0.32568293678006266,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001419713736710689,
+      "loss": 0.083,
+      "step": 37519
+    },
+    {
+      "epoch": 0.32569161726026685,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0014196859153031058,
+      "loss": 0.1104,
+      "step": 37520
+    },
+    {
+      "epoch": 0.325700297740471,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0014196580935459193,
+      "loss": 0.1299,
+      "step": 37521
+    },
+    {
+      "epoch": 0.3257089782206752,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0014196302714391594,
+      "loss": 0.0874,
+      "step": 37522
+    },
+    {
+      "epoch": 0.3257176587008793,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0014196024489828564,
+      "loss": 0.0654,
+      "step": 37523
+    },
+    {
+      "epoch": 0.3257263391810835,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.001419574626177041,
+      "loss": 0.2617,
+      "step": 37524
+    },
+    {
+      "epoch": 0.32573501966128765,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0014195468030217438,
+      "loss": 0.0977,
+      "step": 37525
+    },
+    {
+      "epoch": 0.32574370014149184,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001419518979516995,
+      "loss": 0.0938,
+      "step": 37526
+    },
+    {
+      "epoch": 0.325752380621696,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0014194911556628248,
+      "loss": 0.1143,
+      "step": 37527
+    },
+    {
+      "epoch": 0.3257610611019002,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0014194633314592643,
+      "loss": 0.0752,
+      "step": 37528
+    },
+    {
+      "epoch": 0.3257697415821043,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0014194355069063432,
+      "loss": 0.0723,
+      "step": 37529
+    },
+    {
+      "epoch": 0.3257784220623085,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0014194076820040922,
+      "loss": 0.0938,
+      "step": 37530
+    },
+    {
+      "epoch": 0.32578710254251264,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001419379856752542,
+      "loss": 0.085,
+      "step": 37531
+    },
+    {
+      "epoch": 0.32579578302271683,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0014193520311517225,
+      "loss": 0.0996,
+      "step": 37532
+    },
+    {
+      "epoch": 0.32580446350292097,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0014193242052016647,
+      "loss": 0.1172,
+      "step": 37533
+    },
+    {
+      "epoch": 0.32581314398312516,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0014192963789023982,
+      "loss": 0.1064,
+      "step": 37534
+    },
+    {
+      "epoch": 0.3258218244633293,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0014192685522539544,
+      "loss": 0.1113,
+      "step": 37535
+    },
+    {
+      "epoch": 0.3258305049435335,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0014192407252563632,
+      "loss": 0.103,
+      "step": 37536
+    },
+    {
+      "epoch": 0.32583918542373763,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0014192128979096553,
+      "loss": 0.0776,
+      "step": 37537
+    },
+    {
+      "epoch": 0.3258478659039418,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001419185070213861,
+      "loss": 0.0757,
+      "step": 37538
+    },
+    {
+      "epoch": 0.32585654638414596,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0014191572421690106,
+      "loss": 0.1094,
+      "step": 37539
+    },
+    {
+      "epoch": 0.32586522686435015,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0014191294137751345,
+      "loss": 0.1406,
+      "step": 37540
+    },
+    {
+      "epoch": 0.3258739073445543,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0014191015850322636,
+      "loss": 0.1182,
+      "step": 37541
+    },
+    {
+      "epoch": 0.3258825878247585,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0014190737559404277,
+      "loss": 0.084,
+      "step": 37542
+    },
+    {
+      "epoch": 0.3258912683049626,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0014190459264996574,
+      "loss": 0.1084,
+      "step": 37543
+    },
+    {
+      "epoch": 0.3258999487851668,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0014190180967099837,
+      "loss": 0.0728,
+      "step": 37544
+    },
+    {
+      "epoch": 0.32590862926537095,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0014189902665714363,
+      "loss": 0.0952,
+      "step": 37545
+    },
+    {
+      "epoch": 0.32591730974557515,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0014189624360840458,
+      "loss": 0.0869,
+      "step": 37546
+    },
+    {
+      "epoch": 0.3259259902257793,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0014189346052478433,
+      "loss": 0.0908,
+      "step": 37547
+    },
+    {
+      "epoch": 0.3259346707059835,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0014189067740628582,
+      "loss": 0.1709,
+      "step": 37548
+    },
+    {
+      "epoch": 0.3259433511861876,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0014188789425291217,
+      "loss": 0.1104,
+      "step": 37549
+    },
+    {
+      "epoch": 0.3259520316663918,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001418851110646664,
+      "loss": 0.1099,
+      "step": 37550
+    },
+    {
+      "epoch": 0.32596071214659594,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0014188232784155155,
+      "loss": 0.1113,
+      "step": 37551
+    },
+    {
+      "epoch": 0.32596939262680014,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0014187954458357067,
+      "loss": 0.0947,
+      "step": 37552
+    },
+    {
+      "epoch": 0.3259780731070043,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0014187676129072677,
+      "loss": 0.1162,
+      "step": 37553
+    },
+    {
+      "epoch": 0.32598675358720847,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0014187397796302295,
+      "loss": 0.1016,
+      "step": 37554
+    },
+    {
+      "epoch": 0.3259954340674126,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001418711946004622,
+      "loss": 0.083,
+      "step": 37555
+    },
+    {
+      "epoch": 0.3260041145476168,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0014186841120304764,
+      "loss": 0.1143,
+      "step": 37556
+    },
+    {
+      "epoch": 0.32601279502782093,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0014186562777078221,
+      "loss": 0.0908,
+      "step": 37557
+    },
+    {
+      "epoch": 0.3260214755080251,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0014186284430366906,
+      "loss": 0.105,
+      "step": 37558
+    },
+    {
+      "epoch": 0.32603015598822926,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0014186006080171114,
+      "loss": 0.1006,
+      "step": 37559
+    },
+    {
+      "epoch": 0.32603883646843346,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0014185727726491154,
+      "loss": 0.126,
+      "step": 37560
+    },
+    {
+      "epoch": 0.3260475169486376,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0014185449369327333,
+      "loss": 0.062,
+      "step": 37561
+    },
+    {
+      "epoch": 0.3260561974288418,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001418517100867995,
+      "loss": 0.085,
+      "step": 37562
+    },
+    {
+      "epoch": 0.3260648779090459,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.001418489264454931,
+      "loss": 0.1328,
+      "step": 37563
+    },
+    {
+      "epoch": 0.3260735583892501,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0014184614276935722,
+      "loss": 0.1279,
+      "step": 37564
+    },
+    {
+      "epoch": 0.32608223886945426,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0014184335905839487,
+      "loss": 0.0933,
+      "step": 37565
+    },
+    {
+      "epoch": 0.32609091934965845,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001418405753126091,
+      "loss": 0.0791,
+      "step": 37566
+    },
+    {
+      "epoch": 0.3260995998298626,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0014183779153200296,
+      "loss": 0.1621,
+      "step": 37567
+    },
+    {
+      "epoch": 0.3261082803100668,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001418350077165795,
+      "loss": 0.0918,
+      "step": 37568
+    },
+    {
+      "epoch": 0.3261169607902709,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0014183222386634174,
+      "loss": 0.1016,
+      "step": 37569
+    },
+    {
+      "epoch": 0.3261256412704751,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0014182943998129274,
+      "loss": 0.1133,
+      "step": 37570
+    },
+    {
+      "epoch": 0.32613432175067925,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0014182665606143553,
+      "loss": 0.1089,
+      "step": 37571
+    },
+    {
+      "epoch": 0.32614300223088344,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0014182387210677316,
+      "loss": 0.1133,
+      "step": 37572
+    },
+    {
+      "epoch": 0.3261516827110876,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0014182108811730872,
+      "loss": 0.1211,
+      "step": 37573
+    },
+    {
+      "epoch": 0.32616036319129177,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001418183040930452,
+      "loss": 0.082,
+      "step": 37574
+    },
+    {
+      "epoch": 0.3261690436714959,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0014181552003398566,
+      "loss": 0.1318,
+      "step": 37575
+    },
+    {
+      "epoch": 0.3261777241517001,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0014181273594013315,
+      "loss": 0.1055,
+      "step": 37576
+    },
+    {
+      "epoch": 0.32618640463190424,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001418099518114907,
+      "loss": 0.0728,
+      "step": 37577
+    },
+    {
+      "epoch": 0.32619508511210843,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0014180716764806138,
+      "loss": 0.1279,
+      "step": 37578
+    },
+    {
+      "epoch": 0.32620376559231257,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0014180438344984822,
+      "loss": 0.1621,
+      "step": 37579
+    },
+    {
+      "epoch": 0.3262124460725167,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0014180159921685425,
+      "loss": 0.0889,
+      "step": 37580
+    },
+    {
+      "epoch": 0.3262211265527209,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0014179881494908251,
+      "loss": 0.1328,
+      "step": 37581
+    },
+    {
+      "epoch": 0.32622980703292503,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0014179603064653607,
+      "loss": 0.0825,
+      "step": 37582
+    },
+    {
+      "epoch": 0.3262384875131292,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0014179324630921798,
+      "loss": 0.0786,
+      "step": 37583
+    },
+    {
+      "epoch": 0.32624716799333336,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0014179046193713127,
+      "loss": 0.0947,
+      "step": 37584
+    },
+    {
+      "epoch": 0.32625584847353756,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.00141787677530279,
+      "loss": 0.0801,
+      "step": 37585
+    },
+    {
+      "epoch": 0.3262645289537417,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0014178489308866423,
+      "loss": 0.1064,
+      "step": 37586
+    },
+    {
+      "epoch": 0.3262732094339459,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0014178210861228995,
+      "loss": 0.0894,
+      "step": 37587
+    },
+    {
+      "epoch": 0.32628188991415,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001417793241011592,
+      "loss": 0.0972,
+      "step": 37588
+    },
+    {
+      "epoch": 0.3262905703943542,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001417765395552751,
+      "loss": 0.1133,
+      "step": 37589
+    },
+    {
+      "epoch": 0.32629925087455836,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0014177375497464065,
+      "loss": 0.0654,
+      "step": 37590
+    },
+    {
+      "epoch": 0.32630793135476255,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0014177097035925888,
+      "loss": 0.1338,
+      "step": 37591
+    },
+    {
+      "epoch": 0.3263166118349667,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0014176818570913285,
+      "loss": 0.1045,
+      "step": 37592
+    },
+    {
+      "epoch": 0.3263252923151709,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0014176540102426565,
+      "loss": 0.1196,
+      "step": 37593
+    },
+    {
+      "epoch": 0.326333972795375,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0014176261630466024,
+      "loss": 0.0947,
+      "step": 37594
+    },
+    {
+      "epoch": 0.3263426532755792,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0014175983155031975,
+      "loss": 0.0947,
+      "step": 37595
+    },
+    {
+      "epoch": 0.32635133375578335,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0014175704676124717,
+      "loss": 0.1367,
+      "step": 37596
+    },
+    {
+      "epoch": 0.32636001423598754,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0014175426193744553,
+      "loss": 0.0825,
+      "step": 37597
+    },
+    {
+      "epoch": 0.3263686947161917,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0014175147707891796,
+      "loss": 0.1191,
+      "step": 37598
+    },
+    {
+      "epoch": 0.32637737519639587,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0014174869218566742,
+      "loss": 0.1089,
+      "step": 37599
+    },
+    {
+      "epoch": 0.3263860556766,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0014174590725769699,
+      "loss": 0.126,
+      "step": 37600
+    },
+    {
+      "epoch": 0.3263947361568042,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0014174312229500972,
+      "loss": 0.127,
+      "step": 37601
+    },
+    {
+      "epoch": 0.32640341663700834,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0014174033729760862,
+      "loss": 0.1562,
+      "step": 37602
+    },
+    {
+      "epoch": 0.32641209711721253,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0014173755226549681,
+      "loss": 0.1157,
+      "step": 37603
+    },
+    {
+      "epoch": 0.32642077759741667,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0014173476719867727,
+      "loss": 0.0952,
+      "step": 37604
+    },
+    {
+      "epoch": 0.32642945807762086,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0014173198209715306,
+      "loss": 0.0791,
+      "step": 37605
+    },
+    {
+      "epoch": 0.326438138557825,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001417291969609272,
+      "loss": 0.1162,
+      "step": 37606
+    },
+    {
+      "epoch": 0.3264468190380292,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0014172641179000284,
+      "loss": 0.0962,
+      "step": 37607
+    },
+    {
+      "epoch": 0.3264554995182333,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0014172362658438288,
+      "loss": 0.1182,
+      "step": 37608
+    },
+    {
+      "epoch": 0.3264641799984375,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0014172084134407052,
+      "loss": 0.1104,
+      "step": 37609
+    },
+    {
+      "epoch": 0.32647286047864166,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0014171805606906865,
+      "loss": 0.1299,
+      "step": 37610
+    },
+    {
+      "epoch": 0.32648154095884585,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001417152707593804,
+      "loss": 0.1035,
+      "step": 37611
+    },
+    {
+      "epoch": 0.32649022143905,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0014171248541500882,
+      "loss": 0.1289,
+      "step": 37612
+    },
+    {
+      "epoch": 0.3264989019192542,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0014170970003595696,
+      "loss": 0.1069,
+      "step": 37613
+    },
+    {
+      "epoch": 0.3265075823994583,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0014170691462222782,
+      "loss": 0.1211,
+      "step": 37614
+    },
+    {
+      "epoch": 0.3265162628796625,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001417041291738245,
+      "loss": 0.0884,
+      "step": 37615
+    },
+    {
+      "epoch": 0.32652494335986665,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0014170134369075001,
+      "loss": 0.1621,
+      "step": 37616
+    },
+    {
+      "epoch": 0.32653362384007084,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001416985581730074,
+      "loss": 0.168,
+      "step": 37617
+    },
+    {
+      "epoch": 0.326542304320275,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0014169577262059972,
+      "loss": 0.1235,
+      "step": 37618
+    },
+    {
+      "epoch": 0.32655098480047917,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0014169298703353005,
+      "loss": 0.1089,
+      "step": 37619
+    },
+    {
+      "epoch": 0.3265596652806833,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0014169020141180139,
+      "loss": 0.0796,
+      "step": 37620
+    },
+    {
+      "epoch": 0.3265683457608875,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0014168741575541677,
+      "loss": 0.1016,
+      "step": 37621
+    },
+    {
+      "epoch": 0.32657702624109164,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001416846300643793,
+      "loss": 0.1064,
+      "step": 37622
+    },
+    {
+      "epoch": 0.32658570672129583,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0014168184433869199,
+      "loss": 0.0869,
+      "step": 37623
+    },
+    {
+      "epoch": 0.32659438720149997,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0014167905857835788,
+      "loss": 0.167,
+      "step": 37624
+    },
+    {
+      "epoch": 0.32660306768170416,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0014167627278338004,
+      "loss": 0.1108,
+      "step": 37625
+    },
+    {
+      "epoch": 0.3266117481619083,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0014167348695376153,
+      "loss": 0.0879,
+      "step": 37626
+    },
+    {
+      "epoch": 0.3266204286421125,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.001416707010895053,
+      "loss": 0.1211,
+      "step": 37627
+    },
+    {
+      "epoch": 0.32662910912231663,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0014166791519061453,
+      "loss": 0.1104,
+      "step": 37628
+    },
+    {
+      "epoch": 0.3266377896025208,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0014166512925709216,
+      "loss": 0.1201,
+      "step": 37629
+    },
+    {
+      "epoch": 0.32664647008272496,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001416623432889413,
+      "loss": 0.1084,
+      "step": 37630
+    },
+    {
+      "epoch": 0.32665515056292915,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.00141659557286165,
+      "loss": 0.0903,
+      "step": 37631
+    },
+    {
+      "epoch": 0.3266638310431333,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0014165677124876623,
+      "loss": 0.0938,
+      "step": 37632
+    },
+    {
+      "epoch": 0.3266725115233375,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0014165398517674815,
+      "loss": 0.1152,
+      "step": 37633
+    },
+    {
+      "epoch": 0.3266811920035416,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0014165119907011373,
+      "loss": 0.1289,
+      "step": 37634
+    },
+    {
+      "epoch": 0.3266898724837458,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0014164841292886603,
+      "loss": 0.082,
+      "step": 37635
+    },
+    {
+      "epoch": 0.32669855296394995,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001416456267530081,
+      "loss": 0.1367,
+      "step": 37636
+    },
+    {
+      "epoch": 0.32670723344415414,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00141642840542543,
+      "loss": 0.1045,
+      "step": 37637
+    },
+    {
+      "epoch": 0.3267159139243583,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0014164005429747374,
+      "loss": 0.0981,
+      "step": 37638
+    },
+    {
+      "epoch": 0.3267245944045625,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0014163726801780338,
+      "loss": 0.1064,
+      "step": 37639
+    },
+    {
+      "epoch": 0.3267332748847666,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.00141634481703535,
+      "loss": 0.0889,
+      "step": 37640
+    },
+    {
+      "epoch": 0.3267419553649708,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0014163169535467163,
+      "loss": 0.1396,
+      "step": 37641
+    },
+    {
+      "epoch": 0.32675063584517494,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0014162890897121629,
+      "loss": 0.1504,
+      "step": 37642
+    },
+    {
+      "epoch": 0.32675931632537913,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0014162612255317212,
+      "loss": 0.0947,
+      "step": 37643
+    },
+    {
+      "epoch": 0.32676799680558327,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.00141623336100542,
+      "loss": 0.0996,
+      "step": 37644
+    },
+    {
+      "epoch": 0.32677667728578746,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0014162054961332916,
+      "loss": 0.1133,
+      "step": 37645
+    },
+    {
+      "epoch": 0.3267853577659916,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001416177630915365,
+      "loss": 0.0903,
+      "step": 37646
+    },
+    {
+      "epoch": 0.3267940382461958,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0014161497653516718,
+      "loss": 0.0923,
+      "step": 37647
+    },
+    {
+      "epoch": 0.32680271872639993,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0014161218994422416,
+      "loss": 0.1152,
+      "step": 37648
+    },
+    {
+      "epoch": 0.3268113992066041,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001416094033187105,
+      "loss": 0.0864,
+      "step": 37649
+    },
+    {
+      "epoch": 0.32682007968680826,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0014160661665862935,
+      "loss": 0.0776,
+      "step": 37650
+    },
+    {
+      "epoch": 0.32682876016701246,
+      "grad_norm": 2.0,
+      "learning_rate": 0.0014160382996398363,
+      "loss": 0.3066,
+      "step": 37651
+    },
+    {
+      "epoch": 0.3268374406472166,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0014160104323477645,
+      "loss": 0.1084,
+      "step": 37652
+    },
+    {
+      "epoch": 0.3268461211274208,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0014159825647101082,
+      "loss": 0.083,
+      "step": 37653
+    },
+    {
+      "epoch": 0.3268548016076249,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0014159546967268983,
+      "loss": 0.0884,
+      "step": 37654
+    },
+    {
+      "epoch": 0.3268634820878291,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001415926828398165,
+      "loss": 0.0933,
+      "step": 37655
+    },
+    {
+      "epoch": 0.32687216256803325,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001415898959723939,
+      "loss": 0.1055,
+      "step": 37656
+    },
+    {
+      "epoch": 0.32688084304823745,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0014158710907042505,
+      "loss": 0.1079,
+      "step": 37657
+    },
+    {
+      "epoch": 0.3268895235284416,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0014158432213391304,
+      "loss": 0.0972,
+      "step": 37658
+    },
+    {
+      "epoch": 0.3268982040086458,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0014158153516286086,
+      "loss": 0.0879,
+      "step": 37659
+    },
+    {
+      "epoch": 0.3269068844888499,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001415787481572716,
+      "loss": 0.0947,
+      "step": 37660
+    },
+    {
+      "epoch": 0.3269155649690541,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0014157596111714827,
+      "loss": 0.1074,
+      "step": 37661
+    },
+    {
+      "epoch": 0.32692424544925824,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0014157317404249398,
+      "loss": 0.1318,
+      "step": 37662
+    },
+    {
+      "epoch": 0.32693292592946244,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0014157038693331174,
+      "loss": 0.1025,
+      "step": 37663
+    },
+    {
+      "epoch": 0.3269416064096666,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0014156759978960459,
+      "loss": 0.1094,
+      "step": 37664
+    },
+    {
+      "epoch": 0.32695028688987077,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0014156481261137558,
+      "loss": 0.0923,
+      "step": 37665
+    },
+    {
+      "epoch": 0.3269589673700749,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0014156202539862776,
+      "loss": 0.2129,
+      "step": 37666
+    },
+    {
+      "epoch": 0.3269676478502791,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001415592381513642,
+      "loss": 0.1045,
+      "step": 37667
+    },
+    {
+      "epoch": 0.32697632833048323,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0014155645086958793,
+      "loss": 0.0811,
+      "step": 37668
+    },
+    {
+      "epoch": 0.3269850088106874,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0014155366355330199,
+      "loss": 0.085,
+      "step": 37669
+    },
+    {
+      "epoch": 0.32699368929089156,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001415508762025094,
+      "loss": 0.0864,
+      "step": 37670
+    },
+    {
+      "epoch": 0.32700236977109576,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001415480888172133,
+      "loss": 0.0874,
+      "step": 37671
+    },
+    {
+      "epoch": 0.3270110502512999,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0014154530139741668,
+      "loss": 0.0938,
+      "step": 37672
+    },
+    {
+      "epoch": 0.3270197307315041,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0014154251394312258,
+      "loss": 0.1055,
+      "step": 37673
+    },
+    {
+      "epoch": 0.3270284112117082,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0014153972645433409,
+      "loss": 0.0869,
+      "step": 37674
+    },
+    {
+      "epoch": 0.3270370916919124,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001415369389310542,
+      "loss": 0.1123,
+      "step": 37675
+    },
+    {
+      "epoch": 0.32704577217211656,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0014153415137328597,
+      "loss": 0.0986,
+      "step": 37676
+    },
+    {
+      "epoch": 0.32705445265232075,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0014153136378103248,
+      "loss": 0.0781,
+      "step": 37677
+    },
+    {
+      "epoch": 0.3270631331325249,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0014152857615429679,
+      "loss": 0.085,
+      "step": 37678
+    },
+    {
+      "epoch": 0.3270718136127291,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0014152578849308188,
+      "loss": 0.1123,
+      "step": 37679
+    },
+    {
+      "epoch": 0.3270804940929332,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0014152300079739086,
+      "loss": 0.0742,
+      "step": 37680
+    },
+    {
+      "epoch": 0.3270891745731374,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0014152021306722679,
+      "loss": 0.1533,
+      "step": 37681
+    },
+    {
+      "epoch": 0.32709785505334155,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0014151742530259265,
+      "loss": 0.0894,
+      "step": 37682
+    },
+    {
+      "epoch": 0.32710653553354574,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0014151463750349154,
+      "loss": 0.0825,
+      "step": 37683
+    },
+    {
+      "epoch": 0.3271152160137499,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001415118496699265,
+      "loss": 0.1016,
+      "step": 37684
+    },
+    {
+      "epoch": 0.32712389649395407,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001415090618019006,
+      "loss": 0.1104,
+      "step": 37685
+    },
+    {
+      "epoch": 0.3271325769741582,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001415062738994168,
+      "loss": 0.0869,
+      "step": 37686
+    },
+    {
+      "epoch": 0.3271412574543624,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0014150348596247825,
+      "loss": 0.1582,
+      "step": 37687
+    },
+    {
+      "epoch": 0.32714993793456654,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0014150069799108793,
+      "loss": 0.1123,
+      "step": 37688
+    },
+    {
+      "epoch": 0.32715861841477073,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0014149790998524898,
+      "loss": 0.0781,
+      "step": 37689
+    },
+    {
+      "epoch": 0.32716729889497487,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0014149512194496436,
+      "loss": 0.085,
+      "step": 37690
+    },
+    {
+      "epoch": 0.32717597937517906,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0014149233387023715,
+      "loss": 0.1123,
+      "step": 37691
+    },
+    {
+      "epoch": 0.3271846598553832,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0014148954576107039,
+      "loss": 0.0986,
+      "step": 37692
+    },
+    {
+      "epoch": 0.3271933403355874,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0014148675761746711,
+      "loss": 0.0889,
+      "step": 37693
+    },
+    {
+      "epoch": 0.3272020208157915,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0014148396943943042,
+      "loss": 0.0796,
+      "step": 37694
+    },
+    {
+      "epoch": 0.3272107012959957,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0014148118122696337,
+      "loss": 0.1079,
+      "step": 37695
+    },
+    {
+      "epoch": 0.32721938177619986,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0014147839298006889,
+      "loss": 0.0942,
+      "step": 37696
+    },
+    {
+      "epoch": 0.32722806225640405,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0014147560469875014,
+      "loss": 0.0918,
+      "step": 37697
+    },
+    {
+      "epoch": 0.3272367427366082,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0014147281638301015,
+      "loss": 0.0986,
+      "step": 37698
+    },
+    {
+      "epoch": 0.3272454232168124,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0014147002803285198,
+      "loss": 0.1064,
+      "step": 37699
+    },
+    {
+      "epoch": 0.3272541036970165,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0014146723964827863,
+      "loss": 0.124,
+      "step": 37700
+    },
+    {
+      "epoch": 0.3272627841772207,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001414644512292932,
+      "loss": 0.1006,
+      "step": 37701
+    },
+    {
+      "epoch": 0.32727146465742485,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0014146166277589872,
+      "loss": 0.1055,
+      "step": 37702
+    },
+    {
+      "epoch": 0.327280145137629,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0014145887428809825,
+      "loss": 0.0786,
+      "step": 37703
+    },
+    {
+      "epoch": 0.3272888256178332,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0014145608576589482,
+      "loss": 0.0879,
+      "step": 37704
+    },
+    {
+      "epoch": 0.3272975060980373,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0014145329720929142,
+      "loss": 0.084,
+      "step": 37705
+    },
+    {
+      "epoch": 0.3273061865782415,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0014145050861829127,
+      "loss": 0.1006,
+      "step": 37706
+    },
+    {
+      "epoch": 0.32731486705844565,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0014144771999289726,
+      "loss": 0.1094,
+      "step": 37707
+    },
+    {
+      "epoch": 0.32732354753864984,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0014144493133311248,
+      "loss": 0.0801,
+      "step": 37708
+    },
+    {
+      "epoch": 0.327332228018854,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0014144214263894003,
+      "loss": 0.1099,
+      "step": 37709
+    },
+    {
+      "epoch": 0.32734090849905817,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0014143935391038294,
+      "loss": 0.0986,
+      "step": 37710
+    },
+    {
+      "epoch": 0.3273495889792623,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0014143656514744418,
+      "loss": 0.1621,
+      "step": 37711
+    },
+    {
+      "epoch": 0.3273582694594665,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0014143377635012693,
+      "loss": 0.1113,
+      "step": 37712
+    },
+    {
+      "epoch": 0.32736694993967064,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0014143098751843413,
+      "loss": 0.1011,
+      "step": 37713
+    },
+    {
+      "epoch": 0.32737563041987483,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0014142819865236892,
+      "loss": 0.0845,
+      "step": 37714
+    },
+    {
+      "epoch": 0.32738431090007897,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0014142540975193428,
+      "loss": 0.1299,
+      "step": 37715
+    },
+    {
+      "epoch": 0.32739299138028316,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0014142262081713328,
+      "loss": 0.1074,
+      "step": 37716
+    },
+    {
+      "epoch": 0.3274016718604873,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0014141983184796896,
+      "loss": 0.1279,
+      "step": 37717
+    },
+    {
+      "epoch": 0.3274103523406915,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001414170428444444,
+      "loss": 0.082,
+      "step": 37718
+    },
+    {
+      "epoch": 0.32741903282089563,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014141425380656263,
+      "loss": 0.1074,
+      "step": 37719
+    },
+    {
+      "epoch": 0.3274277133010998,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0014141146473432673,
+      "loss": 0.1318,
+      "step": 37720
+    },
+    {
+      "epoch": 0.32743639378130396,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.001414086756277397,
+      "loss": 0.1699,
+      "step": 37721
+    },
+    {
+      "epoch": 0.32744507426150815,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001414058864868046,
+      "loss": 0.083,
+      "step": 37722
+    },
+    {
+      "epoch": 0.3274537547417123,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0014140309731152454,
+      "loss": 0.1133,
+      "step": 37723
+    },
+    {
+      "epoch": 0.3274624352219165,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0014140030810190247,
+      "loss": 0.106,
+      "step": 37724
+    },
+    {
+      "epoch": 0.3274711157021206,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0014139751885794151,
+      "loss": 0.105,
+      "step": 37725
+    },
+    {
+      "epoch": 0.3274797961823248,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0014139472957964475,
+      "loss": 0.0918,
+      "step": 37726
+    },
+    {
+      "epoch": 0.32748847666252895,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001413919402670151,
+      "loss": 0.1348,
+      "step": 37727
+    },
+    {
+      "epoch": 0.32749715714273314,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0014138915092005576,
+      "loss": 0.0742,
+      "step": 37728
+    },
+    {
+      "epoch": 0.3275058376229373,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001413863615387697,
+      "loss": 0.1016,
+      "step": 37729
+    },
+    {
+      "epoch": 0.32751451810314147,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0014138357212315998,
+      "loss": 0.0815,
+      "step": 37730
+    },
+    {
+      "epoch": 0.3275231985833456,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0014138078267322963,
+      "loss": 0.0918,
+      "step": 37731
+    },
+    {
+      "epoch": 0.3275318790635498,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0014137799318898177,
+      "loss": 0.0903,
+      "step": 37732
+    },
+    {
+      "epoch": 0.32754055954375394,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001413752036704194,
+      "loss": 0.0986,
+      "step": 37733
+    },
+    {
+      "epoch": 0.32754924002395813,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0014137241411754555,
+      "loss": 0.1406,
+      "step": 37734
+    },
+    {
+      "epoch": 0.32755792050416227,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0014136962453036332,
+      "loss": 0.1108,
+      "step": 37735
+    },
+    {
+      "epoch": 0.32756660098436646,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0014136683490887573,
+      "loss": 0.0781,
+      "step": 37736
+    },
+    {
+      "epoch": 0.3275752814645706,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0014136404525308588,
+      "loss": 0.1523,
+      "step": 37737
+    },
+    {
+      "epoch": 0.3275839619447748,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0014136125556299675,
+      "loss": 0.0845,
+      "step": 37738
+    },
+    {
+      "epoch": 0.32759264242497893,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0014135846583861141,
+      "loss": 0.1143,
+      "step": 37739
+    },
+    {
+      "epoch": 0.3276013229051831,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0014135567607993294,
+      "loss": 0.0757,
+      "step": 37740
+    },
+    {
+      "epoch": 0.32761000338538726,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0014135288628696438,
+      "loss": 0.1191,
+      "step": 37741
+    },
+    {
+      "epoch": 0.32761868386559145,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0014135009645970878,
+      "loss": 0.0845,
+      "step": 37742
+    },
+    {
+      "epoch": 0.3276273643457956,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0014134730659816916,
+      "loss": 0.1543,
+      "step": 37743
+    },
+    {
+      "epoch": 0.3276360448259998,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0014134451670234863,
+      "loss": 0.0562,
+      "step": 37744
+    },
+    {
+      "epoch": 0.3276447253062039,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0014134172677225014,
+      "loss": 0.1226,
+      "step": 37745
+    },
+    {
+      "epoch": 0.3276534057864081,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0014133893680787686,
+      "loss": 0.0977,
+      "step": 37746
+    },
+    {
+      "epoch": 0.32766208626661225,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0014133614680923179,
+      "loss": 0.0981,
+      "step": 37747
+    },
+    {
+      "epoch": 0.32767076674681644,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0014133335677631795,
+      "loss": 0.106,
+      "step": 37748
+    },
+    {
+      "epoch": 0.3276794472270206,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0014133056670913846,
+      "loss": 0.0762,
+      "step": 37749
+    },
+    {
+      "epoch": 0.3276881277072248,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0014132777660769633,
+      "loss": 0.1128,
+      "step": 37750
+    },
+    {
+      "epoch": 0.3276968081874289,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0014132498647199461,
+      "loss": 0.1118,
+      "step": 37751
+    },
+    {
+      "epoch": 0.3277054886676331,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0014132219630203636,
+      "loss": 0.0889,
+      "step": 37752
+    },
+    {
+      "epoch": 0.32771416914783724,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0014131940609782459,
+      "loss": 0.1162,
+      "step": 37753
+    },
+    {
+      "epoch": 0.32772284962804143,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0014131661585936241,
+      "loss": 0.105,
+      "step": 37754
+    },
+    {
+      "epoch": 0.3277315301082456,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0014131382558665287,
+      "loss": 0.0977,
+      "step": 37755
+    },
+    {
+      "epoch": 0.32774021058844977,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0014131103527969897,
+      "loss": 0.126,
+      "step": 37756
+    },
+    {
+      "epoch": 0.3277488910686539,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0014130824493850378,
+      "loss": 0.1162,
+      "step": 37757
+    },
+    {
+      "epoch": 0.3277575715488581,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0014130545456307042,
+      "loss": 0.1387,
+      "step": 37758
+    },
+    {
+      "epoch": 0.32776625202906223,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0014130266415340188,
+      "loss": 0.0952,
+      "step": 37759
+    },
+    {
+      "epoch": 0.3277749325092664,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0014129987370950117,
+      "loss": 0.0835,
+      "step": 37760
+    },
+    {
+      "epoch": 0.32778361298947056,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.001412970832313714,
+      "loss": 0.1357,
+      "step": 37761
+    },
+    {
+      "epoch": 0.32779229346967476,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0014129429271901562,
+      "loss": 0.1045,
+      "step": 37762
+    },
+    {
+      "epoch": 0.3278009739498789,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0014129150217243689,
+      "loss": 0.0825,
+      "step": 37763
+    },
+    {
+      "epoch": 0.3278096544300831,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001412887115916382,
+      "loss": 0.125,
+      "step": 37764
+    },
+    {
+      "epoch": 0.3278183349102872,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0014128592097662268,
+      "loss": 0.0874,
+      "step": 37765
+    },
+    {
+      "epoch": 0.3278270153904914,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0014128313032739333,
+      "loss": 0.0757,
+      "step": 37766
+    },
+    {
+      "epoch": 0.32783569587069555,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0014128033964395325,
+      "loss": 0.1367,
+      "step": 37767
+    },
+    {
+      "epoch": 0.32784437635089975,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0014127754892630541,
+      "loss": 0.0996,
+      "step": 37768
+    },
+    {
+      "epoch": 0.3278530568311039,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0014127475817445295,
+      "loss": 0.0747,
+      "step": 37769
+    },
+    {
+      "epoch": 0.3278617373113081,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0014127196738839888,
+      "loss": 0.1191,
+      "step": 37770
+    },
+    {
+      "epoch": 0.3278704177915122,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0014126917656814625,
+      "loss": 0.1055,
+      "step": 37771
+    },
+    {
+      "epoch": 0.3278790982717164,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014126638571369812,
+      "loss": 0.1055,
+      "step": 37772
+    },
+    {
+      "epoch": 0.32788777875192054,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0014126359482505753,
+      "loss": 0.1436,
+      "step": 37773
+    },
+    {
+      "epoch": 0.32789645923212474,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0014126080390222755,
+      "loss": 0.1143,
+      "step": 37774
+    },
+    {
+      "epoch": 0.3279051397123289,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001412580129452112,
+      "loss": 0.1113,
+      "step": 37775
+    },
+    {
+      "epoch": 0.32791382019253307,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0014125522195401162,
+      "loss": 0.1387,
+      "step": 37776
+    },
+    {
+      "epoch": 0.3279225006727372,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0014125243092863175,
+      "loss": 0.0918,
+      "step": 37777
+    },
+    {
+      "epoch": 0.3279311811529414,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001412496398690747,
+      "loss": 0.0996,
+      "step": 37778
+    },
+    {
+      "epoch": 0.32793986163314554,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0014124684877534351,
+      "loss": 0.083,
+      "step": 37779
+    },
+    {
+      "epoch": 0.32794854211334973,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0014124405764744125,
+      "loss": 0.0854,
+      "step": 37780
+    },
+    {
+      "epoch": 0.32795722259355387,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0014124126648537097,
+      "loss": 0.083,
+      "step": 37781
+    },
+    {
+      "epoch": 0.32796590307375806,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0014123847528913567,
+      "loss": 0.0889,
+      "step": 37782
+    },
+    {
+      "epoch": 0.3279745835539622,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0014123568405873847,
+      "loss": 0.0864,
+      "step": 37783
+    },
+    {
+      "epoch": 0.3279832640341664,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0014123289279418237,
+      "loss": 0.1289,
+      "step": 37784
+    },
+    {
+      "epoch": 0.3279919445143705,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0014123010149547047,
+      "loss": 0.126,
+      "step": 37785
+    },
+    {
+      "epoch": 0.3280006249945747,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001412273101626058,
+      "loss": 0.1494,
+      "step": 37786
+    },
+    {
+      "epoch": 0.32800930547477886,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001412245187955914,
+      "loss": 0.0811,
+      "step": 37787
+    },
+    {
+      "epoch": 0.32801798595498305,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0014122172739443036,
+      "loss": 0.0698,
+      "step": 37788
+    },
+    {
+      "epoch": 0.3280266664351872,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0014121893595912568,
+      "loss": 0.1157,
+      "step": 37789
+    },
+    {
+      "epoch": 0.3280353469153914,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0014121614448968046,
+      "loss": 0.0801,
+      "step": 37790
+    },
+    {
+      "epoch": 0.3280440273955955,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001412133529860977,
+      "loss": 0.0845,
+      "step": 37791
+    },
+    {
+      "epoch": 0.3280527078757997,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0014121056144838053,
+      "loss": 0.0903,
+      "step": 37792
+    },
+    {
+      "epoch": 0.32806138835600385,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0014120776987653193,
+      "loss": 0.125,
+      "step": 37793
+    },
+    {
+      "epoch": 0.32807006883620804,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0014120497827055498,
+      "loss": 0.0859,
+      "step": 37794
+    },
+    {
+      "epoch": 0.3280787493164122,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0014120218663045275,
+      "loss": 0.1475,
+      "step": 37795
+    },
+    {
+      "epoch": 0.32808742979661637,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0014119939495622828,
+      "loss": 0.0928,
+      "step": 37796
+    },
+    {
+      "epoch": 0.3280961102768205,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0014119660324788462,
+      "loss": 0.1182,
+      "step": 37797
+    },
+    {
+      "epoch": 0.3281047907570247,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001411938115054248,
+      "loss": 0.1025,
+      "step": 37798
+    },
+    {
+      "epoch": 0.32811347123722884,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0014119101972885192,
+      "loss": 0.0742,
+      "step": 37799
+    },
+    {
+      "epoch": 0.32812215171743303,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0014118822791816903,
+      "loss": 0.0854,
+      "step": 37800
+    },
+    {
+      "epoch": 0.32813083219763717,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0014118543607337913,
+      "loss": 0.0977,
+      "step": 37801
+    },
+    {
+      "epoch": 0.32813951267784136,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001411826441944853,
+      "loss": 0.0928,
+      "step": 37802
+    },
+    {
+      "epoch": 0.3281481931580455,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001411798522814906,
+      "loss": 0.0967,
+      "step": 37803
+    },
+    {
+      "epoch": 0.3281568736382497,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001411770603343981,
+      "loss": 0.1187,
+      "step": 37804
+    },
+    {
+      "epoch": 0.32816555411845383,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0014117426835321084,
+      "loss": 0.0767,
+      "step": 37805
+    },
+    {
+      "epoch": 0.328174234598658,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0014117147633793186,
+      "loss": 0.127,
+      "step": 37806
+    },
+    {
+      "epoch": 0.32818291507886216,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001411686842885642,
+      "loss": 0.1318,
+      "step": 37807
+    },
+    {
+      "epoch": 0.32819159555906635,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0014116589220511096,
+      "loss": 0.1543,
+      "step": 37808
+    },
+    {
+      "epoch": 0.3282002760392705,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0014116310008757515,
+      "loss": 0.0947,
+      "step": 37809
+    },
+    {
+      "epoch": 0.3282089565194747,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0014116030793595987,
+      "loss": 0.1025,
+      "step": 37810
+    },
+    {
+      "epoch": 0.3282176369996788,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0014115751575026815,
+      "loss": 0.1279,
+      "step": 37811
+    },
+    {
+      "epoch": 0.328226317479883,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.0014115472353050298,
+      "loss": 0.1162,
+      "step": 37812
+    },
+    {
+      "epoch": 0.32823499796008715,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0014115193127666752,
+      "loss": 0.1182,
+      "step": 37813
+    },
+    {
+      "epoch": 0.32824367844029134,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0014114913898876475,
+      "loss": 0.1367,
+      "step": 37814
+    },
+    {
+      "epoch": 0.3282523589204955,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001411463466667978,
+      "loss": 0.1406,
+      "step": 37815
+    },
+    {
+      "epoch": 0.3282610394006997,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0014114355431076963,
+      "loss": 0.1367,
+      "step": 37816
+    },
+    {
+      "epoch": 0.3282697198809038,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0014114076192068335,
+      "loss": 0.1201,
+      "step": 37817
+    },
+    {
+      "epoch": 0.328278400361108,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00141137969496542,
+      "loss": 0.1172,
+      "step": 37818
+    },
+    {
+      "epoch": 0.32828708084131214,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0014113517703834863,
+      "loss": 0.1055,
+      "step": 37819
+    },
+    {
+      "epoch": 0.32829576132151633,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001411323845461063,
+      "loss": 0.1289,
+      "step": 37820
+    },
+    {
+      "epoch": 0.32830444180172047,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0014112959201981808,
+      "loss": 0.0859,
+      "step": 37821
+    },
+    {
+      "epoch": 0.32831312228192466,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0014112679945948697,
+      "loss": 0.1167,
+      "step": 37822
+    },
+    {
+      "epoch": 0.3283218027621288,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0014112400686511607,
+      "loss": 0.0938,
+      "step": 37823
+    },
+    {
+      "epoch": 0.328330483242333,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0014112121423670843,
+      "loss": 0.0908,
+      "step": 37824
+    },
+    {
+      "epoch": 0.32833916372253713,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0014111842157426713,
+      "loss": 0.1074,
+      "step": 37825
+    },
+    {
+      "epoch": 0.32834784420274127,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0014111562887779512,
+      "loss": 0.1455,
+      "step": 37826
+    },
+    {
+      "epoch": 0.32835652468294546,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.001411128361472956,
+      "loss": 0.0757,
+      "step": 37827
+    },
+    {
+      "epoch": 0.3283652051631496,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.001411100433827715,
+      "loss": 0.1128,
+      "step": 37828
+    },
+    {
+      "epoch": 0.3283738856433538,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0014110725058422595,
+      "loss": 0.0986,
+      "step": 37829
+    },
+    {
+      "epoch": 0.32838256612355793,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0014110445775166196,
+      "loss": 0.0986,
+      "step": 37830
+    },
+    {
+      "epoch": 0.3283912466037621,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0014110166488508263,
+      "loss": 0.1211,
+      "step": 37831
+    },
+    {
+      "epoch": 0.32839992708396626,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0014109887198449093,
+      "loss": 0.0884,
+      "step": 37832
+    },
+    {
+      "epoch": 0.32840860756417045,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0014109607904989004,
+      "loss": 0.1133,
+      "step": 37833
+    },
+    {
+      "epoch": 0.3284172880443746,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001410932860812829,
+      "loss": 0.0918,
+      "step": 37834
+    },
+    {
+      "epoch": 0.3284259685245788,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001410904930786726,
+      "loss": 0.0977,
+      "step": 37835
+    },
+    {
+      "epoch": 0.3284346490047829,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0014108770004206226,
+      "loss": 0.0767,
+      "step": 37836
+    },
+    {
+      "epoch": 0.3284433294849871,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0014108490697145484,
+      "loss": 0.1016,
+      "step": 37837
+    },
+    {
+      "epoch": 0.32845200996519125,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0014108211386685346,
+      "loss": 0.0918,
+      "step": 37838
+    },
+    {
+      "epoch": 0.32846069044539544,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001410793207282611,
+      "loss": 0.0742,
+      "step": 37839
+    },
+    {
+      "epoch": 0.3284693709255996,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001410765275556809,
+      "loss": 0.0762,
+      "step": 37840
+    },
+    {
+      "epoch": 0.3284780514058038,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0014107373434911585,
+      "loss": 0.1108,
+      "step": 37841
+    },
+    {
+      "epoch": 0.3284867318860079,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0014107094110856905,
+      "loss": 0.1445,
+      "step": 37842
+    },
+    {
+      "epoch": 0.3284954123662121,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0014106814783404352,
+      "loss": 0.0933,
+      "step": 37843
+    },
+    {
+      "epoch": 0.32850409284641624,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0014106535452554235,
+      "loss": 0.1074,
+      "step": 37844
+    },
+    {
+      "epoch": 0.32851277332662043,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0014106256118306857,
+      "loss": 0.1211,
+      "step": 37845
+    },
+    {
+      "epoch": 0.32852145380682457,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0014105976780662524,
+      "loss": 0.0845,
+      "step": 37846
+    },
+    {
+      "epoch": 0.32853013428702876,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0014105697439621538,
+      "loss": 0.084,
+      "step": 37847
+    },
+    {
+      "epoch": 0.3285388147672329,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0014105418095184212,
+      "loss": 0.0845,
+      "step": 37848
+    },
+    {
+      "epoch": 0.3285474952474371,
+      "grad_norm": 3.46875,
+      "learning_rate": 0.0014105138747350847,
+      "loss": 0.3594,
+      "step": 37849
+    },
+    {
+      "epoch": 0.32855617572764123,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001410485939612175,
+      "loss": 0.1084,
+      "step": 37850
+    },
+    {
+      "epoch": 0.3285648562078454,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0014104580041497223,
+      "loss": 0.1152,
+      "step": 37851
+    },
+    {
+      "epoch": 0.32857353668804956,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0014104300683477574,
+      "loss": 0.1162,
+      "step": 37852
+    },
+    {
+      "epoch": 0.32858221716825375,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0014104021322063108,
+      "loss": 0.1367,
+      "step": 37853
+    },
+    {
+      "epoch": 0.3285908976484579,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0014103741957254134,
+      "loss": 0.0757,
+      "step": 37854
+    },
+    {
+      "epoch": 0.3285995781286621,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0014103462589050953,
+      "loss": 0.1045,
+      "step": 37855
+    },
+    {
+      "epoch": 0.3286082586088662,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001410318321745387,
+      "loss": 0.1074,
+      "step": 37856
+    },
+    {
+      "epoch": 0.3286169390890704,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0014102903842463192,
+      "loss": 0.1045,
+      "step": 37857
+    },
+    {
+      "epoch": 0.32862561956927455,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0014102624464079228,
+      "loss": 0.0923,
+      "step": 37858
+    },
+    {
+      "epoch": 0.32863430004947874,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0014102345082302279,
+      "loss": 0.1104,
+      "step": 37859
+    },
+    {
+      "epoch": 0.3286429805296829,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0014102065697132653,
+      "loss": 0.083,
+      "step": 37860
+    },
+    {
+      "epoch": 0.3286516610098871,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0014101786308570652,
+      "loss": 0.0845,
+      "step": 37861
+    },
+    {
+      "epoch": 0.3286603414900912,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0014101506916616587,
+      "loss": 0.1123,
+      "step": 37862
+    },
+    {
+      "epoch": 0.3286690219702954,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001410122752127076,
+      "loss": 0.1089,
+      "step": 37863
+    },
+    {
+      "epoch": 0.32867770245049954,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0014100948122533477,
+      "loss": 0.1162,
+      "step": 37864
+    },
+    {
+      "epoch": 0.32868638293070374,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0014100668720405042,
+      "loss": 0.0938,
+      "step": 37865
+    },
+    {
+      "epoch": 0.3286950634109079,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0014100389314885766,
+      "loss": 0.1201,
+      "step": 37866
+    },
+    {
+      "epoch": 0.32870374389111207,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0014100109905975948,
+      "loss": 0.166,
+      "step": 37867
+    },
+    {
+      "epoch": 0.3287124243713162,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0014099830493675897,
+      "loss": 0.0762,
+      "step": 37868
+    },
+    {
+      "epoch": 0.3287211048515204,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001409955107798592,
+      "loss": 0.0859,
+      "step": 37869
+    },
+    {
+      "epoch": 0.32872978533172453,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0014099271658906316,
+      "loss": 0.1025,
+      "step": 37870
+    },
+    {
+      "epoch": 0.3287384658119287,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0014098992236437399,
+      "loss": 0.1084,
+      "step": 37871
+    },
+    {
+      "epoch": 0.32874714629213286,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0014098712810579471,
+      "loss": 0.1309,
+      "step": 37872
+    },
+    {
+      "epoch": 0.32875582677233706,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0014098433381332833,
+      "loss": 0.0947,
+      "step": 37873
+    },
+    {
+      "epoch": 0.3287645072525412,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.00140981539486978,
+      "loss": 0.0791,
+      "step": 37874
+    },
+    {
+      "epoch": 0.3287731877327454,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0014097874512674666,
+      "loss": 0.1162,
+      "step": 37875
+    },
+    {
+      "epoch": 0.3287818682129495,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001409759507326375,
+      "loss": 0.0854,
+      "step": 37876
+    },
+    {
+      "epoch": 0.3287905486931537,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0014097315630465345,
+      "loss": 0.1016,
+      "step": 37877
+    },
+    {
+      "epoch": 0.32879922917335785,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0014097036184279764,
+      "loss": 0.1162,
+      "step": 37878
+    },
+    {
+      "epoch": 0.32880790965356205,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001409675673470731,
+      "loss": 0.0859,
+      "step": 37879
+    },
+    {
+      "epoch": 0.3288165901337662,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001409647728174829,
+      "loss": 0.0679,
+      "step": 37880
+    },
+    {
+      "epoch": 0.3288252706139704,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014096197825403012,
+      "loss": 0.0869,
+      "step": 37881
+    },
+    {
+      "epoch": 0.3288339510941745,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0014095918365671775,
+      "loss": 0.0879,
+      "step": 37882
+    },
+    {
+      "epoch": 0.3288426315743787,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.001409563890255489,
+      "loss": 0.0835,
+      "step": 37883
+    },
+    {
+      "epoch": 0.32885131205458284,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001409535943605266,
+      "loss": 0.1377,
+      "step": 37884
+    },
+    {
+      "epoch": 0.32885999253478704,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0014095079966165392,
+      "loss": 0.0801,
+      "step": 37885
+    },
+    {
+      "epoch": 0.3288686730149912,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001409480049289339,
+      "loss": 0.0898,
+      "step": 37886
+    },
+    {
+      "epoch": 0.32887735349519537,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001409452101623696,
+      "loss": 0.1162,
+      "step": 37887
+    },
+    {
+      "epoch": 0.3288860339753995,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0014094241536196411,
+      "loss": 0.0742,
+      "step": 37888
+    },
+    {
+      "epoch": 0.3288947144556037,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0014093962052772043,
+      "loss": 0.0967,
+      "step": 37889
+    },
+    {
+      "epoch": 0.32890339493580784,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0014093682565964165,
+      "loss": 0.085,
+      "step": 37890
+    },
+    {
+      "epoch": 0.32891207541601203,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001409340307577308,
+      "loss": 0.1396,
+      "step": 37891
+    },
+    {
+      "epoch": 0.32892075589621617,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0014093123582199104,
+      "loss": 0.0967,
+      "step": 37892
+    },
+    {
+      "epoch": 0.32892943637642036,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0014092844085242529,
+      "loss": 0.124,
+      "step": 37893
+    },
+    {
+      "epoch": 0.3289381168566245,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0014092564584903666,
+      "loss": 0.103,
+      "step": 37894
+    },
+    {
+      "epoch": 0.3289467973368287,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001409228508118282,
+      "loss": 0.1152,
+      "step": 37895
+    },
+    {
+      "epoch": 0.3289554778170328,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0014092005574080303,
+      "loss": 0.0762,
+      "step": 37896
+    },
+    {
+      "epoch": 0.328964158297237,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0014091726063596413,
+      "loss": 0.209,
+      "step": 37897
+    },
+    {
+      "epoch": 0.32897283877744116,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0014091446549731454,
+      "loss": 0.0918,
+      "step": 37898
+    },
+    {
+      "epoch": 0.32898151925764535,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0014091167032485735,
+      "loss": 0.1094,
+      "step": 37899
+    },
+    {
+      "epoch": 0.3289901997378495,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0014090887511859565,
+      "loss": 0.1172,
+      "step": 37900
+    },
+    {
+      "epoch": 0.3289988802180537,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0014090607987853247,
+      "loss": 0.1338,
+      "step": 37901
+    },
+    {
+      "epoch": 0.3290075606982578,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014090328460467088,
+      "loss": 0.0986,
+      "step": 37902
+    },
+    {
+      "epoch": 0.329016241178462,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001409004892970139,
+      "loss": 0.0942,
+      "step": 37903
+    },
+    {
+      "epoch": 0.32902492165866615,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.001408976939555646,
+      "loss": 0.082,
+      "step": 37904
+    },
+    {
+      "epoch": 0.32903360213887034,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0014089489858032608,
+      "loss": 0.1289,
+      "step": 37905
+    },
+    {
+      "epoch": 0.3290422826190745,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0014089210317130136,
+      "loss": 0.0723,
+      "step": 37906
+    },
+    {
+      "epoch": 0.32905096309927867,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0014088930772849343,
+      "loss": 0.0947,
+      "step": 37907
+    },
+    {
+      "epoch": 0.3290596435794828,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.001408865122519055,
+      "loss": 0.1084,
+      "step": 37908
+    },
+    {
+      "epoch": 0.329068324059687,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001408837167415405,
+      "loss": 0.1045,
+      "step": 37909
+    },
+    {
+      "epoch": 0.32907700453989114,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0014088092119740154,
+      "loss": 0.0996,
+      "step": 37910
+    },
+    {
+      "epoch": 0.32908568502009533,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0014087812561949169,
+      "loss": 0.104,
+      "step": 37911
+    },
+    {
+      "epoch": 0.32909436550029947,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0014087533000781396,
+      "loss": 0.0918,
+      "step": 37912
+    },
+    {
+      "epoch": 0.32910304598050366,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0014087253436237143,
+      "loss": 0.1143,
+      "step": 37913
+    },
+    {
+      "epoch": 0.3291117264607078,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0014086973868316722,
+      "loss": 0.1406,
+      "step": 37914
+    },
+    {
+      "epoch": 0.329120406940912,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0014086694297020428,
+      "loss": 0.0933,
+      "step": 37915
+    },
+    {
+      "epoch": 0.32912908742111613,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0014086414722348569,
+      "loss": 0.1084,
+      "step": 37916
+    },
+    {
+      "epoch": 0.3291377679013203,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0014086135144301458,
+      "loss": 0.1299,
+      "step": 37917
+    },
+    {
+      "epoch": 0.32914644838152446,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.001408585556287939,
+      "loss": 0.0903,
+      "step": 37918
+    },
+    {
+      "epoch": 0.32915512886172865,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0014085575978082681,
+      "loss": 0.1235,
+      "step": 37919
+    },
+    {
+      "epoch": 0.3291638093419328,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0014085296389911635,
+      "loss": 0.1025,
+      "step": 37920
+    },
+    {
+      "epoch": 0.329172489822137,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0014085016798366548,
+      "loss": 0.1025,
+      "step": 37921
+    },
+    {
+      "epoch": 0.3291811703023411,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.001408473720344774,
+      "loss": 0.1094,
+      "step": 37922
+    },
+    {
+      "epoch": 0.3291898507825453,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0014084457605155506,
+      "loss": 0.1133,
+      "step": 37923
+    },
+    {
+      "epoch": 0.32919853126274945,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0014084178003490157,
+      "loss": 0.0898,
+      "step": 37924
+    },
+    {
+      "epoch": 0.32920721174295364,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0014083898398451998,
+      "loss": 0.127,
+      "step": 37925
+    },
+    {
+      "epoch": 0.3292158922231578,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0014083618790041333,
+      "loss": 0.1035,
+      "step": 37926
+    },
+    {
+      "epoch": 0.329224572703362,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0014083339178258467,
+      "loss": 0.1064,
+      "step": 37927
+    },
+    {
+      "epoch": 0.3292332531835661,
+      "grad_norm": 1.671875,
+      "learning_rate": 0.0014083059563103708,
+      "loss": 0.1396,
+      "step": 37928
+    },
+    {
+      "epoch": 0.3292419336637703,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001408277994457736,
+      "loss": 0.1152,
+      "step": 37929
+    },
+    {
+      "epoch": 0.32925061414397444,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0014082500322679734,
+      "loss": 0.1113,
+      "step": 37930
+    },
+    {
+      "epoch": 0.32925929462417863,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0014082220697411135,
+      "loss": 0.123,
+      "step": 37931
+    },
+    {
+      "epoch": 0.32926797510438277,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001408194106877186,
+      "loss": 0.0889,
+      "step": 37932
+    },
+    {
+      "epoch": 0.32927665558458696,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0014081661436762224,
+      "loss": 0.0781,
+      "step": 37933
+    },
+    {
+      "epoch": 0.3292853360647911,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0014081381801382526,
+      "loss": 0.1641,
+      "step": 37934
+    },
+    {
+      "epoch": 0.3292940165449953,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0014081102162633076,
+      "loss": 0.1118,
+      "step": 37935
+    },
+    {
+      "epoch": 0.32930269702519943,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001408082252051418,
+      "loss": 0.0938,
+      "step": 37936
+    },
+    {
+      "epoch": 0.3293113775054036,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0014080542875026141,
+      "loss": 0.0977,
+      "step": 37937
+    },
+    {
+      "epoch": 0.32932005798560776,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0014080263226169268,
+      "loss": 0.0752,
+      "step": 37938
+    },
+    {
+      "epoch": 0.32932873846581195,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0014079983573943864,
+      "loss": 0.1201,
+      "step": 37939
+    },
+    {
+      "epoch": 0.3293374189460161,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001407970391835024,
+      "loss": 0.1221,
+      "step": 37940
+    },
+    {
+      "epoch": 0.3293460994262203,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0014079424259388696,
+      "loss": 0.0996,
+      "step": 37941
+    },
+    {
+      "epoch": 0.3293547799064244,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0014079144597059538,
+      "loss": 0.1523,
+      "step": 37942
+    },
+    {
+      "epoch": 0.3293634603866286,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0014078864931363075,
+      "loss": 0.0928,
+      "step": 37943
+    },
+    {
+      "epoch": 0.32937214086683275,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0014078585262299612,
+      "loss": 0.0752,
+      "step": 37944
+    },
+    {
+      "epoch": 0.32938082134703695,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0014078305589869456,
+      "loss": 0.0723,
+      "step": 37945
+    },
+    {
+      "epoch": 0.3293895018272411,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0014078025914072909,
+      "loss": 0.1157,
+      "step": 37946
+    },
+    {
+      "epoch": 0.3293981823074453,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0014077746234910278,
+      "loss": 0.1152,
+      "step": 37947
+    },
+    {
+      "epoch": 0.3294068627876494,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001407746655238187,
+      "loss": 0.0952,
+      "step": 37948
+    },
+    {
+      "epoch": 0.32941554326785355,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0014077186866487995,
+      "loss": 0.083,
+      "step": 37949
+    },
+    {
+      "epoch": 0.32942422374805774,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0014076907177228953,
+      "loss": 0.1035,
+      "step": 37950
+    },
+    {
+      "epoch": 0.3294329042282619,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0014076627484605048,
+      "loss": 0.1719,
+      "step": 37951
+    },
+    {
+      "epoch": 0.3294415847084661,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0014076347788616595,
+      "loss": 0.0854,
+      "step": 37952
+    },
+    {
+      "epoch": 0.3294502651886702,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0014076068089263892,
+      "loss": 0.0811,
+      "step": 37953
+    },
+    {
+      "epoch": 0.3294589456688744,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0014075788386547245,
+      "loss": 0.1118,
+      "step": 37954
+    },
+    {
+      "epoch": 0.32946762614907854,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0014075508680466962,
+      "loss": 0.0811,
+      "step": 37955
+    },
+    {
+      "epoch": 0.32947630662928273,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0014075228971023352,
+      "loss": 0.0996,
+      "step": 37956
+    },
+    {
+      "epoch": 0.32948498710948687,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0014074949258216713,
+      "loss": 0.1123,
+      "step": 37957
+    },
+    {
+      "epoch": 0.32949366758969106,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001407466954204736,
+      "loss": 0.0825,
+      "step": 37958
+    },
+    {
+      "epoch": 0.3295023480698952,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0014074389822515594,
+      "loss": 0.0859,
+      "step": 37959
+    },
+    {
+      "epoch": 0.3295110285500994,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001407411009962172,
+      "loss": 0.0889,
+      "step": 37960
+    },
+    {
+      "epoch": 0.32951970903030353,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0014073830373366048,
+      "loss": 0.1465,
+      "step": 37961
+    },
+    {
+      "epoch": 0.3295283895105077,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0014073550643748881,
+      "loss": 0.1084,
+      "step": 37962
+    },
+    {
+      "epoch": 0.32953706999071186,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0014073270910770525,
+      "loss": 0.082,
+      "step": 37963
+    },
+    {
+      "epoch": 0.32954575047091605,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001407299117443128,
+      "loss": 0.0869,
+      "step": 37964
+    },
+    {
+      "epoch": 0.3295544309511202,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0014072711434731464,
+      "loss": 0.123,
+      "step": 37965
+    },
+    {
+      "epoch": 0.3295631114313244,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0014072431691671374,
+      "loss": 0.1079,
+      "step": 37966
+    },
+    {
+      "epoch": 0.3295717919115285,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0014072151945251324,
+      "loss": 0.085,
+      "step": 37967
+    },
+    {
+      "epoch": 0.3295804723917327,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001407187219547161,
+      "loss": 0.127,
+      "step": 37968
+    },
+    {
+      "epoch": 0.32958915287193685,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0014071592442332543,
+      "loss": 0.1416,
+      "step": 37969
+    },
+    {
+      "epoch": 0.32959783335214105,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0014071312685834433,
+      "loss": 0.1079,
+      "step": 37970
+    },
+    {
+      "epoch": 0.3296065138323452,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001407103292597758,
+      "loss": 0.0791,
+      "step": 37971
+    },
+    {
+      "epoch": 0.3296151943125494,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014070753162762288,
+      "loss": 0.054,
+      "step": 37972
+    },
+    {
+      "epoch": 0.3296238747927535,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0014070473396188869,
+      "loss": 0.1074,
+      "step": 37973
+    },
+    {
+      "epoch": 0.3296325552729577,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0014070193626257626,
+      "loss": 0.1465,
+      "step": 37974
+    },
+    {
+      "epoch": 0.32964123575316184,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0014069913852968862,
+      "loss": 0.1172,
+      "step": 37975
+    },
+    {
+      "epoch": 0.32964991623336604,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001406963407632289,
+      "loss": 0.0786,
+      "step": 37976
+    },
+    {
+      "epoch": 0.3296585967135702,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0014069354296320012,
+      "loss": 0.1108,
+      "step": 37977
+    },
+    {
+      "epoch": 0.32966727719377437,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0014069074512960535,
+      "loss": 0.1006,
+      "step": 37978
+    },
+    {
+      "epoch": 0.3296759576739785,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0014068794726244764,
+      "loss": 0.1021,
+      "step": 37979
+    },
+    {
+      "epoch": 0.3296846381541827,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0014068514936173004,
+      "loss": 0.1113,
+      "step": 37980
+    },
+    {
+      "epoch": 0.32969331863438683,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0014068235142745563,
+      "loss": 0.1025,
+      "step": 37981
+    },
+    {
+      "epoch": 0.329701999114591,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0014067955345962748,
+      "loss": 0.0918,
+      "step": 37982
+    },
+    {
+      "epoch": 0.32971067959479516,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001406767554582486,
+      "loss": 0.0781,
+      "step": 37983
+    },
+    {
+      "epoch": 0.32971936007499936,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001406739574233221,
+      "loss": 0.1338,
+      "step": 37984
+    },
+    {
+      "epoch": 0.3297280405552035,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0014067115935485102,
+      "loss": 0.085,
+      "step": 37985
+    },
+    {
+      "epoch": 0.3297367210354077,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0014066836125283836,
+      "loss": 0.0923,
+      "step": 37986
+    },
+    {
+      "epoch": 0.3297454015156118,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0014066556311728732,
+      "loss": 0.0996,
+      "step": 37987
+    },
+    {
+      "epoch": 0.329754081995816,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0014066276494820085,
+      "loss": 0.084,
+      "step": 37988
+    },
+    {
+      "epoch": 0.32976276247602015,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0014065996674558206,
+      "loss": 0.1553,
+      "step": 37989
+    },
+    {
+      "epoch": 0.32977144295622435,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.0014065716850943397,
+      "loss": 0.2578,
+      "step": 37990
+    },
+    {
+      "epoch": 0.3297801234364285,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0014065437023975966,
+      "loss": 0.0903,
+      "step": 37991
+    },
+    {
+      "epoch": 0.3297888039166327,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0014065157193656223,
+      "loss": 0.0996,
+      "step": 37992
+    },
+    {
+      "epoch": 0.3297974843968368,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0014064877359984468,
+      "loss": 0.082,
+      "step": 37993
+    },
+    {
+      "epoch": 0.329806164877041,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0014064597522961007,
+      "loss": 0.0938,
+      "step": 37994
+    },
+    {
+      "epoch": 0.32981484535724515,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0014064317682586148,
+      "loss": 0.064,
+      "step": 37995
+    },
+    {
+      "epoch": 0.32982352583744934,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0014064037838860198,
+      "loss": 0.1226,
+      "step": 37996
+    },
+    {
+      "epoch": 0.3298322063176535,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0014063757991783464,
+      "loss": 0.0894,
+      "step": 37997
+    },
+    {
+      "epoch": 0.32984088679785767,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001406347814135625,
+      "loss": 0.0908,
+      "step": 37998
+    },
+    {
+      "epoch": 0.3298495672780618,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001406319828757886,
+      "loss": 0.0854,
+      "step": 37999
+    },
+    {
+      "epoch": 0.329858247758266,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014062918430451609,
+      "loss": 0.0898,
+      "step": 38000
+    },
+    {
+      "epoch": 0.32986692823847014,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0014062638569974789,
+      "loss": 0.1289,
+      "step": 38001
+    },
+    {
+      "epoch": 0.32987560871867433,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0014062358706148717,
+      "loss": 0.1299,
+      "step": 38002
+    },
+    {
+      "epoch": 0.32988428919887847,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0014062078838973694,
+      "loss": 0.082,
+      "step": 38003
+    },
+    {
+      "epoch": 0.32989296967908266,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0014061798968450026,
+      "loss": 0.1016,
+      "step": 38004
+    },
+    {
+      "epoch": 0.3299016501592868,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0014061519094578023,
+      "loss": 0.1045,
+      "step": 38005
+    },
+    {
+      "epoch": 0.329910330639491,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0014061239217357988,
+      "loss": 0.1133,
+      "step": 38006
+    },
+    {
+      "epoch": 0.3299190111196951,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001406095933679023,
+      "loss": 0.0679,
+      "step": 38007
+    },
+    {
+      "epoch": 0.3299276915998993,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001406067945287505,
+      "loss": 0.0732,
+      "step": 38008
+    },
+    {
+      "epoch": 0.32993637208010346,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0014060399565612757,
+      "loss": 0.0879,
+      "step": 38009
+    },
+    {
+      "epoch": 0.32994505256030765,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001406011967500366,
+      "loss": 0.1484,
+      "step": 38010
+    },
+    {
+      "epoch": 0.3299537330405118,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0014059839781048059,
+      "loss": 0.0854,
+      "step": 38011
+    },
+    {
+      "epoch": 0.329962413520716,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0014059559883746262,
+      "loss": 0.1055,
+      "step": 38012
+    },
+    {
+      "epoch": 0.3299710940009201,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0014059279983098583,
+      "loss": 0.1201,
+      "step": 38013
+    },
+    {
+      "epoch": 0.3299797744811243,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0014059000079105315,
+      "loss": 0.0811,
+      "step": 38014
+    },
+    {
+      "epoch": 0.32998845496132845,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0014058720171766772,
+      "loss": 0.0942,
+      "step": 38015
+    },
+    {
+      "epoch": 0.32999713544153264,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0014058440261083258,
+      "loss": 0.0718,
+      "step": 38016
+    },
+    {
+      "epoch": 0.3300058159217368,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0014058160347055083,
+      "loss": 0.0986,
+      "step": 38017
+    },
+    {
+      "epoch": 0.33001449640194097,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0014057880429682547,
+      "loss": 0.1455,
+      "step": 38018
+    },
+    {
+      "epoch": 0.3300231768821451,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0014057600508965958,
+      "loss": 0.0811,
+      "step": 38019
+    },
+    {
+      "epoch": 0.3300318573623493,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0014057320584905627,
+      "loss": 0.1064,
+      "step": 38020
+    },
+    {
+      "epoch": 0.33004053784255344,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0014057040657501853,
+      "loss": 0.1104,
+      "step": 38021
+    },
+    {
+      "epoch": 0.33004921832275763,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0014056760726754948,
+      "loss": 0.1123,
+      "step": 38022
+    },
+    {
+      "epoch": 0.33005789880296177,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0014056480792665209,
+      "loss": 0.0781,
+      "step": 38023
+    },
+    {
+      "epoch": 0.33006657928316596,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001405620085523295,
+      "loss": 0.0845,
+      "step": 38024
+    },
+    {
+      "epoch": 0.3300752597633701,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0014055920914458483,
+      "loss": 0.0908,
+      "step": 38025
+    },
+    {
+      "epoch": 0.3300839402435743,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.00140556409703421,
+      "loss": 0.1064,
+      "step": 38026
+    },
+    {
+      "epoch": 0.33009262072377843,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0014055361022884119,
+      "loss": 0.1045,
+      "step": 38027
+    },
+    {
+      "epoch": 0.3301013012039826,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0014055081072084836,
+      "loss": 0.0962,
+      "step": 38028
+    },
+    {
+      "epoch": 0.33010998168418676,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0014054801117944565,
+      "loss": 0.0928,
+      "step": 38029
+    },
+    {
+      "epoch": 0.33011866216439095,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0014054521160463608,
+      "loss": 0.0859,
+      "step": 38030
+    },
+    {
+      "epoch": 0.3301273426445951,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0014054241199642275,
+      "loss": 0.1348,
+      "step": 38031
+    },
+    {
+      "epoch": 0.3301360231247993,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0014053961235480869,
+      "loss": 0.0879,
+      "step": 38032
+    },
+    {
+      "epoch": 0.3301447036050034,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0014053681267979694,
+      "loss": 0.1182,
+      "step": 38033
+    },
+    {
+      "epoch": 0.3301533840852076,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001405340129713906,
+      "loss": 0.0962,
+      "step": 38034
+    },
+    {
+      "epoch": 0.33016206456541175,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0014053121322959274,
+      "loss": 0.0786,
+      "step": 38035
+    },
+    {
+      "epoch": 0.33017074504561594,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0014052841345440642,
+      "loss": 0.0786,
+      "step": 38036
+    },
+    {
+      "epoch": 0.3301794255258201,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0014052561364583465,
+      "loss": 0.1006,
+      "step": 38037
+    },
+    {
+      "epoch": 0.3301881060060243,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0014052281380388053,
+      "loss": 0.1079,
+      "step": 38038
+    },
+    {
+      "epoch": 0.3301967864862284,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0014052001392854715,
+      "loss": 0.1143,
+      "step": 38039
+    },
+    {
+      "epoch": 0.3302054669664326,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0014051721401983751,
+      "loss": 0.1162,
+      "step": 38040
+    },
+    {
+      "epoch": 0.33021414744663674,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0014051441407775473,
+      "loss": 0.106,
+      "step": 38041
+    },
+    {
+      "epoch": 0.33022282792684093,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0014051161410230182,
+      "loss": 0.1143,
+      "step": 38042
+    },
+    {
+      "epoch": 0.33023150840704507,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0014050881409348186,
+      "loss": 0.1211,
+      "step": 38043
+    },
+    {
+      "epoch": 0.33024018888724926,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0014050601405129796,
+      "loss": 0.1055,
+      "step": 38044
+    },
+    {
+      "epoch": 0.3302488693674534,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0014050321397575311,
+      "loss": 0.1089,
+      "step": 38045
+    },
+    {
+      "epoch": 0.3302575498476576,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0014050041386685038,
+      "loss": 0.1514,
+      "step": 38046
+    },
+    {
+      "epoch": 0.33026623032786173,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001404976137245929,
+      "loss": 0.0918,
+      "step": 38047
+    },
+    {
+      "epoch": 0.3302749108080659,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0014049481354898368,
+      "loss": 0.0669,
+      "step": 38048
+    },
+    {
+      "epoch": 0.33028359128827006,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001404920133400258,
+      "loss": 0.0996,
+      "step": 38049
+    },
+    {
+      "epoch": 0.33029227176847425,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0014048921309772226,
+      "loss": 0.1104,
+      "step": 38050
+    },
+    {
+      "epoch": 0.3303009522486784,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0014048641282207624,
+      "loss": 0.0996,
+      "step": 38051
+    },
+    {
+      "epoch": 0.3303096327288826,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001404836125130907,
+      "loss": 0.127,
+      "step": 38052
+    },
+    {
+      "epoch": 0.3303183132090867,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0014048081217076873,
+      "loss": 0.0859,
+      "step": 38053
+    },
+    {
+      "epoch": 0.3303269936892909,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0014047801179511343,
+      "loss": 0.0718,
+      "step": 38054
+    },
+    {
+      "epoch": 0.33033567416949505,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0014047521138612783,
+      "loss": 0.0938,
+      "step": 38055
+    },
+    {
+      "epoch": 0.33034435464969925,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0014047241094381497,
+      "loss": 0.0859,
+      "step": 38056
+    },
+    {
+      "epoch": 0.3303530351299034,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.00140469610468178,
+      "loss": 0.0835,
+      "step": 38057
+    },
+    {
+      "epoch": 0.3303617156101076,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0014046680995921986,
+      "loss": 0.0811,
+      "step": 38058
+    },
+    {
+      "epoch": 0.3303703960903117,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001404640094169437,
+      "loss": 0.0659,
+      "step": 38059
+    },
+    {
+      "epoch": 0.3303790765705159,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0014046120884135254,
+      "loss": 0.3145,
+      "step": 38060
+    },
+    {
+      "epoch": 0.33038775705072004,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001404584082324495,
+      "loss": 0.0923,
+      "step": 38061
+    },
+    {
+      "epoch": 0.33039643753092424,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0014045560759023757,
+      "loss": 0.0767,
+      "step": 38062
+    },
+    {
+      "epoch": 0.3304051180111284,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0014045280691471983,
+      "loss": 0.1016,
+      "step": 38063
+    },
+    {
+      "epoch": 0.33041379849133257,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0014045000620589937,
+      "loss": 0.1021,
+      "step": 38064
+    },
+    {
+      "epoch": 0.3304224789715367,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0014044720546377925,
+      "loss": 0.1309,
+      "step": 38065
+    },
+    {
+      "epoch": 0.3304311594517409,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0014044440468836256,
+      "loss": 0.1367,
+      "step": 38066
+    },
+    {
+      "epoch": 0.33043983993194503,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0014044160387965228,
+      "loss": 0.1104,
+      "step": 38067
+    },
+    {
+      "epoch": 0.3304485204121492,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.001404388030376515,
+      "loss": 0.1328,
+      "step": 38068
+    },
+    {
+      "epoch": 0.33045720089235336,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0014043600216236333,
+      "loss": 0.1377,
+      "step": 38069
+    },
+    {
+      "epoch": 0.33046588137255756,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0014043320125379083,
+      "loss": 0.1133,
+      "step": 38070
+    },
+    {
+      "epoch": 0.3304745618527617,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0014043040031193699,
+      "loss": 0.1172,
+      "step": 38071
+    },
+    {
+      "epoch": 0.33048324233296583,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0014042759933680492,
+      "loss": 0.1152,
+      "step": 38072
+    },
+    {
+      "epoch": 0.33049192281317,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0014042479832839774,
+      "loss": 0.1221,
+      "step": 38073
+    },
+    {
+      "epoch": 0.33050060329337416,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0014042199728671846,
+      "loss": 0.0742,
+      "step": 38074
+    },
+    {
+      "epoch": 0.33050928377357836,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0014041919621177008,
+      "loss": 0.1055,
+      "step": 38075
+    },
+    {
+      "epoch": 0.3305179642537825,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0014041639510355575,
+      "loss": 0.1055,
+      "step": 38076
+    },
+    {
+      "epoch": 0.3305266447339867,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0014041359396207854,
+      "loss": 0.0928,
+      "step": 38077
+    },
+    {
+      "epoch": 0.3305353252141908,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0014041079278734143,
+      "loss": 0.0889,
+      "step": 38078
+    },
+    {
+      "epoch": 0.330544005694395,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0014040799157934757,
+      "loss": 0.0742,
+      "step": 38079
+    },
+    {
+      "epoch": 0.33055268617459915,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0014040519033809995,
+      "loss": 0.123,
+      "step": 38080
+    },
+    {
+      "epoch": 0.33056136665480335,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0014040238906360167,
+      "loss": 0.1094,
+      "step": 38081
+    },
+    {
+      "epoch": 0.3305700471350075,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0014039958775585585,
+      "loss": 0.1104,
+      "step": 38082
+    },
+    {
+      "epoch": 0.3305787276152117,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0014039678641486546,
+      "loss": 0.1123,
+      "step": 38083
+    },
+    {
+      "epoch": 0.3305874080954158,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0014039398504063362,
+      "loss": 0.1367,
+      "step": 38084
+    },
+    {
+      "epoch": 0.33059608857562,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0014039118363316334,
+      "loss": 0.0894,
+      "step": 38085
+    },
+    {
+      "epoch": 0.33060476905582414,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0014038838219245773,
+      "loss": 0.123,
+      "step": 38086
+    },
+    {
+      "epoch": 0.33061344953602834,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0014038558071851987,
+      "loss": 0.1123,
+      "step": 38087
+    },
+    {
+      "epoch": 0.3306221300162325,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0014038277921135278,
+      "loss": 0.0913,
+      "step": 38088
+    },
+    {
+      "epoch": 0.33063081049643667,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0014037997767095954,
+      "loss": 0.1084,
+      "step": 38089
+    },
+    {
+      "epoch": 0.3306394909766408,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001403771760973432,
+      "loss": 0.1191,
+      "step": 38090
+    },
+    {
+      "epoch": 0.330648171456845,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0014037437449050682,
+      "loss": 0.1562,
+      "step": 38091
+    },
+    {
+      "epoch": 0.33065685193704913,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0014037157285045352,
+      "loss": 0.1123,
+      "step": 38092
+    },
+    {
+      "epoch": 0.3306655324172533,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0014036877117718633,
+      "loss": 0.1069,
+      "step": 38093
+    },
+    {
+      "epoch": 0.33067421289745746,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0014036596947070825,
+      "loss": 0.0801,
+      "step": 38094
+    },
+    {
+      "epoch": 0.33068289337766166,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0014036316773102246,
+      "loss": 0.0962,
+      "step": 38095
+    },
+    {
+      "epoch": 0.3306915738578658,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0014036036595813195,
+      "loss": 0.0962,
+      "step": 38096
+    },
+    {
+      "epoch": 0.33070025433807,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001403575641520398,
+      "loss": 0.1064,
+      "step": 38097
+    },
+    {
+      "epoch": 0.3307089348182741,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0014035476231274906,
+      "loss": 0.126,
+      "step": 38098
+    },
+    {
+      "epoch": 0.3307176152984783,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0014035196044026282,
+      "loss": 0.1211,
+      "step": 38099
+    },
+    {
+      "epoch": 0.33072629577868246,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0014034915853458412,
+      "loss": 0.1621,
+      "step": 38100
+    },
+    {
+      "epoch": 0.33073497625888665,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0014034635659571605,
+      "loss": 0.0898,
+      "step": 38101
+    },
+    {
+      "epoch": 0.3307436567390908,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0014034355462366165,
+      "loss": 0.1226,
+      "step": 38102
+    },
+    {
+      "epoch": 0.330752337219295,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0014034075261842398,
+      "loss": 0.083,
+      "step": 38103
+    },
+    {
+      "epoch": 0.3307610176994991,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0014033795058000615,
+      "loss": 0.0903,
+      "step": 38104
+    },
+    {
+      "epoch": 0.3307696981797033,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001403351485084112,
+      "loss": 0.1001,
+      "step": 38105
+    },
+    {
+      "epoch": 0.33077837865990745,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0014033234640364219,
+      "loss": 0.0742,
+      "step": 38106
+    },
+    {
+      "epoch": 0.33078705914011164,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0014032954426570214,
+      "loss": 0.0654,
+      "step": 38107
+    },
+    {
+      "epoch": 0.3307957396203158,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0014032674209459417,
+      "loss": 0.0913,
+      "step": 38108
+    },
+    {
+      "epoch": 0.33080442010051997,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0014032393989032135,
+      "loss": 0.1299,
+      "step": 38109
+    },
+    {
+      "epoch": 0.3308131005807241,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.001403211376528867,
+      "loss": 0.085,
+      "step": 38110
+    },
+    {
+      "epoch": 0.3308217810609283,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014031833538229333,
+      "loss": 0.1387,
+      "step": 38111
+    },
+    {
+      "epoch": 0.33083046154113244,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0014031553307854428,
+      "loss": 0.103,
+      "step": 38112
+    },
+    {
+      "epoch": 0.33083914202133663,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014031273074164262,
+      "loss": 0.1211,
+      "step": 38113
+    },
+    {
+      "epoch": 0.33084782250154077,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001403099283715914,
+      "loss": 0.0898,
+      "step": 38114
+    },
+    {
+      "epoch": 0.33085650298174496,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.001403071259683937,
+      "loss": 0.0967,
+      "step": 38115
+    },
+    {
+      "epoch": 0.3308651834619491,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001403043235320526,
+      "loss": 0.0815,
+      "step": 38116
+    },
+    {
+      "epoch": 0.3308738639421533,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0014030152106257112,
+      "loss": 0.083,
+      "step": 38117
+    },
+    {
+      "epoch": 0.3308825444223574,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0014029871855995239,
+      "loss": 0.1104,
+      "step": 38118
+    },
+    {
+      "epoch": 0.3308912249025616,
+      "grad_norm": 1.125,
+      "learning_rate": 0.001402959160241994,
+      "loss": 0.3477,
+      "step": 38119
+    },
+    {
+      "epoch": 0.33089990538276576,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0014029311345531523,
+      "loss": 0.1045,
+      "step": 38120
+    },
+    {
+      "epoch": 0.33090858586296995,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.00140290310853303,
+      "loss": 0.0894,
+      "step": 38121
+    },
+    {
+      "epoch": 0.3309172663431741,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0014028750821816576,
+      "loss": 0.1387,
+      "step": 38122
+    },
+    {
+      "epoch": 0.3309259468233783,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0014028470554990654,
+      "loss": 0.1045,
+      "step": 38123
+    },
+    {
+      "epoch": 0.3309346273035824,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0014028190284852843,
+      "loss": 0.0815,
+      "step": 38124
+    },
+    {
+      "epoch": 0.3309433077837866,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0014027910011403446,
+      "loss": 0.0742,
+      "step": 38125
+    },
+    {
+      "epoch": 0.33095198826399075,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0014027629734642776,
+      "loss": 0.0938,
+      "step": 38126
+    },
+    {
+      "epoch": 0.33096066874419494,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0014027349454571133,
+      "loss": 0.1338,
+      "step": 38127
+    },
+    {
+      "epoch": 0.3309693492243991,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0014027069171188824,
+      "loss": 0.1562,
+      "step": 38128
+    },
+    {
+      "epoch": 0.33097802970460327,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001402678888449616,
+      "loss": 0.082,
+      "step": 38129
+    },
+    {
+      "epoch": 0.3309867101848074,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0014026508594493447,
+      "loss": 0.105,
+      "step": 38130
+    },
+    {
+      "epoch": 0.3309953906650116,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0014026228301180987,
+      "loss": 0.0884,
+      "step": 38131
+    },
+    {
+      "epoch": 0.33100407114521574,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0014025948004559089,
+      "loss": 0.103,
+      "step": 38132
+    },
+    {
+      "epoch": 0.33101275162541993,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0014025667704628062,
+      "loss": 0.1123,
+      "step": 38133
+    },
+    {
+      "epoch": 0.33102143210562407,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001402538740138821,
+      "loss": 0.1006,
+      "step": 38134
+    },
+    {
+      "epoch": 0.33103011258582826,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0014025107094839838,
+      "loss": 0.0996,
+      "step": 38135
+    },
+    {
+      "epoch": 0.3310387930660324,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0014024826784983257,
+      "loss": 0.0874,
+      "step": 38136
+    },
+    {
+      "epoch": 0.3310474735462366,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0014024546471818767,
+      "loss": 0.1006,
+      "step": 38137
+    },
+    {
+      "epoch": 0.33105615402644073,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0014024266155346682,
+      "loss": 0.1191,
+      "step": 38138
+    },
+    {
+      "epoch": 0.3310648345066449,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.00140239858355673,
+      "loss": 0.0859,
+      "step": 38139
+    },
+    {
+      "epoch": 0.33107351498684906,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0014023705512480938,
+      "loss": 0.1445,
+      "step": 38140
+    },
+    {
+      "epoch": 0.33108219546705325,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0014023425186087894,
+      "loss": 0.0967,
+      "step": 38141
+    },
+    {
+      "epoch": 0.3310908759472574,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0014023144856388481,
+      "loss": 0.1011,
+      "step": 38142
+    },
+    {
+      "epoch": 0.3310995564274616,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0014022864523382998,
+      "loss": 0.1387,
+      "step": 38143
+    },
+    {
+      "epoch": 0.3311082369076657,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001402258418707176,
+      "loss": 0.0972,
+      "step": 38144
+    },
+    {
+      "epoch": 0.3311169173878699,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0014022303847455067,
+      "loss": 0.0903,
+      "step": 38145
+    },
+    {
+      "epoch": 0.33112559786807405,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0014022023504533226,
+      "loss": 0.1045,
+      "step": 38146
+    },
+    {
+      "epoch": 0.33113427834827824,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0014021743158306548,
+      "loss": 0.0801,
+      "step": 38147
+    },
+    {
+      "epoch": 0.3311429588284824,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0014021462808775336,
+      "loss": 0.0806,
+      "step": 38148
+    },
+    {
+      "epoch": 0.3311516393086866,
+      "grad_norm": 0.625,
+      "learning_rate": 0.00140211824559399,
+      "loss": 0.124,
+      "step": 38149
+    },
+    {
+      "epoch": 0.3311603197888907,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0014020902099800541,
+      "loss": 0.0811,
+      "step": 38150
+    },
+    {
+      "epoch": 0.3311690002690949,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001402062174035757,
+      "loss": 0.0928,
+      "step": 38151
+    },
+    {
+      "epoch": 0.33117768074929904,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0014020341377611295,
+      "loss": 0.1113,
+      "step": 38152
+    },
+    {
+      "epoch": 0.33118636122950323,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0014020061011562018,
+      "loss": 0.1016,
+      "step": 38153
+    },
+    {
+      "epoch": 0.33119504170970737,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0014019780642210045,
+      "loss": 0.0996,
+      "step": 38154
+    },
+    {
+      "epoch": 0.33120372218991156,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0014019500269555688,
+      "loss": 0.0791,
+      "step": 38155
+    },
+    {
+      "epoch": 0.3312124026701157,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0014019219893599251,
+      "loss": 0.0952,
+      "step": 38156
+    },
+    {
+      "epoch": 0.3312210831503199,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0014018939514341038,
+      "loss": 0.1113,
+      "step": 38157
+    },
+    {
+      "epoch": 0.33122976363052403,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0014018659131781364,
+      "loss": 0.0854,
+      "step": 38158
+    },
+    {
+      "epoch": 0.3312384441107282,
+      "grad_norm": 3.078125,
+      "learning_rate": 0.0014018378745920522,
+      "loss": 0.3184,
+      "step": 38159
+    },
+    {
+      "epoch": 0.33124712459093236,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001401809835675883,
+      "loss": 0.0972,
+      "step": 38160
+    },
+    {
+      "epoch": 0.33125580507113656,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0014017817964296594,
+      "loss": 0.0938,
+      "step": 38161
+    },
+    {
+      "epoch": 0.3312644855513407,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0014017537568534115,
+      "loss": 0.0977,
+      "step": 38162
+    },
+    {
+      "epoch": 0.3312731660315449,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.00140172571694717,
+      "loss": 0.1455,
+      "step": 38163
+    },
+    {
+      "epoch": 0.331281846511749,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001401697676710966,
+      "loss": 0.0806,
+      "step": 38164
+    },
+    {
+      "epoch": 0.3312905269919532,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.00140166963614483,
+      "loss": 0.0903,
+      "step": 38165
+    },
+    {
+      "epoch": 0.33129920747215735,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0014016415952487929,
+      "loss": 0.0942,
+      "step": 38166
+    },
+    {
+      "epoch": 0.33130788795236155,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0014016135540228845,
+      "loss": 0.1289,
+      "step": 38167
+    },
+    {
+      "epoch": 0.3313165684325657,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0014015855124671361,
+      "loss": 0.082,
+      "step": 38168
+    },
+    {
+      "epoch": 0.3313252489127699,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0014015574705815786,
+      "loss": 0.1016,
+      "step": 38169
+    },
+    {
+      "epoch": 0.331333929392974,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0014015294283662425,
+      "loss": 0.0894,
+      "step": 38170
+    },
+    {
+      "epoch": 0.3313426098731782,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001401501385821158,
+      "loss": 0.1206,
+      "step": 38171
+    },
+    {
+      "epoch": 0.33135129035338234,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0014014733429463565,
+      "loss": 0.127,
+      "step": 38172
+    },
+    {
+      "epoch": 0.33135997083358654,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001401445299741868,
+      "loss": 0.0894,
+      "step": 38173
+    },
+    {
+      "epoch": 0.3313686513137907,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0014014172562077235,
+      "loss": 0.0947,
+      "step": 38174
+    },
+    {
+      "epoch": 0.33137733179399487,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0014013892123439537,
+      "loss": 0.1211,
+      "step": 38175
+    },
+    {
+      "epoch": 0.331386012274199,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001401361168150589,
+      "loss": 0.0703,
+      "step": 38176
+    },
+    {
+      "epoch": 0.3313946927544032,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0014013331236276602,
+      "loss": 0.0903,
+      "step": 38177
+    },
+    {
+      "epoch": 0.33140337323460733,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0014013050787751982,
+      "loss": 0.126,
+      "step": 38178
+    },
+    {
+      "epoch": 0.3314120537148115,
+      "grad_norm": 0.061767578125,
+      "learning_rate": 0.0014012770335932335,
+      "loss": 0.0742,
+      "step": 38179
+    },
+    {
+      "epoch": 0.33142073419501566,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0014012489880817966,
+      "loss": 0.0952,
+      "step": 38180
+    },
+    {
+      "epoch": 0.33142941467521986,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0014012209422409186,
+      "loss": 0.1035,
+      "step": 38181
+    },
+    {
+      "epoch": 0.331438095155424,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0014011928960706297,
+      "loss": 0.1523,
+      "step": 38182
+    },
+    {
+      "epoch": 0.3314467756356282,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001401164849570961,
+      "loss": 0.0977,
+      "step": 38183
+    },
+    {
+      "epoch": 0.3314554561158323,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0014011368027419427,
+      "loss": 0.1162,
+      "step": 38184
+    },
+    {
+      "epoch": 0.3314641365960365,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001401108755583606,
+      "loss": 0.0864,
+      "step": 38185
+    },
+    {
+      "epoch": 0.33147281707624066,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0014010807080959806,
+      "loss": 0.1768,
+      "step": 38186
+    },
+    {
+      "epoch": 0.33148149755644485,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0014010526602790988,
+      "loss": 0.1113,
+      "step": 38187
+    },
+    {
+      "epoch": 0.331490178036649,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0014010246121329898,
+      "loss": 0.0884,
+      "step": 38188
+    },
+    {
+      "epoch": 0.3314988585168532,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0014009965636576848,
+      "loss": 0.1084,
+      "step": 38189
+    },
+    {
+      "epoch": 0.3315075389970573,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0014009685148532146,
+      "loss": 0.1445,
+      "step": 38190
+    },
+    {
+      "epoch": 0.3315162194772615,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0014009404657196098,
+      "loss": 0.0894,
+      "step": 38191
+    },
+    {
+      "epoch": 0.33152489995746565,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0014009124162569012,
+      "loss": 0.0801,
+      "step": 38192
+    },
+    {
+      "epoch": 0.33153358043766984,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0014008843664651185,
+      "loss": 0.0928,
+      "step": 38193
+    },
+    {
+      "epoch": 0.331542260917874,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0014008563163442943,
+      "loss": 0.1064,
+      "step": 38194
+    },
+    {
+      "epoch": 0.3315509413980781,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0014008282658944573,
+      "loss": 0.1211,
+      "step": 38195
+    },
+    {
+      "epoch": 0.3315596218782823,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0014008002151156393,
+      "loss": 0.082,
+      "step": 38196
+    },
+    {
+      "epoch": 0.33156830235848644,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0014007721640078707,
+      "loss": 0.0898,
+      "step": 38197
+    },
+    {
+      "epoch": 0.33157698283869064,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0014007441125711822,
+      "loss": 0.127,
+      "step": 38198
+    },
+    {
+      "epoch": 0.3315856633188948,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0014007160608056045,
+      "loss": 0.1426,
+      "step": 38199
+    },
+    {
+      "epoch": 0.33159434379909897,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0014006880087111683,
+      "loss": 0.0835,
+      "step": 38200
+    },
+    {
+      "epoch": 0.3316030242793031,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0014006599562879042,
+      "loss": 0.1064,
+      "step": 38201
+    },
+    {
+      "epoch": 0.3316117047595073,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0014006319035358426,
+      "loss": 0.1133,
+      "step": 38202
+    },
+    {
+      "epoch": 0.33162038523971143,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0014006038504550146,
+      "loss": 0.1045,
+      "step": 38203
+    },
+    {
+      "epoch": 0.33162906571991563,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001400575797045451,
+      "loss": 0.0869,
+      "step": 38204
+    },
+    {
+      "epoch": 0.33163774620011977,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0014005477433071821,
+      "loss": 0.1406,
+      "step": 38205
+    },
+    {
+      "epoch": 0.33164642668032396,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0014005196892402383,
+      "loss": 0.0752,
+      "step": 38206
+    },
+    {
+      "epoch": 0.3316551071605281,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001400491634844651,
+      "loss": 0.0986,
+      "step": 38207
+    },
+    {
+      "epoch": 0.3316637876407323,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0014004635801204506,
+      "loss": 0.1011,
+      "step": 38208
+    },
+    {
+      "epoch": 0.3316724681209364,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001400435525067668,
+      "loss": 0.1816,
+      "step": 38209
+    },
+    {
+      "epoch": 0.3316811486011406,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0014004074696863332,
+      "loss": 0.1006,
+      "step": 38210
+    },
+    {
+      "epoch": 0.33168982908134476,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0014003794139764777,
+      "loss": 0.1001,
+      "step": 38211
+    },
+    {
+      "epoch": 0.33169850956154895,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0014003513579381314,
+      "loss": 0.1621,
+      "step": 38212
+    },
+    {
+      "epoch": 0.3317071900417531,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0014003233015713254,
+      "loss": 0.1504,
+      "step": 38213
+    },
+    {
+      "epoch": 0.3317158705219573,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0014002952448760903,
+      "loss": 0.0933,
+      "step": 38214
+    },
+    {
+      "epoch": 0.3317245510021614,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0014002671878524573,
+      "loss": 0.1387,
+      "step": 38215
+    },
+    {
+      "epoch": 0.3317332314823656,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0014002391305004563,
+      "loss": 0.127,
+      "step": 38216
+    },
+    {
+      "epoch": 0.33174191196256975,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0014002110728201184,
+      "loss": 0.1562,
+      "step": 38217
+    },
+    {
+      "epoch": 0.33175059244277394,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001400183014811474,
+      "loss": 0.1221,
+      "step": 38218
+    },
+    {
+      "epoch": 0.3317592729229781,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0014001549564745539,
+      "loss": 0.0962,
+      "step": 38219
+    },
+    {
+      "epoch": 0.33176795340318227,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0014001268978093891,
+      "loss": 0.1299,
+      "step": 38220
+    },
+    {
+      "epoch": 0.3317766338833864,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.00140009883881601,
+      "loss": 0.0728,
+      "step": 38221
+    },
+    {
+      "epoch": 0.3317853143635906,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001400070779494447,
+      "loss": 0.1309,
+      "step": 38222
+    },
+    {
+      "epoch": 0.33179399484379474,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0014000427198447313,
+      "loss": 0.1133,
+      "step": 38223
+    },
+    {
+      "epoch": 0.33180267532399893,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0014000146598668938,
+      "loss": 0.1279,
+      "step": 38224
+    },
+    {
+      "epoch": 0.33181135580420307,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 0.0013999865995609642,
+      "loss": 0.054,
+      "step": 38225
+    },
+    {
+      "epoch": 0.33182003628440726,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0013999585389269742,
+      "loss": 0.1162,
+      "step": 38226
+    },
+    {
+      "epoch": 0.3318287167646114,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0013999304779649538,
+      "loss": 0.1074,
+      "step": 38227
+    },
+    {
+      "epoch": 0.3318373972448156,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0013999024166749337,
+      "loss": 0.1191,
+      "step": 38228
+    },
+    {
+      "epoch": 0.33184607772501973,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0013998743550569452,
+      "loss": 0.1055,
+      "step": 38229
+    },
+    {
+      "epoch": 0.3318547582052239,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0013998462931110185,
+      "loss": 0.1279,
+      "step": 38230
+    },
+    {
+      "epoch": 0.33186343868542806,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0013998182308371846,
+      "loss": 0.165,
+      "step": 38231
+    },
+    {
+      "epoch": 0.33187211916563225,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0013997901682354734,
+      "loss": 0.0771,
+      "step": 38232
+    },
+    {
+      "epoch": 0.3318807996458364,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001399762105305917,
+      "loss": 0.1172,
+      "step": 38233
+    },
+    {
+      "epoch": 0.3318894801260406,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0013997340420485447,
+      "loss": 0.1602,
+      "step": 38234
+    },
+    {
+      "epoch": 0.3318981606062447,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0013997059784633876,
+      "loss": 0.1016,
+      "step": 38235
+    },
+    {
+      "epoch": 0.3319068410864489,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001399677914550477,
+      "loss": 0.1016,
+      "step": 38236
+    },
+    {
+      "epoch": 0.33191552156665305,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001399649850309843,
+      "loss": 0.0991,
+      "step": 38237
+    },
+    {
+      "epoch": 0.33192420204685724,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0013996217857415163,
+      "loss": 0.0864,
+      "step": 38238
+    },
+    {
+      "epoch": 0.3319328825270614,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.001399593720845528,
+      "loss": 0.1113,
+      "step": 38239
+    },
+    {
+      "epoch": 0.33194156300726557,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0013995656556219083,
+      "loss": 0.1318,
+      "step": 38240
+    },
+    {
+      "epoch": 0.3319502434874697,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001399537590070688,
+      "loss": 0.0884,
+      "step": 38241
+    },
+    {
+      "epoch": 0.3319589239676739,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001399509524191898,
+      "loss": 0.0894,
+      "step": 38242
+    },
+    {
+      "epoch": 0.33196760444787804,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0013994814579855691,
+      "loss": 0.0952,
+      "step": 38243
+    },
+    {
+      "epoch": 0.33197628492808223,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0013994533914517314,
+      "loss": 0.1035,
+      "step": 38244
+    },
+    {
+      "epoch": 0.33198496540828637,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001399425324590416,
+      "loss": 0.1709,
+      "step": 38245
+    },
+    {
+      "epoch": 0.33199364588849056,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0013993972574016537,
+      "loss": 0.1025,
+      "step": 38246
+    },
+    {
+      "epoch": 0.3320023263686947,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0013993691898854753,
+      "loss": 0.1084,
+      "step": 38247
+    },
+    {
+      "epoch": 0.3320110068488989,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001399341122041911,
+      "loss": 0.0874,
+      "step": 38248
+    },
+    {
+      "epoch": 0.33201968732910303,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0013993130538709918,
+      "loss": 0.126,
+      "step": 38249
+    },
+    {
+      "epoch": 0.3320283678093072,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0013992849853727482,
+      "loss": 0.1226,
+      "step": 38250
+    },
+    {
+      "epoch": 0.33203704828951136,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0013992569165472112,
+      "loss": 0.1201,
+      "step": 38251
+    },
+    {
+      "epoch": 0.33204572876971555,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0013992288473944113,
+      "loss": 0.0918,
+      "step": 38252
+    },
+    {
+      "epoch": 0.3320544092499197,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001399200777914379,
+      "loss": 0.0898,
+      "step": 38253
+    },
+    {
+      "epoch": 0.3320630897301239,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0013991727081071456,
+      "loss": 0.1245,
+      "step": 38254
+    },
+    {
+      "epoch": 0.332071770210328,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0013991446379727412,
+      "loss": 0.1045,
+      "step": 38255
+    },
+    {
+      "epoch": 0.3320804506905322,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0013991165675111968,
+      "loss": 0.1055,
+      "step": 38256
+    },
+    {
+      "epoch": 0.33208913117073635,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001399088496722543,
+      "loss": 0.0864,
+      "step": 38257
+    },
+    {
+      "epoch": 0.33209781165094054,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0013990604256068104,
+      "loss": 0.1182,
+      "step": 38258
+    },
+    {
+      "epoch": 0.3321064921311447,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.00139903235416403,
+      "loss": 0.0898,
+      "step": 38259
+    },
+    {
+      "epoch": 0.3321151726113489,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0013990042823942323,
+      "loss": 0.1738,
+      "step": 38260
+    },
+    {
+      "epoch": 0.332123853091553,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001398976210297448,
+      "loss": 0.0889,
+      "step": 38261
+    },
+    {
+      "epoch": 0.3321325335717572,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0013989481378737074,
+      "loss": 0.1719,
+      "step": 38262
+    },
+    {
+      "epoch": 0.33214121405196134,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001398920065123042,
+      "loss": 0.1182,
+      "step": 38263
+    },
+    {
+      "epoch": 0.33214989453216553,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0013988919920454822,
+      "loss": 0.106,
+      "step": 38264
+    },
+    {
+      "epoch": 0.3321585750123697,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0013988639186410584,
+      "loss": 0.1084,
+      "step": 38265
+    },
+    {
+      "epoch": 0.33216725549257387,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0013988358449098016,
+      "loss": 0.1094,
+      "step": 38266
+    },
+    {
+      "epoch": 0.332175935972778,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0013988077708517424,
+      "loss": 0.1035,
+      "step": 38267
+    },
+    {
+      "epoch": 0.3321846164529822,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0013987796964669114,
+      "loss": 0.1201,
+      "step": 38268
+    },
+    {
+      "epoch": 0.33219329693318633,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0013987516217553396,
+      "loss": 0.0718,
+      "step": 38269
+    },
+    {
+      "epoch": 0.3322019774133905,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0013987235467170574,
+      "loss": 0.1069,
+      "step": 38270
+    },
+    {
+      "epoch": 0.33221065789359466,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0013986954713520955,
+      "loss": 0.1162,
+      "step": 38271
+    },
+    {
+      "epoch": 0.33221933837379886,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001398667395660485,
+      "loss": 0.0942,
+      "step": 38272
+    },
+    {
+      "epoch": 0.332228018854003,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001398639319642256,
+      "loss": 0.0649,
+      "step": 38273
+    },
+    {
+      "epoch": 0.3322366993342072,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0013986112432974398,
+      "loss": 0.0986,
+      "step": 38274
+    },
+    {
+      "epoch": 0.3322453798144113,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0013985831666260666,
+      "loss": 0.1152,
+      "step": 38275
+    },
+    {
+      "epoch": 0.3322540602946155,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0013985550896281676,
+      "loss": 0.1045,
+      "step": 38276
+    },
+    {
+      "epoch": 0.33226274077481965,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001398527012303773,
+      "loss": 0.0986,
+      "step": 38277
+    },
+    {
+      "epoch": 0.33227142125502385,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0013984989346529143,
+      "loss": 0.1484,
+      "step": 38278
+    },
+    {
+      "epoch": 0.332280101735228,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.001398470856675621,
+      "loss": 0.0923,
+      "step": 38279
+    },
+    {
+      "epoch": 0.3322887822154322,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0013984427783719244,
+      "loss": 0.1084,
+      "step": 38280
+    },
+    {
+      "epoch": 0.3322974626956363,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0013984146997418558,
+      "loss": 0.1621,
+      "step": 38281
+    },
+    {
+      "epoch": 0.3323061431758405,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001398386620785445,
+      "loss": 0.0806,
+      "step": 38282
+    },
+    {
+      "epoch": 0.33231482365604464,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0013983585415027232,
+      "loss": 0.1973,
+      "step": 38283
+    },
+    {
+      "epoch": 0.33232350413624884,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001398330461893721,
+      "loss": 0.1045,
+      "step": 38284
+    },
+    {
+      "epoch": 0.332332184616453,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001398302381958469,
+      "loss": 0.1045,
+      "step": 38285
+    },
+    {
+      "epoch": 0.33234086509665717,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001398274301696998,
+      "loss": 0.1387,
+      "step": 38286
+    },
+    {
+      "epoch": 0.3323495455768613,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0013982462211093386,
+      "loss": 0.0879,
+      "step": 38287
+    },
+    {
+      "epoch": 0.3323582260570655,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0013982181401955218,
+      "loss": 0.167,
+      "step": 38288
+    },
+    {
+      "epoch": 0.33236690653726964,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001398190058955578,
+      "loss": 0.1001,
+      "step": 38289
+    },
+    {
+      "epoch": 0.33237558701747383,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0013981619773895384,
+      "loss": 0.1045,
+      "step": 38290
+    },
+    {
+      "epoch": 0.33238426749767797,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001398133895497433,
+      "loss": 0.1147,
+      "step": 38291
+    },
+    {
+      "epoch": 0.33239294797788216,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0013981058132792928,
+      "loss": 0.0923,
+      "step": 38292
+    },
+    {
+      "epoch": 0.3324016284580863,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0013980777307351483,
+      "loss": 0.1006,
+      "step": 38293
+    },
+    {
+      "epoch": 0.3324103089382905,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001398049647865031,
+      "loss": 0.0996,
+      "step": 38294
+    },
+    {
+      "epoch": 0.3324189894184946,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001398021564668971,
+      "loss": 0.0977,
+      "step": 38295
+    },
+    {
+      "epoch": 0.3324276698986988,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0013979934811469991,
+      "loss": 0.1396,
+      "step": 38296
+    },
+    {
+      "epoch": 0.33243635037890296,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.001397965397299146,
+      "loss": 0.0938,
+      "step": 38297
+    },
+    {
+      "epoch": 0.33244503085910715,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0013979373131254422,
+      "loss": 0.0918,
+      "step": 38298
+    },
+    {
+      "epoch": 0.3324537113393113,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001397909228625919,
+      "loss": 0.0913,
+      "step": 38299
+    },
+    {
+      "epoch": 0.3324623918195155,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0013978811438006065,
+      "loss": 0.0986,
+      "step": 38300
+    },
+    {
+      "epoch": 0.3324710722997196,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0013978530586495354,
+      "loss": 0.1504,
+      "step": 38301
+    },
+    {
+      "epoch": 0.3324797527799238,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.001397824973172737,
+      "loss": 0.1211,
+      "step": 38302
+    },
+    {
+      "epoch": 0.33248843326012795,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0013977968873702417,
+      "loss": 0.1113,
+      "step": 38303
+    },
+    {
+      "epoch": 0.33249711374033214,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0013977688012420802,
+      "loss": 0.1289,
+      "step": 38304
+    },
+    {
+      "epoch": 0.3325057942205363,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0013977407147882828,
+      "loss": 0.0957,
+      "step": 38305
+    },
+    {
+      "epoch": 0.33251447470074047,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0013977126280088812,
+      "loss": 0.0771,
+      "step": 38306
+    },
+    {
+      "epoch": 0.3325231551809446,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0013976845409039054,
+      "loss": 0.1079,
+      "step": 38307
+    },
+    {
+      "epoch": 0.3325318356611488,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0013976564534733862,
+      "loss": 0.0815,
+      "step": 38308
+    },
+    {
+      "epoch": 0.33254051614135294,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0013976283657173543,
+      "loss": 0.1172,
+      "step": 38309
+    },
+    {
+      "epoch": 0.33254919662155713,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0013976002776358405,
+      "loss": 0.0952,
+      "step": 38310
+    },
+    {
+      "epoch": 0.33255787710176127,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0013975721892288753,
+      "loss": 0.082,
+      "step": 38311
+    },
+    {
+      "epoch": 0.33256655758196546,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00139754410049649,
+      "loss": 0.0869,
+      "step": 38312
+    },
+    {
+      "epoch": 0.3325752380621696,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001397516011438715,
+      "loss": 0.1094,
+      "step": 38313
+    },
+    {
+      "epoch": 0.3325839185423738,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0013974879220555805,
+      "loss": 0.0845,
+      "step": 38314
+    },
+    {
+      "epoch": 0.33259259902257793,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001397459832347118,
+      "loss": 0.1348,
+      "step": 38315
+    },
+    {
+      "epoch": 0.3326012795027821,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001397431742313358,
+      "loss": 0.1055,
+      "step": 38316
+    },
+    {
+      "epoch": 0.33260995998298626,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001397403651954331,
+      "loss": 0.1182,
+      "step": 38317
+    },
+    {
+      "epoch": 0.33261864046319045,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0013973755612700677,
+      "loss": 0.0996,
+      "step": 38318
+    },
+    {
+      "epoch": 0.3326273209433946,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001397347470260599,
+      "loss": 0.1025,
+      "step": 38319
+    },
+    {
+      "epoch": 0.3326360014235987,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0013973193789259557,
+      "loss": 0.1387,
+      "step": 38320
+    },
+    {
+      "epoch": 0.3326446819038029,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0013972912872661683,
+      "loss": 0.1055,
+      "step": 38321
+    },
+    {
+      "epoch": 0.33265336238400706,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0013972631952812677,
+      "loss": 0.0986,
+      "step": 38322
+    },
+    {
+      "epoch": 0.33266204286421125,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0013972351029712844,
+      "loss": 0.0918,
+      "step": 38323
+    },
+    {
+      "epoch": 0.3326707233444154,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0013972070103362494,
+      "loss": 0.1113,
+      "step": 38324
+    },
+    {
+      "epoch": 0.3326794038246196,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001397178917376193,
+      "loss": 0.127,
+      "step": 38325
+    },
+    {
+      "epoch": 0.3326880843048237,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0013971508240911467,
+      "loss": 0.0669,
+      "step": 38326
+    },
+    {
+      "epoch": 0.3326967647850279,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.00139712273048114,
+      "loss": 0.0947,
+      "step": 38327
+    },
+    {
+      "epoch": 0.33270544526523205,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001397094636546205,
+      "loss": 0.1021,
+      "step": 38328
+    },
+    {
+      "epoch": 0.33271412574543624,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0013970665422863716,
+      "loss": 0.1221,
+      "step": 38329
+    },
+    {
+      "epoch": 0.3327228062256404,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0013970384477016704,
+      "loss": 0.127,
+      "step": 38330
+    },
+    {
+      "epoch": 0.33273148670584457,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0013970103527921327,
+      "loss": 0.0952,
+      "step": 38331
+    },
+    {
+      "epoch": 0.3327401671860487,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0013969822575577889,
+      "loss": 0.1494,
+      "step": 38332
+    },
+    {
+      "epoch": 0.3327488476662529,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0013969541619986699,
+      "loss": 0.1572,
+      "step": 38333
+    },
+    {
+      "epoch": 0.33275752814645704,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001396926066114806,
+      "loss": 0.0674,
+      "step": 38334
+    },
+    {
+      "epoch": 0.33276620862666123,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0013968979699062285,
+      "loss": 0.0923,
+      "step": 38335
+    },
+    {
+      "epoch": 0.33277488910686537,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0013968698733729677,
+      "loss": 0.1357,
+      "step": 38336
+    },
+    {
+      "epoch": 0.33278356958706956,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0013968417765150545,
+      "loss": 0.1494,
+      "step": 38337
+    },
+    {
+      "epoch": 0.3327922500672737,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0013968136793325195,
+      "loss": 0.0703,
+      "step": 38338
+    },
+    {
+      "epoch": 0.3328009305474779,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0013967855818253936,
+      "loss": 0.0811,
+      "step": 38339
+    },
+    {
+      "epoch": 0.33280961102768203,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0013967574839937073,
+      "loss": 0.0933,
+      "step": 38340
+    },
+    {
+      "epoch": 0.3328182915078862,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0013967293858374917,
+      "loss": 0.082,
+      "step": 38341
+    },
+    {
+      "epoch": 0.33282697198809036,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0013967012873567772,
+      "loss": 0.1191,
+      "step": 38342
+    },
+    {
+      "epoch": 0.33283565246829455,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0013966731885515947,
+      "loss": 0.0923,
+      "step": 38343
+    },
+    {
+      "epoch": 0.3328443329484987,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0013966450894219746,
+      "loss": 0.1621,
+      "step": 38344
+    },
+    {
+      "epoch": 0.3328530134287029,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0013966169899679482,
+      "loss": 0.1104,
+      "step": 38345
+    },
+    {
+      "epoch": 0.332861693908907,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0013965888901895458,
+      "loss": 0.1104,
+      "step": 38346
+    },
+    {
+      "epoch": 0.3328703743891112,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0013965607900867985,
+      "loss": 0.0835,
+      "step": 38347
+    },
+    {
+      "epoch": 0.33287905486931535,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0013965326896597362,
+      "loss": 0.0649,
+      "step": 38348
+    },
+    {
+      "epoch": 0.33288773534951954,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0013965045889083908,
+      "loss": 0.1348,
+      "step": 38349
+    },
+    {
+      "epoch": 0.3328964158297237,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0013964764878327918,
+      "loss": 0.0889,
+      "step": 38350
+    },
+    {
+      "epoch": 0.3329050963099279,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0013964483864329711,
+      "loss": 0.106,
+      "step": 38351
+    },
+    {
+      "epoch": 0.332913776790132,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0013964202847089587,
+      "loss": 0.1104,
+      "step": 38352
+    },
+    {
+      "epoch": 0.3329224572703362,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0013963921826607856,
+      "loss": 0.1152,
+      "step": 38353
+    },
+    {
+      "epoch": 0.33293113775054034,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0013963640802884822,
+      "loss": 0.0815,
+      "step": 38354
+    },
+    {
+      "epoch": 0.33293981823074453,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.00139633597759208,
+      "loss": 0.1167,
+      "step": 38355
+    },
+    {
+      "epoch": 0.33294849871094867,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0013963078745716092,
+      "loss": 0.0947,
+      "step": 38356
+    },
+    {
+      "epoch": 0.33295717919115286,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0013962797712271,
+      "loss": 0.1245,
+      "step": 38357
+    },
+    {
+      "epoch": 0.332965859671357,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001396251667558584,
+      "loss": 0.1006,
+      "step": 38358
+    },
+    {
+      "epoch": 0.3329745401515612,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0013962235635660916,
+      "loss": 0.0791,
+      "step": 38359
+    },
+    {
+      "epoch": 0.33298322063176533,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0013961954592496535,
+      "loss": 0.0752,
+      "step": 38360
+    },
+    {
+      "epoch": 0.3329919011119695,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0013961673546093007,
+      "loss": 0.1079,
+      "step": 38361
+    },
+    {
+      "epoch": 0.33300058159217366,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0013961392496450635,
+      "loss": 0.0869,
+      "step": 38362
+    },
+    {
+      "epoch": 0.33300926207237785,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001396111144356973,
+      "loss": 0.0996,
+      "step": 38363
+    },
+    {
+      "epoch": 0.333017942552582,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.00139608303874506,
+      "loss": 0.1025,
+      "step": 38364
+    },
+    {
+      "epoch": 0.3330266230327862,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0013960549328093548,
+      "loss": 0.1426,
+      "step": 38365
+    },
+    {
+      "epoch": 0.3330353035129903,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0013960268265498883,
+      "loss": 0.0791,
+      "step": 38366
+    },
+    {
+      "epoch": 0.3330439839931945,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0013959987199666914,
+      "loss": 0.1006,
+      "step": 38367
+    },
+    {
+      "epoch": 0.33305266447339865,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0013959706130597948,
+      "loss": 0.1045,
+      "step": 38368
+    },
+    {
+      "epoch": 0.33306134495360284,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.001395942505829229,
+      "loss": 0.0879,
+      "step": 38369
+    },
+    {
+      "epoch": 0.333070025433807,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0013959143982750248,
+      "loss": 0.0742,
+      "step": 38370
+    },
+    {
+      "epoch": 0.3330787059140112,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0013958862903972135,
+      "loss": 0.1465,
+      "step": 38371
+    },
+    {
+      "epoch": 0.3330873863942153,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001395858182195825,
+      "loss": 0.1084,
+      "step": 38372
+    },
+    {
+      "epoch": 0.3330960668744195,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.001395830073670891,
+      "loss": 0.1348,
+      "step": 38373
+    },
+    {
+      "epoch": 0.33310474735462364,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0013958019648224412,
+      "loss": 0.0693,
+      "step": 38374
+    },
+    {
+      "epoch": 0.33311342783482784,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0013957738556505072,
+      "loss": 0.1387,
+      "step": 38375
+    },
+    {
+      "epoch": 0.333122108315032,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001395745746155119,
+      "loss": 0.1001,
+      "step": 38376
+    },
+    {
+      "epoch": 0.33313078879523617,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0013957176363363077,
+      "loss": 0.0884,
+      "step": 38377
+    },
+    {
+      "epoch": 0.3331394692754403,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0013956895261941042,
+      "loss": 0.0933,
+      "step": 38378
+    },
+    {
+      "epoch": 0.3331481497556445,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001395661415728539,
+      "loss": 0.0791,
+      "step": 38379
+    },
+    {
+      "epoch": 0.33315683023584863,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0013956333049396428,
+      "loss": 0.1025,
+      "step": 38380
+    },
+    {
+      "epoch": 0.3331655107160528,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0013956051938274466,
+      "loss": 0.0918,
+      "step": 38381
+    },
+    {
+      "epoch": 0.33317419119625696,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001395577082391981,
+      "loss": 0.1006,
+      "step": 38382
+    },
+    {
+      "epoch": 0.33318287167646116,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0013955489706332766,
+      "loss": 0.1182,
+      "step": 38383
+    },
+    {
+      "epoch": 0.3331915521566653,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0013955208585513645,
+      "loss": 0.0684,
+      "step": 38384
+    },
+    {
+      "epoch": 0.3332002326368695,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0013954927461462753,
+      "loss": 0.106,
+      "step": 38385
+    },
+    {
+      "epoch": 0.3332089131170736,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0013954646334180397,
+      "loss": 0.0957,
+      "step": 38386
+    },
+    {
+      "epoch": 0.3332175935972778,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0013954365203666878,
+      "loss": 0.0957,
+      "step": 38387
+    },
+    {
+      "epoch": 0.33322627407748195,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0013954084069922514,
+      "loss": 0.167,
+      "step": 38388
+    },
+    {
+      "epoch": 0.33323495455768615,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001395380293294761,
+      "loss": 0.1182,
+      "step": 38389
+    },
+    {
+      "epoch": 0.3332436350378903,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001395352179274247,
+      "loss": 0.0908,
+      "step": 38390
+    },
+    {
+      "epoch": 0.3332523155180945,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0013953240649307403,
+      "loss": 0.0908,
+      "step": 38391
+    },
+    {
+      "epoch": 0.3332609959982986,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0013952959502642715,
+      "loss": 0.0562,
+      "step": 38392
+    },
+    {
+      "epoch": 0.3332696764785028,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0013952678352748716,
+      "loss": 0.0781,
+      "step": 38393
+    },
+    {
+      "epoch": 0.33327835695870694,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0013952397199625713,
+      "loss": 0.0791,
+      "step": 38394
+    },
+    {
+      "epoch": 0.33328703743891114,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.001395211604327401,
+      "loss": 0.0986,
+      "step": 38395
+    },
+    {
+      "epoch": 0.3332957179191153,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001395183488369392,
+      "loss": 0.0908,
+      "step": 38396
+    },
+    {
+      "epoch": 0.33330439839931947,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0013951553720885747,
+      "loss": 0.0801,
+      "step": 38397
+    },
+    {
+      "epoch": 0.3333130788795236,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0013951272554849799,
+      "loss": 0.1328,
+      "step": 38398
+    },
+    {
+      "epoch": 0.3333217593597278,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0013950991385586382,
+      "loss": 0.0923,
+      "step": 38399
+    },
+    {
+      "epoch": 0.33333043983993194,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0013950710213095807,
+      "loss": 0.125,
+      "step": 38400
+    },
+    {
+      "epoch": 0.33333912032013613,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0013950429037378378,
+      "loss": 0.123,
+      "step": 38401
+    },
+    {
+      "epoch": 0.33334780080034027,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0013950147858434409,
+      "loss": 0.0869,
+      "step": 38402
+    },
+    {
+      "epoch": 0.33335648128054446,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0013949866676264198,
+      "loss": 0.0918,
+      "step": 38403
+    },
+    {
+      "epoch": 0.3333651617607486,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001394958549086806,
+      "loss": 0.0854,
+      "step": 38404
+    },
+    {
+      "epoch": 0.3333738422409528,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0013949304302246297,
+      "loss": 0.1055,
+      "step": 38405
+    },
+    {
+      "epoch": 0.3333825227211569,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0013949023110399218,
+      "loss": 0.1074,
+      "step": 38406
+    },
+    {
+      "epoch": 0.3333912032013611,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0013948741915327135,
+      "loss": 0.1035,
+      "step": 38407
+    },
+    {
+      "epoch": 0.33339988368156526,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0013948460717030352,
+      "loss": 0.1123,
+      "step": 38408
+    },
+    {
+      "epoch": 0.33340856416176945,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001394817951550917,
+      "loss": 0.082,
+      "step": 38409
+    },
+    {
+      "epoch": 0.3334172446419736,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0013947898310763913,
+      "loss": 0.1094,
+      "step": 38410
+    },
+    {
+      "epoch": 0.3334259251221778,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0013947617102794874,
+      "loss": 0.1299,
+      "step": 38411
+    },
+    {
+      "epoch": 0.3334346056023819,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0013947335891602367,
+      "loss": 0.0801,
+      "step": 38412
+    },
+    {
+      "epoch": 0.3334432860825861,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0013947054677186696,
+      "loss": 0.1152,
+      "step": 38413
+    },
+    {
+      "epoch": 0.33345196656279025,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001394677345954817,
+      "loss": 0.083,
+      "step": 38414
+    },
+    {
+      "epoch": 0.33346064704299444,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0013946492238687098,
+      "loss": 0.0957,
+      "step": 38415
+    },
+    {
+      "epoch": 0.3334693275231986,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0013946211014603785,
+      "loss": 0.1836,
+      "step": 38416
+    },
+    {
+      "epoch": 0.33347800800340277,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0013945929787298542,
+      "loss": 0.123,
+      "step": 38417
+    },
+    {
+      "epoch": 0.3334866884836069,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001394564855677167,
+      "loss": 0.1377,
+      "step": 38418
+    },
+    {
+      "epoch": 0.3334953689638111,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0013945367323023486,
+      "loss": 0.0889,
+      "step": 38419
+    },
+    {
+      "epoch": 0.33350404944401524,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001394508608605429,
+      "loss": 0.1221,
+      "step": 38420
+    },
+    {
+      "epoch": 0.33351272992421943,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0013944804845864394,
+      "loss": 0.0913,
+      "step": 38421
+    },
+    {
+      "epoch": 0.33352141040442357,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.00139445236024541,
+      "loss": 0.1084,
+      "step": 38422
+    },
+    {
+      "epoch": 0.33353009088462776,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0013944242355823724,
+      "loss": 0.0811,
+      "step": 38423
+    },
+    {
+      "epoch": 0.3335387713648319,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0013943961105973566,
+      "loss": 0.1069,
+      "step": 38424
+    },
+    {
+      "epoch": 0.3335474518450361,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0013943679852903939,
+      "loss": 0.0996,
+      "step": 38425
+    },
+    {
+      "epoch": 0.33355613232524023,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0013943398596615144,
+      "loss": 0.1094,
+      "step": 38426
+    },
+    {
+      "epoch": 0.3335648128054444,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0013943117337107493,
+      "loss": 0.1338,
+      "step": 38427
+    },
+    {
+      "epoch": 0.33357349328564856,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0013942836074381298,
+      "loss": 0.0996,
+      "step": 38428
+    },
+    {
+      "epoch": 0.33358217376585275,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0013942554808436857,
+      "loss": 0.1299,
+      "step": 38429
+    },
+    {
+      "epoch": 0.3335908542460569,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0013942273539274484,
+      "loss": 0.1221,
+      "step": 38430
+    },
+    {
+      "epoch": 0.3335995347262611,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001394199226689448,
+      "loss": 0.1064,
+      "step": 38431
+    },
+    {
+      "epoch": 0.3336082152064652,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0013941710991297162,
+      "loss": 0.2441,
+      "step": 38432
+    },
+    {
+      "epoch": 0.3336168956866694,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0013941429712482835,
+      "loss": 0.1006,
+      "step": 38433
+    },
+    {
+      "epoch": 0.33362557616687355,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0013941148430451804,
+      "loss": 0.0742,
+      "step": 38434
+    },
+    {
+      "epoch": 0.33363425664707774,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0013940867145204375,
+      "loss": 0.0776,
+      "step": 38435
+    },
+    {
+      "epoch": 0.3336429371272819,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0013940585856740857,
+      "loss": 0.0938,
+      "step": 38436
+    },
+    {
+      "epoch": 0.3336516176074861,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001394030456506156,
+      "loss": 0.0947,
+      "step": 38437
+    },
+    {
+      "epoch": 0.3336602980876902,
+      "grad_norm": 1.625,
+      "learning_rate": 0.001394002327016679,
+      "loss": 0.1147,
+      "step": 38438
+    },
+    {
+      "epoch": 0.3336689785678944,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0013939741972056857,
+      "loss": 0.1016,
+      "step": 38439
+    },
+    {
+      "epoch": 0.33367765904809854,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0013939460670732062,
+      "loss": 0.0879,
+      "step": 38440
+    },
+    {
+      "epoch": 0.33368633952830273,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001393917936619272,
+      "loss": 0.1216,
+      "step": 38441
+    },
+    {
+      "epoch": 0.33369502000850687,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0013938898058439139,
+      "loss": 0.0786,
+      "step": 38442
+    },
+    {
+      "epoch": 0.333703700488711,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0013938616747471617,
+      "loss": 0.1562,
+      "step": 38443
+    },
+    {
+      "epoch": 0.3337123809689152,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0013938335433290468,
+      "loss": 0.1133,
+      "step": 38444
+    },
+    {
+      "epoch": 0.33372106144911934,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0013938054115896004,
+      "loss": 0.0996,
+      "step": 38445
+    },
+    {
+      "epoch": 0.33372974192932353,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0013937772795288524,
+      "loss": 0.084,
+      "step": 38446
+    },
+    {
+      "epoch": 0.33373842240952767,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001393749147146834,
+      "loss": 0.1406,
+      "step": 38447
+    },
+    {
+      "epoch": 0.33374710288973186,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0013937210144435762,
+      "loss": 0.1064,
+      "step": 38448
+    },
+    {
+      "epoch": 0.333755783369936,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0013936928814191091,
+      "loss": 0.0933,
+      "step": 38449
+    },
+    {
+      "epoch": 0.3337644638501402,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0013936647480734645,
+      "loss": 0.127,
+      "step": 38450
+    },
+    {
+      "epoch": 0.33377314433034433,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0013936366144066724,
+      "loss": 0.1729,
+      "step": 38451
+    },
+    {
+      "epoch": 0.3337818248105485,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0013936084804187633,
+      "loss": 0.1094,
+      "step": 38452
+    },
+    {
+      "epoch": 0.33379050529075266,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0013935803461097686,
+      "loss": 0.1177,
+      "step": 38453
+    },
+    {
+      "epoch": 0.33379918577095685,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001393552211479719,
+      "loss": 0.0718,
+      "step": 38454
+    },
+    {
+      "epoch": 0.333807866251161,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001393524076528645,
+      "loss": 0.0986,
+      "step": 38455
+    },
+    {
+      "epoch": 0.3338165467313652,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0013934959412565773,
+      "loss": 0.1021,
+      "step": 38456
+    },
+    {
+      "epoch": 0.3338252272115693,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001393467805663547,
+      "loss": 0.1152,
+      "step": 38457
+    },
+    {
+      "epoch": 0.3338339076917735,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0013934396697495848,
+      "loss": 0.085,
+      "step": 38458
+    },
+    {
+      "epoch": 0.33384258817197765,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0013934115335147213,
+      "loss": 0.1133,
+      "step": 38459
+    },
+    {
+      "epoch": 0.33385126865218184,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0013933833969589876,
+      "loss": 0.0801,
+      "step": 38460
+    },
+    {
+      "epoch": 0.333859949132386,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0013933552600824137,
+      "loss": 0.0918,
+      "step": 38461
+    },
+    {
+      "epoch": 0.3338686296125902,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0013933271228850311,
+      "loss": 0.0977,
+      "step": 38462
+    },
+    {
+      "epoch": 0.3338773100927943,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0013932989853668706,
+      "loss": 0.125,
+      "step": 38463
+    },
+    {
+      "epoch": 0.3338859905729985,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0013932708475279626,
+      "loss": 0.0859,
+      "step": 38464
+    },
+    {
+      "epoch": 0.33389467105320264,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001393242709368338,
+      "loss": 0.0889,
+      "step": 38465
+    },
+    {
+      "epoch": 0.33390335153340683,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0013932145708880272,
+      "loss": 0.0903,
+      "step": 38466
+    },
+    {
+      "epoch": 0.33391203201361097,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.001393186432087062,
+      "loss": 0.0918,
+      "step": 38467
+    },
+    {
+      "epoch": 0.33392071249381516,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0013931582929654724,
+      "loss": 0.1729,
+      "step": 38468
+    },
+    {
+      "epoch": 0.3339293929740193,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001393130153523289,
+      "loss": 0.1196,
+      "step": 38469
+    },
+    {
+      "epoch": 0.3339380734542235,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0013931020137605428,
+      "loss": 0.0874,
+      "step": 38470
+    },
+    {
+      "epoch": 0.33394675393442763,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001393073873677265,
+      "loss": 0.085,
+      "step": 38471
+    },
+    {
+      "epoch": 0.3339554344146318,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001393045733273486,
+      "loss": 0.3164,
+      "step": 38472
+    },
+    {
+      "epoch": 0.33396411489483596,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0013930175925492365,
+      "loss": 0.0981,
+      "step": 38473
+    },
+    {
+      "epoch": 0.33397279537504015,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0013929894515045472,
+      "loss": 0.1191,
+      "step": 38474
+    },
+    {
+      "epoch": 0.3339814758552443,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001392961310139449,
+      "loss": 0.0957,
+      "step": 38475
+    },
+    {
+      "epoch": 0.3339901563354485,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001392933168453973,
+      "loss": 0.1094,
+      "step": 38476
+    },
+    {
+      "epoch": 0.3339988368156526,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0013929050264481495,
+      "loss": 0.0938,
+      "step": 38477
+    },
+    {
+      "epoch": 0.3340075172958568,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0013928768841220095,
+      "loss": 0.1191,
+      "step": 38478
+    },
+    {
+      "epoch": 0.33401619777606095,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0013928487414755838,
+      "loss": 0.0781,
+      "step": 38479
+    },
+    {
+      "epoch": 0.33402487825626515,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0013928205985089032,
+      "loss": 0.0874,
+      "step": 38480
+    },
+    {
+      "epoch": 0.3340335587364693,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0013927924552219982,
+      "loss": 0.0991,
+      "step": 38481
+    },
+    {
+      "epoch": 0.3340422392166735,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0013927643116148998,
+      "loss": 0.1074,
+      "step": 38482
+    },
+    {
+      "epoch": 0.3340509196968776,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0013927361676876386,
+      "loss": 0.1133,
+      "step": 38483
+    },
+    {
+      "epoch": 0.3340596001770818,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0013927080234402455,
+      "loss": 0.1152,
+      "step": 38484
+    },
+    {
+      "epoch": 0.33406828065728594,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0013926798788727519,
+      "loss": 0.1001,
+      "step": 38485
+    },
+    {
+      "epoch": 0.33407696113749014,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0013926517339851877,
+      "loss": 0.0869,
+      "step": 38486
+    },
+    {
+      "epoch": 0.3340856416176943,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0013926235887775837,
+      "loss": 0.1289,
+      "step": 38487
+    },
+    {
+      "epoch": 0.33409432209789847,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0013925954432499712,
+      "loss": 0.1182,
+      "step": 38488
+    },
+    {
+      "epoch": 0.3341030025781026,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0013925672974023807,
+      "loss": 0.0713,
+      "step": 38489
+    },
+    {
+      "epoch": 0.3341116830583068,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0013925391512348427,
+      "loss": 0.1045,
+      "step": 38490
+    },
+    {
+      "epoch": 0.33412036353851093,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0013925110047473886,
+      "loss": 0.1348,
+      "step": 38491
+    },
+    {
+      "epoch": 0.3341290440187151,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0013924828579400487,
+      "loss": 0.1445,
+      "step": 38492
+    },
+    {
+      "epoch": 0.33413772449891926,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001392454710812854,
+      "loss": 0.1133,
+      "step": 38493
+    },
+    {
+      "epoch": 0.33414640497912346,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001392426563365835,
+      "loss": 0.0991,
+      "step": 38494
+    },
+    {
+      "epoch": 0.3341550854593276,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0013923984155990228,
+      "loss": 0.0918,
+      "step": 38495
+    },
+    {
+      "epoch": 0.3341637659395318,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0013923702675124484,
+      "loss": 0.0967,
+      "step": 38496
+    },
+    {
+      "epoch": 0.3341724464197359,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001392342119106142,
+      "loss": 0.124,
+      "step": 38497
+    },
+    {
+      "epoch": 0.3341811268999401,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0013923139703801348,
+      "loss": 0.1094,
+      "step": 38498
+    },
+    {
+      "epoch": 0.33418980738014425,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0013922858213344575,
+      "loss": 0.1162,
+      "step": 38499
+    },
+    {
+      "epoch": 0.33419848786034845,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0013922576719691405,
+      "loss": 0.0771,
+      "step": 38500
+    },
+    {
+      "epoch": 0.3342071683405526,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0013922295222842153,
+      "loss": 0.1279,
+      "step": 38501
+    },
+    {
+      "epoch": 0.3342158488207568,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001392201372279712,
+      "loss": 0.0952,
+      "step": 38502
+    },
+    {
+      "epoch": 0.3342245293009609,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0013921732219556618,
+      "loss": 0.1084,
+      "step": 38503
+    },
+    {
+      "epoch": 0.3342332097811651,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0013921450713120951,
+      "loss": 0.0737,
+      "step": 38504
+    },
+    {
+      "epoch": 0.33424189026136925,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0013921169203490432,
+      "loss": 0.1143,
+      "step": 38505
+    },
+    {
+      "epoch": 0.33425057074157344,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0013920887690665365,
+      "loss": 0.1138,
+      "step": 38506
+    },
+    {
+      "epoch": 0.3342592512217776,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001392060617464606,
+      "loss": 0.0913,
+      "step": 38507
+    },
+    {
+      "epoch": 0.33426793170198177,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0013920324655432824,
+      "loss": 0.1016,
+      "step": 38508
+    },
+    {
+      "epoch": 0.3342766121821859,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0013920043133025964,
+      "loss": 0.0771,
+      "step": 38509
+    },
+    {
+      "epoch": 0.3342852926623901,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001391976160742579,
+      "loss": 0.1211,
+      "step": 38510
+    },
+    {
+      "epoch": 0.33429397314259424,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0013919480078632607,
+      "loss": 0.1289,
+      "step": 38511
+    },
+    {
+      "epoch": 0.33430265362279843,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0013919198546646725,
+      "loss": 0.1162,
+      "step": 38512
+    },
+    {
+      "epoch": 0.33431133410300257,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0013918917011468454,
+      "loss": 0.1445,
+      "step": 38513
+    },
+    {
+      "epoch": 0.33432001458320676,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0013918635473098096,
+      "loss": 0.1582,
+      "step": 38514
+    },
+    {
+      "epoch": 0.3343286950634109,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0013918353931535962,
+      "loss": 0.1123,
+      "step": 38515
+    },
+    {
+      "epoch": 0.3343373755436151,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0013918072386782361,
+      "loss": 0.1182,
+      "step": 38516
+    },
+    {
+      "epoch": 0.3343460560238192,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0013917790838837599,
+      "loss": 0.1074,
+      "step": 38517
+    },
+    {
+      "epoch": 0.3343547365040234,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0013917509287701983,
+      "loss": 0.1543,
+      "step": 38518
+    },
+    {
+      "epoch": 0.33436341698422756,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0013917227733375825,
+      "loss": 0.0889,
+      "step": 38519
+    },
+    {
+      "epoch": 0.33437209746443175,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0013916946175859433,
+      "loss": 0.1011,
+      "step": 38520
+    },
+    {
+      "epoch": 0.3343807779446359,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0013916664615153112,
+      "loss": 0.084,
+      "step": 38521
+    },
+    {
+      "epoch": 0.3343894584248401,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001391638305125717,
+      "loss": 0.085,
+      "step": 38522
+    },
+    {
+      "epoch": 0.3343981389050442,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0013916101484171908,
+      "loss": 0.1123,
+      "step": 38523
+    },
+    {
+      "epoch": 0.3344068193852484,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001391581991389765,
+      "loss": 0.1182,
+      "step": 38524
+    },
+    {
+      "epoch": 0.33441549986545255,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0013915538340434693,
+      "loss": 0.0742,
+      "step": 38525
+    },
+    {
+      "epoch": 0.33442418034565674,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0013915256763783346,
+      "loss": 0.0996,
+      "step": 38526
+    },
+    {
+      "epoch": 0.3344328608258609,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0013914975183943919,
+      "loss": 0.0786,
+      "step": 38527
+    },
+    {
+      "epoch": 0.33444154130606507,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0013914693600916718,
+      "loss": 0.1182,
+      "step": 38528
+    },
+    {
+      "epoch": 0.3344502217862692,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0013914412014702053,
+      "loss": 0.0776,
+      "step": 38529
+    },
+    {
+      "epoch": 0.3344589022664734,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0013914130425300227,
+      "loss": 0.0825,
+      "step": 38530
+    },
+    {
+      "epoch": 0.33446758274667754,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0013913848832711557,
+      "loss": 0.1069,
+      "step": 38531
+    },
+    {
+      "epoch": 0.33447626322688173,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0013913567236936342,
+      "loss": 0.1348,
+      "step": 38532
+    },
+    {
+      "epoch": 0.33448494370708587,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0013913285637974894,
+      "loss": 0.1348,
+      "step": 38533
+    },
+    {
+      "epoch": 0.33449362418729006,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0013913004035827521,
+      "loss": 0.1152,
+      "step": 38534
+    },
+    {
+      "epoch": 0.3345023046674942,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0013912722430494534,
+      "loss": 0.0986,
+      "step": 38535
+    },
+    {
+      "epoch": 0.3345109851476984,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0013912440821976235,
+      "loss": 0.1172,
+      "step": 38536
+    },
+    {
+      "epoch": 0.33451966562790253,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0013912159210272934,
+      "loss": 0.1138,
+      "step": 38537
+    },
+    {
+      "epoch": 0.3345283461081067,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001391187759538494,
+      "loss": 0.0938,
+      "step": 38538
+    },
+    {
+      "epoch": 0.33453702658831086,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0013911595977312559,
+      "loss": 0.085,
+      "step": 38539
+    },
+    {
+      "epoch": 0.33454570706851505,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0013911314356056103,
+      "loss": 0.1108,
+      "step": 38540
+    },
+    {
+      "epoch": 0.3345543875487192,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0013911032731615874,
+      "loss": 0.0957,
+      "step": 38541
+    },
+    {
+      "epoch": 0.3345630680289234,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0013910751103992185,
+      "loss": 0.1562,
+      "step": 38542
+    },
+    {
+      "epoch": 0.3345717485091275,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0013910469473185343,
+      "loss": 0.1011,
+      "step": 38543
+    },
+    {
+      "epoch": 0.3345804289893317,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0013910187839195653,
+      "loss": 0.1182,
+      "step": 38544
+    },
+    {
+      "epoch": 0.33458910946953585,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001390990620202343,
+      "loss": 0.1035,
+      "step": 38545
+    },
+    {
+      "epoch": 0.33459778994974004,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0013909624561668974,
+      "loss": 0.0703,
+      "step": 38546
+    },
+    {
+      "epoch": 0.3346064704299442,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0013909342918132595,
+      "loss": 0.1396,
+      "step": 38547
+    },
+    {
+      "epoch": 0.3346151509101484,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0013909061271414605,
+      "loss": 0.105,
+      "step": 38548
+    },
+    {
+      "epoch": 0.3346238313903525,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0013908779621515308,
+      "loss": 0.0718,
+      "step": 38549
+    },
+    {
+      "epoch": 0.3346325118705567,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0013908497968435013,
+      "loss": 0.1045,
+      "step": 38550
+    },
+    {
+      "epoch": 0.33464119235076084,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001390821631217403,
+      "loss": 0.0703,
+      "step": 38551
+    },
+    {
+      "epoch": 0.33464987283096503,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0013907934652732663,
+      "loss": 0.1128,
+      "step": 38552
+    },
+    {
+      "epoch": 0.33465855331116917,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0013907652990111225,
+      "loss": 0.1592,
+      "step": 38553
+    },
+    {
+      "epoch": 0.33466723379137336,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001390737132431002,
+      "loss": 0.105,
+      "step": 38554
+    },
+    {
+      "epoch": 0.3346759142715775,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.001390708965532936,
+      "loss": 0.1875,
+      "step": 38555
+    },
+    {
+      "epoch": 0.3346845947517817,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0013906807983169545,
+      "loss": 0.083,
+      "step": 38556
+    },
+    {
+      "epoch": 0.33469327523198583,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0013906526307830892,
+      "loss": 0.0996,
+      "step": 38557
+    },
+    {
+      "epoch": 0.33470195571219,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0013906244629313706,
+      "loss": 0.1201,
+      "step": 38558
+    },
+    {
+      "epoch": 0.33471063619239416,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0013905962947618291,
+      "loss": 0.0898,
+      "step": 38559
+    },
+    {
+      "epoch": 0.33471931667259835,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0013905681262744963,
+      "loss": 0.1016,
+      "step": 38560
+    },
+    {
+      "epoch": 0.3347279971528025,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001390539957469402,
+      "loss": 0.1143,
+      "step": 38561
+    },
+    {
+      "epoch": 0.3347366776330067,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0013905117883465778,
+      "loss": 0.1191,
+      "step": 38562
+    },
+    {
+      "epoch": 0.3347453581132108,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0013904836189060547,
+      "loss": 0.0918,
+      "step": 38563
+    },
+    {
+      "epoch": 0.334754038593415,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0013904554491478629,
+      "loss": 0.1123,
+      "step": 38564
+    },
+    {
+      "epoch": 0.33476271907361915,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0013904272790720331,
+      "loss": 0.084,
+      "step": 38565
+    },
+    {
+      "epoch": 0.3347713995538233,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0013903991086785966,
+      "loss": 0.1006,
+      "step": 38566
+    },
+    {
+      "epoch": 0.3347800800340275,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001390370937967584,
+      "loss": 0.1406,
+      "step": 38567
+    },
+    {
+      "epoch": 0.3347887605142316,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001390342766939026,
+      "loss": 0.0815,
+      "step": 38568
+    },
+    {
+      "epoch": 0.3347974409944358,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0013903145955929538,
+      "loss": 0.1123,
+      "step": 38569
+    },
+    {
+      "epoch": 0.33480612147463995,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0013902864239293978,
+      "loss": 0.1416,
+      "step": 38570
+    },
+    {
+      "epoch": 0.33481480195484414,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0013902582519483885,
+      "loss": 0.126,
+      "step": 38571
+    },
+    {
+      "epoch": 0.3348234824350483,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0013902300796499575,
+      "loss": 0.1133,
+      "step": 38572
+    },
+    {
+      "epoch": 0.3348321629152525,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.001390201907034135,
+      "loss": 0.0991,
+      "step": 38573
+    },
+    {
+      "epoch": 0.3348408433954566,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0013901737341009523,
+      "loss": 0.1108,
+      "step": 38574
+    },
+    {
+      "epoch": 0.3348495238756608,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.00139014556085044,
+      "loss": 0.0957,
+      "step": 38575
+    },
+    {
+      "epoch": 0.33485820435586494,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0013901173872826289,
+      "loss": 0.1357,
+      "step": 38576
+    },
+    {
+      "epoch": 0.33486688483606913,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0013900892133975496,
+      "loss": 0.1162,
+      "step": 38577
+    },
+    {
+      "epoch": 0.33487556531627327,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0013900610391952334,
+      "loss": 0.123,
+      "step": 38578
+    },
+    {
+      "epoch": 0.33488424579647746,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0013900328646757104,
+      "loss": 0.127,
+      "step": 38579
+    },
+    {
+      "epoch": 0.3348929262766816,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0013900046898390117,
+      "loss": 0.2148,
+      "step": 38580
+    },
+    {
+      "epoch": 0.3349016067568858,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0013899765146851686,
+      "loss": 0.1348,
+      "step": 38581
+    },
+    {
+      "epoch": 0.33491028723708993,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0013899483392142114,
+      "loss": 0.1162,
+      "step": 38582
+    },
+    {
+      "epoch": 0.3349189677172941,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0013899201634261713,
+      "loss": 0.1055,
+      "step": 38583
+    },
+    {
+      "epoch": 0.33492764819749826,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0013898919873210787,
+      "loss": 0.125,
+      "step": 38584
+    },
+    {
+      "epoch": 0.33493632867770246,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0013898638108989648,
+      "loss": 0.0977,
+      "step": 38585
+    },
+    {
+      "epoch": 0.3349450091579066,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0013898356341598597,
+      "loss": 0.1113,
+      "step": 38586
+    },
+    {
+      "epoch": 0.3349536896381108,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0013898074571037948,
+      "loss": 0.1152,
+      "step": 38587
+    },
+    {
+      "epoch": 0.3349623701183149,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0013897792797308013,
+      "loss": 0.127,
+      "step": 38588
+    },
+    {
+      "epoch": 0.3349710505985191,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.001389751102040909,
+      "loss": 0.1143,
+      "step": 38589
+    },
+    {
+      "epoch": 0.33497973107872325,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0013897229240341497,
+      "loss": 0.082,
+      "step": 38590
+    },
+    {
+      "epoch": 0.33498841155892745,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0013896947457105531,
+      "loss": 0.0913,
+      "step": 38591
+    },
+    {
+      "epoch": 0.3349970920391316,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0013896665670701512,
+      "loss": 0.083,
+      "step": 38592
+    },
+    {
+      "epoch": 0.3350057725193358,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0013896383881129744,
+      "loss": 0.0923,
+      "step": 38593
+    },
+    {
+      "epoch": 0.3350144529995399,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0013896102088390531,
+      "loss": 0.1025,
+      "step": 38594
+    },
+    {
+      "epoch": 0.3350231334797441,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0013895820292484185,
+      "loss": 0.1152,
+      "step": 38595
+    },
+    {
+      "epoch": 0.33503181395994824,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0013895538493411015,
+      "loss": 0.1113,
+      "step": 38596
+    },
+    {
+      "epoch": 0.33504049444015244,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0013895256691171325,
+      "loss": 0.0967,
+      "step": 38597
+    },
+    {
+      "epoch": 0.3350491749203566,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0013894974885765431,
+      "loss": 0.126,
+      "step": 38598
+    },
+    {
+      "epoch": 0.33505785540056077,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001389469307719363,
+      "loss": 0.1055,
+      "step": 38599
+    },
+    {
+      "epoch": 0.3350665358807649,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0013894411265456235,
+      "loss": 0.1001,
+      "step": 38600
+    },
+    {
+      "epoch": 0.3350752163609691,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0013894129450553558,
+      "loss": 0.104,
+      "step": 38601
+    },
+    {
+      "epoch": 0.33508389684117323,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0013893847632485906,
+      "loss": 0.1152,
+      "step": 38602
+    },
+    {
+      "epoch": 0.3350925773213774,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0013893565811253586,
+      "loss": 0.1758,
+      "step": 38603
+    },
+    {
+      "epoch": 0.33510125780158156,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0013893283986856904,
+      "loss": 0.0806,
+      "step": 38604
+    },
+    {
+      "epoch": 0.33510993828178576,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001389300215929617,
+      "loss": 0.1582,
+      "step": 38605
+    },
+    {
+      "epoch": 0.3351186187619899,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0013892720328571694,
+      "loss": 0.0708,
+      "step": 38606
+    },
+    {
+      "epoch": 0.3351272992421941,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0013892438494683785,
+      "loss": 0.1309,
+      "step": 38607
+    },
+    {
+      "epoch": 0.3351359797223982,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0013892156657632744,
+      "loss": 0.1348,
+      "step": 38608
+    },
+    {
+      "epoch": 0.3351446602026024,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0013891874817418882,
+      "loss": 0.0957,
+      "step": 38609
+    },
+    {
+      "epoch": 0.33515334068280656,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0013891592974042515,
+      "loss": 0.0898,
+      "step": 38610
+    },
+    {
+      "epoch": 0.33516202116301075,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0013891311127503944,
+      "loss": 0.0684,
+      "step": 38611
+    },
+    {
+      "epoch": 0.3351707016432149,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0013891029277803475,
+      "loss": 0.1523,
+      "step": 38612
+    },
+    {
+      "epoch": 0.3351793821234191,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0013890747424941422,
+      "loss": 0.1436,
+      "step": 38613
+    },
+    {
+      "epoch": 0.3351880626036232,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.001389046556891809,
+      "loss": 0.1123,
+      "step": 38614
+    },
+    {
+      "epoch": 0.3351967430838274,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0013890183709733793,
+      "loss": 0.1055,
+      "step": 38615
+    },
+    {
+      "epoch": 0.33520542356403155,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0013889901847388831,
+      "loss": 0.0986,
+      "step": 38616
+    },
+    {
+      "epoch": 0.33521410404423574,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0013889619981883514,
+      "loss": 0.1309,
+      "step": 38617
+    },
+    {
+      "epoch": 0.3352227845244399,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0013889338113218155,
+      "loss": 0.106,
+      "step": 38618
+    },
+    {
+      "epoch": 0.33523146500464407,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0013889056241393057,
+      "loss": 0.1006,
+      "step": 38619
+    },
+    {
+      "epoch": 0.3352401454848482,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0013888774366408532,
+      "loss": 0.0928,
+      "step": 38620
+    },
+    {
+      "epoch": 0.3352488259650524,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0013888492488264887,
+      "loss": 0.1299,
+      "step": 38621
+    },
+    {
+      "epoch": 0.33525750644525654,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0013888210606962429,
+      "loss": 0.1123,
+      "step": 38622
+    },
+    {
+      "epoch": 0.33526618692546073,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001388792872250147,
+      "loss": 0.1328,
+      "step": 38623
+    },
+    {
+      "epoch": 0.33527486740566487,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.001388764683488231,
+      "loss": 0.1426,
+      "step": 38624
+    },
+    {
+      "epoch": 0.33528354788586906,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0013887364944105264,
+      "loss": 0.1816,
+      "step": 38625
+    },
+    {
+      "epoch": 0.3352922283660732,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0013887083050170643,
+      "loss": 0.1465,
+      "step": 38626
+    },
+    {
+      "epoch": 0.3353009088462774,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001388680115307875,
+      "loss": 0.123,
+      "step": 38627
+    },
+    {
+      "epoch": 0.3353095893264815,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0013886519252829894,
+      "loss": 0.125,
+      "step": 38628
+    },
+    {
+      "epoch": 0.3353182698066857,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001388623734942438,
+      "loss": 0.1211,
+      "step": 38629
+    },
+    {
+      "epoch": 0.33532695028688986,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0013885955442862526,
+      "loss": 0.1191,
+      "step": 38630
+    },
+    {
+      "epoch": 0.33533563076709405,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0013885673533144635,
+      "loss": 0.1172,
+      "step": 38631
+    },
+    {
+      "epoch": 0.3353443112472982,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001388539162027101,
+      "loss": 0.0815,
+      "step": 38632
+    },
+    {
+      "epoch": 0.3353529917275024,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0013885109704241969,
+      "loss": 0.1016,
+      "step": 38633
+    },
+    {
+      "epoch": 0.3353616722077065,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0013884827785057812,
+      "loss": 0.0972,
+      "step": 38634
+    },
+    {
+      "epoch": 0.3353703526879107,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0013884545862718851,
+      "loss": 0.1562,
+      "step": 38635
+    },
+    {
+      "epoch": 0.33537903316811485,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0013884263937225397,
+      "loss": 0.0801,
+      "step": 38636
+    },
+    {
+      "epoch": 0.33538771364831904,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0013883982008577752,
+      "loss": 0.0874,
+      "step": 38637
+    },
+    {
+      "epoch": 0.3353963941285232,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0013883700076776227,
+      "loss": 0.0791,
+      "step": 38638
+    },
+    {
+      "epoch": 0.33540507460872737,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0013883418141821132,
+      "loss": 0.1016,
+      "step": 38639
+    },
+    {
+      "epoch": 0.3354137550889315,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0013883136203712773,
+      "loss": 0.0903,
+      "step": 38640
+    },
+    {
+      "epoch": 0.3354224355691357,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0013882854262451463,
+      "loss": 0.127,
+      "step": 38641
+    },
+    {
+      "epoch": 0.33543111604933984,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0013882572318037506,
+      "loss": 0.0737,
+      "step": 38642
+    },
+    {
+      "epoch": 0.33543979652954403,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001388229037047121,
+      "loss": 0.0771,
+      "step": 38643
+    },
+    {
+      "epoch": 0.33544847700974817,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0013882008419752886,
+      "loss": 0.0938,
+      "step": 38644
+    },
+    {
+      "epoch": 0.33545715748995236,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001388172646588284,
+      "loss": 0.1001,
+      "step": 38645
+    },
+    {
+      "epoch": 0.3354658379701565,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001388144450886138,
+      "loss": 0.0977,
+      "step": 38646
+    },
+    {
+      "epoch": 0.3354745184503607,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0013881162548688816,
+      "loss": 0.1445,
+      "step": 38647
+    },
+    {
+      "epoch": 0.33548319893056483,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0013880880585365456,
+      "loss": 0.0732,
+      "step": 38648
+    },
+    {
+      "epoch": 0.335491879410769,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0013880598618891608,
+      "loss": 0.1533,
+      "step": 38649
+    },
+    {
+      "epoch": 0.33550055989097316,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0013880316649267583,
+      "loss": 0.0811,
+      "step": 38650
+    },
+    {
+      "epoch": 0.33550924037117735,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0013880034676493683,
+      "loss": 0.0991,
+      "step": 38651
+    },
+    {
+      "epoch": 0.3355179208513815,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0013879752700570226,
+      "loss": 0.1367,
+      "step": 38652
+    },
+    {
+      "epoch": 0.3355266013315857,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001387947072149751,
+      "loss": 0.1172,
+      "step": 38653
+    },
+    {
+      "epoch": 0.3355352818117898,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001387918873927585,
+      "loss": 0.1025,
+      "step": 38654
+    },
+    {
+      "epoch": 0.335543962291994,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001387890675390555,
+      "loss": 0.0928,
+      "step": 38655
+    },
+    {
+      "epoch": 0.33555264277219815,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0013878624765386925,
+      "loss": 0.1328,
+      "step": 38656
+    },
+    {
+      "epoch": 0.33556132325240234,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0013878342773720276,
+      "loss": 0.1318,
+      "step": 38657
+    },
+    {
+      "epoch": 0.3355700037326065,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0013878060778905917,
+      "loss": 0.1416,
+      "step": 38658
+    },
+    {
+      "epoch": 0.3355786842128107,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0013877778780944153,
+      "loss": 0.1152,
+      "step": 38659
+    },
+    {
+      "epoch": 0.3355873646930148,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0013877496779835293,
+      "loss": 0.1006,
+      "step": 38660
+    },
+    {
+      "epoch": 0.335596045173219,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0013877214775579645,
+      "loss": 0.1299,
+      "step": 38661
+    },
+    {
+      "epoch": 0.33560472565342314,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0013876932768177516,
+      "loss": 0.0967,
+      "step": 38662
+    },
+    {
+      "epoch": 0.33561340613362733,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0013876650757629221,
+      "loss": 0.1299,
+      "step": 38663
+    },
+    {
+      "epoch": 0.33562208661383147,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0013876368743935064,
+      "loss": 0.1001,
+      "step": 38664
+    },
+    {
+      "epoch": 0.33563076709403566,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001387608672709535,
+      "loss": 0.0947,
+      "step": 38665
+    },
+    {
+      "epoch": 0.3356394475742398,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0013875804707110394,
+      "loss": 0.1035,
+      "step": 38666
+    },
+    {
+      "epoch": 0.335648128054444,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.00138755226839805,
+      "loss": 0.0928,
+      "step": 38667
+    },
+    {
+      "epoch": 0.33565680853464813,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0013875240657705975,
+      "loss": 0.127,
+      "step": 38668
+    },
+    {
+      "epoch": 0.3356654890148523,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0013874958628287133,
+      "loss": 0.0771,
+      "step": 38669
+    },
+    {
+      "epoch": 0.33567416949505646,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0013874676595724278,
+      "loss": 0.0952,
+      "step": 38670
+    },
+    {
+      "epoch": 0.33568284997526066,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0013874394560017722,
+      "loss": 0.0918,
+      "step": 38671
+    },
+    {
+      "epoch": 0.3356915304554648,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001387411252116777,
+      "loss": 0.0952,
+      "step": 38672
+    },
+    {
+      "epoch": 0.335700210935669,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.001387383047917473,
+      "loss": 0.1011,
+      "step": 38673
+    },
+    {
+      "epoch": 0.3357088914158731,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0013873548434038916,
+      "loss": 0.1387,
+      "step": 38674
+    },
+    {
+      "epoch": 0.3357175718960773,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001387326638576063,
+      "loss": 0.0752,
+      "step": 38675
+    },
+    {
+      "epoch": 0.33572625237628145,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0013872984334340182,
+      "loss": 0.1035,
+      "step": 38676
+    },
+    {
+      "epoch": 0.33573493285648565,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0013872702279777882,
+      "loss": 0.1641,
+      "step": 38677
+    },
+    {
+      "epoch": 0.3357436133366898,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001387242022207404,
+      "loss": 0.1377,
+      "step": 38678
+    },
+    {
+      "epoch": 0.335752293816894,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0013872138161228962,
+      "loss": 0.0801,
+      "step": 38679
+    },
+    {
+      "epoch": 0.3357609742970981,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0013871856097242956,
+      "loss": 0.1172,
+      "step": 38680
+    },
+    {
+      "epoch": 0.3357696547773023,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001387157403011633,
+      "loss": 0.1211,
+      "step": 38681
+    },
+    {
+      "epoch": 0.33577833525750644,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0013871291959849396,
+      "loss": 0.0996,
+      "step": 38682
+    },
+    {
+      "epoch": 0.33578701573771064,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001387100988644246,
+      "loss": 0.0835,
+      "step": 38683
+    },
+    {
+      "epoch": 0.3357956962179148,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0013870727809895832,
+      "loss": 0.0913,
+      "step": 38684
+    },
+    {
+      "epoch": 0.33580437669811897,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0013870445730209816,
+      "loss": 0.0898,
+      "step": 38685
+    },
+    {
+      "epoch": 0.3358130571783231,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001387016364738472,
+      "loss": 0.1133,
+      "step": 38686
+    },
+    {
+      "epoch": 0.3358217376585273,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0013869881561420865,
+      "loss": 0.1191,
+      "step": 38687
+    },
+    {
+      "epoch": 0.33583041813873143,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001386959947231855,
+      "loss": 0.1021,
+      "step": 38688
+    },
+    {
+      "epoch": 0.33583909861893557,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001386931738007808,
+      "loss": 0.1182,
+      "step": 38689
+    },
+    {
+      "epoch": 0.33584777909913976,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0013869035284699766,
+      "loss": 0.1089,
+      "step": 38690
+    },
+    {
+      "epoch": 0.3358564595793439,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0013868753186183922,
+      "loss": 0.1699,
+      "step": 38691
+    },
+    {
+      "epoch": 0.3358651400595481,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0013868471084530851,
+      "loss": 0.0879,
+      "step": 38692
+    },
+    {
+      "epoch": 0.33587382053975223,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0013868188979740863,
+      "loss": 0.1367,
+      "step": 38693
+    },
+    {
+      "epoch": 0.3358825010199564,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001386790687181427,
+      "loss": 0.0845,
+      "step": 38694
+    },
+    {
+      "epoch": 0.33589118150016056,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0013867624760751371,
+      "loss": 0.1367,
+      "step": 38695
+    },
+    {
+      "epoch": 0.33589986198036476,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0013867342646552486,
+      "loss": 0.1084,
+      "step": 38696
+    },
+    {
+      "epoch": 0.3359085424605689,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0013867060529217917,
+      "loss": 0.0986,
+      "step": 38697
+    },
+    {
+      "epoch": 0.3359172229407731,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001386677840874797,
+      "loss": 0.0952,
+      "step": 38698
+    },
+    {
+      "epoch": 0.3359259034209772,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001386649628514296,
+      "loss": 0.0737,
+      "step": 38699
+    },
+    {
+      "epoch": 0.3359345839011814,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0013866214158403194,
+      "loss": 0.0996,
+      "step": 38700
+    },
+    {
+      "epoch": 0.33594326438138555,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001386593202852898,
+      "loss": 0.0635,
+      "step": 38701
+    },
+    {
+      "epoch": 0.33595194486158975,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0013865649895520625,
+      "loss": 0.1318,
+      "step": 38702
+    },
+    {
+      "epoch": 0.3359606253417939,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0013865367759378436,
+      "loss": 0.0796,
+      "step": 38703
+    },
+    {
+      "epoch": 0.3359693058219981,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0013865085620102724,
+      "loss": 0.0903,
+      "step": 38704
+    },
+    {
+      "epoch": 0.3359779863022022,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0013864803477693797,
+      "loss": 0.1465,
+      "step": 38705
+    },
+    {
+      "epoch": 0.3359866667824064,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0013864521332151967,
+      "loss": 0.1016,
+      "step": 38706
+    },
+    {
+      "epoch": 0.33599534726261054,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0013864239183477534,
+      "loss": 0.1328,
+      "step": 38707
+    },
+    {
+      "epoch": 0.33600402774281474,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0013863957031670818,
+      "loss": 0.127,
+      "step": 38708
+    },
+    {
+      "epoch": 0.3360127082230189,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001386367487673212,
+      "loss": 0.0996,
+      "step": 38709
+    },
+    {
+      "epoch": 0.33602138870322307,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0013863392718661749,
+      "loss": 0.1045,
+      "step": 38710
+    },
+    {
+      "epoch": 0.3360300691834272,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0013863110557460019,
+      "loss": 0.1001,
+      "step": 38711
+    },
+    {
+      "epoch": 0.3360387496636314,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0013862828393127228,
+      "loss": 0.0864,
+      "step": 38712
+    },
+    {
+      "epoch": 0.33604743014383553,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0013862546225663693,
+      "loss": 0.1797,
+      "step": 38713
+    },
+    {
+      "epoch": 0.33605611062403973,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001386226405506972,
+      "loss": 0.1211,
+      "step": 38714
+    },
+    {
+      "epoch": 0.33606479110424387,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001386198188134562,
+      "loss": 0.127,
+      "step": 38715
+    },
+    {
+      "epoch": 0.33607347158444806,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0013861699704491696,
+      "loss": 0.0854,
+      "step": 38716
+    },
+    {
+      "epoch": 0.3360821520646522,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0013861417524508267,
+      "loss": 0.1084,
+      "step": 38717
+    },
+    {
+      "epoch": 0.3360908325448564,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0013861135341395629,
+      "loss": 0.1367,
+      "step": 38718
+    },
+    {
+      "epoch": 0.3360995130250605,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0013860853155154098,
+      "loss": 0.1465,
+      "step": 38719
+    },
+    {
+      "epoch": 0.3361081935052647,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0013860570965783983,
+      "loss": 0.0698,
+      "step": 38720
+    },
+    {
+      "epoch": 0.33611687398546886,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0013860288773285588,
+      "loss": 0.1357,
+      "step": 38721
+    },
+    {
+      "epoch": 0.33612555446567305,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0013860006577659225,
+      "loss": 0.1523,
+      "step": 38722
+    },
+    {
+      "epoch": 0.3361342349458772,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0013859724378905204,
+      "loss": 0.0957,
+      "step": 38723
+    },
+    {
+      "epoch": 0.3361429154260814,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0013859442177023824,
+      "loss": 0.0933,
+      "step": 38724
+    },
+    {
+      "epoch": 0.3361515959062855,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0013859159972015407,
+      "loss": 0.085,
+      "step": 38725
+    },
+    {
+      "epoch": 0.3361602763864897,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0013858877763880256,
+      "loss": 0.103,
+      "step": 38726
+    },
+    {
+      "epoch": 0.33616895686669385,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0013858595552618679,
+      "loss": 0.0669,
+      "step": 38727
+    },
+    {
+      "epoch": 0.33617763734689804,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0013858313338230984,
+      "loss": 0.2598,
+      "step": 38728
+    },
+    {
+      "epoch": 0.3361863178271022,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0013858031120717478,
+      "loss": 0.1611,
+      "step": 38729
+    },
+    {
+      "epoch": 0.33619499830730637,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0013857748900078476,
+      "loss": 0.1211,
+      "step": 38730
+    },
+    {
+      "epoch": 0.3362036787875105,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001385746667631428,
+      "loss": 0.1523,
+      "step": 38731
+    },
+    {
+      "epoch": 0.3362123592677147,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0013857184449425205,
+      "loss": 0.1147,
+      "step": 38732
+    },
+    {
+      "epoch": 0.33622103974791884,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0013856902219411549,
+      "loss": 0.0947,
+      "step": 38733
+    },
+    {
+      "epoch": 0.33622972022812303,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0013856619986273634,
+      "loss": 0.1074,
+      "step": 38734
+    },
+    {
+      "epoch": 0.33623840070832717,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0013856337750011761,
+      "loss": 0.0859,
+      "step": 38735
+    },
+    {
+      "epoch": 0.33624708118853136,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001385605551062624,
+      "loss": 0.0918,
+      "step": 38736
+    },
+    {
+      "epoch": 0.3362557616687355,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0013855773268117379,
+      "loss": 0.1133,
+      "step": 38737
+    },
+    {
+      "epoch": 0.3362644421489397,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0013855491022485486,
+      "loss": 0.083,
+      "step": 38738
+    },
+    {
+      "epoch": 0.33627312262914383,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0013855208773730873,
+      "loss": 0.0938,
+      "step": 38739
+    },
+    {
+      "epoch": 0.336281803109348,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0013854926521853846,
+      "loss": 0.1279,
+      "step": 38740
+    },
+    {
+      "epoch": 0.33629048358955216,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0013854644266854716,
+      "loss": 0.1201,
+      "step": 38741
+    },
+    {
+      "epoch": 0.33629916406975635,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0013854362008733784,
+      "loss": 0.0996,
+      "step": 38742
+    },
+    {
+      "epoch": 0.3363078445499605,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0013854079747491366,
+      "loss": 0.1025,
+      "step": 38743
+    },
+    {
+      "epoch": 0.3363165250301647,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0013853797483127774,
+      "loss": 0.1133,
+      "step": 38744
+    },
+    {
+      "epoch": 0.3363252055103688,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0013853515215643312,
+      "loss": 0.1738,
+      "step": 38745
+    },
+    {
+      "epoch": 0.336333885990573,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0013853232945038286,
+      "loss": 0.0737,
+      "step": 38746
+    },
+    {
+      "epoch": 0.33634256647077715,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0013852950671313007,
+      "loss": 0.0771,
+      "step": 38747
+    },
+    {
+      "epoch": 0.33635124695098134,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0013852668394467784,
+      "loss": 0.083,
+      "step": 38748
+    },
+    {
+      "epoch": 0.3363599274311855,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0013852386114502927,
+      "loss": 0.0923,
+      "step": 38749
+    },
+    {
+      "epoch": 0.33636860791138967,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0013852103831418743,
+      "loss": 0.0718,
+      "step": 38750
+    },
+    {
+      "epoch": 0.3363772883915938,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001385182154521554,
+      "loss": 0.0933,
+      "step": 38751
+    },
+    {
+      "epoch": 0.336385968871798,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001385153925589363,
+      "loss": 0.1182,
+      "step": 38752
+    },
+    {
+      "epoch": 0.33639464935200214,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0013851256963453315,
+      "loss": 0.1064,
+      "step": 38753
+    },
+    {
+      "epoch": 0.33640332983220633,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0013850974667894912,
+      "loss": 0.1201,
+      "step": 38754
+    },
+    {
+      "epoch": 0.33641201031241047,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0013850692369218726,
+      "loss": 0.0889,
+      "step": 38755
+    },
+    {
+      "epoch": 0.33642069079261466,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0013850410067425066,
+      "loss": 0.0693,
+      "step": 38756
+    },
+    {
+      "epoch": 0.3364293712728188,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0013850127762514237,
+      "loss": 0.0679,
+      "step": 38757
+    },
+    {
+      "epoch": 0.336438051753023,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0013849845454486556,
+      "loss": 0.0947,
+      "step": 38758
+    },
+    {
+      "epoch": 0.33644673223322713,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0013849563143342324,
+      "loss": 0.1055,
+      "step": 38759
+    },
+    {
+      "epoch": 0.3364554127134313,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0013849280829081852,
+      "loss": 0.0986,
+      "step": 38760
+    },
+    {
+      "epoch": 0.33646409319363546,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0013848998511705453,
+      "loss": 0.0962,
+      "step": 38761
+    },
+    {
+      "epoch": 0.33647277367383965,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001384871619121343,
+      "loss": 0.0869,
+      "step": 38762
+    },
+    {
+      "epoch": 0.3364814541540438,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001384843386760609,
+      "loss": 0.1021,
+      "step": 38763
+    },
+    {
+      "epoch": 0.336490134634248,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0013848151540883746,
+      "loss": 0.0635,
+      "step": 38764
+    },
+    {
+      "epoch": 0.3364988151144521,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001384786921104671,
+      "loss": 0.1108,
+      "step": 38765
+    },
+    {
+      "epoch": 0.3365074955946563,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0013847586878095288,
+      "loss": 0.1191,
+      "step": 38766
+    },
+    {
+      "epoch": 0.33651617607486045,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0013847304542029784,
+      "loss": 0.105,
+      "step": 38767
+    },
+    {
+      "epoch": 0.33652485655506464,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0013847022202850513,
+      "loss": 0.0737,
+      "step": 38768
+    },
+    {
+      "epoch": 0.3365335370352688,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001384673986055778,
+      "loss": 0.0723,
+      "step": 38769
+    },
+    {
+      "epoch": 0.336542217515473,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0013846457515151896,
+      "loss": 0.0918,
+      "step": 38770
+    },
+    {
+      "epoch": 0.3365508979956771,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0013846175166633168,
+      "loss": 0.1064,
+      "step": 38771
+    },
+    {
+      "epoch": 0.3365595784758813,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0013845892815001907,
+      "loss": 0.1035,
+      "step": 38772
+    },
+    {
+      "epoch": 0.33656825895608544,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0013845610460258419,
+      "loss": 0.0986,
+      "step": 38773
+    },
+    {
+      "epoch": 0.33657693943628963,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0013845328102403015,
+      "loss": 0.1113,
+      "step": 38774
+    },
+    {
+      "epoch": 0.3365856199164938,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0013845045741436,
+      "loss": 0.1118,
+      "step": 38775
+    },
+    {
+      "epoch": 0.33659430039669797,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001384476337735769,
+      "loss": 0.0859,
+      "step": 38776
+    },
+    {
+      "epoch": 0.3366029808769021,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0013844481010168387,
+      "loss": 0.1191,
+      "step": 38777
+    },
+    {
+      "epoch": 0.3366116613571063,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0013844198639868402,
+      "loss": 0.1465,
+      "step": 38778
+    },
+    {
+      "epoch": 0.33662034183731043,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0013843916266458047,
+      "loss": 0.0938,
+      "step": 38779
+    },
+    {
+      "epoch": 0.3366290223175146,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0013843633889937625,
+      "loss": 0.0918,
+      "step": 38780
+    },
+    {
+      "epoch": 0.33663770279771876,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001384335151030745,
+      "loss": 0.1152,
+      "step": 38781
+    },
+    {
+      "epoch": 0.33664638327792296,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0013843069127567823,
+      "loss": 0.1357,
+      "step": 38782
+    },
+    {
+      "epoch": 0.3366550637581271,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0013842786741719065,
+      "loss": 0.0903,
+      "step": 38783
+    },
+    {
+      "epoch": 0.3366637442383313,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0013842504352761477,
+      "loss": 0.0928,
+      "step": 38784
+    },
+    {
+      "epoch": 0.3366724247185354,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0013842221960695365,
+      "loss": 0.0835,
+      "step": 38785
+    },
+    {
+      "epoch": 0.3366811051987396,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0013841939565521044,
+      "loss": 0.0967,
+      "step": 38786
+    },
+    {
+      "epoch": 0.33668978567894375,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0013841657167238822,
+      "loss": 0.1152,
+      "step": 38787
+    },
+    {
+      "epoch": 0.33669846615914795,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0013841374765849004,
+      "loss": 0.1201,
+      "step": 38788
+    },
+    {
+      "epoch": 0.3367071466393521,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0013841092361351903,
+      "loss": 0.085,
+      "step": 38789
+    },
+    {
+      "epoch": 0.3367158271195563,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0013840809953747828,
+      "loss": 0.1445,
+      "step": 38790
+    },
+    {
+      "epoch": 0.3367245075997604,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001384052754303708,
+      "loss": 0.126,
+      "step": 38791
+    },
+    {
+      "epoch": 0.3367331880799646,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001384024512921998,
+      "loss": 0.1021,
+      "step": 38792
+    },
+    {
+      "epoch": 0.33674186856016874,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0013839962712296828,
+      "loss": 0.0854,
+      "step": 38793
+    },
+    {
+      "epoch": 0.33675054904037294,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0013839680292267933,
+      "loss": 0.1196,
+      "step": 38794
+    },
+    {
+      "epoch": 0.3367592295205771,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001383939786913361,
+      "loss": 0.2324,
+      "step": 38795
+    },
+    {
+      "epoch": 0.33676791000078127,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001383911544289416,
+      "loss": 0.1387,
+      "step": 38796
+    },
+    {
+      "epoch": 0.3367765904809854,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0013838833013549903,
+      "loss": 0.1426,
+      "step": 38797
+    },
+    {
+      "epoch": 0.3367852709611896,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0013838550581101135,
+      "loss": 0.0786,
+      "step": 38798
+    },
+    {
+      "epoch": 0.33679395144139374,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0013838268145548174,
+      "loss": 0.1133,
+      "step": 38799
+    },
+    {
+      "epoch": 0.33680263192159793,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0013837985706891323,
+      "loss": 0.1318,
+      "step": 38800
+    },
+    {
+      "epoch": 0.33681131240180207,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0013837703265130894,
+      "loss": 0.1133,
+      "step": 38801
+    },
+    {
+      "epoch": 0.33681999288200626,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0013837420820267195,
+      "loss": 0.123,
+      "step": 38802
+    },
+    {
+      "epoch": 0.3368286733622104,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0013837138372300537,
+      "loss": 0.1309,
+      "step": 38803
+    },
+    {
+      "epoch": 0.3368373538424146,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0013836855921231226,
+      "loss": 0.1045,
+      "step": 38804
+    },
+    {
+      "epoch": 0.3368460343226187,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0013836573467059575,
+      "loss": 0.0889,
+      "step": 38805
+    },
+    {
+      "epoch": 0.3368547148028229,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0013836291009785885,
+      "loss": 0.0928,
+      "step": 38806
+    },
+    {
+      "epoch": 0.33686339528302706,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.001383600854941047,
+      "loss": 0.1221,
+      "step": 38807
+    },
+    {
+      "epoch": 0.33687207576323125,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0013835726085933643,
+      "loss": 0.0811,
+      "step": 38808
+    },
+    {
+      "epoch": 0.3368807562434354,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0013835443619355705,
+      "loss": 0.1113,
+      "step": 38809
+    },
+    {
+      "epoch": 0.3368894367236396,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0013835161149676968,
+      "loss": 0.083,
+      "step": 38810
+    },
+    {
+      "epoch": 0.3368981172038437,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0013834878676897745,
+      "loss": 0.0996,
+      "step": 38811
+    },
+    {
+      "epoch": 0.33690679768404785,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0013834596201018337,
+      "loss": 0.0977,
+      "step": 38812
+    },
+    {
+      "epoch": 0.33691547816425205,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001383431372203906,
+      "loss": 0.1094,
+      "step": 38813
+    },
+    {
+      "epoch": 0.3369241586444562,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001383403123996022,
+      "loss": 0.1143,
+      "step": 38814
+    },
+    {
+      "epoch": 0.3369328391246604,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0013833748754782124,
+      "loss": 0.0898,
+      "step": 38815
+    },
+    {
+      "epoch": 0.3369415196048645,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0013833466266505083,
+      "loss": 0.1133,
+      "step": 38816
+    },
+    {
+      "epoch": 0.3369502000850687,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001383318377512941,
+      "loss": 0.0693,
+      "step": 38817
+    },
+    {
+      "epoch": 0.33695888056527284,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0013832901280655407,
+      "loss": 0.085,
+      "step": 38818
+    },
+    {
+      "epoch": 0.33696756104547704,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0013832618783083384,
+      "loss": 0.1074,
+      "step": 38819
+    },
+    {
+      "epoch": 0.3369762415256812,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0013832336282413656,
+      "loss": 0.1025,
+      "step": 38820
+    },
+    {
+      "epoch": 0.33698492200588537,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0013832053778646523,
+      "loss": 0.1475,
+      "step": 38821
+    },
+    {
+      "epoch": 0.3369936024860895,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0013831771271782304,
+      "loss": 0.1016,
+      "step": 38822
+    },
+    {
+      "epoch": 0.3370022829662937,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0013831488761821297,
+      "loss": 0.1309,
+      "step": 38823
+    },
+    {
+      "epoch": 0.33701096344649784,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0013831206248763816,
+      "loss": 0.0938,
+      "step": 38824
+    },
+    {
+      "epoch": 0.33701964392670203,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0013830923732610174,
+      "loss": 0.166,
+      "step": 38825
+    },
+    {
+      "epoch": 0.33702832440690617,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0013830641213360675,
+      "loss": 0.0977,
+      "step": 38826
+    },
+    {
+      "epoch": 0.33703700488711036,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001383035869101563,
+      "loss": 0.0796,
+      "step": 38827
+    },
+    {
+      "epoch": 0.3370456853673145,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0013830076165575345,
+      "loss": 0.1104,
+      "step": 38828
+    },
+    {
+      "epoch": 0.3370543658475187,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0013829793637040136,
+      "loss": 0.1084,
+      "step": 38829
+    },
+    {
+      "epoch": 0.3370630463277228,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0013829511105410304,
+      "loss": 0.0859,
+      "step": 38830
+    },
+    {
+      "epoch": 0.337071726807927,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0013829228570686161,
+      "loss": 0.0645,
+      "step": 38831
+    },
+    {
+      "epoch": 0.33708040728813116,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0013828946032868018,
+      "loss": 0.0737,
+      "step": 38832
+    },
+    {
+      "epoch": 0.33708908776833535,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001382866349195618,
+      "loss": 0.1309,
+      "step": 38833
+    },
+    {
+      "epoch": 0.3370977682485395,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0013828380947950956,
+      "loss": 0.1289,
+      "step": 38834
+    },
+    {
+      "epoch": 0.3371064487287437,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001382809840085266,
+      "loss": 0.0879,
+      "step": 38835
+    },
+    {
+      "epoch": 0.3371151292089478,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.00138278158506616,
+      "loss": 0.0713,
+      "step": 38836
+    },
+    {
+      "epoch": 0.337123809689152,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0013827533297378078,
+      "loss": 0.1221,
+      "step": 38837
+    },
+    {
+      "epoch": 0.33713249016935615,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0013827250741002414,
+      "loss": 0.1289,
+      "step": 38838
+    },
+    {
+      "epoch": 0.33714117064956034,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0013826968181534904,
+      "loss": 0.1016,
+      "step": 38839
+    },
+    {
+      "epoch": 0.3371498511297645,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001382668561897587,
+      "loss": 0.1182,
+      "step": 38840
+    },
+    {
+      "epoch": 0.33715853160996867,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0013826403053325612,
+      "loss": 0.1377,
+      "step": 38841
+    },
+    {
+      "epoch": 0.3371672120901728,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0013826120484584444,
+      "loss": 0.0801,
+      "step": 38842
+    },
+    {
+      "epoch": 0.337175892570377,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0013825837912752673,
+      "loss": 0.0923,
+      "step": 38843
+    },
+    {
+      "epoch": 0.33718457305058114,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0013825555337830607,
+      "loss": 0.0952,
+      "step": 38844
+    },
+    {
+      "epoch": 0.33719325353078533,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0013825272759818558,
+      "loss": 0.1309,
+      "step": 38845
+    },
+    {
+      "epoch": 0.33720193401098947,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0013824990178716832,
+      "loss": 0.0977,
+      "step": 38846
+    },
+    {
+      "epoch": 0.33721061449119366,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0013824707594525738,
+      "loss": 0.1143,
+      "step": 38847
+    },
+    {
+      "epoch": 0.3372192949713978,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0013824425007245586,
+      "loss": 0.0825,
+      "step": 38848
+    },
+    {
+      "epoch": 0.337227975451602,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0013824142416876688,
+      "loss": 0.105,
+      "step": 38849
+    },
+    {
+      "epoch": 0.33723665593180613,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0013823859823419349,
+      "loss": 0.0801,
+      "step": 38850
+    },
+    {
+      "epoch": 0.3372453364120103,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0013823577226873879,
+      "loss": 0.0786,
+      "step": 38851
+    },
+    {
+      "epoch": 0.33725401689221446,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0013823294627240588,
+      "loss": 0.0874,
+      "step": 38852
+    },
+    {
+      "epoch": 0.33726269737241865,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0013823012024519786,
+      "loss": 0.0693,
+      "step": 38853
+    },
+    {
+      "epoch": 0.3372713778526228,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0013822729418711776,
+      "loss": 0.1025,
+      "step": 38854
+    },
+    {
+      "epoch": 0.337280058332827,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0013822446809816874,
+      "loss": 0.0762,
+      "step": 38855
+    },
+    {
+      "epoch": 0.3372887388130311,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0013822164197835388,
+      "loss": 0.1348,
+      "step": 38856
+    },
+    {
+      "epoch": 0.3372974192932353,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0013821881582767623,
+      "loss": 0.0806,
+      "step": 38857
+    },
+    {
+      "epoch": 0.33730609977343945,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.001382159896461389,
+      "loss": 0.0898,
+      "step": 38858
+    },
+    {
+      "epoch": 0.33731478025364364,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.00138213163433745,
+      "loss": 0.1045,
+      "step": 38859
+    },
+    {
+      "epoch": 0.3373234607338478,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0013821033719049762,
+      "loss": 0.0918,
+      "step": 38860
+    },
+    {
+      "epoch": 0.337332141214052,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0013820751091639986,
+      "loss": 0.1016,
+      "step": 38861
+    },
+    {
+      "epoch": 0.3373408216942561,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0013820468461145474,
+      "loss": 0.1328,
+      "step": 38862
+    },
+    {
+      "epoch": 0.3373495021744603,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0013820185827566542,
+      "loss": 0.0996,
+      "step": 38863
+    },
+    {
+      "epoch": 0.33735818265466444,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0013819903190903499,
+      "loss": 0.0776,
+      "step": 38864
+    },
+    {
+      "epoch": 0.33736686313486863,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.001381962055115665,
+      "loss": 0.1328,
+      "step": 38865
+    },
+    {
+      "epoch": 0.33737554361507277,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0013819337908326306,
+      "loss": 0.0913,
+      "step": 38866
+    },
+    {
+      "epoch": 0.33738422409527696,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0013819055262412779,
+      "loss": 0.0811,
+      "step": 38867
+    },
+    {
+      "epoch": 0.3373929045754811,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0013818772613416372,
+      "loss": 0.1094,
+      "step": 38868
+    },
+    {
+      "epoch": 0.3374015850556853,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00138184899613374,
+      "loss": 0.1191,
+      "step": 38869
+    },
+    {
+      "epoch": 0.33741026553588943,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001381820730617617,
+      "loss": 0.0771,
+      "step": 38870
+    },
+    {
+      "epoch": 0.3374189460160936,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0013817924647932989,
+      "loss": 0.1172,
+      "step": 38871
+    },
+    {
+      "epoch": 0.33742762649629776,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001381764198660817,
+      "loss": 0.0957,
+      "step": 38872
+    },
+    {
+      "epoch": 0.33743630697650195,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001381735932220202,
+      "loss": 0.0928,
+      "step": 38873
+    },
+    {
+      "epoch": 0.3374449874567061,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0013817076654714846,
+      "loss": 0.0898,
+      "step": 38874
+    },
+    {
+      "epoch": 0.3374536679369103,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0013816793984146962,
+      "loss": 0.1602,
+      "step": 38875
+    },
+    {
+      "epoch": 0.3374623484171144,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0013816511310498672,
+      "loss": 0.1318,
+      "step": 38876
+    },
+    {
+      "epoch": 0.3374710288973186,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0013816228633770288,
+      "loss": 0.0889,
+      "step": 38877
+    },
+    {
+      "epoch": 0.33747970937752275,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0013815945953962118,
+      "loss": 0.1118,
+      "step": 38878
+    },
+    {
+      "epoch": 0.33748838985772694,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0013815663271074478,
+      "loss": 0.1055,
+      "step": 38879
+    },
+    {
+      "epoch": 0.3374970703379311,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0013815380585107662,
+      "loss": 0.0796,
+      "step": 38880
+    },
+    {
+      "epoch": 0.3375057508181353,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0013815097896061994,
+      "loss": 0.0933,
+      "step": 38881
+    },
+    {
+      "epoch": 0.3375144312983394,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0013814815203937777,
+      "loss": 0.125,
+      "step": 38882
+    },
+    {
+      "epoch": 0.3375231117785436,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0013814532508735317,
+      "loss": 0.0879,
+      "step": 38883
+    },
+    {
+      "epoch": 0.33753179225874774,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001381424981045493,
+      "loss": 0.0884,
+      "step": 38884
+    },
+    {
+      "epoch": 0.33754047273895194,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0013813967109096918,
+      "loss": 0.1152,
+      "step": 38885
+    },
+    {
+      "epoch": 0.3375491532191561,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0013813684404661596,
+      "loss": 0.1309,
+      "step": 38886
+    },
+    {
+      "epoch": 0.33755783369936027,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001381340169714927,
+      "loss": 0.1216,
+      "step": 38887
+    },
+    {
+      "epoch": 0.3375665141795644,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0013813118986560251,
+      "loss": 0.085,
+      "step": 38888
+    },
+    {
+      "epoch": 0.3375751946597686,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0013812836272894845,
+      "loss": 0.0723,
+      "step": 38889
+    },
+    {
+      "epoch": 0.33758387513997273,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.001381255355615337,
+      "loss": 0.1245,
+      "step": 38890
+    },
+    {
+      "epoch": 0.3375925556201769,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0013812270836336123,
+      "loss": 0.1143,
+      "step": 38891
+    },
+    {
+      "epoch": 0.33760123610038106,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0013811988113443421,
+      "loss": 0.1035,
+      "step": 38892
+    },
+    {
+      "epoch": 0.33760991658058526,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0013811705387475571,
+      "loss": 0.0786,
+      "step": 38893
+    },
+    {
+      "epoch": 0.3376185970607894,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0013811422658432883,
+      "loss": 0.127,
+      "step": 38894
+    },
+    {
+      "epoch": 0.3376272775409936,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0013811139926315664,
+      "loss": 0.0698,
+      "step": 38895
+    },
+    {
+      "epoch": 0.3376359580211977,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0013810857191124224,
+      "loss": 0.0859,
+      "step": 38896
+    },
+    {
+      "epoch": 0.3376446385014019,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0013810574452858874,
+      "loss": 0.1016,
+      "step": 38897
+    },
+    {
+      "epoch": 0.33765331898160605,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001381029171151992,
+      "loss": 0.083,
+      "step": 38898
+    },
+    {
+      "epoch": 0.33766199946181025,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0013810008967107676,
+      "loss": 0.1172,
+      "step": 38899
+    },
+    {
+      "epoch": 0.3376706799420144,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0013809726219622448,
+      "loss": 0.1196,
+      "step": 38900
+    },
+    {
+      "epoch": 0.3376793604222186,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0013809443469064544,
+      "loss": 0.1377,
+      "step": 38901
+    },
+    {
+      "epoch": 0.3376880409024227,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0013809160715434275,
+      "loss": 0.0859,
+      "step": 38902
+    },
+    {
+      "epoch": 0.3376967213826269,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.001380887795873195,
+      "loss": 0.1377,
+      "step": 38903
+    },
+    {
+      "epoch": 0.33770540186283105,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001380859519895788,
+      "loss": 0.0981,
+      "step": 38904
+    },
+    {
+      "epoch": 0.33771408234303524,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0013808312436112374,
+      "loss": 0.0593,
+      "step": 38905
+    },
+    {
+      "epoch": 0.3377227628232394,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0013808029670195733,
+      "loss": 0.1152,
+      "step": 38906
+    },
+    {
+      "epoch": 0.33773144330344357,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0013807746901208279,
+      "loss": 0.1094,
+      "step": 38907
+    },
+    {
+      "epoch": 0.3377401237836477,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0013807464129150313,
+      "loss": 0.1147,
+      "step": 38908
+    },
+    {
+      "epoch": 0.3377488042638519,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0013807181354022148,
+      "loss": 0.0923,
+      "step": 38909
+    },
+    {
+      "epoch": 0.33775748474405604,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0013806898575824088,
+      "loss": 0.1611,
+      "step": 38910
+    },
+    {
+      "epoch": 0.33776616522426023,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0013806615794556447,
+      "loss": 0.1104,
+      "step": 38911
+    },
+    {
+      "epoch": 0.33777484570446437,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001380633301021954,
+      "loss": 0.1016,
+      "step": 38912
+    },
+    {
+      "epoch": 0.33778352618466856,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0013806050222813662,
+      "loss": 0.0747,
+      "step": 38913
+    },
+    {
+      "epoch": 0.3377922066648727,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0013805767432339131,
+      "loss": 0.1035,
+      "step": 38914
+    },
+    {
+      "epoch": 0.3378008871450769,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0013805484638796254,
+      "loss": 0.1113,
+      "step": 38915
+    },
+    {
+      "epoch": 0.337809567625281,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001380520184218534,
+      "loss": 0.0903,
+      "step": 38916
+    },
+    {
+      "epoch": 0.3378182481054852,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0013804919042506702,
+      "loss": 0.1123,
+      "step": 38917
+    },
+    {
+      "epoch": 0.33782692858568936,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0013804636239760648,
+      "loss": 0.1289,
+      "step": 38918
+    },
+    {
+      "epoch": 0.33783560906589355,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0013804353433947486,
+      "loss": 0.0698,
+      "step": 38919
+    },
+    {
+      "epoch": 0.3378442895460977,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001380407062506752,
+      "loss": 0.1064,
+      "step": 38920
+    },
+    {
+      "epoch": 0.3378529700263019,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001380378781312107,
+      "loss": 0.0918,
+      "step": 38921
+    },
+    {
+      "epoch": 0.337861650506506,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.001380350499810844,
+      "loss": 0.0938,
+      "step": 38922
+    },
+    {
+      "epoch": 0.3378703309867102,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0013803222180029935,
+      "loss": 0.1191,
+      "step": 38923
+    },
+    {
+      "epoch": 0.33787901146691435,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001380293935888587,
+      "loss": 0.1133,
+      "step": 38924
+    },
+    {
+      "epoch": 0.33788769194711854,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0013802656534676557,
+      "loss": 0.0713,
+      "step": 38925
+    },
+    {
+      "epoch": 0.3378963724273227,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0013802373707402296,
+      "loss": 0.125,
+      "step": 38926
+    },
+    {
+      "epoch": 0.33790505290752687,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0013802090877063403,
+      "loss": 0.084,
+      "step": 38927
+    },
+    {
+      "epoch": 0.337913733387731,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0013801808043660182,
+      "loss": 0.1289,
+      "step": 38928
+    },
+    {
+      "epoch": 0.3379224138679352,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0013801525207192949,
+      "loss": 0.0957,
+      "step": 38929
+    },
+    {
+      "epoch": 0.33793109434813934,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0013801242367662008,
+      "loss": 0.1143,
+      "step": 38930
+    },
+    {
+      "epoch": 0.33793977482834353,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0013800959525067675,
+      "loss": 0.126,
+      "step": 38931
+    },
+    {
+      "epoch": 0.33794845530854767,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001380067667941025,
+      "loss": 0.0933,
+      "step": 38932
+    },
+    {
+      "epoch": 0.33795713578875186,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001380039383069005,
+      "loss": 0.1074,
+      "step": 38933
+    },
+    {
+      "epoch": 0.337965816268956,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001380011097890738,
+      "loss": 0.0977,
+      "step": 38934
+    },
+    {
+      "epoch": 0.33797449674916014,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.001379982812406255,
+      "loss": 0.0806,
+      "step": 38935
+    },
+    {
+      "epoch": 0.33798317722936433,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0013799545266155872,
+      "loss": 0.1338,
+      "step": 38936
+    },
+    {
+      "epoch": 0.33799185770956847,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0013799262405187651,
+      "loss": 0.0996,
+      "step": 38937
+    },
+    {
+      "epoch": 0.33800053818977266,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0013798979541158201,
+      "loss": 0.1289,
+      "step": 38938
+    },
+    {
+      "epoch": 0.3380092186699768,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0013798696674067828,
+      "loss": 0.0996,
+      "step": 38939
+    },
+    {
+      "epoch": 0.338017899150181,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0013798413803916842,
+      "loss": 0.2041,
+      "step": 38940
+    },
+    {
+      "epoch": 0.3380265796303851,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0013798130930705553,
+      "loss": 0.0947,
+      "step": 38941
+    },
+    {
+      "epoch": 0.3380352601105893,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0013797848054434272,
+      "loss": 0.0781,
+      "step": 38942
+    },
+    {
+      "epoch": 0.33804394059079346,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0013797565175103304,
+      "loss": 0.1357,
+      "step": 38943
+    },
+    {
+      "epoch": 0.33805262107099765,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001379728229271296,
+      "loss": 0.0732,
+      "step": 38944
+    },
+    {
+      "epoch": 0.3380613015512018,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0013796999407263554,
+      "loss": 0.1016,
+      "step": 38945
+    },
+    {
+      "epoch": 0.338069982031406,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0013796716518755386,
+      "loss": 0.0835,
+      "step": 38946
+    },
+    {
+      "epoch": 0.3380786625116101,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0013796433627188773,
+      "loss": 0.1367,
+      "step": 38947
+    },
+    {
+      "epoch": 0.3380873429918143,
+      "grad_norm": 2.21875,
+      "learning_rate": 0.0013796150732564025,
+      "loss": 0.209,
+      "step": 38948
+    },
+    {
+      "epoch": 0.33809602347201845,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0013795867834881445,
+      "loss": 0.0996,
+      "step": 38949
+    },
+    {
+      "epoch": 0.33810470395222264,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001379558493414135,
+      "loss": 0.1367,
+      "step": 38950
+    },
+    {
+      "epoch": 0.3381133844324268,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001379530203034404,
+      "loss": 0.0732,
+      "step": 38951
+    },
+    {
+      "epoch": 0.33812206491263097,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0013795019123489837,
+      "loss": 0.1206,
+      "step": 38952
+    },
+    {
+      "epoch": 0.3381307453928351,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0013794736213579035,
+      "loss": 0.064,
+      "step": 38953
+    },
+    {
+      "epoch": 0.3381394258730393,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0013794453300611956,
+      "loss": 0.0908,
+      "step": 38954
+    },
+    {
+      "epoch": 0.33814810635324344,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0013794170384588901,
+      "loss": 0.084,
+      "step": 38955
+    },
+    {
+      "epoch": 0.33815678683344763,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.001379388746551019,
+      "loss": 0.0737,
+      "step": 38956
+    },
+    {
+      "epoch": 0.33816546731365177,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001379360454337612,
+      "loss": 0.0986,
+      "step": 38957
+    },
+    {
+      "epoch": 0.33817414779385596,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0013793321618187006,
+      "loss": 0.1064,
+      "step": 38958
+    },
+    {
+      "epoch": 0.3381828282740601,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0013793038689943163,
+      "loss": 0.0981,
+      "step": 38959
+    },
+    {
+      "epoch": 0.3381915087542643,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0013792755758644892,
+      "loss": 0.0801,
+      "step": 38960
+    },
+    {
+      "epoch": 0.33820018923446843,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0013792472824292507,
+      "loss": 0.123,
+      "step": 38961
+    },
+    {
+      "epoch": 0.3382088697146726,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0013792189886886307,
+      "loss": 0.1089,
+      "step": 38962
+    },
+    {
+      "epoch": 0.33821755019487676,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001379190694642662,
+      "loss": 0.0669,
+      "step": 38963
+    },
+    {
+      "epoch": 0.33822623067508095,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001379162400291374,
+      "loss": 0.1006,
+      "step": 38964
+    },
+    {
+      "epoch": 0.3382349111552851,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0013791341056347984,
+      "loss": 0.1001,
+      "step": 38965
+    },
+    {
+      "epoch": 0.3382435916354893,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0013791058106729656,
+      "loss": 0.0801,
+      "step": 38966
+    },
+    {
+      "epoch": 0.3382522721156934,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0013790775154059074,
+      "loss": 0.0972,
+      "step": 38967
+    },
+    {
+      "epoch": 0.3382609525958976,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0013790492198336541,
+      "loss": 0.1641,
+      "step": 38968
+    },
+    {
+      "epoch": 0.33826963307610175,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0013790209239562365,
+      "loss": 0.0742,
+      "step": 38969
+    },
+    {
+      "epoch": 0.33827831355630594,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0013789926277736862,
+      "loss": 0.1045,
+      "step": 38970
+    },
+    {
+      "epoch": 0.3382869940365101,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0013789643312860332,
+      "loss": 0.0977,
+      "step": 38971
+    },
+    {
+      "epoch": 0.3382956745167143,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0013789360344933096,
+      "loss": 0.123,
+      "step": 38972
+    },
+    {
+      "epoch": 0.3383043549969184,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0013789077373955452,
+      "loss": 0.2227,
+      "step": 38973
+    },
+    {
+      "epoch": 0.3383130354771226,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001378879439992772,
+      "loss": 0.0811,
+      "step": 38974
+    },
+    {
+      "epoch": 0.33832171595732674,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.00137885114228502,
+      "loss": 0.127,
+      "step": 38975
+    },
+    {
+      "epoch": 0.33833039643753093,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.001378822844272321,
+      "loss": 0.0654,
+      "step": 38976
+    },
+    {
+      "epoch": 0.33833907691773507,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001378794545954705,
+      "loss": 0.1309,
+      "step": 38977
+    },
+    {
+      "epoch": 0.33834775739793926,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001378766247332204,
+      "loss": 0.083,
+      "step": 38978
+    },
+    {
+      "epoch": 0.3383564378781434,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001378737948404848,
+      "loss": 0.1191,
+      "step": 38979
+    },
+    {
+      "epoch": 0.3383651183583476,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0013787096491726686,
+      "loss": 0.1172,
+      "step": 38980
+    },
+    {
+      "epoch": 0.33837379883855173,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0013786813496356968,
+      "loss": 0.1211,
+      "step": 38981
+    },
+    {
+      "epoch": 0.3383824793187559,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0013786530497939629,
+      "loss": 0.1094,
+      "step": 38982
+    },
+    {
+      "epoch": 0.33839115979896006,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0013786247496474981,
+      "loss": 0.0898,
+      "step": 38983
+    },
+    {
+      "epoch": 0.33839984027916425,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0013785964491963337,
+      "loss": 0.0728,
+      "step": 38984
+    },
+    {
+      "epoch": 0.3384085207593684,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0013785681484405003,
+      "loss": 0.1084,
+      "step": 38985
+    },
+    {
+      "epoch": 0.3384172012395726,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001378539847380029,
+      "loss": 0.0781,
+      "step": 38986
+    },
+    {
+      "epoch": 0.3384258817197767,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0013785115460149507,
+      "loss": 0.126,
+      "step": 38987
+    },
+    {
+      "epoch": 0.3384345621999809,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0013784832443452964,
+      "loss": 0.0962,
+      "step": 38988
+    },
+    {
+      "epoch": 0.33844324268018505,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0013784549423710968,
+      "loss": 0.0996,
+      "step": 38989
+    },
+    {
+      "epoch": 0.33845192316038925,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0013784266400923835,
+      "loss": 0.1157,
+      "step": 38990
+    },
+    {
+      "epoch": 0.3384606036405934,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0013783983375091867,
+      "loss": 0.1562,
+      "step": 38991
+    },
+    {
+      "epoch": 0.3384692841207976,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0013783700346215375,
+      "loss": 0.0977,
+      "step": 38992
+    },
+    {
+      "epoch": 0.3384779646010017,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001378341731429467,
+      "loss": 0.1064,
+      "step": 38993
+    },
+    {
+      "epoch": 0.3384866450812059,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0013783134279330065,
+      "loss": 0.0996,
+      "step": 38994
+    },
+    {
+      "epoch": 0.33849532556141004,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0013782851241321864,
+      "loss": 0.1035,
+      "step": 38995
+    },
+    {
+      "epoch": 0.33850400604161424,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0013782568200270383,
+      "loss": 0.1045,
+      "step": 38996
+    },
+    {
+      "epoch": 0.3385126865218184,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001378228515617592,
+      "loss": 0.0649,
+      "step": 38997
+    },
+    {
+      "epoch": 0.33852136700202257,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0013782002109038796,
+      "loss": 0.1104,
+      "step": 38998
+    },
+    {
+      "epoch": 0.3385300474822267,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0013781719058859317,
+      "loss": 0.1143,
+      "step": 38999
+    },
+    {
+      "epoch": 0.3385387279624309,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0013781436005637791,
+      "loss": 0.0854,
+      "step": 39000
+    },
+    {
+      "epoch": 0.33854740844263503,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0013781152949374528,
+      "loss": 0.0923,
+      "step": 39001
+    },
+    {
+      "epoch": 0.3385560889228392,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0013780869890069839,
+      "loss": 0.0952,
+      "step": 39002
+    },
+    {
+      "epoch": 0.33856476940304336,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0013780586827724028,
+      "loss": 0.0776,
+      "step": 39003
+    },
+    {
+      "epoch": 0.33857344988324756,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0013780303762337414,
+      "loss": 0.0889,
+      "step": 39004
+    },
+    {
+      "epoch": 0.3385821303634517,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0013780020693910297,
+      "loss": 0.1055,
+      "step": 39005
+    },
+    {
+      "epoch": 0.3385908108436559,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0013779737622442996,
+      "loss": 0.0854,
+      "step": 39006
+    },
+    {
+      "epoch": 0.33859949132386,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0013779454547935815,
+      "loss": 0.123,
+      "step": 39007
+    },
+    {
+      "epoch": 0.3386081718040642,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0013779171470389064,
+      "loss": 0.0811,
+      "step": 39008
+    },
+    {
+      "epoch": 0.33861685228426835,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0013778888389803052,
+      "loss": 0.0732,
+      "step": 39009
+    },
+    {
+      "epoch": 0.33862553276447255,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0013778605306178089,
+      "loss": 0.0801,
+      "step": 39010
+    },
+    {
+      "epoch": 0.3386342132446767,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0013778322219514486,
+      "loss": 0.0698,
+      "step": 39011
+    },
+    {
+      "epoch": 0.3386428937248809,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0013778039129812548,
+      "loss": 0.0967,
+      "step": 39012
+    },
+    {
+      "epoch": 0.338651574205085,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0013777756037072592,
+      "loss": 0.0845,
+      "step": 39013
+    },
+    {
+      "epoch": 0.3386602546852892,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0013777472941294922,
+      "loss": 0.1055,
+      "step": 39014
+    },
+    {
+      "epoch": 0.33866893516549335,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0013777189842479852,
+      "loss": 0.1445,
+      "step": 39015
+    },
+    {
+      "epoch": 0.33867761564569754,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0013776906740627687,
+      "loss": 0.123,
+      "step": 39016
+    },
+    {
+      "epoch": 0.3386862961259017,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001377662363573874,
+      "loss": 0.1157,
+      "step": 39017
+    },
+    {
+      "epoch": 0.33869497660610587,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0013776340527813316,
+      "loss": 0.0811,
+      "step": 39018
+    },
+    {
+      "epoch": 0.33870365708631,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001377605741685173,
+      "loss": 0.127,
+      "step": 39019
+    },
+    {
+      "epoch": 0.3387123375665142,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0013775774302854289,
+      "loss": 0.1348,
+      "step": 39020
+    },
+    {
+      "epoch": 0.33872101804671834,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00137754911858213,
+      "loss": 0.106,
+      "step": 39021
+    },
+    {
+      "epoch": 0.33872969852692253,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001377520806575308,
+      "loss": 0.1006,
+      "step": 39022
+    },
+    {
+      "epoch": 0.33873837900712667,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0013774924942649932,
+      "loss": 0.0908,
+      "step": 39023
+    },
+    {
+      "epoch": 0.33874705948733086,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0013774641816512165,
+      "loss": 0.1299,
+      "step": 39024
+    },
+    {
+      "epoch": 0.338755739967535,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0013774358687340098,
+      "loss": 0.1001,
+      "step": 39025
+    },
+    {
+      "epoch": 0.3387644204477392,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001377407555513403,
+      "loss": 0.1182,
+      "step": 39026
+    },
+    {
+      "epoch": 0.3387731009279433,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0013773792419894276,
+      "loss": 0.0967,
+      "step": 39027
+    },
+    {
+      "epoch": 0.3387817814081475,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001377350928162114,
+      "loss": 0.1045,
+      "step": 39028
+    },
+    {
+      "epoch": 0.33879046188835166,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0013773226140314943,
+      "loss": 0.1104,
+      "step": 39029
+    },
+    {
+      "epoch": 0.33879914236855585,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0013772942995975983,
+      "loss": 0.0889,
+      "step": 39030
+    },
+    {
+      "epoch": 0.33880782284876,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0013772659848604574,
+      "loss": 0.1426,
+      "step": 39031
+    },
+    {
+      "epoch": 0.3388165033289642,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0013772376698201027,
+      "loss": 0.1074,
+      "step": 39032
+    },
+    {
+      "epoch": 0.3388251838091683,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001377209354476565,
+      "loss": 0.0947,
+      "step": 39033
+    },
+    {
+      "epoch": 0.3388338642893725,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0013771810388298756,
+      "loss": 0.1152,
+      "step": 39034
+    },
+    {
+      "epoch": 0.33884254476957665,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0013771527228800648,
+      "loss": 0.0825,
+      "step": 39035
+    },
+    {
+      "epoch": 0.33885122524978084,
+      "grad_norm": 0.052978515625,
+      "learning_rate": 0.0013771244066271642,
+      "loss": 0.0439,
+      "step": 39036
+    },
+    {
+      "epoch": 0.338859905729985,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0013770960900712041,
+      "loss": 0.1001,
+      "step": 39037
+    },
+    {
+      "epoch": 0.33886858621018917,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0013770677732122165,
+      "loss": 0.1162,
+      "step": 39038
+    },
+    {
+      "epoch": 0.3388772666903933,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0013770394560502313,
+      "loss": 0.1074,
+      "step": 39039
+    },
+    {
+      "epoch": 0.3388859471705975,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0013770111385852802,
+      "loss": 0.0859,
+      "step": 39040
+    },
+    {
+      "epoch": 0.33889462765080164,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0013769828208173936,
+      "loss": 0.1445,
+      "step": 39041
+    },
+    {
+      "epoch": 0.33890330813100583,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001376954502746603,
+      "loss": 0.105,
+      "step": 39042
+    },
+    {
+      "epoch": 0.33891198861120997,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001376926184372939,
+      "loss": 0.1021,
+      "step": 39043
+    },
+    {
+      "epoch": 0.33892066909141416,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0013768978656964325,
+      "loss": 0.1289,
+      "step": 39044
+    },
+    {
+      "epoch": 0.3389293495716183,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0013768695467171147,
+      "loss": 0.0928,
+      "step": 39045
+    },
+    {
+      "epoch": 0.3389380300518225,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0013768412274350167,
+      "loss": 0.085,
+      "step": 39046
+    },
+    {
+      "epoch": 0.33894671053202663,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0013768129078501694,
+      "loss": 0.0986,
+      "step": 39047
+    },
+    {
+      "epoch": 0.3389553910122308,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0013767845879626036,
+      "loss": 0.126,
+      "step": 39048
+    },
+    {
+      "epoch": 0.33896407149243496,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.00137675626777235,
+      "loss": 0.1406,
+      "step": 39049
+    },
+    {
+      "epoch": 0.33897275197263915,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0013767279472794403,
+      "loss": 0.0771,
+      "step": 39050
+    },
+    {
+      "epoch": 0.3389814324528433,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0013766996264839046,
+      "loss": 0.0815,
+      "step": 39051
+    },
+    {
+      "epoch": 0.3389901129330475,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001376671305385775,
+      "loss": 0.1045,
+      "step": 39052
+    },
+    {
+      "epoch": 0.3389987934132516,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001376642983985081,
+      "loss": 0.1084,
+      "step": 39053
+    },
+    {
+      "epoch": 0.3390074738934558,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0013766146622818552,
+      "loss": 0.0928,
+      "step": 39054
+    },
+    {
+      "epoch": 0.33901615437365995,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0013765863402761275,
+      "loss": 0.1128,
+      "step": 39055
+    },
+    {
+      "epoch": 0.33902483485386414,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001376558017967929,
+      "loss": 0.1172,
+      "step": 39056
+    },
+    {
+      "epoch": 0.3390335153340683,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0013765296953572909,
+      "loss": 0.106,
+      "step": 39057
+    },
+    {
+      "epoch": 0.3390421958142724,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0013765013724442439,
+      "loss": 0.1021,
+      "step": 39058
+    },
+    {
+      "epoch": 0.3390508762944766,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0013764730492288196,
+      "loss": 0.0898,
+      "step": 39059
+    },
+    {
+      "epoch": 0.33905955677468075,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001376444725711048,
+      "loss": 0.1201,
+      "step": 39060
+    },
+    {
+      "epoch": 0.33906823725488494,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001376416401890961,
+      "loss": 0.0703,
+      "step": 39061
+    },
+    {
+      "epoch": 0.3390769177350891,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0013763880777685888,
+      "loss": 0.0815,
+      "step": 39062
+    },
+    {
+      "epoch": 0.33908559821529327,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001376359753343963,
+      "loss": 0.0825,
+      "step": 39063
+    },
+    {
+      "epoch": 0.3390942786954974,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0013763314286171144,
+      "loss": 0.1123,
+      "step": 39064
+    },
+    {
+      "epoch": 0.3391029591757016,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0013763031035880738,
+      "loss": 0.1123,
+      "step": 39065
+    },
+    {
+      "epoch": 0.33911163965590574,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0013762747782568722,
+      "loss": 0.0981,
+      "step": 39066
+    },
+    {
+      "epoch": 0.33912032013610993,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0013762464526235407,
+      "loss": 0.1396,
+      "step": 39067
+    },
+    {
+      "epoch": 0.33912900061631407,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.00137621812668811,
+      "loss": 0.1089,
+      "step": 39068
+    },
+    {
+      "epoch": 0.33913768109651826,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001376189800450612,
+      "loss": 0.1221,
+      "step": 39069
+    },
+    {
+      "epoch": 0.3391463615767224,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0013761614739110763,
+      "loss": 0.1074,
+      "step": 39070
+    },
+    {
+      "epoch": 0.3391550420569266,
+      "grad_norm": 2.234375,
+      "learning_rate": 0.0013761331470695346,
+      "loss": 0.1436,
+      "step": 39071
+    },
+    {
+      "epoch": 0.33916372253713073,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0013761048199260183,
+      "loss": 0.1123,
+      "step": 39072
+    },
+    {
+      "epoch": 0.3391724030173349,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0013760764924805576,
+      "loss": 0.1406,
+      "step": 39073
+    },
+    {
+      "epoch": 0.33918108349753906,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0013760481647331838,
+      "loss": 0.1094,
+      "step": 39074
+    },
+    {
+      "epoch": 0.33918976397774325,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001376019836683928,
+      "loss": 0.0801,
+      "step": 39075
+    },
+    {
+      "epoch": 0.3391984444579474,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.001375991508332821,
+      "loss": 0.1133,
+      "step": 39076
+    },
+    {
+      "epoch": 0.3392071249381516,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001375963179679894,
+      "loss": 0.127,
+      "step": 39077
+    },
+    {
+      "epoch": 0.3392158054183557,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0013759348507251775,
+      "loss": 0.1187,
+      "step": 39078
+    },
+    {
+      "epoch": 0.3392244858985599,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0013759065214687027,
+      "loss": 0.0903,
+      "step": 39079
+    },
+    {
+      "epoch": 0.33923316637876405,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001375878191910501,
+      "loss": 0.0859,
+      "step": 39080
+    },
+    {
+      "epoch": 0.33924184685896824,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0013758498620506027,
+      "loss": 0.1177,
+      "step": 39081
+    },
+    {
+      "epoch": 0.3392505273391724,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0013758215318890397,
+      "loss": 0.0864,
+      "step": 39082
+    },
+    {
+      "epoch": 0.3392592078193766,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001375793201425842,
+      "loss": 0.1299,
+      "step": 39083
+    },
+    {
+      "epoch": 0.3392678882995807,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0013757648706610411,
+      "loss": 0.0752,
+      "step": 39084
+    },
+    {
+      "epoch": 0.3392765687797849,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001375736539594668,
+      "loss": 0.1069,
+      "step": 39085
+    },
+    {
+      "epoch": 0.33928524925998904,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0013757082082267536,
+      "loss": 0.1104,
+      "step": 39086
+    },
+    {
+      "epoch": 0.33929392974019323,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0013756798765573283,
+      "loss": 0.1123,
+      "step": 39087
+    },
+    {
+      "epoch": 0.33930261022039737,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0013756515445864242,
+      "loss": 0.1094,
+      "step": 39088
+    },
+    {
+      "epoch": 0.33931129070060156,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0013756232123140716,
+      "loss": 0.1138,
+      "step": 39089
+    },
+    {
+      "epoch": 0.3393199711808057,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0013755948797403016,
+      "loss": 0.1172,
+      "step": 39090
+    },
+    {
+      "epoch": 0.3393286516610099,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0013755665468651451,
+      "loss": 0.0986,
+      "step": 39091
+    },
+    {
+      "epoch": 0.33933733214121403,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001375538213688633,
+      "loss": 0.0874,
+      "step": 39092
+    },
+    {
+      "epoch": 0.3393460126214182,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0013755098802107966,
+      "loss": 0.1201,
+      "step": 39093
+    },
+    {
+      "epoch": 0.33935469310162236,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001375481546431667,
+      "loss": 0.1133,
+      "step": 39094
+    },
+    {
+      "epoch": 0.33936337358182656,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0013754532123512746,
+      "loss": 0.1113,
+      "step": 39095
+    },
+    {
+      "epoch": 0.3393720540620307,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0013754248779696508,
+      "loss": 0.0566,
+      "step": 39096
+    },
+    {
+      "epoch": 0.3393807345422349,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0013753965432868262,
+      "loss": 0.1309,
+      "step": 39097
+    },
+    {
+      "epoch": 0.339389415022439,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.0013753682083028325,
+      "loss": 0.0786,
+      "step": 39098
+    },
+    {
+      "epoch": 0.3393980955026432,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0013753398730177002,
+      "loss": 0.0742,
+      "step": 39099
+    },
+    {
+      "epoch": 0.33940677598284735,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.00137531153743146,
+      "loss": 0.1182,
+      "step": 39100
+    },
+    {
+      "epoch": 0.33941545646305155,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0013752832015441437,
+      "loss": 0.1187,
+      "step": 39101
+    },
+    {
+      "epoch": 0.3394241369432557,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0013752548653557816,
+      "loss": 0.1309,
+      "step": 39102
+    },
+    {
+      "epoch": 0.3394328174234599,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0013752265288664049,
+      "loss": 0.1011,
+      "step": 39103
+    },
+    {
+      "epoch": 0.339441497903664,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0013751981920760447,
+      "loss": 0.0972,
+      "step": 39104
+    },
+    {
+      "epoch": 0.3394501783838682,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001375169854984732,
+      "loss": 0.1016,
+      "step": 39105
+    },
+    {
+      "epoch": 0.33945885886407234,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0013751415175924976,
+      "loss": 0.0635,
+      "step": 39106
+    },
+    {
+      "epoch": 0.33946753934427654,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0013751131798993724,
+      "loss": 0.0811,
+      "step": 39107
+    },
+    {
+      "epoch": 0.3394762198244807,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0013750848419053876,
+      "loss": 0.0825,
+      "step": 39108
+    },
+    {
+      "epoch": 0.33948490030468487,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001375056503610574,
+      "loss": 0.0752,
+      "step": 39109
+    },
+    {
+      "epoch": 0.339493580784889,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0013750281650149632,
+      "loss": 0.1055,
+      "step": 39110
+    },
+    {
+      "epoch": 0.3395022612650932,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0013749998261185854,
+      "loss": 0.1523,
+      "step": 39111
+    },
+    {
+      "epoch": 0.33951094174529733,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001374971486921472,
+      "loss": 0.0713,
+      "step": 39112
+    },
+    {
+      "epoch": 0.3395196222255015,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001374943147423654,
+      "loss": 0.1533,
+      "step": 39113
+    },
+    {
+      "epoch": 0.33952830270570566,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0013749148076251621,
+      "loss": 0.1406,
+      "step": 39114
+    },
+    {
+      "epoch": 0.33953698318590986,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0013748864675260277,
+      "loss": 0.0967,
+      "step": 39115
+    },
+    {
+      "epoch": 0.339545663666114,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0013748581271262816,
+      "loss": 0.1299,
+      "step": 39116
+    },
+    {
+      "epoch": 0.3395543441463182,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0013748297864259545,
+      "loss": 0.1367,
+      "step": 39117
+    },
+    {
+      "epoch": 0.3395630246265223,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0013748014454250776,
+      "loss": 0.1416,
+      "step": 39118
+    },
+    {
+      "epoch": 0.3395717051067265,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0013747731041236825,
+      "loss": 0.1602,
+      "step": 39119
+    },
+    {
+      "epoch": 0.33958038558693066,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0013747447625217994,
+      "loss": 0.1025,
+      "step": 39120
+    },
+    {
+      "epoch": 0.33958906606713485,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0013747164206194596,
+      "loss": 0.0957,
+      "step": 39121
+    },
+    {
+      "epoch": 0.339597746547339,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.001374688078416694,
+      "loss": 0.1094,
+      "step": 39122
+    },
+    {
+      "epoch": 0.3396064270275432,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0013746597359135338,
+      "loss": 0.0957,
+      "step": 39123
+    },
+    {
+      "epoch": 0.3396151075077473,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0013746313931100097,
+      "loss": 0.1011,
+      "step": 39124
+    },
+    {
+      "epoch": 0.3396237879879515,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0013746030500061527,
+      "loss": 0.0815,
+      "step": 39125
+    },
+    {
+      "epoch": 0.33963246846815565,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.001374574706601994,
+      "loss": 0.0981,
+      "step": 39126
+    },
+    {
+      "epoch": 0.33964114894835984,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0013745463628975648,
+      "loss": 0.1084,
+      "step": 39127
+    },
+    {
+      "epoch": 0.339649829428564,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0013745180188928954,
+      "loss": 0.0898,
+      "step": 39128
+    },
+    {
+      "epoch": 0.33965850990876817,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0013744896745880174,
+      "loss": 0.0918,
+      "step": 39129
+    },
+    {
+      "epoch": 0.3396671903889723,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0013744613299829615,
+      "loss": 0.0767,
+      "step": 39130
+    },
+    {
+      "epoch": 0.3396758708691765,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001374432985077759,
+      "loss": 0.0947,
+      "step": 39131
+    },
+    {
+      "epoch": 0.33968455134938064,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0013744046398724405,
+      "loss": 0.127,
+      "step": 39132
+    },
+    {
+      "epoch": 0.33969323182958483,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0013743762943670378,
+      "loss": 0.0762,
+      "step": 39133
+    },
+    {
+      "epoch": 0.33970191230978897,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0013743479485615808,
+      "loss": 0.1182,
+      "step": 39134
+    },
+    {
+      "epoch": 0.33971059278999316,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0013743196024561008,
+      "loss": 0.1309,
+      "step": 39135
+    },
+    {
+      "epoch": 0.3397192732701973,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0013742912560506296,
+      "loss": 0.1099,
+      "step": 39136
+    },
+    {
+      "epoch": 0.3397279537504015,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001374262909345197,
+      "loss": 0.063,
+      "step": 39137
+    },
+    {
+      "epoch": 0.3397366342306056,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0013742345623398349,
+      "loss": 0.1084,
+      "step": 39138
+    },
+    {
+      "epoch": 0.3397453147108098,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001374206215034574,
+      "loss": 0.1172,
+      "step": 39139
+    },
+    {
+      "epoch": 0.33975399519101396,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0013741778674294454,
+      "loss": 0.1309,
+      "step": 39140
+    },
+    {
+      "epoch": 0.33976267567121815,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0013741495195244797,
+      "loss": 0.0879,
+      "step": 39141
+    },
+    {
+      "epoch": 0.3397713561514223,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0013741211713197087,
+      "loss": 0.0859,
+      "step": 39142
+    },
+    {
+      "epoch": 0.3397800366316265,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0013740928228151625,
+      "loss": 0.0938,
+      "step": 39143
+    },
+    {
+      "epoch": 0.3397887171118306,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0013740644740108728,
+      "loss": 0.0889,
+      "step": 39144
+    },
+    {
+      "epoch": 0.3397973975920348,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.00137403612490687,
+      "loss": 0.1196,
+      "step": 39145
+    },
+    {
+      "epoch": 0.33980607807223895,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0013740077755031855,
+      "loss": 0.105,
+      "step": 39146
+    },
+    {
+      "epoch": 0.33981475855244314,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0013739794257998502,
+      "loss": 0.1357,
+      "step": 39147
+    },
+    {
+      "epoch": 0.3398234390326473,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001373951075796895,
+      "loss": 0.1572,
+      "step": 39148
+    },
+    {
+      "epoch": 0.33983211951285147,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001373922725494351,
+      "loss": 0.1221,
+      "step": 39149
+    },
+    {
+      "epoch": 0.3398407999930556,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0013738943748922497,
+      "loss": 0.0801,
+      "step": 39150
+    },
+    {
+      "epoch": 0.3398494804732598,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0013738660239906216,
+      "loss": 0.1143,
+      "step": 39151
+    },
+    {
+      "epoch": 0.33985816095346394,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0013738376727894972,
+      "loss": 0.0957,
+      "step": 39152
+    },
+    {
+      "epoch": 0.33986684143366813,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0013738093212889082,
+      "loss": 0.1484,
+      "step": 39153
+    },
+    {
+      "epoch": 0.33987552191387227,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0013737809694888859,
+      "loss": 0.1172,
+      "step": 39154
+    },
+    {
+      "epoch": 0.33988420239407646,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0013737526173894603,
+      "loss": 0.1089,
+      "step": 39155
+    },
+    {
+      "epoch": 0.3398928828742806,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001373724264990663,
+      "loss": 0.0801,
+      "step": 39156
+    },
+    {
+      "epoch": 0.3399015633544848,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001373695912292525,
+      "loss": 0.0967,
+      "step": 39157
+    },
+    {
+      "epoch": 0.33991024383468893,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0013736675592950773,
+      "loss": 0.1289,
+      "step": 39158
+    },
+    {
+      "epoch": 0.3399189243148931,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001373639205998351,
+      "loss": 0.1172,
+      "step": 39159
+    },
+    {
+      "epoch": 0.33992760479509726,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001373610852402377,
+      "loss": 0.0928,
+      "step": 39160
+    },
+    {
+      "epoch": 0.33993628527530145,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.001373582498507186,
+      "loss": 0.0859,
+      "step": 39161
+    },
+    {
+      "epoch": 0.3399449657555056,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0013735541443128097,
+      "loss": 0.1289,
+      "step": 39162
+    },
+    {
+      "epoch": 0.3399536462357098,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0013735257898192786,
+      "loss": 0.085,
+      "step": 39163
+    },
+    {
+      "epoch": 0.3399623267159139,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0013734974350266237,
+      "loss": 0.1245,
+      "step": 39164
+    },
+    {
+      "epoch": 0.3399710071961181,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001373469079934876,
+      "loss": 0.1069,
+      "step": 39165
+    },
+    {
+      "epoch": 0.33997968767632225,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0013734407245440665,
+      "loss": 0.1216,
+      "step": 39166
+    },
+    {
+      "epoch": 0.33998836815652644,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0013734123688542265,
+      "loss": 0.1104,
+      "step": 39167
+    },
+    {
+      "epoch": 0.3399970486367306,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0013733840128653872,
+      "loss": 0.0786,
+      "step": 39168
+    },
+    {
+      "epoch": 0.3400057291169348,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001373355656577579,
+      "loss": 0.1152,
+      "step": 39169
+    },
+    {
+      "epoch": 0.3400144095971389,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0013733272999908329,
+      "loss": 0.1123,
+      "step": 39170
+    },
+    {
+      "epoch": 0.3400230900773431,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0013732989431051803,
+      "loss": 0.0903,
+      "step": 39171
+    },
+    {
+      "epoch": 0.34003177055754724,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0013732705859206523,
+      "loss": 0.0781,
+      "step": 39172
+    },
+    {
+      "epoch": 0.34004045103775143,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0013732422284372797,
+      "loss": 0.1113,
+      "step": 39173
+    },
+    {
+      "epoch": 0.34004913151795557,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0013732138706550932,
+      "loss": 0.1387,
+      "step": 39174
+    },
+    {
+      "epoch": 0.34005781199815976,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0013731855125741243,
+      "loss": 0.1484,
+      "step": 39175
+    },
+    {
+      "epoch": 0.3400664924783639,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0013731571541944036,
+      "loss": 0.104,
+      "step": 39176
+    },
+    {
+      "epoch": 0.3400751729585681,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0013731287955159625,
+      "loss": 0.0879,
+      "step": 39177
+    },
+    {
+      "epoch": 0.34008385343877223,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0013731004365388317,
+      "loss": 0.1011,
+      "step": 39178
+    },
+    {
+      "epoch": 0.3400925339189764,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0013730720772630427,
+      "loss": 0.1201,
+      "step": 39179
+    },
+    {
+      "epoch": 0.34010121439918056,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0013730437176886264,
+      "loss": 0.0811,
+      "step": 39180
+    },
+    {
+      "epoch": 0.3401098948793847,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0013730153578156133,
+      "loss": 0.0913,
+      "step": 39181
+    },
+    {
+      "epoch": 0.3401185753595889,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0013729869976440343,
+      "loss": 0.1045,
+      "step": 39182
+    },
+    {
+      "epoch": 0.34012725583979303,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0013729586371739213,
+      "loss": 0.1191,
+      "step": 39183
+    },
+    {
+      "epoch": 0.3401359363199972,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0013729302764053047,
+      "loss": 0.0947,
+      "step": 39184
+    },
+    {
+      "epoch": 0.34014461680020136,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0013729019153382153,
+      "loss": 0.0908,
+      "step": 39185
+    },
+    {
+      "epoch": 0.34015329728040555,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0013728735539726851,
+      "loss": 0.0908,
+      "step": 39186
+    },
+    {
+      "epoch": 0.3401619777606097,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001372845192308744,
+      "loss": 0.083,
+      "step": 39187
+    },
+    {
+      "epoch": 0.3401706582408139,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0013728168303464238,
+      "loss": 0.082,
+      "step": 39188
+    },
+    {
+      "epoch": 0.340179338721018,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0013727884680857553,
+      "loss": 0.1216,
+      "step": 39189
+    },
+    {
+      "epoch": 0.3401880192012222,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0013727601055267692,
+      "loss": 0.1123,
+      "step": 39190
+    },
+    {
+      "epoch": 0.34019669968142635,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0013727317426694967,
+      "loss": 0.1416,
+      "step": 39191
+    },
+    {
+      "epoch": 0.34020538016163054,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0013727033795139687,
+      "loss": 0.1289,
+      "step": 39192
+    },
+    {
+      "epoch": 0.3402140606418347,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001372675016060217,
+      "loss": 0.1387,
+      "step": 39193
+    },
+    {
+      "epoch": 0.3402227411220389,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0013726466523082714,
+      "loss": 0.1167,
+      "step": 39194
+    },
+    {
+      "epoch": 0.340231421602243,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0013726182882581638,
+      "loss": 0.1045,
+      "step": 39195
+    },
+    {
+      "epoch": 0.3402401020824472,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0013725899239099247,
+      "loss": 0.0757,
+      "step": 39196
+    },
+    {
+      "epoch": 0.34024878256265134,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0013725615592635855,
+      "loss": 0.106,
+      "step": 39197
+    },
+    {
+      "epoch": 0.34025746304285553,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0013725331943191773,
+      "loss": 0.0898,
+      "step": 39198
+    },
+    {
+      "epoch": 0.34026614352305967,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0013725048290767307,
+      "loss": 0.1104,
+      "step": 39199
+    },
+    {
+      "epoch": 0.34027482400326386,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0013724764635362767,
+      "loss": 0.1138,
+      "step": 39200
+    },
+    {
+      "epoch": 0.340283504483468,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0013724480976978468,
+      "loss": 0.0898,
+      "step": 39201
+    },
+    {
+      "epoch": 0.3402921849636722,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001372419731561472,
+      "loss": 0.0913,
+      "step": 39202
+    },
+    {
+      "epoch": 0.34030086544387633,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0013723913651271823,
+      "loss": 0.0718,
+      "step": 39203
+    },
+    {
+      "epoch": 0.3403095459240805,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0013723629983950101,
+      "loss": 0.0713,
+      "step": 39204
+    },
+    {
+      "epoch": 0.34031822640428466,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0013723346313649857,
+      "loss": 0.127,
+      "step": 39205
+    },
+    {
+      "epoch": 0.34032690688448886,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0013723062640371405,
+      "loss": 0.1738,
+      "step": 39206
+    },
+    {
+      "epoch": 0.340335587364693,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0013722778964115048,
+      "loss": 0.1094,
+      "step": 39207
+    },
+    {
+      "epoch": 0.3403442678448972,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0013722495284881103,
+      "loss": 0.0752,
+      "step": 39208
+    },
+    {
+      "epoch": 0.3403529483251013,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0013722211602669877,
+      "loss": 0.0986,
+      "step": 39209
+    },
+    {
+      "epoch": 0.3403616288053055,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0013721927917481683,
+      "loss": 0.0845,
+      "step": 39210
+    },
+    {
+      "epoch": 0.34037030928550965,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0013721644229316829,
+      "loss": 0.1182,
+      "step": 39211
+    },
+    {
+      "epoch": 0.34037898976571385,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0013721360538175627,
+      "loss": 0.1719,
+      "step": 39212
+    },
+    {
+      "epoch": 0.340387670245918,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0013721076844058382,
+      "loss": 0.1387,
+      "step": 39213
+    },
+    {
+      "epoch": 0.3403963507261222,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001372079314696541,
+      "loss": 0.1562,
+      "step": 39214
+    },
+    {
+      "epoch": 0.3404050312063263,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001372050944689702,
+      "loss": 0.1172,
+      "step": 39215
+    },
+    {
+      "epoch": 0.3404137116865305,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0013720225743853523,
+      "loss": 0.0669,
+      "step": 39216
+    },
+    {
+      "epoch": 0.34042239216673464,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0013719942037835227,
+      "loss": 0.1143,
+      "step": 39217
+    },
+    {
+      "epoch": 0.34043107264693884,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0013719658328842442,
+      "loss": 0.0898,
+      "step": 39218
+    },
+    {
+      "epoch": 0.340439753127143,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0013719374616875482,
+      "loss": 0.1123,
+      "step": 39219
+    },
+    {
+      "epoch": 0.34044843360734717,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0013719090901934653,
+      "loss": 0.0957,
+      "step": 39220
+    },
+    {
+      "epoch": 0.3404571140875513,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0013718807184020265,
+      "loss": 0.085,
+      "step": 39221
+    },
+    {
+      "epoch": 0.3404657945677555,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0013718523463132635,
+      "loss": 0.1309,
+      "step": 39222
+    },
+    {
+      "epoch": 0.34047447504795963,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0013718239739272066,
+      "loss": 0.0869,
+      "step": 39223
+    },
+    {
+      "epoch": 0.34048315552816383,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001371795601243887,
+      "loss": 0.1309,
+      "step": 39224
+    },
+    {
+      "epoch": 0.34049183600836797,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001371767228263336,
+      "loss": 0.082,
+      "step": 39225
+    },
+    {
+      "epoch": 0.34050051648857216,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0013717388549855842,
+      "loss": 0.082,
+      "step": 39226
+    },
+    {
+      "epoch": 0.3405091969687763,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0013717104814106632,
+      "loss": 0.1084,
+      "step": 39227
+    },
+    {
+      "epoch": 0.3405178774489805,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0013716821075386034,
+      "loss": 0.0977,
+      "step": 39228
+    },
+    {
+      "epoch": 0.3405265579291846,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0013716537333694366,
+      "loss": 0.0723,
+      "step": 39229
+    },
+    {
+      "epoch": 0.3405352384093888,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0013716253589031928,
+      "loss": 0.0938,
+      "step": 39230
+    },
+    {
+      "epoch": 0.34054391888959296,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0013715969841399038,
+      "loss": 0.1182,
+      "step": 39231
+    },
+    {
+      "epoch": 0.34055259936979715,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0013715686090796004,
+      "loss": 0.0942,
+      "step": 39232
+    },
+    {
+      "epoch": 0.3405612798500013,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001371540233722314,
+      "loss": 0.0747,
+      "step": 39233
+    },
+    {
+      "epoch": 0.3405699603302055,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0013715118580680744,
+      "loss": 0.1064,
+      "step": 39234
+    },
+    {
+      "epoch": 0.3405786408104096,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0013714834821169145,
+      "loss": 0.127,
+      "step": 39235
+    },
+    {
+      "epoch": 0.3405873212906138,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0013714551058688637,
+      "loss": 0.0903,
+      "step": 39236
+    },
+    {
+      "epoch": 0.34059600177081795,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001371426729323954,
+      "loss": 0.0752,
+      "step": 39237
+    },
+    {
+      "epoch": 0.34060468225102214,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.001371398352482216,
+      "loss": 0.0728,
+      "step": 39238
+    },
+    {
+      "epoch": 0.3406133627312263,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0013713699753436807,
+      "loss": 0.0806,
+      "step": 39239
+    },
+    {
+      "epoch": 0.34062204321143047,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0013713415979083796,
+      "loss": 0.1104,
+      "step": 39240
+    },
+    {
+      "epoch": 0.3406307236916346,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0013713132201763432,
+      "loss": 0.1025,
+      "step": 39241
+    },
+    {
+      "epoch": 0.3406394041718388,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0013712848421476027,
+      "loss": 0.0918,
+      "step": 39242
+    },
+    {
+      "epoch": 0.34064808465204294,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0013712564638221892,
+      "loss": 0.123,
+      "step": 39243
+    },
+    {
+      "epoch": 0.34065676513224713,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0013712280852001338,
+      "loss": 0.1172,
+      "step": 39244
+    },
+    {
+      "epoch": 0.34066544561245127,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0013711997062814674,
+      "loss": 0.1299,
+      "step": 39245
+    },
+    {
+      "epoch": 0.34067412609265546,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0013711713270662211,
+      "loss": 0.1416,
+      "step": 39246
+    },
+    {
+      "epoch": 0.3406828065728596,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0013711429475544259,
+      "loss": 0.123,
+      "step": 39247
+    },
+    {
+      "epoch": 0.3406914870530638,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0013711145677461128,
+      "loss": 0.0713,
+      "step": 39248
+    },
+    {
+      "epoch": 0.34070016753326793,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0013710861876413127,
+      "loss": 0.0889,
+      "step": 39249
+    },
+    {
+      "epoch": 0.3407088480134721,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0013710578072400573,
+      "loss": 0.1299,
+      "step": 39250
+    },
+    {
+      "epoch": 0.34071752849367626,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001371029426542377,
+      "loss": 0.1143,
+      "step": 39251
+    },
+    {
+      "epoch": 0.34072620897388045,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.001371001045548303,
+      "loss": 0.0654,
+      "step": 39252
+    },
+    {
+      "epoch": 0.3407348894540846,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0013709726642578658,
+      "loss": 0.1006,
+      "step": 39253
+    },
+    {
+      "epoch": 0.3407435699342888,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0013709442826710975,
+      "loss": 0.0996,
+      "step": 39254
+    },
+    {
+      "epoch": 0.3407522504144929,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0013709159007880285,
+      "loss": 0.1055,
+      "step": 39255
+    },
+    {
+      "epoch": 0.3407609308946971,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0013708875186086897,
+      "loss": 0.1196,
+      "step": 39256
+    },
+    {
+      "epoch": 0.34076961137490125,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0013708591361331127,
+      "loss": 0.1084,
+      "step": 39257
+    },
+    {
+      "epoch": 0.34077829185510544,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.001370830753361328,
+      "loss": 0.1064,
+      "step": 39258
+    },
+    {
+      "epoch": 0.3407869723353096,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001370802370293367,
+      "loss": 0.0767,
+      "step": 39259
+    },
+    {
+      "epoch": 0.34079565281551377,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0013707739869292604,
+      "loss": 0.1328,
+      "step": 39260
+    },
+    {
+      "epoch": 0.3408043332957179,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0013707456032690398,
+      "loss": 0.1289,
+      "step": 39261
+    },
+    {
+      "epoch": 0.3408130137759221,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0013707172193127355,
+      "loss": 0.0608,
+      "step": 39262
+    },
+    {
+      "epoch": 0.34082169425612624,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0013706888350603792,
+      "loss": 0.123,
+      "step": 39263
+    },
+    {
+      "epoch": 0.34083037473633043,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0013706604505120015,
+      "loss": 0.1016,
+      "step": 39264
+    },
+    {
+      "epoch": 0.34083905521653457,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0013706320656676333,
+      "loss": 0.1162,
+      "step": 39265
+    },
+    {
+      "epoch": 0.34084773569673876,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0013706036805273066,
+      "loss": 0.0938,
+      "step": 39266
+    },
+    {
+      "epoch": 0.3408564161769429,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0013705752950910516,
+      "loss": 0.0972,
+      "step": 39267
+    },
+    {
+      "epoch": 0.3408650966571471,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0013705469093588992,
+      "loss": 0.1147,
+      "step": 39268
+    },
+    {
+      "epoch": 0.34087377713735123,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0013705185233308808,
+      "loss": 0.1543,
+      "step": 39269
+    },
+    {
+      "epoch": 0.3408824576175554,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0013704901370070277,
+      "loss": 0.1245,
+      "step": 39270
+    },
+    {
+      "epoch": 0.34089113809775956,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0013704617503873704,
+      "loss": 0.0898,
+      "step": 39271
+    },
+    {
+      "epoch": 0.34089981857796375,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0013704333634719404,
+      "loss": 0.1211,
+      "step": 39272
+    },
+    {
+      "epoch": 0.3409084990581679,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0013704049762607684,
+      "loss": 0.1211,
+      "step": 39273
+    },
+    {
+      "epoch": 0.3409171795383721,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0013703765887538855,
+      "loss": 0.0674,
+      "step": 39274
+    },
+    {
+      "epoch": 0.3409258600185762,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001370348200951323,
+      "loss": 0.1064,
+      "step": 39275
+    },
+    {
+      "epoch": 0.3409345404987804,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0013703198128531117,
+      "loss": 0.1191,
+      "step": 39276
+    },
+    {
+      "epoch": 0.34094322097898455,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0013702914244592829,
+      "loss": 0.0977,
+      "step": 39277
+    },
+    {
+      "epoch": 0.34095190145918874,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0013702630357698671,
+      "loss": 0.084,
+      "step": 39278
+    },
+    {
+      "epoch": 0.3409605819393929,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0013702346467848962,
+      "loss": 0.0889,
+      "step": 39279
+    },
+    {
+      "epoch": 0.3409692624195971,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0013702062575044003,
+      "loss": 0.0674,
+      "step": 39280
+    },
+    {
+      "epoch": 0.3409779428998012,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001370177867928411,
+      "loss": 0.1221,
+      "step": 39281
+    },
+    {
+      "epoch": 0.3409866233800054,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001370149478056959,
+      "loss": 0.1152,
+      "step": 39282
+    },
+    {
+      "epoch": 0.34099530386020954,
+      "grad_norm": 2.046875,
+      "learning_rate": 0.001370121087890076,
+      "loss": 0.1167,
+      "step": 39283
+    },
+    {
+      "epoch": 0.34100398434041373,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0013700926974277924,
+      "loss": 0.1138,
+      "step": 39284
+    },
+    {
+      "epoch": 0.3410126648206179,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0013700643066701396,
+      "loss": 0.1182,
+      "step": 39285
+    },
+    {
+      "epoch": 0.34102134530082207,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0013700359156171486,
+      "loss": 0.0869,
+      "step": 39286
+    },
+    {
+      "epoch": 0.3410300257810262,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0013700075242688502,
+      "loss": 0.084,
+      "step": 39287
+    },
+    {
+      "epoch": 0.3410387062612304,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001369979132625276,
+      "loss": 0.123,
+      "step": 39288
+    },
+    {
+      "epoch": 0.34104738674143453,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0013699507406864564,
+      "loss": 0.082,
+      "step": 39289
+    },
+    {
+      "epoch": 0.3410560672216387,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0013699223484524225,
+      "loss": 0.0664,
+      "step": 39290
+    },
+    {
+      "epoch": 0.34106474770184286,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.0013698939559232057,
+      "loss": 0.1133,
+      "step": 39291
+    },
+    {
+      "epoch": 0.34107342818204706,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001369865563098837,
+      "loss": 0.1035,
+      "step": 39292
+    },
+    {
+      "epoch": 0.3410821086622512,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0013698371699793475,
+      "loss": 0.0986,
+      "step": 39293
+    },
+    {
+      "epoch": 0.3410907891424554,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0013698087765647681,
+      "loss": 0.0742,
+      "step": 39294
+    },
+    {
+      "epoch": 0.3410994696226595,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0013697803828551299,
+      "loss": 0.0747,
+      "step": 39295
+    },
+    {
+      "epoch": 0.3411081501028637,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0013697519888504637,
+      "loss": 0.0762,
+      "step": 39296
+    },
+    {
+      "epoch": 0.34111683058306785,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0013697235945508012,
+      "loss": 0.0684,
+      "step": 39297
+    },
+    {
+      "epoch": 0.34112551106327205,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0013696951999561727,
+      "loss": 0.125,
+      "step": 39298
+    },
+    {
+      "epoch": 0.3411341915434762,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0013696668050666094,
+      "loss": 0.0894,
+      "step": 39299
+    },
+    {
+      "epoch": 0.3411428720236804,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0013696384098821426,
+      "loss": 0.1055,
+      "step": 39300
+    },
+    {
+      "epoch": 0.3411515525038845,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0013696100144028037,
+      "loss": 0.1279,
+      "step": 39301
+    },
+    {
+      "epoch": 0.3411602329840887,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0013695816186286229,
+      "loss": 0.0957,
+      "step": 39302
+    },
+    {
+      "epoch": 0.34116891346429284,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001369553222559632,
+      "loss": 0.0918,
+      "step": 39303
+    },
+    {
+      "epoch": 0.341177593944497,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0013695248261958617,
+      "loss": 0.1426,
+      "step": 39304
+    },
+    {
+      "epoch": 0.3411862744247012,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0013694964295373432,
+      "loss": 0.0889,
+      "step": 39305
+    },
+    {
+      "epoch": 0.3411949549049053,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001369468032584107,
+      "loss": 0.1133,
+      "step": 39306
+    },
+    {
+      "epoch": 0.3412036353851095,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001369439635336185,
+      "loss": 0.0752,
+      "step": 39307
+    },
+    {
+      "epoch": 0.34121231586531364,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0013694112377936076,
+      "loss": 0.0933,
+      "step": 39308
+    },
+    {
+      "epoch": 0.34122099634551784,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0013693828399564062,
+      "loss": 0.084,
+      "step": 39309
+    },
+    {
+      "epoch": 0.341229676825722,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001369354441824612,
+      "loss": 0.0874,
+      "step": 39310
+    },
+    {
+      "epoch": 0.34123835730592617,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0013693260433982558,
+      "loss": 0.1318,
+      "step": 39311
+    },
+    {
+      "epoch": 0.3412470377861303,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0013692976446773684,
+      "loss": 0.1016,
+      "step": 39312
+    },
+    {
+      "epoch": 0.3412557182663345,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0013692692456619812,
+      "loss": 0.0879,
+      "step": 39313
+    },
+    {
+      "epoch": 0.34126439874653863,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0013692408463521254,
+      "loss": 0.1182,
+      "step": 39314
+    },
+    {
+      "epoch": 0.3412730792267428,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0013692124467478317,
+      "loss": 0.1211,
+      "step": 39315
+    },
+    {
+      "epoch": 0.34128175970694696,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0013691840468491314,
+      "loss": 0.1045,
+      "step": 39316
+    },
+    {
+      "epoch": 0.34129044018715116,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0013691556466560555,
+      "loss": 0.123,
+      "step": 39317
+    },
+    {
+      "epoch": 0.3412991206673553,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0013691272461686349,
+      "loss": 0.1001,
+      "step": 39318
+    },
+    {
+      "epoch": 0.3413078011475595,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001369098845386901,
+      "loss": 0.0713,
+      "step": 39319
+    },
+    {
+      "epoch": 0.3413164816277636,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0013690704443108843,
+      "loss": 0.0718,
+      "step": 39320
+    },
+    {
+      "epoch": 0.3413251621079678,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0013690420429406163,
+      "loss": 0.1108,
+      "step": 39321
+    },
+    {
+      "epoch": 0.34133384258817195,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001369013641276128,
+      "loss": 0.1387,
+      "step": 39322
+    },
+    {
+      "epoch": 0.34134252306837615,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0013689852393174507,
+      "loss": 0.1123,
+      "step": 39323
+    },
+    {
+      "epoch": 0.3413512035485803,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001368956837064615,
+      "loss": 0.0752,
+      "step": 39324
+    },
+    {
+      "epoch": 0.3413598840287845,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0013689284345176518,
+      "loss": 0.0918,
+      "step": 39325
+    },
+    {
+      "epoch": 0.3413685645089886,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001368900031676593,
+      "loss": 0.1074,
+      "step": 39326
+    },
+    {
+      "epoch": 0.3413772449891928,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0013688716285414691,
+      "loss": 0.1064,
+      "step": 39327
+    },
+    {
+      "epoch": 0.34138592546939694,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001368843225112311,
+      "loss": 0.1025,
+      "step": 39328
+    },
+    {
+      "epoch": 0.34139460594960114,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0013688148213891502,
+      "loss": 0.0967,
+      "step": 39329
+    },
+    {
+      "epoch": 0.3414032864298053,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0013687864173720172,
+      "loss": 0.0986,
+      "step": 39330
+    },
+    {
+      "epoch": 0.34141196691000947,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0013687580130609437,
+      "loss": 0.0957,
+      "step": 39331
+    },
+    {
+      "epoch": 0.3414206473902136,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0013687296084559607,
+      "loss": 0.1143,
+      "step": 39332
+    },
+    {
+      "epoch": 0.3414293278704178,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0013687012035570986,
+      "loss": 0.1289,
+      "step": 39333
+    },
+    {
+      "epoch": 0.34143800835062194,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001368672798364389,
+      "loss": 0.1006,
+      "step": 39334
+    },
+    {
+      "epoch": 0.34144668883082613,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0013686443928778629,
+      "loss": 0.0996,
+      "step": 39335
+    },
+    {
+      "epoch": 0.34145536931103027,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0013686159870975516,
+      "loss": 0.1113,
+      "step": 39336
+    },
+    {
+      "epoch": 0.34146404979123446,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0013685875810234857,
+      "loss": 0.126,
+      "step": 39337
+    },
+    {
+      "epoch": 0.3414727302714386,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0013685591746556963,
+      "loss": 0.1104,
+      "step": 39338
+    },
+    {
+      "epoch": 0.3414814107516428,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0013685307679942146,
+      "loss": 0.0806,
+      "step": 39339
+    },
+    {
+      "epoch": 0.3414900912318469,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0013685023610390719,
+      "loss": 0.125,
+      "step": 39340
+    },
+    {
+      "epoch": 0.3414987717120511,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001368473953790299,
+      "loss": 0.0815,
+      "step": 39341
+    },
+    {
+      "epoch": 0.34150745219225526,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0013684455462479271,
+      "loss": 0.0908,
+      "step": 39342
+    },
+    {
+      "epoch": 0.34151613267245945,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001368417138411987,
+      "loss": 0.0874,
+      "step": 39343
+    },
+    {
+      "epoch": 0.3415248131526636,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0013683887302825101,
+      "loss": 0.25,
+      "step": 39344
+    },
+    {
+      "epoch": 0.3415334936328678,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0013683603218595272,
+      "loss": 0.0835,
+      "step": 39345
+    },
+    {
+      "epoch": 0.3415421741130719,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0013683319131430696,
+      "loss": 0.0898,
+      "step": 39346
+    },
+    {
+      "epoch": 0.3415508545932761,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0013683035041331681,
+      "loss": 0.1279,
+      "step": 39347
+    },
+    {
+      "epoch": 0.34155953507348025,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001368275094829854,
+      "loss": 0.1084,
+      "step": 39348
+    },
+    {
+      "epoch": 0.34156821555368444,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001368246685233158,
+      "loss": 0.1064,
+      "step": 39349
+    },
+    {
+      "epoch": 0.3415768960338886,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001368218275343112,
+      "loss": 0.0933,
+      "step": 39350
+    },
+    {
+      "epoch": 0.34158557651409277,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0013681898651597463,
+      "loss": 0.124,
+      "step": 39351
+    },
+    {
+      "epoch": 0.3415942569942969,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0013681614546830923,
+      "loss": 0.1113,
+      "step": 39352
+    },
+    {
+      "epoch": 0.3416029374745011,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0013681330439131807,
+      "loss": 0.1016,
+      "step": 39353
+    },
+    {
+      "epoch": 0.34161161795470524,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001368104632850043,
+      "loss": 0.0947,
+      "step": 39354
+    },
+    {
+      "epoch": 0.34162029843490943,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0013680762214937098,
+      "loss": 0.1113,
+      "step": 39355
+    },
+    {
+      "epoch": 0.34162897891511357,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0013680478098442128,
+      "loss": 0.1006,
+      "step": 39356
+    },
+    {
+      "epoch": 0.34163765939531776,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0013680193979015828,
+      "loss": 0.0791,
+      "step": 39357
+    },
+    {
+      "epoch": 0.3416463398755219,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0013679909856658505,
+      "loss": 0.0864,
+      "step": 39358
+    },
+    {
+      "epoch": 0.3416550203557261,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0013679625731370475,
+      "loss": 0.1328,
+      "step": 39359
+    },
+    {
+      "epoch": 0.34166370083593023,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0013679341603152046,
+      "loss": 0.1279,
+      "step": 39360
+    },
+    {
+      "epoch": 0.3416723813161344,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001367905747200353,
+      "loss": 0.1006,
+      "step": 39361
+    },
+    {
+      "epoch": 0.34168106179633856,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0013678773337925236,
+      "loss": 0.1064,
+      "step": 39362
+    },
+    {
+      "epoch": 0.34168974227654275,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0013678489200917477,
+      "loss": 0.0933,
+      "step": 39363
+    },
+    {
+      "epoch": 0.3416984227567469,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001367820506098056,
+      "loss": 0.1138,
+      "step": 39364
+    },
+    {
+      "epoch": 0.3417071032369511,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0013677920918114801,
+      "loss": 0.0928,
+      "step": 39365
+    },
+    {
+      "epoch": 0.3417157837171552,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0013677636772320507,
+      "loss": 0.1045,
+      "step": 39366
+    },
+    {
+      "epoch": 0.3417244641973594,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0013677352623597992,
+      "loss": 0.1309,
+      "step": 39367
+    },
+    {
+      "epoch": 0.34173314467756355,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001367706847194756,
+      "loss": 0.1001,
+      "step": 39368
+    },
+    {
+      "epoch": 0.34174182515776774,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0013676784317369528,
+      "loss": 0.1162,
+      "step": 39369
+    },
+    {
+      "epoch": 0.3417505056379719,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0013676500159864205,
+      "loss": 0.106,
+      "step": 39370
+    },
+    {
+      "epoch": 0.3417591861181761,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0013676215999431903,
+      "loss": 0.0918,
+      "step": 39371
+    },
+    {
+      "epoch": 0.3417678665983802,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001367593183607293,
+      "loss": 0.1934,
+      "step": 39372
+    },
+    {
+      "epoch": 0.3417765470785844,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0013675647669787597,
+      "loss": 0.0791,
+      "step": 39373
+    },
+    {
+      "epoch": 0.34178522755878854,
+      "grad_norm": 2.5625,
+      "learning_rate": 0.001367536350057622,
+      "loss": 0.1348,
+      "step": 39374
+    },
+    {
+      "epoch": 0.34179390803899273,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.00136750793284391,
+      "loss": 0.0767,
+      "step": 39375
+    },
+    {
+      "epoch": 0.34180258851919687,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0013674795153376556,
+      "loss": 0.1006,
+      "step": 39376
+    },
+    {
+      "epoch": 0.34181126899940106,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0013674510975388897,
+      "loss": 0.1387,
+      "step": 39377
+    },
+    {
+      "epoch": 0.3418199494796052,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0013674226794476432,
+      "loss": 0.1064,
+      "step": 39378
+    },
+    {
+      "epoch": 0.3418286299598094,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0013673942610639472,
+      "loss": 0.0996,
+      "step": 39379
+    },
+    {
+      "epoch": 0.34183731044001353,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001367365842387833,
+      "loss": 0.1211,
+      "step": 39380
+    },
+    {
+      "epoch": 0.3418459909202177,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0013673374234193314,
+      "loss": 0.1084,
+      "step": 39381
+    },
+    {
+      "epoch": 0.34185467140042186,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0013673090041584736,
+      "loss": 0.0801,
+      "step": 39382
+    },
+    {
+      "epoch": 0.34186335188062605,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0013672805846052908,
+      "loss": 0.1113,
+      "step": 39383
+    },
+    {
+      "epoch": 0.3418720323608302,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001367252164759814,
+      "loss": 0.0996,
+      "step": 39384
+    },
+    {
+      "epoch": 0.3418807128410344,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0013672237446220746,
+      "loss": 0.0835,
+      "step": 39385
+    },
+    {
+      "epoch": 0.3418893933212385,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0013671953241921027,
+      "loss": 0.0938,
+      "step": 39386
+    },
+    {
+      "epoch": 0.3418980738014427,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0013671669034699301,
+      "loss": 0.1396,
+      "step": 39387
+    },
+    {
+      "epoch": 0.34190675428164685,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001367138482455588,
+      "loss": 0.1104,
+      "step": 39388
+    },
+    {
+      "epoch": 0.34191543476185104,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0013671100611491072,
+      "loss": 0.1016,
+      "step": 39389
+    },
+    {
+      "epoch": 0.3419241152420552,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0013670816395505187,
+      "loss": 0.106,
+      "step": 39390
+    },
+    {
+      "epoch": 0.3419327957222594,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.001367053217659854,
+      "loss": 0.1201,
+      "step": 39391
+    },
+    {
+      "epoch": 0.3419414762024635,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.001367024795477144,
+      "loss": 0.0928,
+      "step": 39392
+    },
+    {
+      "epoch": 0.3419501566826677,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0013669963730024196,
+      "loss": 0.1035,
+      "step": 39393
+    },
+    {
+      "epoch": 0.34195883716287184,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0013669679502357117,
+      "loss": 0.0703,
+      "step": 39394
+    },
+    {
+      "epoch": 0.34196751764307604,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0013669395271770518,
+      "loss": 0.1094,
+      "step": 39395
+    },
+    {
+      "epoch": 0.3419761981232802,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0013669111038264708,
+      "loss": 0.1162,
+      "step": 39396
+    },
+    {
+      "epoch": 0.34198487860348437,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0013668826801839999,
+      "loss": 0.1152,
+      "step": 39397
+    },
+    {
+      "epoch": 0.3419935590836885,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0013668542562496704,
+      "loss": 0.0962,
+      "step": 39398
+    },
+    {
+      "epoch": 0.3420022395638927,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0013668258320235125,
+      "loss": 0.1279,
+      "step": 39399
+    },
+    {
+      "epoch": 0.34201092004409683,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0013667974075055583,
+      "loss": 0.0928,
+      "step": 39400
+    },
+    {
+      "epoch": 0.342019600524301,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0013667689826958383,
+      "loss": 0.0854,
+      "step": 39401
+    },
+    {
+      "epoch": 0.34202828100450516,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0013667405575943838,
+      "loss": 0.0747,
+      "step": 39402
+    },
+    {
+      "epoch": 0.34203696148470936,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0013667121322012258,
+      "loss": 0.1289,
+      "step": 39403
+    },
+    {
+      "epoch": 0.3420456419649135,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0013666837065163956,
+      "loss": 0.0674,
+      "step": 39404
+    },
+    {
+      "epoch": 0.3420543224451177,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0013666552805399238,
+      "loss": 0.0928,
+      "step": 39405
+    },
+    {
+      "epoch": 0.3420630029253218,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001366626854271842,
+      "loss": 0.105,
+      "step": 39406
+    },
+    {
+      "epoch": 0.342071683405526,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001366598427712181,
+      "loss": 0.0972,
+      "step": 39407
+    },
+    {
+      "epoch": 0.34208036388573015,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0013665700008609718,
+      "loss": 0.1025,
+      "step": 39408
+    },
+    {
+      "epoch": 0.34208904436593435,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001366541573718246,
+      "loss": 0.2051,
+      "step": 39409
+    },
+    {
+      "epoch": 0.3420977248461385,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0013665131462840343,
+      "loss": 0.1152,
+      "step": 39410
+    },
+    {
+      "epoch": 0.3421064053263427,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0013664847185583678,
+      "loss": 0.1289,
+      "step": 39411
+    },
+    {
+      "epoch": 0.3421150858065468,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0013664562905412775,
+      "loss": 0.0981,
+      "step": 39412
+    },
+    {
+      "epoch": 0.342123766286751,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0013664278622327946,
+      "loss": 0.123,
+      "step": 39413
+    },
+    {
+      "epoch": 0.34213244676695515,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0013663994336329502,
+      "loss": 0.125,
+      "step": 39414
+    },
+    {
+      "epoch": 0.34214112724715934,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0013663710047417755,
+      "loss": 0.0947,
+      "step": 39415
+    },
+    {
+      "epoch": 0.3421498077273635,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0013663425755593012,
+      "loss": 0.0869,
+      "step": 39416
+    },
+    {
+      "epoch": 0.34215848820756767,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0013663141460855588,
+      "loss": 0.1133,
+      "step": 39417
+    },
+    {
+      "epoch": 0.3421671686877718,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0013662857163205793,
+      "loss": 0.0889,
+      "step": 39418
+    },
+    {
+      "epoch": 0.342175849167976,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0013662572862643938,
+      "loss": 0.1055,
+      "step": 39419
+    },
+    {
+      "epoch": 0.34218452964818014,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0013662288559170334,
+      "loss": 0.0757,
+      "step": 39420
+    },
+    {
+      "epoch": 0.34219321012838433,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001366200425278529,
+      "loss": 0.082,
+      "step": 39421
+    },
+    {
+      "epoch": 0.34220189060858847,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0013661719943489118,
+      "loss": 0.1562,
+      "step": 39422
+    },
+    {
+      "epoch": 0.34221057108879266,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001366143563128213,
+      "loss": 0.1123,
+      "step": 39423
+    },
+    {
+      "epoch": 0.3422192515689968,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0013661151316164634,
+      "loss": 0.0791,
+      "step": 39424
+    },
+    {
+      "epoch": 0.342227932049201,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0013660866998136943,
+      "loss": 0.1064,
+      "step": 39425
+    },
+    {
+      "epoch": 0.3422366125294051,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001366058267719937,
+      "loss": 0.1006,
+      "step": 39426
+    },
+    {
+      "epoch": 0.3422452930096093,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0013660298353352222,
+      "loss": 0.0703,
+      "step": 39427
+    },
+    {
+      "epoch": 0.34225397348981346,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0013660014026595812,
+      "loss": 0.083,
+      "step": 39428
+    },
+    {
+      "epoch": 0.3422626539700176,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001365972969693045,
+      "loss": 0.0742,
+      "step": 39429
+    },
+    {
+      "epoch": 0.3422713344502218,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0013659445364356448,
+      "loss": 0.0986,
+      "step": 39430
+    },
+    {
+      "epoch": 0.3422800149304259,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001365916102887412,
+      "loss": 0.1455,
+      "step": 39431
+    },
+    {
+      "epoch": 0.3422886954106301,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0013658876690483771,
+      "loss": 0.1318,
+      "step": 39432
+    },
+    {
+      "epoch": 0.34229737589083425,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001365859234918571,
+      "loss": 0.1045,
+      "step": 39433
+    },
+    {
+      "epoch": 0.34230605637103845,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0013658308004980257,
+      "loss": 0.0874,
+      "step": 39434
+    },
+    {
+      "epoch": 0.3423147368512426,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0013658023657867715,
+      "loss": 0.0732,
+      "step": 39435
+    },
+    {
+      "epoch": 0.3423234173314468,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0013657739307848403,
+      "loss": 0.1602,
+      "step": 39436
+    },
+    {
+      "epoch": 0.3423320978116509,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0013657454954922625,
+      "loss": 0.1055,
+      "step": 39437
+    },
+    {
+      "epoch": 0.3423407782918551,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0013657170599090695,
+      "loss": 0.0713,
+      "step": 39438
+    },
+    {
+      "epoch": 0.34234945877205925,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.001365688624035292,
+      "loss": 0.0859,
+      "step": 39439
+    },
+    {
+      "epoch": 0.34235813925226344,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0013656601878709617,
+      "loss": 0.1533,
+      "step": 39440
+    },
+    {
+      "epoch": 0.3423668197324676,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0013656317514161096,
+      "loss": 0.0879,
+      "step": 39441
+    },
+    {
+      "epoch": 0.34237550021267177,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0013656033146707662,
+      "loss": 0.0835,
+      "step": 39442
+    },
+    {
+      "epoch": 0.3423841806928759,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0013655748776349632,
+      "loss": 0.0957,
+      "step": 39443
+    },
+    {
+      "epoch": 0.3423928611730801,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0013655464403087313,
+      "loss": 0.1094,
+      "step": 39444
+    },
+    {
+      "epoch": 0.34240154165328424,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0013655180026921022,
+      "loss": 0.127,
+      "step": 39445
+    },
+    {
+      "epoch": 0.34241022213348843,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0013654895647851062,
+      "loss": 0.0708,
+      "step": 39446
+    },
+    {
+      "epoch": 0.34241890261369257,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001365461126587775,
+      "loss": 0.0825,
+      "step": 39447
+    },
+    {
+      "epoch": 0.34242758309389676,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0013654326881001396,
+      "loss": 0.0972,
+      "step": 39448
+    },
+    {
+      "epoch": 0.3424362635741009,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001365404249322231,
+      "loss": 0.0898,
+      "step": 39449
+    },
+    {
+      "epoch": 0.3424449440543051,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0013653758102540798,
+      "loss": 0.0781,
+      "step": 39450
+    },
+    {
+      "epoch": 0.3424536245345092,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0013653473708957183,
+      "loss": 0.1118,
+      "step": 39451
+    },
+    {
+      "epoch": 0.3424623050147134,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0013653189312471765,
+      "loss": 0.0918,
+      "step": 39452
+    },
+    {
+      "epoch": 0.34247098549491756,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0013652904913084858,
+      "loss": 0.0889,
+      "step": 39453
+    },
+    {
+      "epoch": 0.34247966597512175,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0013652620510796775,
+      "loss": 0.124,
+      "step": 39454
+    },
+    {
+      "epoch": 0.3424883464553259,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001365233610560783,
+      "loss": 0.127,
+      "step": 39455
+    },
+    {
+      "epoch": 0.3424970269355301,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0013652051697518326,
+      "loss": 0.1074,
+      "step": 39456
+    },
+    {
+      "epoch": 0.3425057074157342,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0013651767286528577,
+      "loss": 0.1074,
+      "step": 39457
+    },
+    {
+      "epoch": 0.3425143878959384,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.00136514828726389,
+      "loss": 0.0991,
+      "step": 39458
+    },
+    {
+      "epoch": 0.34252306837614255,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0013651198455849598,
+      "loss": 0.0732,
+      "step": 39459
+    },
+    {
+      "epoch": 0.34253174885634674,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0013650914036160986,
+      "loss": 0.127,
+      "step": 39460
+    },
+    {
+      "epoch": 0.3425404293365509,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0013650629613573373,
+      "loss": 0.1426,
+      "step": 39461
+    },
+    {
+      "epoch": 0.34254910981675507,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0013650345188087074,
+      "loss": 0.1084,
+      "step": 39462
+    },
+    {
+      "epoch": 0.3425577902969592,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0013650060759702394,
+      "loss": 0.0693,
+      "step": 39463
+    },
+    {
+      "epoch": 0.3425664707771634,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0013649776328419649,
+      "loss": 0.0835,
+      "step": 39464
+    },
+    {
+      "epoch": 0.34257515125736754,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001364949189423915,
+      "loss": 0.0811,
+      "step": 39465
+    },
+    {
+      "epoch": 0.34258383173757173,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0013649207457161203,
+      "loss": 0.085,
+      "step": 39466
+    },
+    {
+      "epoch": 0.34259251221777587,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0013648923017186125,
+      "loss": 0.1094,
+      "step": 39467
+    },
+    {
+      "epoch": 0.34260119269798006,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0013648638574314225,
+      "loss": 0.1104,
+      "step": 39468
+    },
+    {
+      "epoch": 0.3426098731781842,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001364835412854581,
+      "loss": 0.1367,
+      "step": 39469
+    },
+    {
+      "epoch": 0.3426185536583884,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0013648069679881199,
+      "loss": 0.1279,
+      "step": 39470
+    },
+    {
+      "epoch": 0.34262723413859253,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.00136477852283207,
+      "loss": 0.0947,
+      "step": 39471
+    },
+    {
+      "epoch": 0.3426359146187967,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0013647500773864617,
+      "loss": 0.0898,
+      "step": 39472
+    },
+    {
+      "epoch": 0.34264459509900086,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001364721631651327,
+      "loss": 0.0776,
+      "step": 39473
+    },
+    {
+      "epoch": 0.34265327557920505,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0013646931856266967,
+      "loss": 0.1436,
+      "step": 39474
+    },
+    {
+      "epoch": 0.3426619560594092,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0013646647393126016,
+      "loss": 0.1094,
+      "step": 39475
+    },
+    {
+      "epoch": 0.3426706365396134,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0013646362927090734,
+      "loss": 0.1221,
+      "step": 39476
+    },
+    {
+      "epoch": 0.3426793170198175,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0013646078458161434,
+      "loss": 0.0903,
+      "step": 39477
+    },
+    {
+      "epoch": 0.3426879975000217,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0013645793986338415,
+      "loss": 0.1211,
+      "step": 39478
+    },
+    {
+      "epoch": 0.34269667798022585,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0013645509511621997,
+      "loss": 0.1123,
+      "step": 39479
+    },
+    {
+      "epoch": 0.34270535846043004,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0013645225034012495,
+      "loss": 0.0869,
+      "step": 39480
+    },
+    {
+      "epoch": 0.3427140389406342,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0013644940553510208,
+      "loss": 0.0962,
+      "step": 39481
+    },
+    {
+      "epoch": 0.3427227194208384,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0013644656070115454,
+      "loss": 0.123,
+      "step": 39482
+    },
+    {
+      "epoch": 0.3427313999010425,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0013644371583828546,
+      "loss": 0.0894,
+      "step": 39483
+    },
+    {
+      "epoch": 0.3427400803812467,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0013644087094649795,
+      "loss": 0.1123,
+      "step": 39484
+    },
+    {
+      "epoch": 0.34274876086145084,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0013643802602579507,
+      "loss": 0.0649,
+      "step": 39485
+    },
+    {
+      "epoch": 0.34275744134165503,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0013643518107618,
+      "loss": 0.1035,
+      "step": 39486
+    },
+    {
+      "epoch": 0.34276612182185917,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0013643233609765576,
+      "loss": 0.1094,
+      "step": 39487
+    },
+    {
+      "epoch": 0.34277480230206336,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0013642949109022556,
+      "loss": 0.1426,
+      "step": 39488
+    },
+    {
+      "epoch": 0.3427834827822675,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0013642664605389243,
+      "loss": 0.127,
+      "step": 39489
+    },
+    {
+      "epoch": 0.3427921632624717,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0013642380098865953,
+      "loss": 0.0732,
+      "step": 39490
+    },
+    {
+      "epoch": 0.34280084374267583,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0013642095589452997,
+      "loss": 0.1035,
+      "step": 39491
+    },
+    {
+      "epoch": 0.34280952422288,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001364181107715068,
+      "loss": 0.1543,
+      "step": 39492
+    },
+    {
+      "epoch": 0.34281820470308416,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0013641526561959326,
+      "loss": 0.1533,
+      "step": 39493
+    },
+    {
+      "epoch": 0.34282688518328835,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0013641242043879234,
+      "loss": 0.0957,
+      "step": 39494
+    },
+    {
+      "epoch": 0.3428355656634925,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0013640957522910717,
+      "loss": 0.0796,
+      "step": 39495
+    },
+    {
+      "epoch": 0.3428442461436967,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0013640672999054096,
+      "loss": 0.1211,
+      "step": 39496
+    },
+    {
+      "epoch": 0.3428529266239008,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001364038847230967,
+      "loss": 0.0869,
+      "step": 39497
+    },
+    {
+      "epoch": 0.342861607104105,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0013640103942677752,
+      "loss": 0.127,
+      "step": 39498
+    },
+    {
+      "epoch": 0.34287028758430915,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001363981941015866,
+      "loss": 0.0903,
+      "step": 39499
+    },
+    {
+      "epoch": 0.34287896806451335,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.00136395348747527,
+      "loss": 0.0879,
+      "step": 39500
+    },
+    {
+      "epoch": 0.3428876485447175,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0013639250336460185,
+      "loss": 0.0859,
+      "step": 39501
+    },
+    {
+      "epoch": 0.3428963290249217,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0013638965795281423,
+      "loss": 0.1123,
+      "step": 39502
+    },
+    {
+      "epoch": 0.3429050095051258,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0013638681251216729,
+      "loss": 0.1147,
+      "step": 39503
+    },
+    {
+      "epoch": 0.34291368998533,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0013638396704266414,
+      "loss": 0.085,
+      "step": 39504
+    },
+    {
+      "epoch": 0.34292237046553414,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0013638112154430787,
+      "loss": 0.1172,
+      "step": 39505
+    },
+    {
+      "epoch": 0.34293105094573834,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001363782760171016,
+      "loss": 0.1162,
+      "step": 39506
+    },
+    {
+      "epoch": 0.3429397314259425,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0013637543046104846,
+      "loss": 0.0825,
+      "step": 39507
+    },
+    {
+      "epoch": 0.34294841190614667,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0013637258487615154,
+      "loss": 0.0869,
+      "step": 39508
+    },
+    {
+      "epoch": 0.3429570923863508,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0013636973926241397,
+      "loss": 0.1108,
+      "step": 39509
+    },
+    {
+      "epoch": 0.342965772866555,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001363668936198388,
+      "loss": 0.0884,
+      "step": 39510
+    },
+    {
+      "epoch": 0.34297445334675913,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0013636404794842923,
+      "loss": 0.1123,
+      "step": 39511
+    },
+    {
+      "epoch": 0.3429831338269633,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001363612022481883,
+      "loss": 0.0859,
+      "step": 39512
+    },
+    {
+      "epoch": 0.34299181430716746,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001363583565191192,
+      "loss": 0.123,
+      "step": 39513
+    },
+    {
+      "epoch": 0.34300049478737166,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0013635551076122496,
+      "loss": 0.083,
+      "step": 39514
+    },
+    {
+      "epoch": 0.3430091752675758,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0013635266497450875,
+      "loss": 0.0879,
+      "step": 39515
+    },
+    {
+      "epoch": 0.34301785574778,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0013634981915897366,
+      "loss": 0.083,
+      "step": 39516
+    },
+    {
+      "epoch": 0.3430265362279841,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0013634697331462282,
+      "loss": 0.1602,
+      "step": 39517
+    },
+    {
+      "epoch": 0.3430352167081883,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0013634412744145932,
+      "loss": 0.1021,
+      "step": 39518
+    },
+    {
+      "epoch": 0.34304389718839245,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0013634128153948626,
+      "loss": 0.0986,
+      "step": 39519
+    },
+    {
+      "epoch": 0.34305257766859665,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0013633843560870676,
+      "loss": 0.1123,
+      "step": 39520
+    },
+    {
+      "epoch": 0.3430612581488008,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0013633558964912397,
+      "loss": 0.1357,
+      "step": 39521
+    },
+    {
+      "epoch": 0.343069938629005,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0013633274366074096,
+      "loss": 0.1631,
+      "step": 39522
+    },
+    {
+      "epoch": 0.3430786191092091,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0013632989764356088,
+      "loss": 0.1084,
+      "step": 39523
+    },
+    {
+      "epoch": 0.3430872995894133,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001363270515975868,
+      "loss": 0.1133,
+      "step": 39524
+    },
+    {
+      "epoch": 0.34309598006961745,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0013632420552282182,
+      "loss": 0.104,
+      "step": 39525
+    },
+    {
+      "epoch": 0.34310466054982164,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0013632135941926914,
+      "loss": 0.1113,
+      "step": 39526
+    },
+    {
+      "epoch": 0.3431133410300258,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0013631851328693178,
+      "loss": 0.0918,
+      "step": 39527
+    },
+    {
+      "epoch": 0.34312202151022997,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0013631566712581289,
+      "loss": 0.106,
+      "step": 39528
+    },
+    {
+      "epoch": 0.3431307019904341,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.001363128209359156,
+      "loss": 0.1436,
+      "step": 39529
+    },
+    {
+      "epoch": 0.3431393824706383,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00136309974717243,
+      "loss": 0.1191,
+      "step": 39530
+    },
+    {
+      "epoch": 0.34314806295084244,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001363071284697982,
+      "loss": 0.1299,
+      "step": 39531
+    },
+    {
+      "epoch": 0.34315674343104663,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0013630428219358435,
+      "loss": 0.1221,
+      "step": 39532
+    },
+    {
+      "epoch": 0.34316542391125077,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0013630143588860449,
+      "loss": 0.0869,
+      "step": 39533
+    },
+    {
+      "epoch": 0.34317410439145496,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0013629858955486178,
+      "loss": 0.0762,
+      "step": 39534
+    },
+    {
+      "epoch": 0.3431827848716591,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0013629574319235936,
+      "loss": 0.1504,
+      "step": 39535
+    },
+    {
+      "epoch": 0.3431914653518633,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0013629289680110027,
+      "loss": 0.1133,
+      "step": 39536
+    },
+    {
+      "epoch": 0.3432001458320674,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001362900503810877,
+      "loss": 0.084,
+      "step": 39537
+    },
+    {
+      "epoch": 0.3432088263122716,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0013628720393232471,
+      "loss": 0.0903,
+      "step": 39538
+    },
+    {
+      "epoch": 0.34321750679247576,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0013628435745481444,
+      "loss": 0.1216,
+      "step": 39539
+    },
+    {
+      "epoch": 0.34322618727267995,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0013628151094856,
+      "loss": 0.0947,
+      "step": 39540
+    },
+    {
+      "epoch": 0.3432348677528841,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0013627866441356443,
+      "loss": 0.0767,
+      "step": 39541
+    },
+    {
+      "epoch": 0.3432435482330883,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0013627581784983094,
+      "loss": 0.1016,
+      "step": 39542
+    },
+    {
+      "epoch": 0.3432522287132924,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0013627297125736264,
+      "loss": 0.084,
+      "step": 39543
+    },
+    {
+      "epoch": 0.3432609091934966,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0013627012463616262,
+      "loss": 0.1094,
+      "step": 39544
+    },
+    {
+      "epoch": 0.34326958967370075,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0013626727798623395,
+      "loss": 0.1387,
+      "step": 39545
+    },
+    {
+      "epoch": 0.34327827015390494,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0013626443130757978,
+      "loss": 0.1406,
+      "step": 39546
+    },
+    {
+      "epoch": 0.3432869506341091,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0013626158460020323,
+      "loss": 0.1406,
+      "step": 39547
+    },
+    {
+      "epoch": 0.34329563111431327,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0013625873786410744,
+      "loss": 0.1064,
+      "step": 39548
+    },
+    {
+      "epoch": 0.3433043115945174,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0013625589109929543,
+      "loss": 0.1035,
+      "step": 39549
+    },
+    {
+      "epoch": 0.3433129920747216,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0013625304430577038,
+      "loss": 0.1406,
+      "step": 39550
+    },
+    {
+      "epoch": 0.34332167255492574,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0013625019748353545,
+      "loss": 0.165,
+      "step": 39551
+    },
+    {
+      "epoch": 0.3433303530351299,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0013624735063259366,
+      "loss": 0.103,
+      "step": 39552
+    },
+    {
+      "epoch": 0.34333903351533407,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0013624450375294816,
+      "loss": 0.0986,
+      "step": 39553
+    },
+    {
+      "epoch": 0.3433477139955382,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0013624165684460207,
+      "loss": 0.0786,
+      "step": 39554
+    },
+    {
+      "epoch": 0.3433563944757424,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001362388099075585,
+      "loss": 0.0845,
+      "step": 39555
+    },
+    {
+      "epoch": 0.34336507495594654,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0013623596294182058,
+      "loss": 0.1211,
+      "step": 39556
+    },
+    {
+      "epoch": 0.34337375543615073,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001362331159473914,
+      "loss": 0.1689,
+      "step": 39557
+    },
+    {
+      "epoch": 0.34338243591635487,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0013623026892427403,
+      "loss": 0.0986,
+      "step": 39558
+    },
+    {
+      "epoch": 0.34339111639655906,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0013622742187247168,
+      "loss": 0.0923,
+      "step": 39559
+    },
+    {
+      "epoch": 0.3433997968767632,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0013622457479198738,
+      "loss": 0.1123,
+      "step": 39560
+    },
+    {
+      "epoch": 0.3434084773569674,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0013622172768282433,
+      "loss": 0.0972,
+      "step": 39561
+    },
+    {
+      "epoch": 0.3434171578371715,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0013621888054498555,
+      "loss": 0.1465,
+      "step": 39562
+    },
+    {
+      "epoch": 0.3434258383173757,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0013621603337847419,
+      "loss": 0.1143,
+      "step": 39563
+    },
+    {
+      "epoch": 0.34343451879757986,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0013621318618329342,
+      "loss": 0.0796,
+      "step": 39564
+    },
+    {
+      "epoch": 0.34344319927778405,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0013621033895944627,
+      "loss": 0.0938,
+      "step": 39565
+    },
+    {
+      "epoch": 0.3434518797579882,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0013620749170693588,
+      "loss": 0.0884,
+      "step": 39566
+    },
+    {
+      "epoch": 0.3434605602381924,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0013620464442576537,
+      "loss": 0.0957,
+      "step": 39567
+    },
+    {
+      "epoch": 0.3434692407183965,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0013620179711593787,
+      "loss": 0.1055,
+      "step": 39568
+    },
+    {
+      "epoch": 0.3434779211986007,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0013619894977745647,
+      "loss": 0.0859,
+      "step": 39569
+    },
+    {
+      "epoch": 0.34348660167880485,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0013619610241032431,
+      "loss": 0.0859,
+      "step": 39570
+    },
+    {
+      "epoch": 0.34349528215900904,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0013619325501454446,
+      "loss": 0.0688,
+      "step": 39571
+    },
+    {
+      "epoch": 0.3435039626392132,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0013619040759012006,
+      "loss": 0.1318,
+      "step": 39572
+    },
+    {
+      "epoch": 0.34351264311941737,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0013618756013705424,
+      "loss": 0.1221,
+      "step": 39573
+    },
+    {
+      "epoch": 0.3435213235996215,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0013618471265535012,
+      "loss": 0.0864,
+      "step": 39574
+    },
+    {
+      "epoch": 0.3435300040798257,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0013618186514501077,
+      "loss": 0.1113,
+      "step": 39575
+    },
+    {
+      "epoch": 0.34353868456002984,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001361790176060393,
+      "loss": 0.1328,
+      "step": 39576
+    },
+    {
+      "epoch": 0.34354736504023403,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001361761700384389,
+      "loss": 0.0894,
+      "step": 39577
+    },
+    {
+      "epoch": 0.34355604552043817,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 0.0013617332244221256,
+      "loss": 0.0537,
+      "step": 39578
+    },
+    {
+      "epoch": 0.34356472600064236,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0013617047481736353,
+      "loss": 0.0918,
+      "step": 39579
+    },
+    {
+      "epoch": 0.3435734064808465,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0013616762716389483,
+      "loss": 0.0781,
+      "step": 39580
+    },
+    {
+      "epoch": 0.3435820869610507,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0013616477948180964,
+      "loss": 0.0933,
+      "step": 39581
+    },
+    {
+      "epoch": 0.34359076744125483,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0013616193177111104,
+      "loss": 0.127,
+      "step": 39582
+    },
+    {
+      "epoch": 0.343599447921459,
+      "grad_norm": 2.015625,
+      "learning_rate": 0.001361590840318021,
+      "loss": 0.1982,
+      "step": 39583
+    },
+    {
+      "epoch": 0.34360812840166316,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.00136156236263886,
+      "loss": 0.0952,
+      "step": 39584
+    },
+    {
+      "epoch": 0.34361680888186735,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0013615338846736584,
+      "loss": 0.1553,
+      "step": 39585
+    },
+    {
+      "epoch": 0.3436254893620715,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0013615054064224474,
+      "loss": 0.0884,
+      "step": 39586
+    },
+    {
+      "epoch": 0.3436341698422757,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001361476927885258,
+      "loss": 0.126,
+      "step": 39587
+    },
+    {
+      "epoch": 0.3436428503224798,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0013614484490621207,
+      "loss": 0.1348,
+      "step": 39588
+    },
+    {
+      "epoch": 0.343651530802684,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0013614199699530677,
+      "loss": 0.0762,
+      "step": 39589
+    },
+    {
+      "epoch": 0.34366021128288815,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0013613914905581303,
+      "loss": 0.0908,
+      "step": 39590
+    },
+    {
+      "epoch": 0.34366889176309234,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0013613630108773387,
+      "loss": 0.1025,
+      "step": 39591
+    },
+    {
+      "epoch": 0.3436775722432965,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0013613345309107243,
+      "loss": 0.1035,
+      "step": 39592
+    },
+    {
+      "epoch": 0.3436862527235007,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0013613060506583183,
+      "loss": 0.0884,
+      "step": 39593
+    },
+    {
+      "epoch": 0.3436949332037048,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0013612775701201522,
+      "loss": 0.0981,
+      "step": 39594
+    },
+    {
+      "epoch": 0.343703613683909,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0013612490892962567,
+      "loss": 0.0801,
+      "step": 39595
+    },
+    {
+      "epoch": 0.34371229416411314,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0013612206081866636,
+      "loss": 0.1416,
+      "step": 39596
+    },
+    {
+      "epoch": 0.34372097464431733,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0013611921267914031,
+      "loss": 0.0986,
+      "step": 39597
+    },
+    {
+      "epoch": 0.34372965512452147,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0013611636451105068,
+      "loss": 0.1348,
+      "step": 39598
+    },
+    {
+      "epoch": 0.34373833560472566,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001361135163144006,
+      "loss": 0.1348,
+      "step": 39599
+    },
+    {
+      "epoch": 0.3437470160849298,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0013611066808919318,
+      "loss": 0.1465,
+      "step": 39600
+    },
+    {
+      "epoch": 0.343755696565134,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0013610781983543153,
+      "loss": 0.0952,
+      "step": 39601
+    },
+    {
+      "epoch": 0.34376437704533813,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0013610497155311873,
+      "loss": 0.0718,
+      "step": 39602
+    },
+    {
+      "epoch": 0.3437730575255423,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0013610212324225792,
+      "loss": 0.1387,
+      "step": 39603
+    },
+    {
+      "epoch": 0.34378173800574646,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0013609927490285223,
+      "loss": 0.1436,
+      "step": 39604
+    },
+    {
+      "epoch": 0.34379041848595066,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001360964265349048,
+      "loss": 0.1504,
+      "step": 39605
+    },
+    {
+      "epoch": 0.3437990989661548,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0013609357813841865,
+      "loss": 0.1475,
+      "step": 39606
+    },
+    {
+      "epoch": 0.343807779446359,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.00136090729713397,
+      "loss": 0.1191,
+      "step": 39607
+    },
+    {
+      "epoch": 0.3438164599265631,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0013608788125984291,
+      "loss": 0.1235,
+      "step": 39608
+    },
+    {
+      "epoch": 0.3438251404067673,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001360850327777595,
+      "loss": 0.0742,
+      "step": 39609
+    },
+    {
+      "epoch": 0.34383382088697145,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0013608218426714989,
+      "loss": 0.1147,
+      "step": 39610
+    },
+    {
+      "epoch": 0.34384250136717565,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.001360793357280172,
+      "loss": 0.0688,
+      "step": 39611
+    },
+    {
+      "epoch": 0.3438511818473798,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0013607648716036454,
+      "loss": 0.123,
+      "step": 39612
+    },
+    {
+      "epoch": 0.343859862327584,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0013607363856419504,
+      "loss": 0.0815,
+      "step": 39613
+    },
+    {
+      "epoch": 0.3438685428077881,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0013607078993951177,
+      "loss": 0.1006,
+      "step": 39614
+    },
+    {
+      "epoch": 0.3438772232879923,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0013606794128631788,
+      "loss": 0.1123,
+      "step": 39615
+    },
+    {
+      "epoch": 0.34388590376819644,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001360650926046165,
+      "loss": 0.1562,
+      "step": 39616
+    },
+    {
+      "epoch": 0.34389458424840064,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001360622438944107,
+      "loss": 0.0947,
+      "step": 39617
+    },
+    {
+      "epoch": 0.3439032647286048,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0013605939515570365,
+      "loss": 0.0806,
+      "step": 39618
+    },
+    {
+      "epoch": 0.34391194520880897,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0013605654638849842,
+      "loss": 0.1123,
+      "step": 39619
+    },
+    {
+      "epoch": 0.3439206256890131,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0013605369759279815,
+      "loss": 0.0962,
+      "step": 39620
+    },
+    {
+      "epoch": 0.3439293061692173,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0013605084876860596,
+      "loss": 0.0957,
+      "step": 39621
+    },
+    {
+      "epoch": 0.34393798664942143,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0013604799991592496,
+      "loss": 0.104,
+      "step": 39622
+    },
+    {
+      "epoch": 0.3439466671296256,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0013604515103475823,
+      "loss": 0.1064,
+      "step": 39623
+    },
+    {
+      "epoch": 0.34395534760982976,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0013604230212510891,
+      "loss": 0.0947,
+      "step": 39624
+    },
+    {
+      "epoch": 0.34396402809003396,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0013603945318698015,
+      "loss": 0.126,
+      "step": 39625
+    },
+    {
+      "epoch": 0.3439727085702381,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.00136036604220375,
+      "loss": 0.1445,
+      "step": 39626
+    },
+    {
+      "epoch": 0.3439813890504423,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001360337552252966,
+      "loss": 0.1191,
+      "step": 39627
+    },
+    {
+      "epoch": 0.3439900695306464,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001360309062017481,
+      "loss": 0.1035,
+      "step": 39628
+    },
+    {
+      "epoch": 0.3439987500108506,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0013602805714973264,
+      "loss": 0.0869,
+      "step": 39629
+    },
+    {
+      "epoch": 0.34400743049105476,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0013602520806925325,
+      "loss": 0.084,
+      "step": 39630
+    },
+    {
+      "epoch": 0.34401611097125895,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0013602235896031309,
+      "loss": 0.0859,
+      "step": 39631
+    },
+    {
+      "epoch": 0.3440247914514631,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0013601950982291525,
+      "loss": 0.0752,
+      "step": 39632
+    },
+    {
+      "epoch": 0.3440334719316673,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0013601666065706287,
+      "loss": 0.1387,
+      "step": 39633
+    },
+    {
+      "epoch": 0.3440421524118714,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0013601381146275907,
+      "loss": 0.0732,
+      "step": 39634
+    },
+    {
+      "epoch": 0.3440508328920756,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0013601096224000694,
+      "loss": 0.1108,
+      "step": 39635
+    },
+    {
+      "epoch": 0.34405951337227975,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0013600811298880961,
+      "loss": 0.0977,
+      "step": 39636
+    },
+    {
+      "epoch": 0.34406819385248394,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0013600526370917021,
+      "loss": 0.1602,
+      "step": 39637
+    },
+    {
+      "epoch": 0.3440768743326881,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0013600241440109185,
+      "loss": 0.1016,
+      "step": 39638
+    },
+    {
+      "epoch": 0.34408555481289227,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0013599956506457763,
+      "loss": 0.1025,
+      "step": 39639
+    },
+    {
+      "epoch": 0.3440942352930964,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001359967156996307,
+      "loss": 0.0879,
+      "step": 39640
+    },
+    {
+      "epoch": 0.3441029157733006,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0013599386630625413,
+      "loss": 0.1157,
+      "step": 39641
+    },
+    {
+      "epoch": 0.34411159625350474,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0013599101688445106,
+      "loss": 0.1045,
+      "step": 39642
+    },
+    {
+      "epoch": 0.34412027673370893,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0013598816743422462,
+      "loss": 0.1162,
+      "step": 39643
+    },
+    {
+      "epoch": 0.34412895721391307,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001359853179555779,
+      "loss": 0.1128,
+      "step": 39644
+    },
+    {
+      "epoch": 0.34413763769411726,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0013598246844851402,
+      "loss": 0.0889,
+      "step": 39645
+    },
+    {
+      "epoch": 0.3441463181743214,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0013597961891303608,
+      "loss": 0.1523,
+      "step": 39646
+    },
+    {
+      "epoch": 0.3441549986545256,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0013597676934914726,
+      "loss": 0.0981,
+      "step": 39647
+    },
+    {
+      "epoch": 0.3441636791347297,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001359739197568506,
+      "loss": 0.0996,
+      "step": 39648
+    },
+    {
+      "epoch": 0.3441723596149339,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0013597107013614928,
+      "loss": 0.1426,
+      "step": 39649
+    },
+    {
+      "epoch": 0.34418104009513806,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001359682204870464,
+      "loss": 0.1094,
+      "step": 39650
+    },
+    {
+      "epoch": 0.34418972057534225,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0013596537080954505,
+      "loss": 0.126,
+      "step": 39651
+    },
+    {
+      "epoch": 0.3441984010555464,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0013596252110364836,
+      "loss": 0.1045,
+      "step": 39652
+    },
+    {
+      "epoch": 0.3442070815357506,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0013595967136935945,
+      "loss": 0.1079,
+      "step": 39653
+    },
+    {
+      "epoch": 0.3442157620159547,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0013595682160668142,
+      "loss": 0.1006,
+      "step": 39654
+    },
+    {
+      "epoch": 0.3442244424961589,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0013595397181561737,
+      "loss": 0.0776,
+      "step": 39655
+    },
+    {
+      "epoch": 0.34423312297636305,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0013595112199617052,
+      "loss": 0.1201,
+      "step": 39656
+    },
+    {
+      "epoch": 0.34424180345656724,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0013594827214834387,
+      "loss": 0.0845,
+      "step": 39657
+    },
+    {
+      "epoch": 0.3442504839367714,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0013594542227214057,
+      "loss": 0.1318,
+      "step": 39658
+    },
+    {
+      "epoch": 0.34425916441697557,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0013594257236756375,
+      "loss": 0.1133,
+      "step": 39659
+    },
+    {
+      "epoch": 0.3442678448971797,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0013593972243461656,
+      "loss": 0.0884,
+      "step": 39660
+    },
+    {
+      "epoch": 0.3442765253773839,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0013593687247330207,
+      "loss": 0.0771,
+      "step": 39661
+    },
+    {
+      "epoch": 0.34428520585758804,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0013593402248362337,
+      "loss": 0.1953,
+      "step": 39662
+    },
+    {
+      "epoch": 0.34429388633779223,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0013593117246558364,
+      "loss": 0.1001,
+      "step": 39663
+    },
+    {
+      "epoch": 0.34430256681799637,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.0013592832241918595,
+      "loss": 0.1113,
+      "step": 39664
+    },
+    {
+      "epoch": 0.34431124729820056,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0013592547234443344,
+      "loss": 0.1035,
+      "step": 39665
+    },
+    {
+      "epoch": 0.3443199277784047,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0013592262224132923,
+      "loss": 0.0986,
+      "step": 39666
+    },
+    {
+      "epoch": 0.3443286082586089,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001359197721098764,
+      "loss": 0.0918,
+      "step": 39667
+    },
+    {
+      "epoch": 0.34433728873881303,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0013591692195007816,
+      "loss": 0.0693,
+      "step": 39668
+    },
+    {
+      "epoch": 0.3443459692190172,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0013591407176193754,
+      "loss": 0.0952,
+      "step": 39669
+    },
+    {
+      "epoch": 0.34435464969922136,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0013591122154545767,
+      "loss": 0.1094,
+      "step": 39670
+    },
+    {
+      "epoch": 0.34436333017942555,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0013590837130064166,
+      "loss": 0.1279,
+      "step": 39671
+    },
+    {
+      "epoch": 0.3443720106596297,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0013590552102749267,
+      "loss": 0.1016,
+      "step": 39672
+    },
+    {
+      "epoch": 0.3443806911398339,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001359026707260138,
+      "loss": 0.0884,
+      "step": 39673
+    },
+    {
+      "epoch": 0.344389371620038,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0013589982039620813,
+      "loss": 0.0942,
+      "step": 39674
+    },
+    {
+      "epoch": 0.34439805210024216,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001358969700380788,
+      "loss": 0.1245,
+      "step": 39675
+    },
+    {
+      "epoch": 0.34440673258044635,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0013589411965162896,
+      "loss": 0.0728,
+      "step": 39676
+    },
+    {
+      "epoch": 0.3444154130606505,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001358912692368617,
+      "loss": 0.1084,
+      "step": 39677
+    },
+    {
+      "epoch": 0.3444240935408547,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0013588841879378013,
+      "loss": 0.1729,
+      "step": 39678
+    },
+    {
+      "epoch": 0.3444327740210588,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001358855683223874,
+      "loss": 0.0972,
+      "step": 39679
+    },
+    {
+      "epoch": 0.344441454501263,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0013588271782268655,
+      "loss": 0.1494,
+      "step": 39680
+    },
+    {
+      "epoch": 0.34445013498146715,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0013587986729468077,
+      "loss": 0.0947,
+      "step": 39681
+    },
+    {
+      "epoch": 0.34445881546167134,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0013587701673837317,
+      "loss": 0.1338,
+      "step": 39682
+    },
+    {
+      "epoch": 0.3444674959418755,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0013587416615376683,
+      "loss": 0.0869,
+      "step": 39683
+    },
+    {
+      "epoch": 0.34447617642207967,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001358713155408649,
+      "loss": 0.0825,
+      "step": 39684
+    },
+    {
+      "epoch": 0.3444848569022838,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0013586846489967049,
+      "loss": 0.1133,
+      "step": 39685
+    },
+    {
+      "epoch": 0.344493537382488,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001358656142301867,
+      "loss": 0.1279,
+      "step": 39686
+    },
+    {
+      "epoch": 0.34450221786269214,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0013586276353241668,
+      "loss": 0.1543,
+      "step": 39687
+    },
+    {
+      "epoch": 0.34451089834289633,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0013585991280636353,
+      "loss": 0.1045,
+      "step": 39688
+    },
+    {
+      "epoch": 0.34451957882310047,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0013585706205203037,
+      "loss": 0.127,
+      "step": 39689
+    },
+    {
+      "epoch": 0.34452825930330466,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001358542112694203,
+      "loss": 0.0527,
+      "step": 39690
+    },
+    {
+      "epoch": 0.3445369397835088,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001358513604585365,
+      "loss": 0.1006,
+      "step": 39691
+    },
+    {
+      "epoch": 0.344545620263713,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0013584850961938198,
+      "loss": 0.1025,
+      "step": 39692
+    },
+    {
+      "epoch": 0.34455430074391713,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0013584565875195995,
+      "loss": 0.0791,
+      "step": 39693
+    },
+    {
+      "epoch": 0.3445629812241213,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0013584280785627347,
+      "loss": 0.0732,
+      "step": 39694
+    },
+    {
+      "epoch": 0.34457166170432546,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001358399569323257,
+      "loss": 0.1191,
+      "step": 39695
+    },
+    {
+      "epoch": 0.34458034218452965,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0013583710598011976,
+      "loss": 0.0967,
+      "step": 39696
+    },
+    {
+      "epoch": 0.3445890226647338,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0013583425499965873,
+      "loss": 0.1357,
+      "step": 39697
+    },
+    {
+      "epoch": 0.344597703144938,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0013583140399094575,
+      "loss": 0.1221,
+      "step": 39698
+    },
+    {
+      "epoch": 0.3446063836251421,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0013582855295398396,
+      "loss": 0.0967,
+      "step": 39699
+    },
+    {
+      "epoch": 0.3446150641053463,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0013582570188877643,
+      "loss": 0.1069,
+      "step": 39700
+    },
+    {
+      "epoch": 0.34462374458555045,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0013582285079532627,
+      "loss": 0.1094,
+      "step": 39701
+    },
+    {
+      "epoch": 0.34463242506575464,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0013581999967363667,
+      "loss": 0.0845,
+      "step": 39702
+    },
+    {
+      "epoch": 0.3446411055459588,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0013581714852371068,
+      "loss": 0.1289,
+      "step": 39703
+    },
+    {
+      "epoch": 0.344649786026163,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0013581429734555147,
+      "loss": 0.1113,
+      "step": 39704
+    },
+    {
+      "epoch": 0.3446584665063671,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001358114461391621,
+      "loss": 0.0928,
+      "step": 39705
+    },
+    {
+      "epoch": 0.3446671469865713,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0013580859490454577,
+      "loss": 0.1074,
+      "step": 39706
+    },
+    {
+      "epoch": 0.34467582746677544,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001358057436417055,
+      "loss": 0.1216,
+      "step": 39707
+    },
+    {
+      "epoch": 0.34468450794697963,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001358028923506445,
+      "loss": 0.0845,
+      "step": 39708
+    },
+    {
+      "epoch": 0.34469318842718377,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.001358000410313658,
+      "loss": 0.1123,
+      "step": 39709
+    },
+    {
+      "epoch": 0.34470186890738796,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0013579718968387258,
+      "loss": 0.0957,
+      "step": 39710
+    },
+    {
+      "epoch": 0.3447105493875921,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0013579433830816794,
+      "loss": 0.1387,
+      "step": 39711
+    },
+    {
+      "epoch": 0.3447192298677963,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0013579148690425502,
+      "loss": 0.0947,
+      "step": 39712
+    },
+    {
+      "epoch": 0.34472791034800043,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0013578863547213686,
+      "loss": 0.1055,
+      "step": 39713
+    },
+    {
+      "epoch": 0.3447365908282046,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001357857840118167,
+      "loss": 0.1128,
+      "step": 39714
+    },
+    {
+      "epoch": 0.34474527130840876,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0013578293252329756,
+      "loss": 0.1211,
+      "step": 39715
+    },
+    {
+      "epoch": 0.34475395178861296,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0013578008100658262,
+      "loss": 0.1235,
+      "step": 39716
+    },
+    {
+      "epoch": 0.3447626322688171,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0013577722946167495,
+      "loss": 0.1465,
+      "step": 39717
+    },
+    {
+      "epoch": 0.3447713127490213,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0013577437788857766,
+      "loss": 0.0854,
+      "step": 39718
+    },
+    {
+      "epoch": 0.3447799932292254,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0013577152628729394,
+      "loss": 0.0996,
+      "step": 39719
+    },
+    {
+      "epoch": 0.3447886737094296,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0013576867465782687,
+      "loss": 0.0879,
+      "step": 39720
+    },
+    {
+      "epoch": 0.34479735418963375,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0013576582300017955,
+      "loss": 0.1191,
+      "step": 39721
+    },
+    {
+      "epoch": 0.34480603466983795,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0013576297131435511,
+      "loss": 0.1191,
+      "step": 39722
+    },
+    {
+      "epoch": 0.3448147151500421,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0013576011960035668,
+      "loss": 0.0884,
+      "step": 39723
+    },
+    {
+      "epoch": 0.3448233956302463,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0013575726785818735,
+      "loss": 0.0791,
+      "step": 39724
+    },
+    {
+      "epoch": 0.3448320761104504,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0013575441608785026,
+      "loss": 0.1172,
+      "step": 39725
+    },
+    {
+      "epoch": 0.3448407565906546,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0013575156428934855,
+      "loss": 0.1426,
+      "step": 39726
+    },
+    {
+      "epoch": 0.34484943707085874,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001357487124626853,
+      "loss": 0.0952,
+      "step": 39727
+    },
+    {
+      "epoch": 0.34485811755106294,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0013574586060786367,
+      "loss": 0.1172,
+      "step": 39728
+    },
+    {
+      "epoch": 0.3448667980312671,
+      "grad_norm": 0.059326171875,
+      "learning_rate": 0.0013574300872488676,
+      "loss": 0.052,
+      "step": 39729
+    },
+    {
+      "epoch": 0.34487547851147127,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0013574015681375767,
+      "loss": 0.0718,
+      "step": 39730
+    },
+    {
+      "epoch": 0.3448841589916754,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001357373048744795,
+      "loss": 0.0918,
+      "step": 39731
+    },
+    {
+      "epoch": 0.3448928394718796,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0013573445290705542,
+      "loss": 0.0918,
+      "step": 39732
+    },
+    {
+      "epoch": 0.34490151995208373,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001357316009114885,
+      "loss": 0.0918,
+      "step": 39733
+    },
+    {
+      "epoch": 0.34491020043228793,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0013572874888778193,
+      "loss": 0.1016,
+      "step": 39734
+    },
+    {
+      "epoch": 0.34491888091249207,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001357258968359388,
+      "loss": 0.0996,
+      "step": 39735
+    },
+    {
+      "epoch": 0.34492756139269626,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0013572304475596216,
+      "loss": 0.1133,
+      "step": 39736
+    },
+    {
+      "epoch": 0.3449362418729004,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0013572019264785522,
+      "loss": 0.082,
+      "step": 39737
+    },
+    {
+      "epoch": 0.3449449223531046,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0013571734051162107,
+      "loss": 0.0884,
+      "step": 39738
+    },
+    {
+      "epoch": 0.3449536028333087,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0013571448834726284,
+      "loss": 0.1162,
+      "step": 39739
+    },
+    {
+      "epoch": 0.3449622833135129,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0013571163615478357,
+      "loss": 0.1064,
+      "step": 39740
+    },
+    {
+      "epoch": 0.34497096379371706,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0013570878393418648,
+      "loss": 0.0967,
+      "step": 39741
+    },
+    {
+      "epoch": 0.34497964427392125,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0013570593168547463,
+      "loss": 0.0947,
+      "step": 39742
+    },
+    {
+      "epoch": 0.3449883247541254,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0013570307940865118,
+      "loss": 0.084,
+      "step": 39743
+    },
+    {
+      "epoch": 0.3449970052343296,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001357002271037192,
+      "loss": 0.0825,
+      "step": 39744
+    },
+    {
+      "epoch": 0.3450056857145337,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0013569737477068188,
+      "loss": 0.1045,
+      "step": 39745
+    },
+    {
+      "epoch": 0.3450143661947379,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0013569452240954228,
+      "loss": 0.1021,
+      "step": 39746
+    },
+    {
+      "epoch": 0.34502304667494205,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0013569167002030352,
+      "loss": 0.0776,
+      "step": 39747
+    },
+    {
+      "epoch": 0.34503172715514624,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0013568881760296877,
+      "loss": 0.0967,
+      "step": 39748
+    },
+    {
+      "epoch": 0.3450404076353504,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0013568596515754108,
+      "loss": 0.0996,
+      "step": 39749
+    },
+    {
+      "epoch": 0.34504908811555457,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0013568311268402362,
+      "loss": 0.1152,
+      "step": 39750
+    },
+    {
+      "epoch": 0.3450577685957587,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0013568026018241947,
+      "loss": 0.0889,
+      "step": 39751
+    },
+    {
+      "epoch": 0.3450664490759629,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001356774076527318,
+      "loss": 0.1484,
+      "step": 39752
+    },
+    {
+      "epoch": 0.34507512955616704,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001356745550949637,
+      "loss": 0.123,
+      "step": 39753
+    },
+    {
+      "epoch": 0.34508381003637123,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001356717025091183,
+      "loss": 0.0957,
+      "step": 39754
+    },
+    {
+      "epoch": 0.34509249051657537,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0013566884989519871,
+      "loss": 0.0771,
+      "step": 39755
+    },
+    {
+      "epoch": 0.34510117099677956,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0013566599725320806,
+      "loss": 0.1045,
+      "step": 39756
+    },
+    {
+      "epoch": 0.3451098514769837,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0013566314458314942,
+      "loss": 0.1279,
+      "step": 39757
+    },
+    {
+      "epoch": 0.3451185319571879,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0013566029188502597,
+      "loss": 0.1006,
+      "step": 39758
+    },
+    {
+      "epoch": 0.34512721243739203,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0013565743915884083,
+      "loss": 0.1084,
+      "step": 39759
+    },
+    {
+      "epoch": 0.3451358929175962,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0013565458640459706,
+      "loss": 0.1543,
+      "step": 39760
+    },
+    {
+      "epoch": 0.34514457339780036,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0013565173362229785,
+      "loss": 0.1016,
+      "step": 39761
+    },
+    {
+      "epoch": 0.34515325387800455,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.001356488808119463,
+      "loss": 0.0815,
+      "step": 39762
+    },
+    {
+      "epoch": 0.3451619343582087,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0013564602797354547,
+      "loss": 0.127,
+      "step": 39763
+    },
+    {
+      "epoch": 0.3451706148384129,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0013564317510709857,
+      "loss": 0.0981,
+      "step": 39764
+    },
+    {
+      "epoch": 0.345179295318617,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0013564032221260872,
+      "loss": 0.0957,
+      "step": 39765
+    },
+    {
+      "epoch": 0.3451879757988212,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001356374692900789,
+      "loss": 0.0908,
+      "step": 39766
+    },
+    {
+      "epoch": 0.34519665627902535,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0013563461633951238,
+      "loss": 0.1064,
+      "step": 39767
+    },
+    {
+      "epoch": 0.34520533675922954,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0013563176336091225,
+      "loss": 0.0845,
+      "step": 39768
+    },
+    {
+      "epoch": 0.3452140172394337,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001356289103542816,
+      "loss": 0.0894,
+      "step": 39769
+    },
+    {
+      "epoch": 0.34522269771963787,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0013562605731962355,
+      "loss": 0.1465,
+      "step": 39770
+    },
+    {
+      "epoch": 0.345231378199842,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001356232042569412,
+      "loss": 0.0732,
+      "step": 39771
+    },
+    {
+      "epoch": 0.3452400586800462,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0013562035116623772,
+      "loss": 0.1035,
+      "step": 39772
+    },
+    {
+      "epoch": 0.34524873916025034,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0013561749804751621,
+      "loss": 0.1143,
+      "step": 39773
+    },
+    {
+      "epoch": 0.34525741964045453,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001356146449007798,
+      "loss": 0.1504,
+      "step": 39774
+    },
+    {
+      "epoch": 0.34526610012065867,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0013561179172603159,
+      "loss": 0.1328,
+      "step": 39775
+    },
+    {
+      "epoch": 0.34527478060086286,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0013560893852327472,
+      "loss": 0.168,
+      "step": 39776
+    },
+    {
+      "epoch": 0.345283461081067,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001356060852925123,
+      "loss": 0.1387,
+      "step": 39777
+    },
+    {
+      "epoch": 0.3452921415612712,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0013560323203374745,
+      "loss": 0.1074,
+      "step": 39778
+    },
+    {
+      "epoch": 0.34530082204147533,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0013560037874698326,
+      "loss": 0.1348,
+      "step": 39779
+    },
+    {
+      "epoch": 0.3453095025216795,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0013559752543222288,
+      "loss": 0.1406,
+      "step": 39780
+    },
+    {
+      "epoch": 0.34531818300188366,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0013559467208946947,
+      "loss": 0.127,
+      "step": 39781
+    },
+    {
+      "epoch": 0.34532686348208785,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0013559181871872607,
+      "loss": 0.0898,
+      "step": 39782
+    },
+    {
+      "epoch": 0.345335543962292,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0013558896531999586,
+      "loss": 0.1289,
+      "step": 39783
+    },
+    {
+      "epoch": 0.3453442244424962,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0013558611189328196,
+      "loss": 0.0996,
+      "step": 39784
+    },
+    {
+      "epoch": 0.3453529049227003,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0013558325843858747,
+      "loss": 0.0972,
+      "step": 39785
+    },
+    {
+      "epoch": 0.3453615854029045,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0013558040495591553,
+      "loss": 0.0977,
+      "step": 39786
+    },
+    {
+      "epoch": 0.34537026588310865,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001355775514452692,
+      "loss": 0.1064,
+      "step": 39787
+    },
+    {
+      "epoch": 0.34537894636331284,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0013557469790665163,
+      "loss": 0.0996,
+      "step": 39788
+    },
+    {
+      "epoch": 0.345387626843517,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.00135571844340066,
+      "loss": 0.123,
+      "step": 39789
+    },
+    {
+      "epoch": 0.3453963073237212,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0013556899074551535,
+      "loss": 0.1016,
+      "step": 39790
+    },
+    {
+      "epoch": 0.3454049878039253,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0013556613712300287,
+      "loss": 0.0757,
+      "step": 39791
+    },
+    {
+      "epoch": 0.3454136682841295,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0013556328347253163,
+      "loss": 0.0884,
+      "step": 39792
+    },
+    {
+      "epoch": 0.34542234876433364,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0013556042979410478,
+      "loss": 0.0801,
+      "step": 39793
+    },
+    {
+      "epoch": 0.34543102924453783,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0013555757608772541,
+      "loss": 0.0752,
+      "step": 39794
+    },
+    {
+      "epoch": 0.345439709724742,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0013555472235339664,
+      "loss": 0.0981,
+      "step": 39795
+    },
+    {
+      "epoch": 0.34544839020494617,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0013555186859112165,
+      "loss": 0.1191,
+      "step": 39796
+    },
+    {
+      "epoch": 0.3454570706851503,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0013554901480090351,
+      "loss": 0.1504,
+      "step": 39797
+    },
+    {
+      "epoch": 0.34546575116535444,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0013554616098274534,
+      "loss": 0.1318,
+      "step": 39798
+    },
+    {
+      "epoch": 0.34547443164555863,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0013554330713665028,
+      "loss": 0.1006,
+      "step": 39799
+    },
+    {
+      "epoch": 0.34548311212576277,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0013554045326262146,
+      "loss": 0.1064,
+      "step": 39800
+    },
+    {
+      "epoch": 0.34549179260596696,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0013553759936066195,
+      "loss": 0.0967,
+      "step": 39801
+    },
+    {
+      "epoch": 0.3455004730861711,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0013553474543077492,
+      "loss": 0.1367,
+      "step": 39802
+    },
+    {
+      "epoch": 0.3455091535663753,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0013553189147296348,
+      "loss": 0.123,
+      "step": 39803
+    },
+    {
+      "epoch": 0.34551783404657943,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0013552903748723073,
+      "loss": 0.0894,
+      "step": 39804
+    },
+    {
+      "epoch": 0.3455265145267836,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0013552618347357982,
+      "loss": 0.0879,
+      "step": 39805
+    },
+    {
+      "epoch": 0.34553519500698776,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0013552332943201387,
+      "loss": 0.0801,
+      "step": 39806
+    },
+    {
+      "epoch": 0.34554387548719195,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.00135520475362536,
+      "loss": 0.1406,
+      "step": 39807
+    },
+    {
+      "epoch": 0.3455525559673961,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0013551762126514928,
+      "loss": 0.085,
+      "step": 39808
+    },
+    {
+      "epoch": 0.3455612364476003,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001355147671398569,
+      "loss": 0.0884,
+      "step": 39809
+    },
+    {
+      "epoch": 0.3455699169278044,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0013551191298666195,
+      "loss": 0.0977,
+      "step": 39810
+    },
+    {
+      "epoch": 0.3455785974080086,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0013550905880556753,
+      "loss": 0.1338,
+      "step": 39811
+    },
+    {
+      "epoch": 0.34558727788821275,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0013550620459657685,
+      "loss": 0.1147,
+      "step": 39812
+    },
+    {
+      "epoch": 0.34559595836841694,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001355033503596929,
+      "loss": 0.1348,
+      "step": 39813
+    },
+    {
+      "epoch": 0.3456046388486211,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0013550049609491893,
+      "loss": 0.0737,
+      "step": 39814
+    },
+    {
+      "epoch": 0.3456133193288253,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0013549764180225796,
+      "loss": 0.125,
+      "step": 39815
+    },
+    {
+      "epoch": 0.3456219998090294,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0013549478748171317,
+      "loss": 0.0938,
+      "step": 39816
+    },
+    {
+      "epoch": 0.3456306802892336,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0013549193313328765,
+      "loss": 0.1206,
+      "step": 39817
+    },
+    {
+      "epoch": 0.34563936076943774,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0013548907875698454,
+      "loss": 0.1221,
+      "step": 39818
+    },
+    {
+      "epoch": 0.34564804124964194,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0013548622435280695,
+      "loss": 0.0786,
+      "step": 39819
+    },
+    {
+      "epoch": 0.3456567217298461,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0013548336992075802,
+      "loss": 0.0874,
+      "step": 39820
+    },
+    {
+      "epoch": 0.34566540221005027,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0013548051546084085,
+      "loss": 0.1113,
+      "step": 39821
+    },
+    {
+      "epoch": 0.3456740826902544,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0013547766097305855,
+      "loss": 0.0938,
+      "step": 39822
+    },
+    {
+      "epoch": 0.3456827631704586,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001354748064574143,
+      "loss": 0.1045,
+      "step": 39823
+    },
+    {
+      "epoch": 0.34569144365066273,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001354719519139112,
+      "loss": 0.1172,
+      "step": 39824
+    },
+    {
+      "epoch": 0.3457001241308669,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0013546909734255233,
+      "loss": 0.1094,
+      "step": 39825
+    },
+    {
+      "epoch": 0.34570880461107106,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001354662427433408,
+      "loss": 0.082,
+      "step": 39826
+    },
+    {
+      "epoch": 0.34571748509127526,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0013546338811627984,
+      "loss": 0.1406,
+      "step": 39827
+    },
+    {
+      "epoch": 0.3457261655714794,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0013546053346137244,
+      "loss": 0.1377,
+      "step": 39828
+    },
+    {
+      "epoch": 0.3457348460516836,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0013545767877862183,
+      "loss": 0.0859,
+      "step": 39829
+    },
+    {
+      "epoch": 0.3457435265318877,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0013545482406803107,
+      "loss": 0.1084,
+      "step": 39830
+    },
+    {
+      "epoch": 0.3457522070120919,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0013545196932960326,
+      "loss": 0.0771,
+      "step": 39831
+    },
+    {
+      "epoch": 0.34576088749229605,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0013544911456334161,
+      "loss": 0.127,
+      "step": 39832
+    },
+    {
+      "epoch": 0.34576956797250025,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0013544625976924917,
+      "loss": 0.085,
+      "step": 39833
+    },
+    {
+      "epoch": 0.3457782484527044,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001354434049473291,
+      "loss": 0.0947,
+      "step": 39834
+    },
+    {
+      "epoch": 0.3457869289329086,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001354405500975845,
+      "loss": 0.1182,
+      "step": 39835
+    },
+    {
+      "epoch": 0.3457956094131127,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0013543769522001848,
+      "loss": 0.0942,
+      "step": 39836
+    },
+    {
+      "epoch": 0.3458042898933169,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0013543484031463418,
+      "loss": 0.1016,
+      "step": 39837
+    },
+    {
+      "epoch": 0.34581297037352104,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001354319853814347,
+      "loss": 0.1104,
+      "step": 39838
+    },
+    {
+      "epoch": 0.34582165085372524,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001354291304204232,
+      "loss": 0.1279,
+      "step": 39839
+    },
+    {
+      "epoch": 0.3458303313339294,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0013542627543160282,
+      "loss": 0.1006,
+      "step": 39840
+    },
+    {
+      "epoch": 0.34583901181413357,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0013542342041497663,
+      "loss": 0.0728,
+      "step": 39841
+    },
+    {
+      "epoch": 0.3458476922943377,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0013542056537054774,
+      "loss": 0.1123,
+      "step": 39842
+    },
+    {
+      "epoch": 0.3458563727745419,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0013541771029831934,
+      "loss": 0.0967,
+      "step": 39843
+    },
+    {
+      "epoch": 0.34586505325474604,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001354148551982945,
+      "loss": 0.1309,
+      "step": 39844
+    },
+    {
+      "epoch": 0.34587373373495023,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0013541200007047634,
+      "loss": 0.0884,
+      "step": 39845
+    },
+    {
+      "epoch": 0.34588241421515437,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0013540914491486802,
+      "loss": 0.0825,
+      "step": 39846
+    },
+    {
+      "epoch": 0.34589109469535856,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001354062897314726,
+      "loss": 0.1104,
+      "step": 39847
+    },
+    {
+      "epoch": 0.3458997751755627,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0013540343452029327,
+      "loss": 0.124,
+      "step": 39848
+    },
+    {
+      "epoch": 0.3459084556557669,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0013540057928133314,
+      "loss": 0.082,
+      "step": 39849
+    },
+    {
+      "epoch": 0.345917136135971,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0013539772401459532,
+      "loss": 0.085,
+      "step": 39850
+    },
+    {
+      "epoch": 0.3459258166161752,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0013539486872008292,
+      "loss": 0.1133,
+      "step": 39851
+    },
+    {
+      "epoch": 0.34593449709637936,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0013539201339779906,
+      "loss": 0.0708,
+      "step": 39852
+    },
+    {
+      "epoch": 0.34594317757658355,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001353891580477469,
+      "loss": 0.1089,
+      "step": 39853
+    },
+    {
+      "epoch": 0.3459518580567877,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0013538630266992952,
+      "loss": 0.1162,
+      "step": 39854
+    },
+    {
+      "epoch": 0.3459605385369919,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0013538344726435009,
+      "loss": 0.0801,
+      "step": 39855
+    },
+    {
+      "epoch": 0.345969219017196,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0013538059183101163,
+      "loss": 0.0918,
+      "step": 39856
+    },
+    {
+      "epoch": 0.3459778994974002,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0013537773636991739,
+      "loss": 0.1133,
+      "step": 39857
+    },
+    {
+      "epoch": 0.34598657997760435,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0013537488088107047,
+      "loss": 0.0938,
+      "step": 39858
+    },
+    {
+      "epoch": 0.34599526045780854,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001353720253644739,
+      "loss": 0.1089,
+      "step": 39859
+    },
+    {
+      "epoch": 0.3460039409380127,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001353691698201309,
+      "loss": 0.0801,
+      "step": 39860
+    },
+    {
+      "epoch": 0.34601262141821687,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0013536631424804454,
+      "loss": 0.083,
+      "step": 39861
+    },
+    {
+      "epoch": 0.346021301898421,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0013536345864821797,
+      "loss": 0.1729,
+      "step": 39862
+    },
+    {
+      "epoch": 0.3460299823786252,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0013536060302065432,
+      "loss": 0.1279,
+      "step": 39863
+    },
+    {
+      "epoch": 0.34603866285882934,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0013535774736535666,
+      "loss": 0.064,
+      "step": 39864
+    },
+    {
+      "epoch": 0.34604734333903353,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0013535489168232818,
+      "loss": 0.0825,
+      "step": 39865
+    },
+    {
+      "epoch": 0.34605602381923767,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0013535203597157194,
+      "loss": 0.1016,
+      "step": 39866
+    },
+    {
+      "epoch": 0.34606470429944186,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001353491802330911,
+      "loss": 0.0757,
+      "step": 39867
+    },
+    {
+      "epoch": 0.346073384779646,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0013534632446688879,
+      "loss": 0.1094,
+      "step": 39868
+    },
+    {
+      "epoch": 0.3460820652598502,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0013534346867296813,
+      "loss": 0.166,
+      "step": 39869
+    },
+    {
+      "epoch": 0.34609074574005433,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0013534061285133223,
+      "loss": 0.123,
+      "step": 39870
+    },
+    {
+      "epoch": 0.3460994262202585,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001353377570019842,
+      "loss": 0.1016,
+      "step": 39871
+    },
+    {
+      "epoch": 0.34610810670046266,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001353349011249272,
+      "loss": 0.1426,
+      "step": 39872
+    },
+    {
+      "epoch": 0.34611678718066685,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0013533204522016432,
+      "loss": 0.0771,
+      "step": 39873
+    },
+    {
+      "epoch": 0.346125467660871,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001353291892876987,
+      "loss": 0.5586,
+      "step": 39874
+    },
+    {
+      "epoch": 0.3461341481410752,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0013532633332753344,
+      "loss": 0.0991,
+      "step": 39875
+    },
+    {
+      "epoch": 0.3461428286212793,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0013532347733967167,
+      "loss": 0.1123,
+      "step": 39876
+    },
+    {
+      "epoch": 0.3461515091014835,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0013532062132411659,
+      "loss": 0.1377,
+      "step": 39877
+    },
+    {
+      "epoch": 0.34616018958168765,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001353177652808712,
+      "loss": 0.0757,
+      "step": 39878
+    },
+    {
+      "epoch": 0.34616887006189184,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0013531490920993872,
+      "loss": 0.103,
+      "step": 39879
+    },
+    {
+      "epoch": 0.346177550542096,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0013531205311132222,
+      "loss": 0.1172,
+      "step": 39880
+    },
+    {
+      "epoch": 0.3461862310223002,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0013530919698502483,
+      "loss": 0.0981,
+      "step": 39881
+    },
+    {
+      "epoch": 0.3461949115025043,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0013530634083104972,
+      "loss": 0.0991,
+      "step": 39882
+    },
+    {
+      "epoch": 0.3462035919827085,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0013530348464939992,
+      "loss": 0.0957,
+      "step": 39883
+    },
+    {
+      "epoch": 0.34621227246291264,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0013530062844007867,
+      "loss": 0.0869,
+      "step": 39884
+    },
+    {
+      "epoch": 0.34622095294311683,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0013529777220308897,
+      "loss": 0.0918,
+      "step": 39885
+    },
+    {
+      "epoch": 0.34622963342332097,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0013529491593843404,
+      "loss": 0.1221,
+      "step": 39886
+    },
+    {
+      "epoch": 0.34623831390352516,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0013529205964611696,
+      "loss": 0.0884,
+      "step": 39887
+    },
+    {
+      "epoch": 0.3462469943837293,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0013528920332614087,
+      "loss": 0.1992,
+      "step": 39888
+    },
+    {
+      "epoch": 0.3462556748639335,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001352863469785089,
+      "loss": 0.0659,
+      "step": 39889
+    },
+    {
+      "epoch": 0.34626435534413763,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0013528349060322414,
+      "loss": 0.1162,
+      "step": 39890
+    },
+    {
+      "epoch": 0.3462730358243418,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0013528063420028974,
+      "loss": 0.0962,
+      "step": 39891
+    },
+    {
+      "epoch": 0.34628171630454596,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0013527777776970883,
+      "loss": 0.0864,
+      "step": 39892
+    },
+    {
+      "epoch": 0.34629039678475015,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001352749213114845,
+      "loss": 0.1133,
+      "step": 39893
+    },
+    {
+      "epoch": 0.3462990772649543,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0013527206482561993,
+      "loss": 0.1211,
+      "step": 39894
+    },
+    {
+      "epoch": 0.3463077577451585,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0013526920831211818,
+      "loss": 0.166,
+      "step": 39895
+    },
+    {
+      "epoch": 0.3463164382253626,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001352663517709824,
+      "loss": 0.1084,
+      "step": 39896
+    },
+    {
+      "epoch": 0.3463251187055668,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0013526349520221572,
+      "loss": 0.1025,
+      "step": 39897
+    },
+    {
+      "epoch": 0.34633379918577095,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0013526063860582126,
+      "loss": 0.0962,
+      "step": 39898
+    },
+    {
+      "epoch": 0.34634247966597514,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0013525778198180218,
+      "loss": 0.1553,
+      "step": 39899
+    },
+    {
+      "epoch": 0.3463511601461793,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0013525492533016153,
+      "loss": 0.0894,
+      "step": 39900
+    },
+    {
+      "epoch": 0.3463598406263835,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0013525206865090248,
+      "loss": 0.0923,
+      "step": 39901
+    },
+    {
+      "epoch": 0.3463685211065876,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0013524921194402817,
+      "loss": 0.0776,
+      "step": 39902
+    },
+    {
+      "epoch": 0.3463772015867918,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001352463552095417,
+      "loss": 0.1094,
+      "step": 39903
+    },
+    {
+      "epoch": 0.34638588206699594,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0013524349844744616,
+      "loss": 0.0898,
+      "step": 39904
+    },
+    {
+      "epoch": 0.34639456254720014,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0013524064165774472,
+      "loss": 0.1855,
+      "step": 39905
+    },
+    {
+      "epoch": 0.3464032430274043,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0013523778484044052,
+      "loss": 0.1099,
+      "step": 39906
+    },
+    {
+      "epoch": 0.34641192350760847,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0013523492799553665,
+      "loss": 0.1055,
+      "step": 39907
+    },
+    {
+      "epoch": 0.3464206039878126,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0013523207112303623,
+      "loss": 0.1104,
+      "step": 39908
+    },
+    {
+      "epoch": 0.3464292844680168,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0013522921422294241,
+      "loss": 0.0801,
+      "step": 39909
+    },
+    {
+      "epoch": 0.34643796494822093,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0013522635729525828,
+      "loss": 0.085,
+      "step": 39910
+    },
+    {
+      "epoch": 0.3464466454284251,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.00135223500339987,
+      "loss": 0.1118,
+      "step": 39911
+    },
+    {
+      "epoch": 0.34645532590862926,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001352206433571317,
+      "loss": 0.1152,
+      "step": 39912
+    },
+    {
+      "epoch": 0.34646400638883346,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0013521778634669544,
+      "loss": 0.1006,
+      "step": 39913
+    },
+    {
+      "epoch": 0.3464726868690376,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001352149293086814,
+      "loss": 0.1011,
+      "step": 39914
+    },
+    {
+      "epoch": 0.3464813673492418,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0013521207224309268,
+      "loss": 0.1377,
+      "step": 39915
+    },
+    {
+      "epoch": 0.3464900478294459,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0013520921514993242,
+      "loss": 0.063,
+      "step": 39916
+    },
+    {
+      "epoch": 0.3464987283096501,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0013520635802920376,
+      "loss": 0.0957,
+      "step": 39917
+    },
+    {
+      "epoch": 0.34650740878985425,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0013520350088090981,
+      "loss": 0.0723,
+      "step": 39918
+    },
+    {
+      "epoch": 0.34651608927005845,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001352006437050537,
+      "loss": 0.126,
+      "step": 39919
+    },
+    {
+      "epoch": 0.3465247697502626,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0013519778650163852,
+      "loss": 0.1289,
+      "step": 39920
+    },
+    {
+      "epoch": 0.3465334502304667,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0013519492927066743,
+      "loss": 0.0938,
+      "step": 39921
+    },
+    {
+      "epoch": 0.3465421307106709,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0013519207201214355,
+      "loss": 0.0859,
+      "step": 39922
+    },
+    {
+      "epoch": 0.34655081119087505,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0013518921472606997,
+      "loss": 0.1006,
+      "step": 39923
+    },
+    {
+      "epoch": 0.34655949167107925,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0013518635741244984,
+      "loss": 0.1289,
+      "step": 39924
+    },
+    {
+      "epoch": 0.3465681721512834,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0013518350007128632,
+      "loss": 0.0747,
+      "step": 39925
+    },
+    {
+      "epoch": 0.3465768526314876,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0013518064270258247,
+      "loss": 0.0923,
+      "step": 39926
+    },
+    {
+      "epoch": 0.3465855331116917,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0013517778530634148,
+      "loss": 0.1699,
+      "step": 39927
+    },
+    {
+      "epoch": 0.3465942135918959,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0013517492788256646,
+      "loss": 0.0928,
+      "step": 39928
+    },
+    {
+      "epoch": 0.34660289407210004,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0013517207043126048,
+      "loss": 0.0981,
+      "step": 39929
+    },
+    {
+      "epoch": 0.34661157455230424,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001351692129524267,
+      "loss": 0.1455,
+      "step": 39930
+    },
+    {
+      "epoch": 0.3466202550325084,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0013516635544606826,
+      "loss": 0.0898,
+      "step": 39931
+    },
+    {
+      "epoch": 0.34662893551271257,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0013516349791218827,
+      "loss": 0.0781,
+      "step": 39932
+    },
+    {
+      "epoch": 0.3466376159929167,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0013516064035078983,
+      "loss": 0.1133,
+      "step": 39933
+    },
+    {
+      "epoch": 0.3466462964731209,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0013515778276187614,
+      "loss": 0.0977,
+      "step": 39934
+    },
+    {
+      "epoch": 0.34665497695332503,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0013515492514545025,
+      "loss": 0.1138,
+      "step": 39935
+    },
+    {
+      "epoch": 0.3466636574335292,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0013515206750151529,
+      "loss": 0.1162,
+      "step": 39936
+    },
+    {
+      "epoch": 0.34667233791373336,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0013514920983007446,
+      "loss": 0.0967,
+      "step": 39937
+    },
+    {
+      "epoch": 0.34668101839393756,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001351463521311308,
+      "loss": 0.0869,
+      "step": 39938
+    },
+    {
+      "epoch": 0.3466896988741417,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0013514349440468746,
+      "loss": 0.1147,
+      "step": 39939
+    },
+    {
+      "epoch": 0.3466983793543459,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001351406366507476,
+      "loss": 0.1426,
+      "step": 39940
+    },
+    {
+      "epoch": 0.34670705983455,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0013513777886931429,
+      "loss": 0.1064,
+      "step": 39941
+    },
+    {
+      "epoch": 0.3467157403147542,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001351349210603907,
+      "loss": 0.1196,
+      "step": 39942
+    },
+    {
+      "epoch": 0.34672442079495835,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0013513206322397992,
+      "loss": 0.1289,
+      "step": 39943
+    },
+    {
+      "epoch": 0.34673310127516255,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001351292053600851,
+      "loss": 0.1045,
+      "step": 39944
+    },
+    {
+      "epoch": 0.3467417817553667,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0013512634746870938,
+      "loss": 0.083,
+      "step": 39945
+    },
+    {
+      "epoch": 0.3467504622355709,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0013512348954985585,
+      "loss": 0.0928,
+      "step": 39946
+    },
+    {
+      "epoch": 0.346759142715775,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0013512063160352763,
+      "loss": 0.0986,
+      "step": 39947
+    },
+    {
+      "epoch": 0.3467678231959792,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001351177736297279,
+      "loss": 0.1074,
+      "step": 39948
+    },
+    {
+      "epoch": 0.34677650367618335,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0013511491562845973,
+      "loss": 0.1133,
+      "step": 39949
+    },
+    {
+      "epoch": 0.34678518415638754,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0013511205759972626,
+      "loss": 0.1025,
+      "step": 39950
+    },
+    {
+      "epoch": 0.3467938646365917,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0013510919954353065,
+      "loss": 0.0913,
+      "step": 39951
+    },
+    {
+      "epoch": 0.34680254511679587,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0013510634145987596,
+      "loss": 0.1001,
+      "step": 39952
+    },
+    {
+      "epoch": 0.346811225597,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0013510348334876536,
+      "loss": 0.085,
+      "step": 39953
+    },
+    {
+      "epoch": 0.3468199060772042,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.00135100625210202,
+      "loss": 0.1113,
+      "step": 39954
+    },
+    {
+      "epoch": 0.34682858655740834,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0013509776704418892,
+      "loss": 0.0879,
+      "step": 39955
+    },
+    {
+      "epoch": 0.34683726703761253,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0013509490885072932,
+      "loss": 0.1182,
+      "step": 39956
+    },
+    {
+      "epoch": 0.34684594751781667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0013509205062982633,
+      "loss": 0.0889,
+      "step": 39957
+    },
+    {
+      "epoch": 0.34685462799802086,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0013508919238148305,
+      "loss": 0.0864,
+      "step": 39958
+    },
+    {
+      "epoch": 0.346863308478225,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0013508633410570258,
+      "loss": 0.1079,
+      "step": 39959
+    },
+    {
+      "epoch": 0.3468719889584292,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0013508347580248807,
+      "loss": 0.082,
+      "step": 39960
+    },
+    {
+      "epoch": 0.3468806694386333,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0013508061747184268,
+      "loss": 0.1084,
+      "step": 39961
+    },
+    {
+      "epoch": 0.3468893499188375,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0013507775911376946,
+      "loss": 0.1182,
+      "step": 39962
+    },
+    {
+      "epoch": 0.34689803039904166,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001350749007282716,
+      "loss": 0.1079,
+      "step": 39963
+    },
+    {
+      "epoch": 0.34690671087924585,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0013507204231535221,
+      "loss": 0.105,
+      "step": 39964
+    },
+    {
+      "epoch": 0.34691539135945,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0013506918387501442,
+      "loss": 0.1045,
+      "step": 39965
+    },
+    {
+      "epoch": 0.3469240718396542,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0013506632540726132,
+      "loss": 0.1113,
+      "step": 39966
+    },
+    {
+      "epoch": 0.3469327523198583,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001350634669120961,
+      "loss": 0.0718,
+      "step": 39967
+    },
+    {
+      "epoch": 0.3469414328000625,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0013506060838952183,
+      "loss": 0.1592,
+      "step": 39968
+    },
+    {
+      "epoch": 0.34695011328026665,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0013505774983954165,
+      "loss": 0.0918,
+      "step": 39969
+    },
+    {
+      "epoch": 0.34695879376047084,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001350548912621587,
+      "loss": 0.1143,
+      "step": 39970
+    },
+    {
+      "epoch": 0.346967474240675,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0013505203265737609,
+      "loss": 0.0869,
+      "step": 39971
+    },
+    {
+      "epoch": 0.34697615472087917,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0013504917402519694,
+      "loss": 0.084,
+      "step": 39972
+    },
+    {
+      "epoch": 0.3469848352010833,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0013504631536562439,
+      "loss": 0.0864,
+      "step": 39973
+    },
+    {
+      "epoch": 0.3469935156812875,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.001350434566786616,
+      "loss": 0.1006,
+      "step": 39974
+    },
+    {
+      "epoch": 0.34700219616149164,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0013504059796431165,
+      "loss": 0.1006,
+      "step": 39975
+    },
+    {
+      "epoch": 0.34701087664169583,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0013503773922257766,
+      "loss": 0.1543,
+      "step": 39976
+    },
+    {
+      "epoch": 0.34701955712189997,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0013503488045346279,
+      "loss": 0.1895,
+      "step": 39977
+    },
+    {
+      "epoch": 0.34702823760210416,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0013503202165697015,
+      "loss": 0.0615,
+      "step": 39978
+    },
+    {
+      "epoch": 0.3470369180823083,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0013502916283310287,
+      "loss": 0.0977,
+      "step": 39979
+    },
+    {
+      "epoch": 0.3470455985625125,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0013502630398186406,
+      "loss": 0.1768,
+      "step": 39980
+    },
+    {
+      "epoch": 0.34705427904271663,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0013502344510325686,
+      "loss": 0.1006,
+      "step": 39981
+    },
+    {
+      "epoch": 0.3470629595229208,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001350205861972844,
+      "loss": 0.1079,
+      "step": 39982
+    },
+    {
+      "epoch": 0.34707164000312496,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0013501772726394981,
+      "loss": 0.1162,
+      "step": 39983
+    },
+    {
+      "epoch": 0.34708032048332915,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001350148683032562,
+      "loss": 0.0645,
+      "step": 39984
+    },
+    {
+      "epoch": 0.3470890009635333,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0013501200931520672,
+      "loss": 0.1377,
+      "step": 39985
+    },
+    {
+      "epoch": 0.3470976814437375,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0013500915029980446,
+      "loss": 0.0967,
+      "step": 39986
+    },
+    {
+      "epoch": 0.3471063619239416,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0013500629125705258,
+      "loss": 0.1016,
+      "step": 39987
+    },
+    {
+      "epoch": 0.3471150424041458,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001350034321869542,
+      "loss": 0.1143,
+      "step": 39988
+    },
+    {
+      "epoch": 0.34712372288434995,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0013500057308951248,
+      "loss": 0.0608,
+      "step": 39989
+    },
+    {
+      "epoch": 0.34713240336455414,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0013499771396473044,
+      "loss": 0.0889,
+      "step": 39990
+    },
+    {
+      "epoch": 0.3471410838447583,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0013499485481261134,
+      "loss": 0.1069,
+      "step": 39991
+    },
+    {
+      "epoch": 0.3471497643249625,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0013499199563315818,
+      "loss": 0.0938,
+      "step": 39992
+    },
+    {
+      "epoch": 0.3471584448051666,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0013498913642637418,
+      "loss": 0.126,
+      "step": 39993
+    },
+    {
+      "epoch": 0.3471671252853708,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0013498627719226244,
+      "loss": 0.1016,
+      "step": 39994
+    },
+    {
+      "epoch": 0.34717580576557494,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0013498341793082608,
+      "loss": 0.1206,
+      "step": 39995
+    },
+    {
+      "epoch": 0.34718448624577913,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001349805586420682,
+      "loss": 0.1064,
+      "step": 39996
+    },
+    {
+      "epoch": 0.34719316672598327,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.00134977699325992,
+      "loss": 0.0762,
+      "step": 39997
+    },
+    {
+      "epoch": 0.34720184720618746,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0013497483998260054,
+      "loss": 0.0957,
+      "step": 39998
+    },
+    {
+      "epoch": 0.3472105276863916,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0013497198061189696,
+      "loss": 0.1299,
+      "step": 39999
+    },
+    {
+      "epoch": 0.3472192081665958,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0013496912121388446,
+      "loss": 0.0752,
+      "step": 40000
+    },
+    {
+      "epoch": 0.34722788864679993,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0013496626178856601,
+      "loss": 0.1001,
+      "step": 40001
+    },
+    {
+      "epoch": 0.3472365691270041,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0013496340233594489,
+      "loss": 0.1064,
+      "step": 40002
+    },
+    {
+      "epoch": 0.34724524960720826,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0013496054285602418,
+      "loss": 0.2148,
+      "step": 40003
+    },
+    {
+      "epoch": 0.34725393008741245,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0013495768334880693,
+      "loss": 0.0737,
+      "step": 40004
+    },
+    {
+      "epoch": 0.3472626105676166,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001349548238142964,
+      "loss": 0.0986,
+      "step": 40005
+    },
+    {
+      "epoch": 0.3472712910478208,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0013495196425249564,
+      "loss": 0.0801,
+      "step": 40006
+    },
+    {
+      "epoch": 0.3472799715280249,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0013494910466340774,
+      "loss": 0.0923,
+      "step": 40007
+    },
+    {
+      "epoch": 0.3472886520082291,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001349462450470359,
+      "loss": 0.084,
+      "step": 40008
+    },
+    {
+      "epoch": 0.34729733248843325,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0013494338540338323,
+      "loss": 0.1123,
+      "step": 40009
+    },
+    {
+      "epoch": 0.34730601296863745,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0013494052573245284,
+      "loss": 0.1318,
+      "step": 40010
+    },
+    {
+      "epoch": 0.3473146934488416,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0013493766603424786,
+      "loss": 0.1157,
+      "step": 40011
+    },
+    {
+      "epoch": 0.3473233739290458,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0013493480630877143,
+      "loss": 0.0908,
+      "step": 40012
+    },
+    {
+      "epoch": 0.3473320544092499,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0013493194655602669,
+      "loss": 0.0825,
+      "step": 40013
+    },
+    {
+      "epoch": 0.3473407348894541,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0013492908677601675,
+      "loss": 0.0947,
+      "step": 40014
+    },
+    {
+      "epoch": 0.34734941536965824,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001349262269687447,
+      "loss": 0.0947,
+      "step": 40015
+    },
+    {
+      "epoch": 0.34735809584986244,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001349233671342137,
+      "loss": 0.0879,
+      "step": 40016
+    },
+    {
+      "epoch": 0.3473667763300666,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0013492050727242691,
+      "loss": 0.1143,
+      "step": 40017
+    },
+    {
+      "epoch": 0.34737545681027077,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0013491764738338744,
+      "loss": 0.1196,
+      "step": 40018
+    },
+    {
+      "epoch": 0.3473841372904749,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0013491478746709838,
+      "loss": 0.104,
+      "step": 40019
+    },
+    {
+      "epoch": 0.3473928177706791,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0013491192752356287,
+      "loss": 0.1387,
+      "step": 40020
+    },
+    {
+      "epoch": 0.34740149825088323,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0013490906755278405,
+      "loss": 0.1221,
+      "step": 40021
+    },
+    {
+      "epoch": 0.3474101787310874,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0013490620755476509,
+      "loss": 0.1016,
+      "step": 40022
+    },
+    {
+      "epoch": 0.34741885921129156,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0013490334752950903,
+      "loss": 0.0879,
+      "step": 40023
+    },
+    {
+      "epoch": 0.34742753969149576,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0013490048747701905,
+      "loss": 0.0913,
+      "step": 40024
+    },
+    {
+      "epoch": 0.3474362201716999,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001348976273972983,
+      "loss": 0.1133,
+      "step": 40025
+    },
+    {
+      "epoch": 0.3474449006519041,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0013489476729034986,
+      "loss": 0.0688,
+      "step": 40026
+    },
+    {
+      "epoch": 0.3474535811321082,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.001348919071561769,
+      "loss": 0.1016,
+      "step": 40027
+    },
+    {
+      "epoch": 0.3474622616123124,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0013488904699478247,
+      "loss": 0.1177,
+      "step": 40028
+    },
+    {
+      "epoch": 0.34747094209251655,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001348861868061698,
+      "loss": 0.083,
+      "step": 40029
+    },
+    {
+      "epoch": 0.34747962257272075,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0013488332659034193,
+      "loss": 0.1094,
+      "step": 40030
+    },
+    {
+      "epoch": 0.3474883030529249,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0013488046634730208,
+      "loss": 0.0967,
+      "step": 40031
+    },
+    {
+      "epoch": 0.3474969835331291,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0013487760607705328,
+      "loss": 0.1128,
+      "step": 40032
+    },
+    {
+      "epoch": 0.3475056640133332,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0013487474577959875,
+      "loss": 0.0938,
+      "step": 40033
+    },
+    {
+      "epoch": 0.3475143444935374,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0013487188545494152,
+      "loss": 0.1006,
+      "step": 40034
+    },
+    {
+      "epoch": 0.34752302497374155,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001348690251030848,
+      "loss": 0.0645,
+      "step": 40035
+    },
+    {
+      "epoch": 0.34753170545394574,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0013486616472403168,
+      "loss": 0.1152,
+      "step": 40036
+    },
+    {
+      "epoch": 0.3475403859341499,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0013486330431778531,
+      "loss": 0.0918,
+      "step": 40037
+    },
+    {
+      "epoch": 0.34754906641435407,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001348604438843488,
+      "loss": 0.0854,
+      "step": 40038
+    },
+    {
+      "epoch": 0.3475577468945582,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0013485758342372527,
+      "loss": 0.0713,
+      "step": 40039
+    },
+    {
+      "epoch": 0.3475664273747624,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0013485472293591784,
+      "loss": 0.1328,
+      "step": 40040
+    },
+    {
+      "epoch": 0.34757510785496654,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0013485186242092968,
+      "loss": 0.0898,
+      "step": 40041
+    },
+    {
+      "epoch": 0.34758378833517073,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0013484900187876391,
+      "loss": 0.0781,
+      "step": 40042
+    },
+    {
+      "epoch": 0.34759246881537487,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0013484614130942364,
+      "loss": 0.1143,
+      "step": 40043
+    },
+    {
+      "epoch": 0.347601149295579,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0013484328071291202,
+      "loss": 0.0815,
+      "step": 40044
+    },
+    {
+      "epoch": 0.3476098297757832,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.001348404200892321,
+      "loss": 0.1055,
+      "step": 40045
+    },
+    {
+      "epoch": 0.34761851025598733,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0013483755943838713,
+      "loss": 0.0938,
+      "step": 40046
+    },
+    {
+      "epoch": 0.3476271907361915,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0013483469876038015,
+      "loss": 0.0859,
+      "step": 40047
+    },
+    {
+      "epoch": 0.34763587121639566,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0013483183805521432,
+      "loss": 0.1309,
+      "step": 40048
+    },
+    {
+      "epoch": 0.34764455169659986,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0013482897732289276,
+      "loss": 0.1196,
+      "step": 40049
+    },
+    {
+      "epoch": 0.347653232176804,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0013482611656341864,
+      "loss": 0.085,
+      "step": 40050
+    },
+    {
+      "epoch": 0.3476619126570082,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0013482325577679499,
+      "loss": 0.0781,
+      "step": 40051
+    },
+    {
+      "epoch": 0.3476705931372123,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0013482039496302504,
+      "loss": 0.0757,
+      "step": 40052
+    },
+    {
+      "epoch": 0.3476792736174165,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001348175341221119,
+      "loss": 0.1348,
+      "step": 40053
+    },
+    {
+      "epoch": 0.34768795409762066,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0013481467325405863,
+      "loss": 0.1084,
+      "step": 40054
+    },
+    {
+      "epoch": 0.34769663457782485,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0013481181235886844,
+      "loss": 0.126,
+      "step": 40055
+    },
+    {
+      "epoch": 0.347705315058029,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0013480895143654441,
+      "loss": 0.1846,
+      "step": 40056
+    },
+    {
+      "epoch": 0.3477139955382332,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0013480609048708966,
+      "loss": 0.0996,
+      "step": 40057
+    },
+    {
+      "epoch": 0.3477226760184373,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001348032295105074,
+      "loss": 0.1025,
+      "step": 40058
+    },
+    {
+      "epoch": 0.3477313564986415,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0013480036850680066,
+      "loss": 0.1426,
+      "step": 40059
+    },
+    {
+      "epoch": 0.34774003697884565,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001347975074759726,
+      "loss": 0.0879,
+      "step": 40060
+    },
+    {
+      "epoch": 0.34774871745904984,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.001347946464180264,
+      "loss": 0.1016,
+      "step": 40061
+    },
+    {
+      "epoch": 0.347757397939254,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001347917853329651,
+      "loss": 0.0908,
+      "step": 40062
+    },
+    {
+      "epoch": 0.34776607841945817,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0013478892422079192,
+      "loss": 0.1309,
+      "step": 40063
+    },
+    {
+      "epoch": 0.3477747588996623,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0013478606308150991,
+      "loss": 0.0898,
+      "step": 40064
+    },
+    {
+      "epoch": 0.3477834393798665,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0013478320191512225,
+      "loss": 0.1113,
+      "step": 40065
+    },
+    {
+      "epoch": 0.34779211986007064,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0013478034072163206,
+      "loss": 0.0977,
+      "step": 40066
+    },
+    {
+      "epoch": 0.34780080034027483,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0013477747950104244,
+      "loss": 0.0928,
+      "step": 40067
+    },
+    {
+      "epoch": 0.34780948082047897,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0013477461825335654,
+      "loss": 0.0869,
+      "step": 40068
+    },
+    {
+      "epoch": 0.34781816130068316,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001347717569785775,
+      "loss": 0.1104,
+      "step": 40069
+    },
+    {
+      "epoch": 0.3478268417808873,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0013476889567670845,
+      "loss": 0.166,
+      "step": 40070
+    },
+    {
+      "epoch": 0.3478355222610915,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001347660343477525,
+      "loss": 0.0972,
+      "step": 40071
+    },
+    {
+      "epoch": 0.3478442027412956,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.001347631729917128,
+      "loss": 0.1572,
+      "step": 40072
+    },
+    {
+      "epoch": 0.3478528832214998,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0013476031160859245,
+      "loss": 0.0605,
+      "step": 40073
+    },
+    {
+      "epoch": 0.34786156370170396,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001347574501983946,
+      "loss": 0.123,
+      "step": 40074
+    },
+    {
+      "epoch": 0.34787024418190815,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0013475458876112236,
+      "loss": 0.0938,
+      "step": 40075
+    },
+    {
+      "epoch": 0.3478789246621123,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001347517272967789,
+      "loss": 0.084,
+      "step": 40076
+    },
+    {
+      "epoch": 0.3478876051423165,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001347488658053673,
+      "loss": 0.0869,
+      "step": 40077
+    },
+    {
+      "epoch": 0.3478962856225206,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001347460042868907,
+      "loss": 0.1641,
+      "step": 40078
+    },
+    {
+      "epoch": 0.3479049661027248,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0013474314274135228,
+      "loss": 0.0688,
+      "step": 40079
+    },
+    {
+      "epoch": 0.34791364658292895,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001347402811687551,
+      "loss": 0.1221,
+      "step": 40080
+    },
+    {
+      "epoch": 0.34792232706313314,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0013473741956910232,
+      "loss": 0.1074,
+      "step": 40081
+    },
+    {
+      "epoch": 0.3479310075433373,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001347345579423971,
+      "loss": 0.1582,
+      "step": 40082
+    },
+    {
+      "epoch": 0.34793968802354147,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0013473169628864252,
+      "loss": 0.123,
+      "step": 40083
+    },
+    {
+      "epoch": 0.3479483685037456,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0013472883460784177,
+      "loss": 0.1289,
+      "step": 40084
+    },
+    {
+      "epoch": 0.3479570489839498,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0013472597289999788,
+      "loss": 0.0928,
+      "step": 40085
+    },
+    {
+      "epoch": 0.34796572946415394,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0013472311116511403,
+      "loss": 0.0815,
+      "step": 40086
+    },
+    {
+      "epoch": 0.34797440994435813,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0013472024940319339,
+      "loss": 0.0889,
+      "step": 40087
+    },
+    {
+      "epoch": 0.34798309042456227,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0013471738761423905,
+      "loss": 0.0938,
+      "step": 40088
+    },
+    {
+      "epoch": 0.34799177090476646,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001347145257982542,
+      "loss": 0.0972,
+      "step": 40089
+    },
+    {
+      "epoch": 0.3480004513849706,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0013471166395524182,
+      "loss": 0.0928,
+      "step": 40090
+    },
+    {
+      "epoch": 0.3480091318651748,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0013470880208520518,
+      "loss": 0.1797,
+      "step": 40091
+    },
+    {
+      "epoch": 0.34801781234537893,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0013470594018814737,
+      "loss": 0.1182,
+      "step": 40092
+    },
+    {
+      "epoch": 0.3480264928255831,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0013470307826407152,
+      "loss": 0.0874,
+      "step": 40093
+    },
+    {
+      "epoch": 0.34803517330578726,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0013470021631298071,
+      "loss": 0.1406,
+      "step": 40094
+    },
+    {
+      "epoch": 0.34804385378599145,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0013469735433487818,
+      "loss": 0.124,
+      "step": 40095
+    },
+    {
+      "epoch": 0.3480525342661956,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0013469449232976695,
+      "loss": 0.0908,
+      "step": 40096
+    },
+    {
+      "epoch": 0.3480612147463998,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0013469163029765018,
+      "loss": 0.0986,
+      "step": 40097
+    },
+    {
+      "epoch": 0.3480698952266039,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0013468876823853108,
+      "loss": 0.1182,
+      "step": 40098
+    },
+    {
+      "epoch": 0.3480785757068081,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0013468590615241265,
+      "loss": 0.1177,
+      "step": 40099
+    },
+    {
+      "epoch": 0.34808725618701225,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0013468304403929812,
+      "loss": 0.1875,
+      "step": 40100
+    },
+    {
+      "epoch": 0.34809593666721644,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0013468018189919057,
+      "loss": 0.1133,
+      "step": 40101
+    },
+    {
+      "epoch": 0.3481046171474206,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0013467731973209316,
+      "loss": 0.0972,
+      "step": 40102
+    },
+    {
+      "epoch": 0.3481132976276248,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0013467445753800897,
+      "loss": 0.1523,
+      "step": 40103
+    },
+    {
+      "epoch": 0.3481219781078289,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001346715953169412,
+      "loss": 0.1123,
+      "step": 40104
+    },
+    {
+      "epoch": 0.3481306585880331,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0013466873306889294,
+      "loss": 0.0947,
+      "step": 40105
+    },
+    {
+      "epoch": 0.34813933906823724,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0013466587079386729,
+      "loss": 0.0991,
+      "step": 40106
+    },
+    {
+      "epoch": 0.34814801954844143,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0013466300849186744,
+      "loss": 0.0986,
+      "step": 40107
+    },
+    {
+      "epoch": 0.34815670002864557,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0013466014616289649,
+      "loss": 0.0752,
+      "step": 40108
+    },
+    {
+      "epoch": 0.34816538050884976,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0013465728380695756,
+      "loss": 0.0942,
+      "step": 40109
+    },
+    {
+      "epoch": 0.3481740609890539,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0013465442142405382,
+      "loss": 0.1025,
+      "step": 40110
+    },
+    {
+      "epoch": 0.3481827414692581,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0013465155901418837,
+      "loss": 0.0894,
+      "step": 40111
+    },
+    {
+      "epoch": 0.34819142194946223,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0013464869657736434,
+      "loss": 0.1318,
+      "step": 40112
+    },
+    {
+      "epoch": 0.3482001024296664,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0013464583411358485,
+      "loss": 0.1641,
+      "step": 40113
+    },
+    {
+      "epoch": 0.34820878290987056,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001346429716228531,
+      "loss": 0.0947,
+      "step": 40114
+    },
+    {
+      "epoch": 0.34821746339007476,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001346401091051721,
+      "loss": 0.085,
+      "step": 40115
+    },
+    {
+      "epoch": 0.3482261438702789,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001346372465605451,
+      "loss": 0.0913,
+      "step": 40116
+    },
+    {
+      "epoch": 0.3482348243504831,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0013463438398897513,
+      "loss": 0.1123,
+      "step": 40117
+    },
+    {
+      "epoch": 0.3482435048306872,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.001346315213904654,
+      "loss": 0.3711,
+      "step": 40118
+    },
+    {
+      "epoch": 0.3482521853108914,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0013462865876501902,
+      "loss": 0.104,
+      "step": 40119
+    },
+    {
+      "epoch": 0.34826086579109555,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0013462579611263907,
+      "loss": 0.0977,
+      "step": 40120
+    },
+    {
+      "epoch": 0.34826954627129975,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0013462293343332874,
+      "loss": 0.0723,
+      "step": 40121
+    },
+    {
+      "epoch": 0.3482782267515039,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0013462007072709114,
+      "loss": 0.0811,
+      "step": 40122
+    },
+    {
+      "epoch": 0.3482869072317081,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001346172079939294,
+      "loss": 0.083,
+      "step": 40123
+    },
+    {
+      "epoch": 0.3482955877119122,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0013461434523384665,
+      "loss": 0.0986,
+      "step": 40124
+    },
+    {
+      "epoch": 0.3483042681921164,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0013461148244684603,
+      "loss": 0.1113,
+      "step": 40125
+    },
+    {
+      "epoch": 0.34831294867232054,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0013460861963293064,
+      "loss": 0.1211,
+      "step": 40126
+    },
+    {
+      "epoch": 0.34832162915252474,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0013460575679210365,
+      "loss": 0.1484,
+      "step": 40127
+    },
+    {
+      "epoch": 0.3483303096327289,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0013460289392436817,
+      "loss": 0.1104,
+      "step": 40128
+    },
+    {
+      "epoch": 0.34833899011293307,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001346000310297273,
+      "loss": 0.1084,
+      "step": 40129
+    },
+    {
+      "epoch": 0.3483476705931372,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0013459716810818427,
+      "loss": 0.1719,
+      "step": 40130
+    },
+    {
+      "epoch": 0.3483563510733414,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0013459430515974213,
+      "loss": 0.0635,
+      "step": 40131
+    },
+    {
+      "epoch": 0.34836503155354553,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0013459144218440403,
+      "loss": 0.0732,
+      "step": 40132
+    },
+    {
+      "epoch": 0.3483737120337497,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0013458857918217306,
+      "loss": 0.0913,
+      "step": 40133
+    },
+    {
+      "epoch": 0.34838239251395386,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0013458571615305242,
+      "loss": 0.0645,
+      "step": 40134
+    },
+    {
+      "epoch": 0.34839107299415806,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.001345828530970452,
+      "loss": 0.1079,
+      "step": 40135
+    },
+    {
+      "epoch": 0.3483997534743622,
+      "grad_norm": 1.8828125,
+      "learning_rate": 0.0013457999001415455,
+      "loss": 0.1484,
+      "step": 40136
+    },
+    {
+      "epoch": 0.3484084339545664,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0013457712690438358,
+      "loss": 0.0986,
+      "step": 40137
+    },
+    {
+      "epoch": 0.3484171144347705,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0013457426376773543,
+      "loss": 0.1006,
+      "step": 40138
+    },
+    {
+      "epoch": 0.3484257949149747,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0013457140060421327,
+      "loss": 0.1079,
+      "step": 40139
+    },
+    {
+      "epoch": 0.34843447539517886,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0013456853741382016,
+      "loss": 0.105,
+      "step": 40140
+    },
+    {
+      "epoch": 0.34844315587538305,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0013456567419655927,
+      "loss": 0.1172,
+      "step": 40141
+    },
+    {
+      "epoch": 0.3484518363555872,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0013456281095243375,
+      "loss": 0.1055,
+      "step": 40142
+    },
+    {
+      "epoch": 0.3484605168357914,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001345599476814467,
+      "loss": 0.0938,
+      "step": 40143
+    },
+    {
+      "epoch": 0.3484691973159955,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0013455708438360125,
+      "loss": 0.0684,
+      "step": 40144
+    },
+    {
+      "epoch": 0.3484778777961997,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0013455422105890054,
+      "loss": 0.0811,
+      "step": 40145
+    },
+    {
+      "epoch": 0.34848655827640385,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001345513577073477,
+      "loss": 0.1143,
+      "step": 40146
+    },
+    {
+      "epoch": 0.34849523875660804,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0013454849432894588,
+      "loss": 0.1016,
+      "step": 40147
+    },
+    {
+      "epoch": 0.3485039192368122,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0013454563092369818,
+      "loss": 0.0957,
+      "step": 40148
+    },
+    {
+      "epoch": 0.34851259971701637,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0013454276749160776,
+      "loss": 0.1426,
+      "step": 40149
+    },
+    {
+      "epoch": 0.3485212801972205,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0013453990403267775,
+      "loss": 0.0791,
+      "step": 40150
+    },
+    {
+      "epoch": 0.3485299606774247,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0013453704054691123,
+      "loss": 0.1396,
+      "step": 40151
+    },
+    {
+      "epoch": 0.34853864115762884,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.001345341770343114,
+      "loss": 0.1299,
+      "step": 40152
+    },
+    {
+      "epoch": 0.34854732163783303,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0013453131349488137,
+      "loss": 0.0903,
+      "step": 40153
+    },
+    {
+      "epoch": 0.34855600211803717,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0013452844992862423,
+      "loss": 0.1182,
+      "step": 40154
+    },
+    {
+      "epoch": 0.34856468259824136,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0013452558633554315,
+      "loss": 0.1094,
+      "step": 40155
+    },
+    {
+      "epoch": 0.3485733630784455,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001345227227156413,
+      "loss": 0.0674,
+      "step": 40156
+    },
+    {
+      "epoch": 0.3485820435586497,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0013451985906892175,
+      "loss": 0.0889,
+      "step": 40157
+    },
+    {
+      "epoch": 0.3485907240388538,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0013451699539538764,
+      "loss": 0.085,
+      "step": 40158
+    },
+    {
+      "epoch": 0.348599404519058,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001345141316950421,
+      "loss": 0.1152,
+      "step": 40159
+    },
+    {
+      "epoch": 0.34860808499926216,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.001345112679678883,
+      "loss": 0.085,
+      "step": 40160
+    },
+    {
+      "epoch": 0.34861676547946635,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0013450840421392934,
+      "loss": 0.0923,
+      "step": 40161
+    },
+    {
+      "epoch": 0.3486254459596705,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0013450554043316833,
+      "loss": 0.1025,
+      "step": 40162
+    },
+    {
+      "epoch": 0.3486341264398747,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0013450267662560847,
+      "loss": 0.1216,
+      "step": 40163
+    },
+    {
+      "epoch": 0.3486428069200788,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0013449981279125283,
+      "loss": 0.0898,
+      "step": 40164
+    },
+    {
+      "epoch": 0.348651487400283,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0013449694893010453,
+      "loss": 0.1133,
+      "step": 40165
+    },
+    {
+      "epoch": 0.34866016788048715,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0013449408504216677,
+      "loss": 0.0801,
+      "step": 40166
+    },
+    {
+      "epoch": 0.3486688483606913,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0013449122112744267,
+      "loss": 0.1167,
+      "step": 40167
+    },
+    {
+      "epoch": 0.3486775288408955,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0013448835718593531,
+      "loss": 0.0981,
+      "step": 40168
+    },
+    {
+      "epoch": 0.3486862093210996,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0013448549321764787,
+      "loss": 0.1118,
+      "step": 40169
+    },
+    {
+      "epoch": 0.3486948898013038,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0013448262922258344,
+      "loss": 0.1035,
+      "step": 40170
+    },
+    {
+      "epoch": 0.34870357028150795,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001344797652007452,
+      "loss": 0.1084,
+      "step": 40171
+    },
+    {
+      "epoch": 0.34871225076171214,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0013447690115213622,
+      "loss": 0.0947,
+      "step": 40172
+    },
+    {
+      "epoch": 0.3487209312419163,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0013447403707675968,
+      "loss": 0.083,
+      "step": 40173
+    },
+    {
+      "epoch": 0.34872961172212047,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001344711729746187,
+      "loss": 0.1348,
+      "step": 40174
+    },
+    {
+      "epoch": 0.3487382922023246,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0013446830884571643,
+      "loss": 0.0767,
+      "step": 40175
+    },
+    {
+      "epoch": 0.3487469726825288,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.00134465444690056,
+      "loss": 0.0801,
+      "step": 40176
+    },
+    {
+      "epoch": 0.34875565316273294,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.001344625805076405,
+      "loss": 0.1055,
+      "step": 40177
+    },
+    {
+      "epoch": 0.34876433364293713,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0013445971629847309,
+      "loss": 0.1162,
+      "step": 40178
+    },
+    {
+      "epoch": 0.34877301412314127,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001344568520625569,
+      "loss": 0.1426,
+      "step": 40179
+    },
+    {
+      "epoch": 0.34878169460334546,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0013445398779989507,
+      "loss": 0.0859,
+      "step": 40180
+    },
+    {
+      "epoch": 0.3487903750835496,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0013445112351049072,
+      "loss": 0.0811,
+      "step": 40181
+    },
+    {
+      "epoch": 0.3487990555637538,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.00134448259194347,
+      "loss": 0.1172,
+      "step": 40182
+    },
+    {
+      "epoch": 0.3488077360439579,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.00134445394851467,
+      "loss": 0.1006,
+      "step": 40183
+    },
+    {
+      "epoch": 0.3488164165241621,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0013444253048185393,
+      "loss": 0.0767,
+      "step": 40184
+    },
+    {
+      "epoch": 0.34882509700436626,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0013443966608551084,
+      "loss": 0.1318,
+      "step": 40185
+    },
+    {
+      "epoch": 0.34883377748457045,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0013443680166244092,
+      "loss": 0.0747,
+      "step": 40186
+    },
+    {
+      "epoch": 0.3488424579647746,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001344339372126473,
+      "loss": 0.123,
+      "step": 40187
+    },
+    {
+      "epoch": 0.3488511384449788,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0013443107273613307,
+      "loss": 0.0996,
+      "step": 40188
+    },
+    {
+      "epoch": 0.3488598189251829,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0013442820823290136,
+      "loss": 0.1113,
+      "step": 40189
+    },
+    {
+      "epoch": 0.3488684994053871,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0013442534370295535,
+      "loss": 0.0835,
+      "step": 40190
+    },
+    {
+      "epoch": 0.34887717988559125,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0013442247914629817,
+      "loss": 0.0811,
+      "step": 40191
+    },
+    {
+      "epoch": 0.34888586036579544,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0013441961456293289,
+      "loss": 0.124,
+      "step": 40192
+    },
+    {
+      "epoch": 0.3488945408459996,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0013441674995286273,
+      "loss": 0.0884,
+      "step": 40193
+    },
+    {
+      "epoch": 0.34890322132620377,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0013441388531609073,
+      "loss": 0.0835,
+      "step": 40194
+    },
+    {
+      "epoch": 0.3489119018064079,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0013441102065262012,
+      "loss": 0.0889,
+      "step": 40195
+    },
+    {
+      "epoch": 0.3489205822866121,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0013440815596245399,
+      "loss": 0.1074,
+      "step": 40196
+    },
+    {
+      "epoch": 0.34892926276681624,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0013440529124559544,
+      "loss": 0.0747,
+      "step": 40197
+    },
+    {
+      "epoch": 0.34893794324702043,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0013440242650204764,
+      "loss": 0.1357,
+      "step": 40198
+    },
+    {
+      "epoch": 0.34894662372722457,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0013439956173181369,
+      "loss": 0.1035,
+      "step": 40199
+    },
+    {
+      "epoch": 0.34895530420742876,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0013439669693489679,
+      "loss": 0.082,
+      "step": 40200
+    },
+    {
+      "epoch": 0.3489639846876329,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0013439383211130002,
+      "loss": 0.1182,
+      "step": 40201
+    },
+    {
+      "epoch": 0.3489726651678371,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0013439096726102647,
+      "loss": 0.0781,
+      "step": 40202
+    },
+    {
+      "epoch": 0.34898134564804123,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0013438810238407934,
+      "loss": 0.0879,
+      "step": 40203
+    },
+    {
+      "epoch": 0.3489900261282454,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0013438523748046179,
+      "loss": 0.0742,
+      "step": 40204
+    },
+    {
+      "epoch": 0.34899870660844956,
+      "grad_norm": 0.375,
+      "learning_rate": 0.001343823725501769,
+      "loss": 0.0933,
+      "step": 40205
+    },
+    {
+      "epoch": 0.34900738708865375,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0013437950759322779,
+      "loss": 0.1416,
+      "step": 40206
+    },
+    {
+      "epoch": 0.3490160675688579,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001343766426096176,
+      "loss": 0.0796,
+      "step": 40207
+    },
+    {
+      "epoch": 0.3490247480490621,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0013437377759934952,
+      "loss": 0.1367,
+      "step": 40208
+    },
+    {
+      "epoch": 0.3490334285292662,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0013437091256242664,
+      "loss": 0.1206,
+      "step": 40209
+    },
+    {
+      "epoch": 0.3490421090094704,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001343680474988521,
+      "loss": 0.1279,
+      "step": 40210
+    },
+    {
+      "epoch": 0.34905078948967455,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.00134365182408629,
+      "loss": 0.1025,
+      "step": 40211
+    },
+    {
+      "epoch": 0.34905946996987874,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001343623172917605,
+      "loss": 0.0903,
+      "step": 40212
+    },
+    {
+      "epoch": 0.3490681504500829,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0013435945214824978,
+      "loss": 0.1055,
+      "step": 40213
+    },
+    {
+      "epoch": 0.3490768309302871,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001343565869780999,
+      "loss": 0.0869,
+      "step": 40214
+    },
+    {
+      "epoch": 0.3490855114104912,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0013435372178131403,
+      "loss": 0.0859,
+      "step": 40215
+    },
+    {
+      "epoch": 0.3490941918906954,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001343508565578953,
+      "loss": 0.1094,
+      "step": 40216
+    },
+    {
+      "epoch": 0.34910287237089954,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0013434799130784686,
+      "loss": 0.0796,
+      "step": 40217
+    },
+    {
+      "epoch": 0.34911155285110373,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0013434512603117178,
+      "loss": 0.0767,
+      "step": 40218
+    },
+    {
+      "epoch": 0.34912023333130787,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0013434226072787327,
+      "loss": 0.084,
+      "step": 40219
+    },
+    {
+      "epoch": 0.34912891381151206,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0013433939539795438,
+      "loss": 0.1045,
+      "step": 40220
+    },
+    {
+      "epoch": 0.3491375942917162,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0013433653004141833,
+      "loss": 0.0684,
+      "step": 40221
+    },
+    {
+      "epoch": 0.3491462747719204,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001343336646582682,
+      "loss": 0.0977,
+      "step": 40222
+    },
+    {
+      "epoch": 0.34915495525212453,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0013433079924850715,
+      "loss": 0.1328,
+      "step": 40223
+    },
+    {
+      "epoch": 0.3491636357323287,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001343279338121383,
+      "loss": 0.0947,
+      "step": 40224
+    },
+    {
+      "epoch": 0.34917231621253286,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001343250683491648,
+      "loss": 0.1289,
+      "step": 40225
+    },
+    {
+      "epoch": 0.34918099669273706,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0013432220285958976,
+      "loss": 0.0889,
+      "step": 40226
+    },
+    {
+      "epoch": 0.3491896771729412,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0013431933734341633,
+      "loss": 0.0986,
+      "step": 40227
+    },
+    {
+      "epoch": 0.3491983576531454,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.001343164718006476,
+      "loss": 0.0991,
+      "step": 40228
+    },
+    {
+      "epoch": 0.3492070381333495,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001343136062312868,
+      "loss": 0.0913,
+      "step": 40229
+    },
+    {
+      "epoch": 0.3492157186135537,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0013431074063533699,
+      "loss": 0.0942,
+      "step": 40230
+    },
+    {
+      "epoch": 0.34922439909375785,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001343078750128013,
+      "loss": 0.0811,
+      "step": 40231
+    },
+    {
+      "epoch": 0.34923307957396205,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0013430500936368291,
+      "loss": 0.0947,
+      "step": 40232
+    },
+    {
+      "epoch": 0.3492417600541662,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0013430214368798487,
+      "loss": 0.1289,
+      "step": 40233
+    },
+    {
+      "epoch": 0.3492504405343704,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0013429927798571041,
+      "loss": 0.1016,
+      "step": 40234
+    },
+    {
+      "epoch": 0.3492591210145745,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0013429641225686266,
+      "loss": 0.1055,
+      "step": 40235
+    },
+    {
+      "epoch": 0.3492678014947787,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001342935465014447,
+      "loss": 0.1104,
+      "step": 40236
+    },
+    {
+      "epoch": 0.34927648197498284,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0013429068071945964,
+      "loss": 0.1025,
+      "step": 40237
+    },
+    {
+      "epoch": 0.34928516245518704,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0013428781491091068,
+      "loss": 0.2461,
+      "step": 40238
+    },
+    {
+      "epoch": 0.3492938429353912,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0013428494907580094,
+      "loss": 0.1367,
+      "step": 40239
+    },
+    {
+      "epoch": 0.34930252341559537,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0013428208321413352,
+      "loss": 0.1201,
+      "step": 40240
+    },
+    {
+      "epoch": 0.3493112038957995,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0013427921732591162,
+      "loss": 0.1172,
+      "step": 40241
+    },
+    {
+      "epoch": 0.3493198843760037,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0013427635141113829,
+      "loss": 0.1201,
+      "step": 40242
+    },
+    {
+      "epoch": 0.34932856485620783,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0013427348546981672,
+      "loss": 0.0918,
+      "step": 40243
+    },
+    {
+      "epoch": 0.34933724533641203,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0013427061950195007,
+      "loss": 0.104,
+      "step": 40244
+    },
+    {
+      "epoch": 0.34934592581661617,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001342677535075414,
+      "loss": 0.1299,
+      "step": 40245
+    },
+    {
+      "epoch": 0.34935460629682036,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0013426488748659387,
+      "loss": 0.1118,
+      "step": 40246
+    },
+    {
+      "epoch": 0.3493632867770245,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0013426202143911063,
+      "loss": 0.1357,
+      "step": 40247
+    },
+    {
+      "epoch": 0.3493719672572287,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0013425915536509484,
+      "loss": 0.1045,
+      "step": 40248
+    },
+    {
+      "epoch": 0.3493806477374328,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0013425628926454956,
+      "loss": 0.0928,
+      "step": 40249
+    },
+    {
+      "epoch": 0.349389328217637,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0013425342313747798,
+      "loss": 0.0967,
+      "step": 40250
+    },
+    {
+      "epoch": 0.34939800869784116,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0013425055698388323,
+      "loss": 0.0942,
+      "step": 40251
+    },
+    {
+      "epoch": 0.34940668917804535,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001342476908037684,
+      "loss": 0.1357,
+      "step": 40252
+    },
+    {
+      "epoch": 0.3494153696582495,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0013424482459713672,
+      "loss": 0.1079,
+      "step": 40253
+    },
+    {
+      "epoch": 0.3494240501384537,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0013424195836399123,
+      "loss": 0.1201,
+      "step": 40254
+    },
+    {
+      "epoch": 0.3494327306186578,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001342390921043351,
+      "loss": 0.0903,
+      "step": 40255
+    },
+    {
+      "epoch": 0.349441411098862,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0013423622581817147,
+      "loss": 0.1523,
+      "step": 40256
+    },
+    {
+      "epoch": 0.34945009157906615,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0013423335950550346,
+      "loss": 0.1069,
+      "step": 40257
+    },
+    {
+      "epoch": 0.34945877205927034,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0013423049316633423,
+      "loss": 0.084,
+      "step": 40258
+    },
+    {
+      "epoch": 0.3494674525394745,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0013422762680066684,
+      "loss": 0.0894,
+      "step": 40259
+    },
+    {
+      "epoch": 0.34947613301967867,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0013422476040850454,
+      "loss": 0.0977,
+      "step": 40260
+    },
+    {
+      "epoch": 0.3494848134998828,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0013422189398985037,
+      "loss": 0.0942,
+      "step": 40261
+    },
+    {
+      "epoch": 0.349493493980087,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0013421902754470751,
+      "loss": 0.0962,
+      "step": 40262
+    },
+    {
+      "epoch": 0.34950217446029114,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001342161610730791,
+      "loss": 0.1094,
+      "step": 40263
+    },
+    {
+      "epoch": 0.34951085494049533,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0013421329457496826,
+      "loss": 0.1216,
+      "step": 40264
+    },
+    {
+      "epoch": 0.34951953542069947,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.001342104280503781,
+      "loss": 0.1123,
+      "step": 40265
+    },
+    {
+      "epoch": 0.34952821590090366,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0013420756149931184,
+      "loss": 0.1143,
+      "step": 40266
+    },
+    {
+      "epoch": 0.3495368963811078,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001342046949217725,
+      "loss": 0.0854,
+      "step": 40267
+    },
+    {
+      "epoch": 0.349545576861312,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0013420182831776325,
+      "loss": 0.1367,
+      "step": 40268
+    },
+    {
+      "epoch": 0.34955425734151613,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001341989616872873,
+      "loss": 0.1113,
+      "step": 40269
+    },
+    {
+      "epoch": 0.3495629378217203,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001341960950303477,
+      "loss": 0.0947,
+      "step": 40270
+    },
+    {
+      "epoch": 0.34957161830192446,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0013419322834694763,
+      "loss": 0.1133,
+      "step": 40271
+    },
+    {
+      "epoch": 0.34958029878212865,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001341903616370902,
+      "loss": 0.0947,
+      "step": 40272
+    },
+    {
+      "epoch": 0.3495889792623328,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0013418749490077856,
+      "loss": 0.1221,
+      "step": 40273
+    },
+    {
+      "epoch": 0.349597659742537,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0013418462813801582,
+      "loss": 0.0718,
+      "step": 40274
+    },
+    {
+      "epoch": 0.3496063402227411,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0013418176134880515,
+      "loss": 0.1006,
+      "step": 40275
+    },
+    {
+      "epoch": 0.3496150207029453,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0013417889453314967,
+      "loss": 0.1396,
+      "step": 40276
+    },
+    {
+      "epoch": 0.34962370118314945,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0013417602769105251,
+      "loss": 0.125,
+      "step": 40277
+    },
+    {
+      "epoch": 0.34963238166335364,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001341731608225168,
+      "loss": 0.0708,
+      "step": 40278
+    },
+    {
+      "epoch": 0.3496410621435578,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0013417029392754568,
+      "loss": 0.1055,
+      "step": 40279
+    },
+    {
+      "epoch": 0.34964974262376197,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001341674270061423,
+      "loss": 0.125,
+      "step": 40280
+    },
+    {
+      "epoch": 0.3496584231039661,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0013416456005830976,
+      "loss": 0.0732,
+      "step": 40281
+    },
+    {
+      "epoch": 0.3496671035841703,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0013416169308405129,
+      "loss": 0.1191,
+      "step": 40282
+    },
+    {
+      "epoch": 0.34967578406437444,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0013415882608336992,
+      "loss": 0.1562,
+      "step": 40283
+    },
+    {
+      "epoch": 0.34968446454457863,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0013415595905626881,
+      "loss": 0.0811,
+      "step": 40284
+    },
+    {
+      "epoch": 0.34969314502478277,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001341530920027511,
+      "loss": 0.1094,
+      "step": 40285
+    },
+    {
+      "epoch": 0.34970182550498696,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0013415022492281993,
+      "loss": 0.0562,
+      "step": 40286
+    },
+    {
+      "epoch": 0.3497105059851911,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0013414735781647846,
+      "loss": 0.0679,
+      "step": 40287
+    },
+    {
+      "epoch": 0.3497191864653953,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0013414449068372978,
+      "loss": 0.0991,
+      "step": 40288
+    },
+    {
+      "epoch": 0.34972786694559943,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0013414162352457706,
+      "loss": 0.1367,
+      "step": 40289
+    },
+    {
+      "epoch": 0.34973654742580357,
+      "grad_norm": 0.05908203125,
+      "learning_rate": 0.001341387563390234,
+      "loss": 0.0762,
+      "step": 40290
+    },
+    {
+      "epoch": 0.34974522790600776,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0013413588912707202,
+      "loss": 0.125,
+      "step": 40291
+    },
+    {
+      "epoch": 0.3497539083862119,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0013413302188872594,
+      "loss": 0.1357,
+      "step": 40292
+    },
+    {
+      "epoch": 0.3497625888664161,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0013413015462398837,
+      "loss": 0.1084,
+      "step": 40293
+    },
+    {
+      "epoch": 0.34977126934662023,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0013412728733286242,
+      "loss": 0.1016,
+      "step": 40294
+    },
+    {
+      "epoch": 0.3497799498268244,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0013412442001535124,
+      "loss": 0.1309,
+      "step": 40295
+    },
+    {
+      "epoch": 0.34978863030702856,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0013412155267145795,
+      "loss": 0.1406,
+      "step": 40296
+    },
+    {
+      "epoch": 0.34979731078723275,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0013411868530118567,
+      "loss": 0.0967,
+      "step": 40297
+    },
+    {
+      "epoch": 0.3498059912674369,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001341158179045376,
+      "loss": 0.0898,
+      "step": 40298
+    },
+    {
+      "epoch": 0.3498146717476411,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001341129504815168,
+      "loss": 0.0938,
+      "step": 40299
+    },
+    {
+      "epoch": 0.3498233522278452,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0013411008303212647,
+      "loss": 0.1162,
+      "step": 40300
+    },
+    {
+      "epoch": 0.3498320327080494,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0013410721555636967,
+      "loss": 0.0815,
+      "step": 40301
+    },
+    {
+      "epoch": 0.34984071318825355,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001341043480542496,
+      "loss": 0.1562,
+      "step": 40302
+    },
+    {
+      "epoch": 0.34984939366845774,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001341014805257694,
+      "loss": 0.0908,
+      "step": 40303
+    },
+    {
+      "epoch": 0.3498580741486619,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0013409861297093218,
+      "loss": 0.0781,
+      "step": 40304
+    },
+    {
+      "epoch": 0.3498667546288661,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0013409574538974108,
+      "loss": 0.1006,
+      "step": 40305
+    },
+    {
+      "epoch": 0.3498754351090702,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0013409287778219918,
+      "loss": 0.1025,
+      "step": 40306
+    },
+    {
+      "epoch": 0.3498841155892744,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0013409001014830974,
+      "loss": 0.1006,
+      "step": 40307
+    },
+    {
+      "epoch": 0.34989279606947854,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.001340871424880758,
+      "loss": 0.0747,
+      "step": 40308
+    },
+    {
+      "epoch": 0.34990147654968273,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001340842748015005,
+      "loss": 0.1523,
+      "step": 40309
+    },
+    {
+      "epoch": 0.34991015702988687,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.00134081407088587,
+      "loss": 0.0879,
+      "step": 40310
+    },
+    {
+      "epoch": 0.34991883751009106,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0013407853934933847,
+      "loss": 0.1377,
+      "step": 40311
+    },
+    {
+      "epoch": 0.3499275179902952,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0013407567158375802,
+      "loss": 0.0669,
+      "step": 40312
+    },
+    {
+      "epoch": 0.3499361984704994,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0013407280379184874,
+      "loss": 0.084,
+      "step": 40313
+    },
+    {
+      "epoch": 0.34994487895070353,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0013406993597361383,
+      "loss": 0.1406,
+      "step": 40314
+    },
+    {
+      "epoch": 0.3499535594309077,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001340670681290564,
+      "loss": 0.0845,
+      "step": 40315
+    },
+    {
+      "epoch": 0.34996223991111186,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0013406420025817959,
+      "loss": 0.1025,
+      "step": 40316
+    },
+    {
+      "epoch": 0.34997092039131605,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.001340613323609865,
+      "loss": 0.1494,
+      "step": 40317
+    },
+    {
+      "epoch": 0.3499796008715202,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0013405846443748028,
+      "loss": 0.1167,
+      "step": 40318
+    },
+    {
+      "epoch": 0.3499882813517244,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0013405559648766415,
+      "loss": 0.1943,
+      "step": 40319
+    },
+    {
+      "epoch": 0.3499969618319285,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0013405272851154113,
+      "loss": 0.1201,
+      "step": 40320
+    },
+    {
+      "epoch": 0.3500056423121327,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0013404986050911446,
+      "loss": 0.0967,
+      "step": 40321
+    },
+    {
+      "epoch": 0.35001432279233685,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0013404699248038717,
+      "loss": 0.0786,
+      "step": 40322
+    },
+    {
+      "epoch": 0.35002300327254104,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0013404412442536246,
+      "loss": 0.0684,
+      "step": 40323
+    },
+    {
+      "epoch": 0.3500316837527452,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0013404125634404347,
+      "loss": 0.1152,
+      "step": 40324
+    },
+    {
+      "epoch": 0.3500403642329494,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0013403838823643334,
+      "loss": 0.0967,
+      "step": 40325
+    },
+    {
+      "epoch": 0.3500490447131535,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0013403552010253519,
+      "loss": 0.127,
+      "step": 40326
+    },
+    {
+      "epoch": 0.3500577251933577,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.001340326519423521,
+      "loss": 0.1006,
+      "step": 40327
+    },
+    {
+      "epoch": 0.35006640567356184,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.001340297837558873,
+      "loss": 0.1196,
+      "step": 40328
+    },
+    {
+      "epoch": 0.35007508615376604,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001340269155431439,
+      "loss": 0.0884,
+      "step": 40329
+    },
+    {
+      "epoch": 0.3500837666339702,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0013402404730412502,
+      "loss": 0.085,
+      "step": 40330
+    },
+    {
+      "epoch": 0.35009244711417437,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0013402117903883381,
+      "loss": 0.124,
+      "step": 40331
+    },
+    {
+      "epoch": 0.3501011275943785,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001340183107472734,
+      "loss": 0.0845,
+      "step": 40332
+    },
+    {
+      "epoch": 0.3501098080745827,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.001340154424294469,
+      "loss": 0.1465,
+      "step": 40333
+    },
+    {
+      "epoch": 0.35011848855478683,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001340125740853575,
+      "loss": 0.0889,
+      "step": 40334
+    },
+    {
+      "epoch": 0.350127169034991,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001340097057150083,
+      "loss": 0.1328,
+      "step": 40335
+    },
+    {
+      "epoch": 0.35013584951519516,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0013400683731840243,
+      "loss": 0.0918,
+      "step": 40336
+    },
+    {
+      "epoch": 0.35014452999539936,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0013400396889554306,
+      "loss": 0.0869,
+      "step": 40337
+    },
+    {
+      "epoch": 0.3501532104756035,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001340011004464333,
+      "loss": 0.1064,
+      "step": 40338
+    },
+    {
+      "epoch": 0.3501618909558077,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0013399823197107633,
+      "loss": 0.082,
+      "step": 40339
+    },
+    {
+      "epoch": 0.3501705714360118,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0013399536346947523,
+      "loss": 0.0752,
+      "step": 40340
+    },
+    {
+      "epoch": 0.350179251916216,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0013399249494163315,
+      "loss": 0.0771,
+      "step": 40341
+    },
+    {
+      "epoch": 0.35018793239642015,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0013398962638755325,
+      "loss": 0.0918,
+      "step": 40342
+    },
+    {
+      "epoch": 0.35019661287662435,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0013398675780723866,
+      "loss": 0.1318,
+      "step": 40343
+    },
+    {
+      "epoch": 0.3502052933568285,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0013398388920069252,
+      "loss": 0.1328,
+      "step": 40344
+    },
+    {
+      "epoch": 0.3502139738370327,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0013398102056791793,
+      "loss": 0.126,
+      "step": 40345
+    },
+    {
+      "epoch": 0.3502226543172368,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0013397815190891807,
+      "loss": 0.0918,
+      "step": 40346
+    },
+    {
+      "epoch": 0.350231334797441,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0013397528322369606,
+      "loss": 0.0967,
+      "step": 40347
+    },
+    {
+      "epoch": 0.35024001527764514,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0013397241451225505,
+      "loss": 0.0664,
+      "step": 40348
+    },
+    {
+      "epoch": 0.35024869575784934,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0013396954577459814,
+      "loss": 0.1152,
+      "step": 40349
+    },
+    {
+      "epoch": 0.3502573762380535,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0013396667701072852,
+      "loss": 0.0835,
+      "step": 40350
+    },
+    {
+      "epoch": 0.35026605671825767,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0013396380822064933,
+      "loss": 0.1387,
+      "step": 40351
+    },
+    {
+      "epoch": 0.3502747371984618,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0013396093940436363,
+      "loss": 0.1108,
+      "step": 40352
+    },
+    {
+      "epoch": 0.350283417678666,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0013395807056187463,
+      "loss": 0.104,
+      "step": 40353
+    },
+    {
+      "epoch": 0.35029209815887014,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0013395520169318545,
+      "loss": 0.1147,
+      "step": 40354
+    },
+    {
+      "epoch": 0.35030077863907433,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001339523327982992,
+      "loss": 0.1206,
+      "step": 40355
+    },
+    {
+      "epoch": 0.35030945911927847,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0013394946387721906,
+      "loss": 0.1133,
+      "step": 40356
+    },
+    {
+      "epoch": 0.35031813959948266,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0013394659492994812,
+      "loss": 0.1064,
+      "step": 40357
+    },
+    {
+      "epoch": 0.3503268200796868,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0013394372595648956,
+      "loss": 0.1309,
+      "step": 40358
+    },
+    {
+      "epoch": 0.350335500559891,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001339408569568465,
+      "loss": 0.1221,
+      "step": 40359
+    },
+    {
+      "epoch": 0.3503441810400951,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001339379879310221,
+      "loss": 0.126,
+      "step": 40360
+    },
+    {
+      "epoch": 0.3503528615202993,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0013393511887901945,
+      "loss": 0.0957,
+      "step": 40361
+    },
+    {
+      "epoch": 0.35036154200050346,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0013393224980084173,
+      "loss": 0.1167,
+      "step": 40362
+    },
+    {
+      "epoch": 0.35037022248070765,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0013392938069649203,
+      "loss": 0.1357,
+      "step": 40363
+    },
+    {
+      "epoch": 0.3503789029609118,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0013392651156597356,
+      "loss": 0.0815,
+      "step": 40364
+    },
+    {
+      "epoch": 0.350387583441116,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0013392364240928938,
+      "loss": 0.0869,
+      "step": 40365
+    },
+    {
+      "epoch": 0.3503962639213201,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0013392077322644269,
+      "loss": 0.1006,
+      "step": 40366
+    },
+    {
+      "epoch": 0.3504049444015243,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0013391790401743656,
+      "loss": 0.0688,
+      "step": 40367
+    },
+    {
+      "epoch": 0.35041362488172845,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0013391503478227423,
+      "loss": 0.106,
+      "step": 40368
+    },
+    {
+      "epoch": 0.35042230536193264,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0013391216552095873,
+      "loss": 0.1113,
+      "step": 40369
+    },
+    {
+      "epoch": 0.3504309858421368,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0013390929623349332,
+      "loss": 0.0918,
+      "step": 40370
+    },
+    {
+      "epoch": 0.35043966632234097,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0013390642691988096,
+      "loss": 0.1426,
+      "step": 40371
+    },
+    {
+      "epoch": 0.3504483468025451,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0013390355758012495,
+      "loss": 0.1108,
+      "step": 40372
+    },
+    {
+      "epoch": 0.3504570272827493,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0013390068821422835,
+      "loss": 0.1191,
+      "step": 40373
+    },
+    {
+      "epoch": 0.35046570776295344,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0013389781882219435,
+      "loss": 0.0898,
+      "step": 40374
+    },
+    {
+      "epoch": 0.35047438824315763,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.00133894949404026,
+      "loss": 0.1309,
+      "step": 40375
+    },
+    {
+      "epoch": 0.35048306872336177,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001338920799597265,
+      "loss": 0.0781,
+      "step": 40376
+    },
+    {
+      "epoch": 0.35049174920356596,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0013388921048929903,
+      "loss": 0.1367,
+      "step": 40377
+    },
+    {
+      "epoch": 0.3505004296837701,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0013388634099274664,
+      "loss": 0.0957,
+      "step": 40378
+    },
+    {
+      "epoch": 0.3505091101639743,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0013388347147007252,
+      "loss": 0.125,
+      "step": 40379
+    },
+    {
+      "epoch": 0.35051779064417843,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0013388060192127977,
+      "loss": 0.0684,
+      "step": 40380
+    },
+    {
+      "epoch": 0.3505264711243826,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0013387773234637156,
+      "loss": 0.1172,
+      "step": 40381
+    },
+    {
+      "epoch": 0.35053515160458676,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0013387486274535107,
+      "loss": 0.1128,
+      "step": 40382
+    },
+    {
+      "epoch": 0.35054383208479095,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0013387199311822133,
+      "loss": 0.0762,
+      "step": 40383
+    },
+    {
+      "epoch": 0.3505525125649951,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0013386912346498555,
+      "loss": 0.1211,
+      "step": 40384
+    },
+    {
+      "epoch": 0.3505611930451993,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0013386625378564686,
+      "loss": 0.1387,
+      "step": 40385
+    },
+    {
+      "epoch": 0.3505698735254034,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0013386338408020839,
+      "loss": 0.1289,
+      "step": 40386
+    },
+    {
+      "epoch": 0.3505785540056076,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001338605143486733,
+      "loss": 0.0889,
+      "step": 40387
+    },
+    {
+      "epoch": 0.35058723448581175,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0013385764459104468,
+      "loss": 0.1001,
+      "step": 40388
+    },
+    {
+      "epoch": 0.35059591496601594,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0013385477480732572,
+      "loss": 0.1172,
+      "step": 40389
+    },
+    {
+      "epoch": 0.3506045954462201,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0013385190499751955,
+      "loss": 0.0825,
+      "step": 40390
+    },
+    {
+      "epoch": 0.3506132759264243,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001338490351616293,
+      "loss": 0.1631,
+      "step": 40391
+    },
+    {
+      "epoch": 0.3506219564066284,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0013384616529965808,
+      "loss": 0.104,
+      "step": 40392
+    },
+    {
+      "epoch": 0.3506306368868326,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0013384329541160902,
+      "loss": 0.0947,
+      "step": 40393
+    },
+    {
+      "epoch": 0.35063931736703674,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0013384042549748536,
+      "loss": 0.1079,
+      "step": 40394
+    },
+    {
+      "epoch": 0.35064799784724093,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001338375555572901,
+      "loss": 0.1523,
+      "step": 40395
+    },
+    {
+      "epoch": 0.35065667832744507,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001338346855910265,
+      "loss": 0.1006,
+      "step": 40396
+    },
+    {
+      "epoch": 0.35066535880764926,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0013383181559869762,
+      "loss": 0.0898,
+      "step": 40397
+    },
+    {
+      "epoch": 0.3506740392878534,
+      "grad_norm": 2.1875,
+      "learning_rate": 0.0013382894558030662,
+      "loss": 0.1416,
+      "step": 40398
+    },
+    {
+      "epoch": 0.3506827197680576,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0013382607553585667,
+      "loss": 0.0869,
+      "step": 40399
+    },
+    {
+      "epoch": 0.35069140024826173,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0013382320546535085,
+      "loss": 0.0815,
+      "step": 40400
+    },
+    {
+      "epoch": 0.3507000807284659,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0013382033536879234,
+      "loss": 0.1162,
+      "step": 40401
+    },
+    {
+      "epoch": 0.35070876120867006,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.001338174652461843,
+      "loss": 0.104,
+      "step": 40402
+    },
+    {
+      "epoch": 0.35071744168887425,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001338145950975298,
+      "loss": 0.1221,
+      "step": 40403
+    },
+    {
+      "epoch": 0.3507261221690784,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0013381172492283203,
+      "loss": 0.0664,
+      "step": 40404
+    },
+    {
+      "epoch": 0.3507348026492826,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.001338088547220941,
+      "loss": 0.1797,
+      "step": 40405
+    },
+    {
+      "epoch": 0.3507434831294867,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0013380598449531918,
+      "loss": 0.1123,
+      "step": 40406
+    },
+    {
+      "epoch": 0.3507521636096909,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001338031142425104,
+      "loss": 0.1016,
+      "step": 40407
+    },
+    {
+      "epoch": 0.35076084408989505,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.001338002439636709,
+      "loss": 0.1016,
+      "step": 40408
+    },
+    {
+      "epoch": 0.35076952457009924,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0013379737365880379,
+      "loss": 0.0879,
+      "step": 40409
+    },
+    {
+      "epoch": 0.3507782050503034,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0013379450332791222,
+      "loss": 0.1094,
+      "step": 40410
+    },
+    {
+      "epoch": 0.3507868855305076,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0013379163297099937,
+      "loss": 0.1211,
+      "step": 40411
+    },
+    {
+      "epoch": 0.3507955660107117,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001337887625880683,
+      "loss": 0.0815,
+      "step": 40412
+    },
+    {
+      "epoch": 0.35080424649091585,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0013378589217912225,
+      "loss": 0.1211,
+      "step": 40413
+    },
+    {
+      "epoch": 0.35081292697112004,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0013378302174416425,
+      "loss": 0.0708,
+      "step": 40414
+    },
+    {
+      "epoch": 0.3508216074513242,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0013378015128319755,
+      "loss": 0.1172,
+      "step": 40415
+    },
+    {
+      "epoch": 0.3508302879315284,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0013377728079622519,
+      "loss": 0.0957,
+      "step": 40416
+    },
+    {
+      "epoch": 0.3508389684117325,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001337744102832504,
+      "loss": 0.0898,
+      "step": 40417
+    },
+    {
+      "epoch": 0.3508476488919367,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0013377153974427625,
+      "loss": 0.0918,
+      "step": 40418
+    },
+    {
+      "epoch": 0.35085632937214084,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0013376866917930589,
+      "loss": 0.0752,
+      "step": 40419
+    },
+    {
+      "epoch": 0.35086500985234503,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0013376579858834247,
+      "loss": 0.1104,
+      "step": 40420
+    },
+    {
+      "epoch": 0.35087369033254917,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0013376292797138915,
+      "loss": 0.0977,
+      "step": 40421
+    },
+    {
+      "epoch": 0.35088237081275336,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0013376005732844902,
+      "loss": 0.1621,
+      "step": 40422
+    },
+    {
+      "epoch": 0.3508910512929575,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0013375718665952527,
+      "loss": 0.1172,
+      "step": 40423
+    },
+    {
+      "epoch": 0.3508997317731617,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0013375431596462102,
+      "loss": 0.0918,
+      "step": 40424
+    },
+    {
+      "epoch": 0.35090841225336583,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0013375144524373941,
+      "loss": 0.1118,
+      "step": 40425
+    },
+    {
+      "epoch": 0.35091709273357,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0013374857449688358,
+      "loss": 0.0977,
+      "step": 40426
+    },
+    {
+      "epoch": 0.35092577321377416,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0013374570372405663,
+      "loss": 0.0938,
+      "step": 40427
+    },
+    {
+      "epoch": 0.35093445369397835,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0013374283292526177,
+      "loss": 0.0879,
+      "step": 40428
+    },
+    {
+      "epoch": 0.3509431341741825,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.001337399621005021,
+      "loss": 0.1074,
+      "step": 40429
+    },
+    {
+      "epoch": 0.3509518146543867,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0013373709124978078,
+      "loss": 0.0942,
+      "step": 40430
+    },
+    {
+      "epoch": 0.3509604951345908,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0013373422037310089,
+      "loss": 0.1279,
+      "step": 40431
+    },
+    {
+      "epoch": 0.350969175614795,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001337313494704656,
+      "loss": 0.0947,
+      "step": 40432
+    },
+    {
+      "epoch": 0.35097785609499915,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001337284785418781,
+      "loss": 0.1064,
+      "step": 40433
+    },
+    {
+      "epoch": 0.35098653657520335,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0013372560758734152,
+      "loss": 0.0981,
+      "step": 40434
+    },
+    {
+      "epoch": 0.3509952170554075,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0013372273660685891,
+      "loss": 0.1084,
+      "step": 40435
+    },
+    {
+      "epoch": 0.3510038975356117,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001337198656004335,
+      "loss": 0.1992,
+      "step": 40436
+    },
+    {
+      "epoch": 0.3510125780158158,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001337169945680684,
+      "loss": 0.1182,
+      "step": 40437
+    },
+    {
+      "epoch": 0.35102125849602,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0013371412350976677,
+      "loss": 0.0972,
+      "step": 40438
+    },
+    {
+      "epoch": 0.35102993897622414,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0013371125242553174,
+      "loss": 0.082,
+      "step": 40439
+    },
+    {
+      "epoch": 0.35103861945642834,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001337083813153664,
+      "loss": 0.103,
+      "step": 40440
+    },
+    {
+      "epoch": 0.3510472999366325,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0013370551017927396,
+      "loss": 0.0942,
+      "step": 40441
+    },
+    {
+      "epoch": 0.35105598041683667,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001337026390172575,
+      "loss": 0.1211,
+      "step": 40442
+    },
+    {
+      "epoch": 0.3510646608970408,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0013369976782932021,
+      "loss": 0.1094,
+      "step": 40443
+    },
+    {
+      "epoch": 0.351073341377245,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001336968966154652,
+      "loss": 0.1162,
+      "step": 40444
+    },
+    {
+      "epoch": 0.35108202185744913,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0013369402537569562,
+      "loss": 0.0942,
+      "step": 40445
+    },
+    {
+      "epoch": 0.3510907023376533,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0013369115411001463,
+      "loss": 0.0918,
+      "step": 40446
+    },
+    {
+      "epoch": 0.35109938281785746,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0013368828281842533,
+      "loss": 0.2051,
+      "step": 40447
+    },
+    {
+      "epoch": 0.35110806329806166,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0013368541150093089,
+      "loss": 0.1113,
+      "step": 40448
+    },
+    {
+      "epoch": 0.3511167437782658,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0013368254015753442,
+      "loss": 0.126,
+      "step": 40449
+    },
+    {
+      "epoch": 0.35112542425847,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0013367966878823911,
+      "loss": 0.1006,
+      "step": 40450
+    },
+    {
+      "epoch": 0.3511341047386741,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0013367679739304805,
+      "loss": 0.1079,
+      "step": 40451
+    },
+    {
+      "epoch": 0.3511427852188783,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001336739259719644,
+      "loss": 0.1992,
+      "step": 40452
+    },
+    {
+      "epoch": 0.35115146569908245,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001336710545249913,
+      "loss": 0.1338,
+      "step": 40453
+    },
+    {
+      "epoch": 0.35116014617928665,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0013366818305213192,
+      "loss": 0.1157,
+      "step": 40454
+    },
+    {
+      "epoch": 0.3511688266594908,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0013366531155338931,
+      "loss": 0.1074,
+      "step": 40455
+    },
+    {
+      "epoch": 0.351177507139695,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0013366244002876676,
+      "loss": 0.0747,
+      "step": 40456
+    },
+    {
+      "epoch": 0.3511861876198991,
+      "grad_norm": 2.4375,
+      "learning_rate": 0.0013365956847826723,
+      "loss": 0.2891,
+      "step": 40457
+    },
+    {
+      "epoch": 0.3511948681001033,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0013365669690189401,
+      "loss": 0.1279,
+      "step": 40458
+    },
+    {
+      "epoch": 0.35120354858030745,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0013365382529965015,
+      "loss": 0.0591,
+      "step": 40459
+    },
+    {
+      "epoch": 0.35121222906051164,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0013365095367153885,
+      "loss": 0.1099,
+      "step": 40460
+    },
+    {
+      "epoch": 0.3512209095407158,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.001336480820175632,
+      "loss": 0.123,
+      "step": 40461
+    },
+    {
+      "epoch": 0.35122959002091997,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0013364521033772637,
+      "loss": 0.063,
+      "step": 40462
+    },
+    {
+      "epoch": 0.3512382705011241,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0013364233863203147,
+      "loss": 0.0776,
+      "step": 40463
+    },
+    {
+      "epoch": 0.3512469509813283,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001336394669004817,
+      "loss": 0.0894,
+      "step": 40464
+    },
+    {
+      "epoch": 0.35125563146153244,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0013363659514308017,
+      "loss": 0.0845,
+      "step": 40465
+    },
+    {
+      "epoch": 0.35126431194173663,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0013363372335983,
+      "loss": 0.0742,
+      "step": 40466
+    },
+    {
+      "epoch": 0.35127299242194077,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0013363085155073434,
+      "loss": 0.168,
+      "step": 40467
+    },
+    {
+      "epoch": 0.35128167290214496,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0013362797971579632,
+      "loss": 0.083,
+      "step": 40468
+    },
+    {
+      "epoch": 0.3512903533823491,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0013362510785501914,
+      "loss": 0.0771,
+      "step": 40469
+    },
+    {
+      "epoch": 0.3512990338625533,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0013362223596840586,
+      "loss": 0.1738,
+      "step": 40470
+    },
+    {
+      "epoch": 0.3513077143427574,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0013361936405595965,
+      "loss": 0.0679,
+      "step": 40471
+    },
+    {
+      "epoch": 0.3513163948229616,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0013361649211768368,
+      "loss": 0.1162,
+      "step": 40472
+    },
+    {
+      "epoch": 0.35132507530316576,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0013361362015358107,
+      "loss": 0.1289,
+      "step": 40473
+    },
+    {
+      "epoch": 0.35133375578336995,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0013361074816365494,
+      "loss": 0.1279,
+      "step": 40474
+    },
+    {
+      "epoch": 0.3513424362635741,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0013360787614790845,
+      "loss": 0.123,
+      "step": 40475
+    },
+    {
+      "epoch": 0.3513511167437783,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0013360500410634479,
+      "loss": 0.1211,
+      "step": 40476
+    },
+    {
+      "epoch": 0.3513597972239824,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0013360213203896703,
+      "loss": 0.1045,
+      "step": 40477
+    },
+    {
+      "epoch": 0.3513684777041866,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0013359925994577831,
+      "loss": 0.0903,
+      "step": 40478
+    },
+    {
+      "epoch": 0.35137715818439075,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.001335963878267818,
+      "loss": 0.082,
+      "step": 40479
+    },
+    {
+      "epoch": 0.35138583866459494,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0013359351568198064,
+      "loss": 0.0801,
+      "step": 40480
+    },
+    {
+      "epoch": 0.3513945191447991,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0013359064351137794,
+      "loss": 0.0913,
+      "step": 40481
+    },
+    {
+      "epoch": 0.35140319962500327,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0013358777131497693,
+      "loss": 0.0894,
+      "step": 40482
+    },
+    {
+      "epoch": 0.3514118801052074,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0013358489909278061,
+      "loss": 0.1104,
+      "step": 40483
+    },
+    {
+      "epoch": 0.3514205605854116,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0013358202684479227,
+      "loss": 0.1123,
+      "step": 40484
+    },
+    {
+      "epoch": 0.35142924106561574,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0013357915457101494,
+      "loss": 0.1602,
+      "step": 40485
+    },
+    {
+      "epoch": 0.35143792154581993,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0013357628227145181,
+      "loss": 0.1338,
+      "step": 40486
+    },
+    {
+      "epoch": 0.35144660202602407,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0013357340994610604,
+      "loss": 0.0645,
+      "step": 40487
+    },
+    {
+      "epoch": 0.35145528250622826,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0013357053759498068,
+      "loss": 0.1162,
+      "step": 40488
+    },
+    {
+      "epoch": 0.3514639629864324,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0013356766521807897,
+      "loss": 0.0801,
+      "step": 40489
+    },
+    {
+      "epoch": 0.3514726434666366,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0013356479281540403,
+      "loss": 0.1006,
+      "step": 40490
+    },
+    {
+      "epoch": 0.35148132394684073,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0013356192038695895,
+      "loss": 0.0957,
+      "step": 40491
+    },
+    {
+      "epoch": 0.3514900044270449,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0013355904793274694,
+      "loss": 0.1143,
+      "step": 40492
+    },
+    {
+      "epoch": 0.35149868490724906,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001335561754527711,
+      "loss": 0.1367,
+      "step": 40493
+    },
+    {
+      "epoch": 0.35150736538745325,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0013355330294703457,
+      "loss": 0.0767,
+      "step": 40494
+    },
+    {
+      "epoch": 0.3515160458676574,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0013355043041554055,
+      "loss": 0.1143,
+      "step": 40495
+    },
+    {
+      "epoch": 0.3515247263478616,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0013354755785829208,
+      "loss": 0.1201,
+      "step": 40496
+    },
+    {
+      "epoch": 0.3515334068280657,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0013354468527529235,
+      "loss": 0.0718,
+      "step": 40497
+    },
+    {
+      "epoch": 0.3515420873082699,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0013354181266654454,
+      "loss": 0.0957,
+      "step": 40498
+    },
+    {
+      "epoch": 0.35155076778847405,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0013353894003205174,
+      "loss": 0.1064,
+      "step": 40499
+    },
+    {
+      "epoch": 0.35155944826867824,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001335360673718171,
+      "loss": 0.1045,
+      "step": 40500
+    },
+    {
+      "epoch": 0.3515681287488824,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0013353319468584377,
+      "loss": 0.1084,
+      "step": 40501
+    },
+    {
+      "epoch": 0.3515768092290866,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001335303219741349,
+      "loss": 0.1235,
+      "step": 40502
+    },
+    {
+      "epoch": 0.3515854897092907,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0013352744923669365,
+      "loss": 0.0957,
+      "step": 40503
+    },
+    {
+      "epoch": 0.3515941701894949,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0013352457647352312,
+      "loss": 0.1074,
+      "step": 40504
+    },
+    {
+      "epoch": 0.35160285066969904,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0013352170368462644,
+      "loss": 0.0903,
+      "step": 40505
+    },
+    {
+      "epoch": 0.35161153114990323,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001335188308700068,
+      "loss": 0.1191,
+      "step": 40506
+    },
+    {
+      "epoch": 0.35162021163010737,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0013351595802966734,
+      "loss": 0.0938,
+      "step": 40507
+    },
+    {
+      "epoch": 0.35162889211031156,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0013351308516361114,
+      "loss": 0.1914,
+      "step": 40508
+    },
+    {
+      "epoch": 0.3516375725905157,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001335102122718414,
+      "loss": 0.1318,
+      "step": 40509
+    },
+    {
+      "epoch": 0.3516462530707199,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0013350733935436122,
+      "loss": 0.1016,
+      "step": 40510
+    },
+    {
+      "epoch": 0.35165493355092403,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001335044664111738,
+      "loss": 0.0967,
+      "step": 40511
+    },
+    {
+      "epoch": 0.3516636140311282,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0013350159344228225,
+      "loss": 0.0869,
+      "step": 40512
+    },
+    {
+      "epoch": 0.35167229451133236,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0013349872044768972,
+      "loss": 0.0874,
+      "step": 40513
+    },
+    {
+      "epoch": 0.35168097499153655,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0013349584742739931,
+      "loss": 0.0918,
+      "step": 40514
+    },
+    {
+      "epoch": 0.3516896554717407,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0013349297438141421,
+      "loss": 0.0776,
+      "step": 40515
+    },
+    {
+      "epoch": 0.3516983359519449,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0013349010130973755,
+      "loss": 0.1201,
+      "step": 40516
+    },
+    {
+      "epoch": 0.351707016432149,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0013348722821237247,
+      "loss": 0.1162,
+      "step": 40517
+    },
+    {
+      "epoch": 0.3517156969123532,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0013348435508932209,
+      "loss": 0.1035,
+      "step": 40518
+    },
+    {
+      "epoch": 0.35172437739255735,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0013348148194058957,
+      "loss": 0.0986,
+      "step": 40519
+    },
+    {
+      "epoch": 0.35173305787276155,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0013347860876617807,
+      "loss": 0.1069,
+      "step": 40520
+    },
+    {
+      "epoch": 0.3517417383529657,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0013347573556609074,
+      "loss": 0.0933,
+      "step": 40521
+    },
+    {
+      "epoch": 0.3517504188331699,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0013347286234033065,
+      "loss": 0.1182,
+      "step": 40522
+    },
+    {
+      "epoch": 0.351759099313374,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0013346998908890103,
+      "loss": 0.1045,
+      "step": 40523
+    },
+    {
+      "epoch": 0.3517677797935782,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0013346711581180496,
+      "loss": 0.125,
+      "step": 40524
+    },
+    {
+      "epoch": 0.35177646027378234,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0013346424250904564,
+      "loss": 0.1182,
+      "step": 40525
+    },
+    {
+      "epoch": 0.35178514075398654,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0013346136918062613,
+      "loss": 0.1279,
+      "step": 40526
+    },
+    {
+      "epoch": 0.3517938212341907,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0013345849582654962,
+      "loss": 0.1099,
+      "step": 40527
+    },
+    {
+      "epoch": 0.35180250171439487,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0013345562244681927,
+      "loss": 0.0889,
+      "step": 40528
+    },
+    {
+      "epoch": 0.351811182194599,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0013345274904143823,
+      "loss": 0.0884,
+      "step": 40529
+    },
+    {
+      "epoch": 0.3518198626748032,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0013344987561040957,
+      "loss": 0.0762,
+      "step": 40530
+    },
+    {
+      "epoch": 0.35182854315500733,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001334470021537365,
+      "loss": 0.0972,
+      "step": 40531
+    },
+    {
+      "epoch": 0.3518372236352115,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0013344412867142212,
+      "loss": 0.1221,
+      "step": 40532
+    },
+    {
+      "epoch": 0.35184590411541566,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0013344125516346966,
+      "loss": 0.0933,
+      "step": 40533
+    },
+    {
+      "epoch": 0.35185458459561986,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0013343838162988213,
+      "loss": 0.1104,
+      "step": 40534
+    },
+    {
+      "epoch": 0.351863265075824,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0013343550807066273,
+      "loss": 0.1123,
+      "step": 40535
+    },
+    {
+      "epoch": 0.35187194555602813,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0013343263448581464,
+      "loss": 0.1748,
+      "step": 40536
+    },
+    {
+      "epoch": 0.3518806260362323,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0013342976087534098,
+      "loss": 0.0894,
+      "step": 40537
+    },
+    {
+      "epoch": 0.35188930651643646,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0013342688723924489,
+      "loss": 0.0889,
+      "step": 40538
+    },
+    {
+      "epoch": 0.35189798699664065,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0013342401357752946,
+      "loss": 0.1201,
+      "step": 40539
+    },
+    {
+      "epoch": 0.3519066674768448,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001334211398901979,
+      "loss": 0.1006,
+      "step": 40540
+    },
+    {
+      "epoch": 0.351915347957049,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0013341826617725336,
+      "loss": 0.1074,
+      "step": 40541
+    },
+    {
+      "epoch": 0.3519240284372531,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0013341539243869895,
+      "loss": 0.1797,
+      "step": 40542
+    },
+    {
+      "epoch": 0.3519327089174573,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0013341251867453783,
+      "loss": 0.1133,
+      "step": 40543
+    },
+    {
+      "epoch": 0.35194138939766145,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0013340964488477308,
+      "loss": 0.1104,
+      "step": 40544
+    },
+    {
+      "epoch": 0.35195006987786565,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0013340677106940795,
+      "loss": 0.0986,
+      "step": 40545
+    },
+    {
+      "epoch": 0.3519587503580698,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001334038972284455,
+      "loss": 0.1069,
+      "step": 40546
+    },
+    {
+      "epoch": 0.351967430838274,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0013340102336188888,
+      "loss": 0.127,
+      "step": 40547
+    },
+    {
+      "epoch": 0.3519761113184781,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001333981494697413,
+      "loss": 0.0874,
+      "step": 40548
+    },
+    {
+      "epoch": 0.3519847917986823,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0013339527555200582,
+      "loss": 0.1147,
+      "step": 40549
+    },
+    {
+      "epoch": 0.35199347227888644,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0013339240160868565,
+      "loss": 0.0869,
+      "step": 40550
+    },
+    {
+      "epoch": 0.35200215275909064,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0013338952763978385,
+      "loss": 0.1084,
+      "step": 40551
+    },
+    {
+      "epoch": 0.3520108332392948,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0013338665364530365,
+      "loss": 0.0986,
+      "step": 40552
+    },
+    {
+      "epoch": 0.35201951371949897,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0013338377962524815,
+      "loss": 0.1162,
+      "step": 40553
+    },
+    {
+      "epoch": 0.3520281941997031,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001333809055796205,
+      "loss": 0.1416,
+      "step": 40554
+    },
+    {
+      "epoch": 0.3520368746799073,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0013337803150842386,
+      "loss": 0.0957,
+      "step": 40555
+    },
+    {
+      "epoch": 0.35204555516011143,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0013337515741166136,
+      "loss": 0.1377,
+      "step": 40556
+    },
+    {
+      "epoch": 0.3520542356403156,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001333722832893361,
+      "loss": 0.1035,
+      "step": 40557
+    },
+    {
+      "epoch": 0.35206291612051976,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0013336940914145125,
+      "loss": 0.0854,
+      "step": 40558
+    },
+    {
+      "epoch": 0.35207159660072396,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0013336653496801001,
+      "loss": 0.1025,
+      "step": 40559
+    },
+    {
+      "epoch": 0.3520802770809281,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0013336366076901547,
+      "loss": 0.106,
+      "step": 40560
+    },
+    {
+      "epoch": 0.3520889575611323,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0013336078654447075,
+      "loss": 0.1055,
+      "step": 40561
+    },
+    {
+      "epoch": 0.3520976380413364,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0013335791229437907,
+      "loss": 0.1221,
+      "step": 40562
+    },
+    {
+      "epoch": 0.3521063185215406,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001333550380187435,
+      "loss": 0.1157,
+      "step": 40563
+    },
+    {
+      "epoch": 0.35211499900174476,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0013335216371756725,
+      "loss": 0.1445,
+      "step": 40564
+    },
+    {
+      "epoch": 0.35212367948194895,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0013334928939085337,
+      "loss": 0.1123,
+      "step": 40565
+    },
+    {
+      "epoch": 0.3521323599621531,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001333464150386051,
+      "loss": 0.1025,
+      "step": 40566
+    },
+    {
+      "epoch": 0.3521410404423573,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001333435406608255,
+      "loss": 0.103,
+      "step": 40567
+    },
+    {
+      "epoch": 0.3521497209225614,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001333406662575178,
+      "loss": 0.0986,
+      "step": 40568
+    },
+    {
+      "epoch": 0.3521584014027656,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0013333779182868506,
+      "loss": 0.0703,
+      "step": 40569
+    },
+    {
+      "epoch": 0.35216708188296975,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0013333491737433048,
+      "loss": 0.0938,
+      "step": 40570
+    },
+    {
+      "epoch": 0.35217576236317394,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.0013333204289445717,
+      "loss": 0.3965,
+      "step": 40571
+    },
+    {
+      "epoch": 0.3521844428433781,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0013332916838906831,
+      "loss": 0.0854,
+      "step": 40572
+    },
+    {
+      "epoch": 0.35219312332358227,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0013332629385816703,
+      "loss": 0.1006,
+      "step": 40573
+    },
+    {
+      "epoch": 0.3522018038037864,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0013332341930175644,
+      "loss": 0.1309,
+      "step": 40574
+    },
+    {
+      "epoch": 0.3522104842839906,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0013332054471983975,
+      "loss": 0.1484,
+      "step": 40575
+    },
+    {
+      "epoch": 0.35221916476419474,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0013331767011242003,
+      "loss": 0.0908,
+      "step": 40576
+    },
+    {
+      "epoch": 0.35222784524439893,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0013331479547950044,
+      "loss": 0.0996,
+      "step": 40577
+    },
+    {
+      "epoch": 0.35223652572460307,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0013331192082108412,
+      "loss": 0.1055,
+      "step": 40578
+    },
+    {
+      "epoch": 0.35224520620480726,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001333090461371743,
+      "loss": 0.0986,
+      "step": 40579
+    },
+    {
+      "epoch": 0.3522538866850114,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0013330617142777402,
+      "loss": 0.084,
+      "step": 40580
+    },
+    {
+      "epoch": 0.3522625671652156,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0013330329669288647,
+      "loss": 0.1064,
+      "step": 40581
+    },
+    {
+      "epoch": 0.3522712476454197,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001333004219325148,
+      "loss": 0.084,
+      "step": 40582
+    },
+    {
+      "epoch": 0.3522799281256239,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001332975471466621,
+      "loss": 0.104,
+      "step": 40583
+    },
+    {
+      "epoch": 0.35228860860582806,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0013329467233533156,
+      "loss": 0.0933,
+      "step": 40584
+    },
+    {
+      "epoch": 0.35229728908603225,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0013329179749852636,
+      "loss": 0.1006,
+      "step": 40585
+    },
+    {
+      "epoch": 0.3523059695662364,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0013328892263624957,
+      "loss": 0.1055,
+      "step": 40586
+    },
+    {
+      "epoch": 0.3523146500464406,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0013328604774850434,
+      "loss": 0.125,
+      "step": 40587
+    },
+    {
+      "epoch": 0.3523233305266447,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0013328317283529388,
+      "loss": 0.0806,
+      "step": 40588
+    },
+    {
+      "epoch": 0.3523320110068489,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.001332802978966213,
+      "loss": 0.1079,
+      "step": 40589
+    },
+    {
+      "epoch": 0.35234069148705305,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.001332774229324897,
+      "loss": 0.0947,
+      "step": 40590
+    },
+    {
+      "epoch": 0.35234937196725724,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0013327454794290225,
+      "loss": 0.1934,
+      "step": 40591
+    },
+    {
+      "epoch": 0.3523580524474614,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0013327167292786213,
+      "loss": 0.0884,
+      "step": 40592
+    },
+    {
+      "epoch": 0.35236673292766557,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0013326879788737246,
+      "loss": 0.0874,
+      "step": 40593
+    },
+    {
+      "epoch": 0.3523754134078697,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0013326592282143642,
+      "loss": 0.0869,
+      "step": 40594
+    },
+    {
+      "epoch": 0.3523840938880739,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0013326304773005704,
+      "loss": 0.063,
+      "step": 40595
+    },
+    {
+      "epoch": 0.35239277436827804,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0013326017261323757,
+      "loss": 0.0752,
+      "step": 40596
+    },
+    {
+      "epoch": 0.35240145484848223,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0013325729747098113,
+      "loss": 0.0889,
+      "step": 40597
+    },
+    {
+      "epoch": 0.35241013532868637,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0013325442230329083,
+      "loss": 0.0859,
+      "step": 40598
+    },
+    {
+      "epoch": 0.35241881580889056,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.001332515471101699,
+      "loss": 0.0718,
+      "step": 40599
+    },
+    {
+      "epoch": 0.3524274962890947,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0013324867189162138,
+      "loss": 0.1367,
+      "step": 40600
+    },
+    {
+      "epoch": 0.3524361767692989,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0013324579664764848,
+      "loss": 0.0996,
+      "step": 40601
+    },
+    {
+      "epoch": 0.35244485724950303,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0013324292137825433,
+      "loss": 0.0576,
+      "step": 40602
+    },
+    {
+      "epoch": 0.3524535377297072,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0013324004608344206,
+      "loss": 0.0869,
+      "step": 40603
+    },
+    {
+      "epoch": 0.35246221820991136,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0013323717076321484,
+      "loss": 0.0947,
+      "step": 40604
+    },
+    {
+      "epoch": 0.35247089869011555,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001332342954175758,
+      "loss": 0.0977,
+      "step": 40605
+    },
+    {
+      "epoch": 0.3524795791703197,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0013323142004652807,
+      "loss": 0.1025,
+      "step": 40606
+    },
+    {
+      "epoch": 0.3524882596505239,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0013322854465007482,
+      "loss": 0.0908,
+      "step": 40607
+    },
+    {
+      "epoch": 0.352496940130728,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0013322566922821914,
+      "loss": 0.0742,
+      "step": 40608
+    },
+    {
+      "epoch": 0.3525056206109322,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0013322279378096425,
+      "loss": 0.1182,
+      "step": 40609
+    },
+    {
+      "epoch": 0.35251430109113635,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001332199183083133,
+      "loss": 0.0854,
+      "step": 40610
+    },
+    {
+      "epoch": 0.35252298157134054,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0013321704281026936,
+      "loss": 0.1147,
+      "step": 40611
+    },
+    {
+      "epoch": 0.3525316620515447,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0013321416728683559,
+      "loss": 0.1357,
+      "step": 40612
+    },
+    {
+      "epoch": 0.3525403425317489,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0013321129173801515,
+      "loss": 0.1533,
+      "step": 40613
+    },
+    {
+      "epoch": 0.352549023011953,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0013320841616381125,
+      "loss": 0.1113,
+      "step": 40614
+    },
+    {
+      "epoch": 0.3525577034921572,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0013320554056422693,
+      "loss": 0.083,
+      "step": 40615
+    },
+    {
+      "epoch": 0.35256638397236134,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0013320266493926538,
+      "loss": 0.083,
+      "step": 40616
+    },
+    {
+      "epoch": 0.35257506445256553,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0013319978928892976,
+      "loss": 0.1367,
+      "step": 40617
+    },
+    {
+      "epoch": 0.35258374493276967,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001331969136132232,
+      "loss": 0.1025,
+      "step": 40618
+    },
+    {
+      "epoch": 0.35259242541297386,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0013319403791214883,
+      "loss": 0.1299,
+      "step": 40619
+    },
+    {
+      "epoch": 0.352601105893178,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0013319116218570984,
+      "loss": 0.0986,
+      "step": 40620
+    },
+    {
+      "epoch": 0.3526097863733822,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001331882864339093,
+      "loss": 0.1035,
+      "step": 40621
+    },
+    {
+      "epoch": 0.35261846685358633,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0013318541065675043,
+      "loss": 0.1045,
+      "step": 40622
+    },
+    {
+      "epoch": 0.3526271473337905,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0013318253485423632,
+      "loss": 0.1133,
+      "step": 40623
+    },
+    {
+      "epoch": 0.35263582781399466,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0013317965902637013,
+      "loss": 0.1328,
+      "step": 40624
+    },
+    {
+      "epoch": 0.35264450829419886,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0013317678317315506,
+      "loss": 0.0981,
+      "step": 40625
+    },
+    {
+      "epoch": 0.352653188774403,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0013317390729459416,
+      "loss": 0.166,
+      "step": 40626
+    },
+    {
+      "epoch": 0.3526618692546072,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0013317103139069064,
+      "loss": 0.1055,
+      "step": 40627
+    },
+    {
+      "epoch": 0.3526705497348113,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0013316815546144762,
+      "loss": 0.1074,
+      "step": 40628
+    },
+    {
+      "epoch": 0.3526792302150155,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.001331652795068683,
+      "loss": 0.1191,
+      "step": 40629
+    },
+    {
+      "epoch": 0.35268791069521965,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0013316240352695572,
+      "loss": 0.1035,
+      "step": 40630
+    },
+    {
+      "epoch": 0.35269659117542385,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0013315952752171313,
+      "loss": 0.1104,
+      "step": 40631
+    },
+    {
+      "epoch": 0.352705271655628,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001331566514911436,
+      "loss": 0.1191,
+      "step": 40632
+    },
+    {
+      "epoch": 0.3527139521358322,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.001331537754352503,
+      "loss": 0.1348,
+      "step": 40633
+    },
+    {
+      "epoch": 0.3527226326160363,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0013315089935403638,
+      "loss": 0.1143,
+      "step": 40634
+    },
+    {
+      "epoch": 0.3527313130962405,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.00133148023247505,
+      "loss": 0.1309,
+      "step": 40635
+    },
+    {
+      "epoch": 0.35273999357644464,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.001331451471156593,
+      "loss": 0.2793,
+      "step": 40636
+    },
+    {
+      "epoch": 0.35274867405664884,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0013314227095850238,
+      "loss": 0.1138,
+      "step": 40637
+    },
+    {
+      "epoch": 0.352757354536853,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0013313939477603745,
+      "loss": 0.0737,
+      "step": 40638
+    },
+    {
+      "epoch": 0.35276603501705717,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001331365185682676,
+      "loss": 0.1021,
+      "step": 40639
+    },
+    {
+      "epoch": 0.3527747154972613,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0013313364233519603,
+      "loss": 0.0947,
+      "step": 40640
+    },
+    {
+      "epoch": 0.3527833959774655,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0013313076607682584,
+      "loss": 0.125,
+      "step": 40641
+    },
+    {
+      "epoch": 0.35279207645766963,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0013312788979316019,
+      "loss": 0.062,
+      "step": 40642
+    },
+    {
+      "epoch": 0.3528007569378738,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001331250134842022,
+      "loss": 0.0977,
+      "step": 40643
+    },
+    {
+      "epoch": 0.35280943741807796,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0013312213714995508,
+      "loss": 0.1035,
+      "step": 40644
+    },
+    {
+      "epoch": 0.35281811789828216,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001331192607904219,
+      "loss": 0.0996,
+      "step": 40645
+    },
+    {
+      "epoch": 0.3528267983784863,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0013311638440560592,
+      "loss": 0.1162,
+      "step": 40646
+    },
+    {
+      "epoch": 0.3528354788586905,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0013311350799551014,
+      "loss": 0.1143,
+      "step": 40647
+    },
+    {
+      "epoch": 0.3528441593388946,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001331106315601378,
+      "loss": 0.0898,
+      "step": 40648
+    },
+    {
+      "epoch": 0.3528528398190988,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0013310775509949204,
+      "loss": 0.1104,
+      "step": 40649
+    },
+    {
+      "epoch": 0.35286152029930296,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0013310487861357596,
+      "loss": 0.1309,
+      "step": 40650
+    },
+    {
+      "epoch": 0.35287020077950715,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0013310200210239274,
+      "loss": 0.0938,
+      "step": 40651
+    },
+    {
+      "epoch": 0.3528788812597113,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.001330991255659455,
+      "loss": 0.0928,
+      "step": 40652
+    },
+    {
+      "epoch": 0.3528875617399155,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0013309624900423743,
+      "loss": 0.0728,
+      "step": 40653
+    },
+    {
+      "epoch": 0.3528962422201196,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0013309337241727164,
+      "loss": 0.125,
+      "step": 40654
+    },
+    {
+      "epoch": 0.3529049227003238,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0013309049580505126,
+      "loss": 0.0786,
+      "step": 40655
+    },
+    {
+      "epoch": 0.35291360318052795,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0013308761916757947,
+      "loss": 0.0923,
+      "step": 40656
+    },
+    {
+      "epoch": 0.35292228366073214,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0013308474250485942,
+      "loss": 0.1094,
+      "step": 40657
+    },
+    {
+      "epoch": 0.3529309641409363,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0013308186581689428,
+      "loss": 0.1143,
+      "step": 40658
+    },
+    {
+      "epoch": 0.35293964462114047,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001330789891036871,
+      "loss": 0.1055,
+      "step": 40659
+    },
+    {
+      "epoch": 0.3529483251013446,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0013307611236524108,
+      "loss": 0.0957,
+      "step": 40660
+    },
+    {
+      "epoch": 0.35295700558154874,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0013307323560155942,
+      "loss": 0.1226,
+      "step": 40661
+    },
+    {
+      "epoch": 0.35296568606175294,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001330703588126452,
+      "loss": 0.0625,
+      "step": 40662
+    },
+    {
+      "epoch": 0.3529743665419571,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0013306748199850157,
+      "loss": 0.1221,
+      "step": 40663
+    },
+    {
+      "epoch": 0.35298304702216127,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0013306460515913167,
+      "loss": 0.0996,
+      "step": 40664
+    },
+    {
+      "epoch": 0.3529917275023654,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0013306172829453868,
+      "loss": 0.1006,
+      "step": 40665
+    },
+    {
+      "epoch": 0.3530004079825696,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0013305885140472574,
+      "loss": 0.0752,
+      "step": 40666
+    },
+    {
+      "epoch": 0.35300908846277373,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00133055974489696,
+      "loss": 0.0703,
+      "step": 40667
+    },
+    {
+      "epoch": 0.3530177689429779,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0013305309754945258,
+      "loss": 0.0786,
+      "step": 40668
+    },
+    {
+      "epoch": 0.35302644942318206,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001330502205839986,
+      "loss": 0.1016,
+      "step": 40669
+    },
+    {
+      "epoch": 0.35303512990338626,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001330473435933373,
+      "loss": 0.0957,
+      "step": 40670
+    },
+    {
+      "epoch": 0.3530438103835904,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0013304446657747175,
+      "loss": 0.0786,
+      "step": 40671
+    },
+    {
+      "epoch": 0.3530524908637946,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0013304158953640513,
+      "loss": 0.084,
+      "step": 40672
+    },
+    {
+      "epoch": 0.3530611713439987,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0013303871247014054,
+      "loss": 0.0762,
+      "step": 40673
+    },
+    {
+      "epoch": 0.3530698518242029,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0013303583537868117,
+      "loss": 0.1006,
+      "step": 40674
+    },
+    {
+      "epoch": 0.35307853230440706,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0013303295826203017,
+      "loss": 0.1147,
+      "step": 40675
+    },
+    {
+      "epoch": 0.35308721278461125,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0013303008112019068,
+      "loss": 0.1016,
+      "step": 40676
+    },
+    {
+      "epoch": 0.3530958932648154,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0013302720395316585,
+      "loss": 0.0815,
+      "step": 40677
+    },
+    {
+      "epoch": 0.3531045737450196,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0013302432676095878,
+      "loss": 0.0708,
+      "step": 40678
+    },
+    {
+      "epoch": 0.3531132542252237,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0013302144954357267,
+      "loss": 0.1162,
+      "step": 40679
+    },
+    {
+      "epoch": 0.3531219347054279,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0013301857230101067,
+      "loss": 0.1201,
+      "step": 40680
+    },
+    {
+      "epoch": 0.35313061518563205,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0013301569503327585,
+      "loss": 0.1094,
+      "step": 40681
+    },
+    {
+      "epoch": 0.35313929566583624,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0013301281774037146,
+      "loss": 0.106,
+      "step": 40682
+    },
+    {
+      "epoch": 0.3531479761460404,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0013300994042230057,
+      "loss": 0.1543,
+      "step": 40683
+    },
+    {
+      "epoch": 0.35315665662624457,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0013300706307906639,
+      "loss": 0.085,
+      "step": 40684
+    },
+    {
+      "epoch": 0.3531653371064487,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.00133004185710672,
+      "loss": 0.0776,
+      "step": 40685
+    },
+    {
+      "epoch": 0.3531740175866529,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0013300130831712057,
+      "loss": 0.1367,
+      "step": 40686
+    },
+    {
+      "epoch": 0.35318269806685704,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0013299843089841527,
+      "loss": 0.1621,
+      "step": 40687
+    },
+    {
+      "epoch": 0.35319137854706123,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0013299555345455925,
+      "loss": 0.0801,
+      "step": 40688
+    },
+    {
+      "epoch": 0.35320005902726537,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0013299267598555562,
+      "loss": 0.1089,
+      "step": 40689
+    },
+    {
+      "epoch": 0.35320873950746956,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0013298979849140755,
+      "loss": 0.1001,
+      "step": 40690
+    },
+    {
+      "epoch": 0.3532174199876737,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0013298692097211818,
+      "loss": 0.0981,
+      "step": 40691
+    },
+    {
+      "epoch": 0.3532261004678779,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0013298404342769064,
+      "loss": 0.105,
+      "step": 40692
+    },
+    {
+      "epoch": 0.35323478094808203,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0013298116585812813,
+      "loss": 0.0879,
+      "step": 40693
+    },
+    {
+      "epoch": 0.3532434614282862,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0013297828826343374,
+      "loss": 0.0742,
+      "step": 40694
+    },
+    {
+      "epoch": 0.35325214190849036,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0013297541064361063,
+      "loss": 0.0908,
+      "step": 40695
+    },
+    {
+      "epoch": 0.35326082238869455,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.00132972532998662,
+      "loss": 0.105,
+      "step": 40696
+    },
+    {
+      "epoch": 0.3532695028688987,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001329696553285909,
+      "loss": 0.1084,
+      "step": 40697
+    },
+    {
+      "epoch": 0.3532781833491029,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0013296677763340056,
+      "loss": 0.0703,
+      "step": 40698
+    },
+    {
+      "epoch": 0.353286863829307,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0013296389991309407,
+      "loss": 0.1104,
+      "step": 40699
+    },
+    {
+      "epoch": 0.3532955443095112,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0013296102216767466,
+      "loss": 0.1143,
+      "step": 40700
+    },
+    {
+      "epoch": 0.35330422478971535,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0013295814439714538,
+      "loss": 0.0723,
+      "step": 40701
+    },
+    {
+      "epoch": 0.35331290526991954,
+      "grad_norm": 2.4375,
+      "learning_rate": 0.001329552666015094,
+      "loss": 0.2119,
+      "step": 40702
+    },
+    {
+      "epoch": 0.3533215857501237,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001329523887807699,
+      "loss": 0.1055,
+      "step": 40703
+    },
+    {
+      "epoch": 0.35333026623032787,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0013294951093493004,
+      "loss": 0.0869,
+      "step": 40704
+    },
+    {
+      "epoch": 0.353338946710532,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0013294663306399291,
+      "loss": 0.0815,
+      "step": 40705
+    },
+    {
+      "epoch": 0.3533476271907362,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001329437551679617,
+      "loss": 0.0874,
+      "step": 40706
+    },
+    {
+      "epoch": 0.35335630767094034,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0013294087724683957,
+      "loss": 0.1367,
+      "step": 40707
+    },
+    {
+      "epoch": 0.35336498815114453,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001329379993006296,
+      "loss": 0.0942,
+      "step": 40708
+    },
+    {
+      "epoch": 0.35337366863134867,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0013293512132933499,
+      "loss": 0.1396,
+      "step": 40709
+    },
+    {
+      "epoch": 0.35338234911155286,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0013293224333295888,
+      "loss": 0.1143,
+      "step": 40710
+    },
+    {
+      "epoch": 0.353391029591757,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0013292936531150439,
+      "loss": 0.1035,
+      "step": 40711
+    },
+    {
+      "epoch": 0.3533997100719612,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0013292648726497469,
+      "loss": 0.124,
+      "step": 40712
+    },
+    {
+      "epoch": 0.35340839055216533,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0013292360919337296,
+      "loss": 0.0869,
+      "step": 40713
+    },
+    {
+      "epoch": 0.3534170710323695,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0013292073109670234,
+      "loss": 0.0859,
+      "step": 40714
+    },
+    {
+      "epoch": 0.35342575151257366,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0013291785297496592,
+      "loss": 0.0879,
+      "step": 40715
+    },
+    {
+      "epoch": 0.35343443199277785,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0013291497482816686,
+      "loss": 0.0801,
+      "step": 40716
+    },
+    {
+      "epoch": 0.353443112472982,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0013291209665630835,
+      "loss": 0.1416,
+      "step": 40717
+    },
+    {
+      "epoch": 0.3534517929531862,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0013290921845939352,
+      "loss": 0.2256,
+      "step": 40718
+    },
+    {
+      "epoch": 0.3534604734333903,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001329063402374255,
+      "loss": 0.0791,
+      "step": 40719
+    },
+    {
+      "epoch": 0.3534691539135945,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0013290346199040747,
+      "loss": 0.082,
+      "step": 40720
+    },
+    {
+      "epoch": 0.35347783439379865,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0013290058371834253,
+      "loss": 0.1113,
+      "step": 40721
+    },
+    {
+      "epoch": 0.35348651487400284,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0013289770542123385,
+      "loss": 0.1064,
+      "step": 40722
+    },
+    {
+      "epoch": 0.353495195354207,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001328948270990846,
+      "loss": 0.125,
+      "step": 40723
+    },
+    {
+      "epoch": 0.3535038758344112,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0013289194875189794,
+      "loss": 0.084,
+      "step": 40724
+    },
+    {
+      "epoch": 0.3535125563146153,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0013288907037967695,
+      "loss": 0.124,
+      "step": 40725
+    },
+    {
+      "epoch": 0.3535212367948195,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0013288619198242482,
+      "loss": 0.1357,
+      "step": 40726
+    },
+    {
+      "epoch": 0.35352991727502364,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0013288331356014471,
+      "loss": 0.0952,
+      "step": 40727
+    },
+    {
+      "epoch": 0.35353859775522783,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0013288043511283977,
+      "loss": 0.1143,
+      "step": 40728
+    },
+    {
+      "epoch": 0.35354727823543197,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0013287755664051308,
+      "loss": 0.1543,
+      "step": 40729
+    },
+    {
+      "epoch": 0.35355595871563616,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0013287467814316787,
+      "loss": 0.1309,
+      "step": 40730
+    },
+    {
+      "epoch": 0.3535646391958403,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0013287179962080725,
+      "loss": 0.0762,
+      "step": 40731
+    },
+    {
+      "epoch": 0.3535733196760445,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0013286892107343435,
+      "loss": 0.1045,
+      "step": 40732
+    },
+    {
+      "epoch": 0.35358200015624863,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001328660425010524,
+      "loss": 0.1299,
+      "step": 40733
+    },
+    {
+      "epoch": 0.3535906806364528,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0013286316390366442,
+      "loss": 0.1191,
+      "step": 40734
+    },
+    {
+      "epoch": 0.35359936111665696,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0013286028528127368,
+      "loss": 0.1289,
+      "step": 40735
+    },
+    {
+      "epoch": 0.35360804159686116,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0013285740663388323,
+      "loss": 0.0806,
+      "step": 40736
+    },
+    {
+      "epoch": 0.3536167220770653,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0013285452796149633,
+      "loss": 0.0771,
+      "step": 40737
+    },
+    {
+      "epoch": 0.3536254025572695,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.00132851649264116,
+      "loss": 0.1211,
+      "step": 40738
+    },
+    {
+      "epoch": 0.3536340830374736,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0013284877054174546,
+      "loss": 0.0859,
+      "step": 40739
+    },
+    {
+      "epoch": 0.3536427635176778,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0013284589179438787,
+      "loss": 0.0874,
+      "step": 40740
+    },
+    {
+      "epoch": 0.35365144399788195,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0013284301302204633,
+      "loss": 0.0933,
+      "step": 40741
+    },
+    {
+      "epoch": 0.35366012447808615,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0013284013422472404,
+      "loss": 0.1011,
+      "step": 40742
+    },
+    {
+      "epoch": 0.3536688049582903,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0013283725540242412,
+      "loss": 0.0889,
+      "step": 40743
+    },
+    {
+      "epoch": 0.3536774854384945,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.001328343765551497,
+      "loss": 0.1621,
+      "step": 40744
+    },
+    {
+      "epoch": 0.3536861659186986,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0013283149768290397,
+      "loss": 0.1216,
+      "step": 40745
+    },
+    {
+      "epoch": 0.3536948463989028,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0013282861878569003,
+      "loss": 0.126,
+      "step": 40746
+    },
+    {
+      "epoch": 0.35370352687910694,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0013282573986351108,
+      "loss": 0.127,
+      "step": 40747
+    },
+    {
+      "epoch": 0.35371220735931114,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0013282286091637024,
+      "loss": 0.0898,
+      "step": 40748
+    },
+    {
+      "epoch": 0.3537208878395153,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0013281998194427066,
+      "loss": 0.127,
+      "step": 40749
+    },
+    {
+      "epoch": 0.35372956831971947,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0013281710294721547,
+      "loss": 0.1211,
+      "step": 40750
+    },
+    {
+      "epoch": 0.3537382487999236,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0013281422392520785,
+      "loss": 0.0703,
+      "step": 40751
+    },
+    {
+      "epoch": 0.3537469292801278,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0013281134487825095,
+      "loss": 0.0928,
+      "step": 40752
+    },
+    {
+      "epoch": 0.35375560976033193,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0013280846580634791,
+      "loss": 0.1035,
+      "step": 40753
+    },
+    {
+      "epoch": 0.35376429024053613,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001328055867095019,
+      "loss": 0.062,
+      "step": 40754
+    },
+    {
+      "epoch": 0.35377297072074027,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0013280270758771599,
+      "loss": 0.1523,
+      "step": 40755
+    },
+    {
+      "epoch": 0.35378165120094446,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0013279982844099341,
+      "loss": 0.1055,
+      "step": 40756
+    },
+    {
+      "epoch": 0.3537903316811486,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0013279694926933727,
+      "loss": 0.1089,
+      "step": 40757
+    },
+    {
+      "epoch": 0.3537990121613528,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0013279407007275074,
+      "loss": 0.1138,
+      "step": 40758
+    },
+    {
+      "epoch": 0.3538076926415569,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0013279119085123693,
+      "loss": 0.0874,
+      "step": 40759
+    },
+    {
+      "epoch": 0.3538163731217611,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.00132788311604799,
+      "loss": 0.1089,
+      "step": 40760
+    },
+    {
+      "epoch": 0.35382505360196526,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0013278543233344018,
+      "loss": 0.127,
+      "step": 40761
+    },
+    {
+      "epoch": 0.35383373408216945,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0013278255303716352,
+      "loss": 0.1367,
+      "step": 40762
+    },
+    {
+      "epoch": 0.3538424145623736,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0013277967371597222,
+      "loss": 0.1162,
+      "step": 40763
+    },
+    {
+      "epoch": 0.3538510950425778,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0013277679436986937,
+      "loss": 0.1943,
+      "step": 40764
+    },
+    {
+      "epoch": 0.3538597755227819,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001327739149988582,
+      "loss": 0.1104,
+      "step": 40765
+    },
+    {
+      "epoch": 0.3538684560029861,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0013277103560294181,
+      "loss": 0.0732,
+      "step": 40766
+    },
+    {
+      "epoch": 0.35387713648319025,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0013276815618212335,
+      "loss": 0.0874,
+      "step": 40767
+    },
+    {
+      "epoch": 0.35388581696339444,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0013276527673640598,
+      "loss": 0.0752,
+      "step": 40768
+    },
+    {
+      "epoch": 0.3538944974435986,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0013276239726579282,
+      "loss": 0.0874,
+      "step": 40769
+    },
+    {
+      "epoch": 0.35390317792380277,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0013275951777028709,
+      "loss": 0.0947,
+      "step": 40770
+    },
+    {
+      "epoch": 0.3539118584040069,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0013275663824989187,
+      "loss": 0.1045,
+      "step": 40771
+    },
+    {
+      "epoch": 0.3539205388842111,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0013275375870461033,
+      "loss": 0.1133,
+      "step": 40772
+    },
+    {
+      "epoch": 0.35392921936441524,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0013275087913444561,
+      "loss": 0.1074,
+      "step": 40773
+    },
+    {
+      "epoch": 0.35393789984461943,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.001327479995394009,
+      "loss": 0.1357,
+      "step": 40774
+    },
+    {
+      "epoch": 0.35394658032482357,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0013274511991947931,
+      "loss": 0.105,
+      "step": 40775
+    },
+    {
+      "epoch": 0.35395526080502776,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00132742240274684,
+      "loss": 0.0986,
+      "step": 40776
+    },
+    {
+      "epoch": 0.3539639412852319,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.001327393606050181,
+      "loss": 0.0972,
+      "step": 40777
+    },
+    {
+      "epoch": 0.3539726217654361,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001327364809104848,
+      "loss": 0.0742,
+      "step": 40778
+    },
+    {
+      "epoch": 0.35398130224564023,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001327336011910872,
+      "loss": 0.1416,
+      "step": 40779
+    },
+    {
+      "epoch": 0.3539899827258444,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0013273072144682852,
+      "loss": 0.1016,
+      "step": 40780
+    },
+    {
+      "epoch": 0.35399866320604856,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001327278416777118,
+      "loss": 0.1104,
+      "step": 40781
+    },
+    {
+      "epoch": 0.35400734368625275,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0013272496188374027,
+      "loss": 0.1436,
+      "step": 40782
+    },
+    {
+      "epoch": 0.3540160241664569,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001327220820649171,
+      "loss": 0.1309,
+      "step": 40783
+    },
+    {
+      "epoch": 0.354024704646661,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0013271920222124541,
+      "loss": 0.0859,
+      "step": 40784
+    },
+    {
+      "epoch": 0.3540333851268652,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001327163223527283,
+      "loss": 0.0874,
+      "step": 40785
+    },
+    {
+      "epoch": 0.35404206560706936,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0013271344245936898,
+      "loss": 0.1162,
+      "step": 40786
+    },
+    {
+      "epoch": 0.35405074608727355,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0013271056254117058,
+      "loss": 0.084,
+      "step": 40787
+    },
+    {
+      "epoch": 0.3540594265674777,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0013270768259813625,
+      "loss": 0.165,
+      "step": 40788
+    },
+    {
+      "epoch": 0.3540681070476819,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.001327048026302691,
+      "loss": 0.0898,
+      "step": 40789
+    },
+    {
+      "epoch": 0.354076787527886,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0013270192263757238,
+      "loss": 0.0737,
+      "step": 40790
+    },
+    {
+      "epoch": 0.3540854680080902,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0013269904262004915,
+      "loss": 0.0986,
+      "step": 40791
+    },
+    {
+      "epoch": 0.35409414848829435,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0013269616257770259,
+      "loss": 0.1074,
+      "step": 40792
+    },
+    {
+      "epoch": 0.35410282896849854,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0013269328251053587,
+      "loss": 0.0938,
+      "step": 40793
+    },
+    {
+      "epoch": 0.3541115094487027,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0013269040241855208,
+      "loss": 0.2158,
+      "step": 40794
+    },
+    {
+      "epoch": 0.35412018992890687,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.0013268752230175442,
+      "loss": 0.1279,
+      "step": 40795
+    },
+    {
+      "epoch": 0.354128870409111,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0013268464216014604,
+      "loss": 0.1348,
+      "step": 40796
+    },
+    {
+      "epoch": 0.3541375508893152,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0013268176199373006,
+      "loss": 0.084,
+      "step": 40797
+    },
+    {
+      "epoch": 0.35414623136951934,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0013267888180250966,
+      "loss": 0.1289,
+      "step": 40798
+    },
+    {
+      "epoch": 0.35415491184972353,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0013267600158648797,
+      "loss": 0.1123,
+      "step": 40799
+    },
+    {
+      "epoch": 0.35416359232992767,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0013267312134566814,
+      "loss": 0.127,
+      "step": 40800
+    },
+    {
+      "epoch": 0.35417227281013186,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0013267024108005335,
+      "loss": 0.1523,
+      "step": 40801
+    },
+    {
+      "epoch": 0.354180953290336,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0013266736078964669,
+      "loss": 0.0869,
+      "step": 40802
+    },
+    {
+      "epoch": 0.3541896337705402,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0013266448047445135,
+      "loss": 0.1377,
+      "step": 40803
+    },
+    {
+      "epoch": 0.35419831425074433,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001326616001344705,
+      "loss": 0.062,
+      "step": 40804
+    },
+    {
+      "epoch": 0.3542069947309485,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0013265871976970722,
+      "loss": 0.0903,
+      "step": 40805
+    },
+    {
+      "epoch": 0.35421567521115266,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0013265583938016474,
+      "loss": 0.0918,
+      "step": 40806
+    },
+    {
+      "epoch": 0.35422435569135685,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0013265295896584617,
+      "loss": 0.0806,
+      "step": 40807
+    },
+    {
+      "epoch": 0.354233036171561,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0013265007852675463,
+      "loss": 0.0981,
+      "step": 40808
+    },
+    {
+      "epoch": 0.3542417166517652,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0013264719806289338,
+      "loss": 0.1021,
+      "step": 40809
+    },
+    {
+      "epoch": 0.3542503971319693,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0013264431757426542,
+      "loss": 0.0923,
+      "step": 40810
+    },
+    {
+      "epoch": 0.3542590776121735,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0013264143706087402,
+      "loss": 0.0962,
+      "step": 40811
+    },
+    {
+      "epoch": 0.35426775809237765,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0013263855652272227,
+      "loss": 0.0713,
+      "step": 40812
+    },
+    {
+      "epoch": 0.35427643857258184,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0013263567595981334,
+      "loss": 0.0962,
+      "step": 40813
+    },
+    {
+      "epoch": 0.354285119052786,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0013263279537215037,
+      "loss": 0.1094,
+      "step": 40814
+    },
+    {
+      "epoch": 0.3542937995329902,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0013262991475973653,
+      "loss": 0.1377,
+      "step": 40815
+    },
+    {
+      "epoch": 0.3543024800131943,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0013262703412257493,
+      "loss": 0.0928,
+      "step": 40816
+    },
+    {
+      "epoch": 0.3543111604933985,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0013262415346066872,
+      "loss": 0.0947,
+      "step": 40817
+    },
+    {
+      "epoch": 0.35431984097360264,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0013262127277402112,
+      "loss": 0.1562,
+      "step": 40818
+    },
+    {
+      "epoch": 0.35432852145380683,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0013261839206263524,
+      "loss": 0.0859,
+      "step": 40819
+    },
+    {
+      "epoch": 0.35433720193401097,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0013261551132651421,
+      "loss": 0.0713,
+      "step": 40820
+    },
+    {
+      "epoch": 0.35434588241421516,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0013261263056566117,
+      "loss": 0.1758,
+      "step": 40821
+    },
+    {
+      "epoch": 0.3543545628944193,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0013260974978007937,
+      "loss": 0.1069,
+      "step": 40822
+    },
+    {
+      "epoch": 0.3543632433746235,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0013260686896977183,
+      "loss": 0.0918,
+      "step": 40823
+    },
+    {
+      "epoch": 0.35437192385482763,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001326039881347418,
+      "loss": 0.0752,
+      "step": 40824
+    },
+    {
+      "epoch": 0.3543806043350318,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0013260110727499233,
+      "loss": 0.0928,
+      "step": 40825
+    },
+    {
+      "epoch": 0.35438928481523596,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0013259822639052669,
+      "loss": 0.0825,
+      "step": 40826
+    },
+    {
+      "epoch": 0.35439796529544015,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001325953454813479,
+      "loss": 0.0986,
+      "step": 40827
+    },
+    {
+      "epoch": 0.3544066457756443,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0013259246454745924,
+      "loss": 0.1064,
+      "step": 40828
+    },
+    {
+      "epoch": 0.3544153262558485,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0013258958358886375,
+      "loss": 0.0791,
+      "step": 40829
+    },
+    {
+      "epoch": 0.3544240067360526,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0013258670260556465,
+      "loss": 0.1406,
+      "step": 40830
+    },
+    {
+      "epoch": 0.3544326872162568,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0013258382159756507,
+      "loss": 0.1357,
+      "step": 40831
+    },
+    {
+      "epoch": 0.35444136769646095,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001325809405648682,
+      "loss": 0.0986,
+      "step": 40832
+    },
+    {
+      "epoch": 0.35445004817666514,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0013257805950747713,
+      "loss": 0.1084,
+      "step": 40833
+    },
+    {
+      "epoch": 0.3544587286568693,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0013257517842539502,
+      "loss": 0.0884,
+      "step": 40834
+    },
+    {
+      "epoch": 0.3544674091370735,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0013257229731862507,
+      "loss": 0.0796,
+      "step": 40835
+    },
+    {
+      "epoch": 0.3544760896172776,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0013256941618717034,
+      "loss": 0.1074,
+      "step": 40836
+    },
+    {
+      "epoch": 0.3544847700974818,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0013256653503103404,
+      "loss": 0.1235,
+      "step": 40837
+    },
+    {
+      "epoch": 0.35449345057768594,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0013256365385021934,
+      "loss": 0.1104,
+      "step": 40838
+    },
+    {
+      "epoch": 0.35450213105789014,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0013256077264472937,
+      "loss": 0.1035,
+      "step": 40839
+    },
+    {
+      "epoch": 0.3545108115380943,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0013255789141456728,
+      "loss": 0.1377,
+      "step": 40840
+    },
+    {
+      "epoch": 0.35451949201829847,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.001325550101597362,
+      "loss": 0.0854,
+      "step": 40841
+    },
+    {
+      "epoch": 0.3545281724985026,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0013255212888023933,
+      "loss": 0.0854,
+      "step": 40842
+    },
+    {
+      "epoch": 0.3545368529787068,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0013254924757607977,
+      "loss": 0.1787,
+      "step": 40843
+    },
+    {
+      "epoch": 0.35454553345891093,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0013254636624726072,
+      "loss": 0.0918,
+      "step": 40844
+    },
+    {
+      "epoch": 0.3545542139391151,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0013254348489378524,
+      "loss": 0.1143,
+      "step": 40845
+    },
+    {
+      "epoch": 0.35456289441931926,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001325406035156566,
+      "loss": 0.1011,
+      "step": 40846
+    },
+    {
+      "epoch": 0.35457157489952346,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0013253772211287787,
+      "loss": 0.123,
+      "step": 40847
+    },
+    {
+      "epoch": 0.3545802553797276,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0013253484068545221,
+      "loss": 0.0981,
+      "step": 40848
+    },
+    {
+      "epoch": 0.3545889358599318,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0013253195923338284,
+      "loss": 0.1377,
+      "step": 40849
+    },
+    {
+      "epoch": 0.3545976163401359,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0013252907775667283,
+      "loss": 0.0977,
+      "step": 40850
+    },
+    {
+      "epoch": 0.3546062968203401,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0013252619625532531,
+      "loss": 0.1367,
+      "step": 40851
+    },
+    {
+      "epoch": 0.35461497730054425,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0013252331472934356,
+      "loss": 0.1221,
+      "step": 40852
+    },
+    {
+      "epoch": 0.35462365778074845,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0013252043317873061,
+      "loss": 0.1064,
+      "step": 40853
+    },
+    {
+      "epoch": 0.3546323382609526,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0013251755160348965,
+      "loss": 0.1143,
+      "step": 40854
+    },
+    {
+      "epoch": 0.3546410187411568,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0013251467000362385,
+      "loss": 0.0947,
+      "step": 40855
+    },
+    {
+      "epoch": 0.3546496992213609,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0013251178837913632,
+      "loss": 0.0752,
+      "step": 40856
+    },
+    {
+      "epoch": 0.3546583797015651,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0013250890673003027,
+      "loss": 0.0869,
+      "step": 40857
+    },
+    {
+      "epoch": 0.35466706018176924,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001325060250563088,
+      "loss": 0.1035,
+      "step": 40858
+    },
+    {
+      "epoch": 0.35467574066197344,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0013250314335797506,
+      "loss": 0.1357,
+      "step": 40859
+    },
+    {
+      "epoch": 0.3546844211421776,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0013250026163503226,
+      "loss": 0.1064,
+      "step": 40860
+    },
+    {
+      "epoch": 0.35469310162238177,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001324973798874835,
+      "loss": 0.1094,
+      "step": 40861
+    },
+    {
+      "epoch": 0.3547017821025859,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0013249449811533194,
+      "loss": 0.082,
+      "step": 40862
+    },
+    {
+      "epoch": 0.3547104625827901,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0013249161631858075,
+      "loss": 0.0757,
+      "step": 40863
+    },
+    {
+      "epoch": 0.35471914306299424,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0013248873449723305,
+      "loss": 0.0854,
+      "step": 40864
+    },
+    {
+      "epoch": 0.35472782354319843,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0013248585265129198,
+      "loss": 0.1104,
+      "step": 40865
+    },
+    {
+      "epoch": 0.35473650402340257,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0013248297078076075,
+      "loss": 0.0894,
+      "step": 40866
+    },
+    {
+      "epoch": 0.35474518450360676,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0013248008888564248,
+      "loss": 0.0894,
+      "step": 40867
+    },
+    {
+      "epoch": 0.3547538649838109,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001324772069659403,
+      "loss": 0.0918,
+      "step": 40868
+    },
+    {
+      "epoch": 0.3547625454640151,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0013247432502165742,
+      "loss": 0.0977,
+      "step": 40869
+    },
+    {
+      "epoch": 0.3547712259442192,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0013247144305279695,
+      "loss": 0.1162,
+      "step": 40870
+    },
+    {
+      "epoch": 0.3547799064244234,
+      "grad_norm": 2.578125,
+      "learning_rate": 0.0013246856105936205,
+      "loss": 0.1504,
+      "step": 40871
+    },
+    {
+      "epoch": 0.35478858690462756,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0013246567904135584,
+      "loss": 0.0811,
+      "step": 40872
+    },
+    {
+      "epoch": 0.35479726738483175,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001324627969987815,
+      "loss": 0.1123,
+      "step": 40873
+    },
+    {
+      "epoch": 0.3548059478650359,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001324599149316422,
+      "loss": 0.0957,
+      "step": 40874
+    },
+    {
+      "epoch": 0.3548146283452401,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001324570328399411,
+      "loss": 0.0845,
+      "step": 40875
+    },
+    {
+      "epoch": 0.3548233088254442,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0013245415072368127,
+      "loss": 0.083,
+      "step": 40876
+    },
+    {
+      "epoch": 0.3548319893056484,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0013245126858286596,
+      "loss": 0.1074,
+      "step": 40877
+    },
+    {
+      "epoch": 0.35484066978585255,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0013244838641749827,
+      "loss": 0.1494,
+      "step": 40878
+    },
+    {
+      "epoch": 0.35484935026605674,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0013244550422758137,
+      "loss": 0.0649,
+      "step": 40879
+    },
+    {
+      "epoch": 0.3548580307462609,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0013244262201311838,
+      "loss": 0.0854,
+      "step": 40880
+    },
+    {
+      "epoch": 0.35486671122646507,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001324397397741125,
+      "loss": 0.0889,
+      "step": 40881
+    },
+    {
+      "epoch": 0.3548753917066692,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0013243685751056684,
+      "loss": 0.0918,
+      "step": 40882
+    },
+    {
+      "epoch": 0.3548840721868734,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001324339752224846,
+      "loss": 0.0918,
+      "step": 40883
+    },
+    {
+      "epoch": 0.35489275266707754,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0013243109290986886,
+      "loss": 0.1172,
+      "step": 40884
+    },
+    {
+      "epoch": 0.35490143314728173,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0013242821057272282,
+      "loss": 0.1074,
+      "step": 40885
+    },
+    {
+      "epoch": 0.35491011362748587,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0013242532821104965,
+      "loss": 0.1128,
+      "step": 40886
+    },
+    {
+      "epoch": 0.35491879410769006,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0013242244582485246,
+      "loss": 0.1143,
+      "step": 40887
+    },
+    {
+      "epoch": 0.3549274745878942,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0013241956341413443,
+      "loss": 0.1309,
+      "step": 40888
+    },
+    {
+      "epoch": 0.3549361550680984,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001324166809788987,
+      "loss": 0.1172,
+      "step": 40889
+    },
+    {
+      "epoch": 0.35494483554830253,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0013241379851914841,
+      "loss": 0.1016,
+      "step": 40890
+    },
+    {
+      "epoch": 0.3549535160285067,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0013241091603488676,
+      "loss": 0.0986,
+      "step": 40891
+    },
+    {
+      "epoch": 0.35496219650871086,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0013240803352611685,
+      "loss": 0.0708,
+      "step": 40892
+    },
+    {
+      "epoch": 0.35497087698891505,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.0013240515099284184,
+      "loss": 0.1025,
+      "step": 40893
+    },
+    {
+      "epoch": 0.3549795574691192,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0013240226843506489,
+      "loss": 0.0972,
+      "step": 40894
+    },
+    {
+      "epoch": 0.3549882379493234,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0013239938585278919,
+      "loss": 0.1143,
+      "step": 40895
+    },
+    {
+      "epoch": 0.3549969184295275,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0013239650324601785,
+      "loss": 0.0791,
+      "step": 40896
+    },
+    {
+      "epoch": 0.3550055989097317,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0013239362061475402,
+      "loss": 0.1162,
+      "step": 40897
+    },
+    {
+      "epoch": 0.35501427938993585,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0013239073795900085,
+      "loss": 0.1152,
+      "step": 40898
+    },
+    {
+      "epoch": 0.35502295987014004,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001323878552787615,
+      "loss": 0.1035,
+      "step": 40899
+    },
+    {
+      "epoch": 0.3550316403503442,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0013238497257403916,
+      "loss": 0.1562,
+      "step": 40900
+    },
+    {
+      "epoch": 0.3550403208305484,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0013238208984483696,
+      "loss": 0.0942,
+      "step": 40901
+    },
+    {
+      "epoch": 0.3550490013107525,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0013237920709115798,
+      "loss": 0.124,
+      "step": 40902
+    },
+    {
+      "epoch": 0.3550576817909567,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001323763243130055,
+      "loss": 0.1084,
+      "step": 40903
+    },
+    {
+      "epoch": 0.35506636227116084,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0013237344151038255,
+      "loss": 0.0942,
+      "step": 40904
+    },
+    {
+      "epoch": 0.35507504275136503,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001323705586832924,
+      "loss": 0.0825,
+      "step": 40905
+    },
+    {
+      "epoch": 0.35508372323156917,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001323676758317381,
+      "loss": 0.1055,
+      "step": 40906
+    },
+    {
+      "epoch": 0.3550924037117733,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0013236479295572286,
+      "loss": 0.1279,
+      "step": 40907
+    },
+    {
+      "epoch": 0.3551010841919775,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0013236191005524982,
+      "loss": 0.1162,
+      "step": 40908
+    },
+    {
+      "epoch": 0.35510976467218164,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0013235902713032216,
+      "loss": 0.0688,
+      "step": 40909
+    },
+    {
+      "epoch": 0.35511844515238583,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0013235614418094298,
+      "loss": 0.1719,
+      "step": 40910
+    },
+    {
+      "epoch": 0.35512712563258997,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0013235326120711543,
+      "loss": 0.1396,
+      "step": 40911
+    },
+    {
+      "epoch": 0.35513580611279416,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001323503782088427,
+      "loss": 0.0928,
+      "step": 40912
+    },
+    {
+      "epoch": 0.3551444865929983,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0013234749518612795,
+      "loss": 0.1201,
+      "step": 40913
+    },
+    {
+      "epoch": 0.3551531670732025,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0013234461213897431,
+      "loss": 0.082,
+      "step": 40914
+    },
+    {
+      "epoch": 0.35516184755340663,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0013234172906738493,
+      "loss": 0.0942,
+      "step": 40915
+    },
+    {
+      "epoch": 0.3551705280336108,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0013233884597136296,
+      "loss": 0.0859,
+      "step": 40916
+    },
+    {
+      "epoch": 0.35517920851381496,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0013233596285091161,
+      "loss": 0.0742,
+      "step": 40917
+    },
+    {
+      "epoch": 0.35518788899401915,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0013233307970603396,
+      "loss": 0.1055,
+      "step": 40918
+    },
+    {
+      "epoch": 0.3551965694742233,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0013233019653673317,
+      "loss": 0.1099,
+      "step": 40919
+    },
+    {
+      "epoch": 0.3552052499544275,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0013232731334301242,
+      "loss": 0.0669,
+      "step": 40920
+    },
+    {
+      "epoch": 0.3552139304346316,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001323244301248749,
+      "loss": 0.1123,
+      "step": 40921
+    },
+    {
+      "epoch": 0.3552226109148358,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0013232154688232365,
+      "loss": 0.1328,
+      "step": 40922
+    },
+    {
+      "epoch": 0.35523129139503995,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0013231866361536194,
+      "loss": 0.1465,
+      "step": 40923
+    },
+    {
+      "epoch": 0.35523997187524414,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0013231578032399283,
+      "loss": 0.0869,
+      "step": 40924
+    },
+    {
+      "epoch": 0.3552486523554483,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0013231289700821956,
+      "loss": 0.1162,
+      "step": 40925
+    },
+    {
+      "epoch": 0.3552573328356525,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0013231001366804523,
+      "loss": 0.1152,
+      "step": 40926
+    },
+    {
+      "epoch": 0.3552660133158566,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.00132307130303473,
+      "loss": 0.125,
+      "step": 40927
+    },
+    {
+      "epoch": 0.3552746937960608,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0013230424691450604,
+      "loss": 0.0723,
+      "step": 40928
+    },
+    {
+      "epoch": 0.35528337427626494,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0013230136350114747,
+      "loss": 0.1748,
+      "step": 40929
+    },
+    {
+      "epoch": 0.35529205475646913,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0013229848006340048,
+      "loss": 0.1152,
+      "step": 40930
+    },
+    {
+      "epoch": 0.35530073523667327,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0013229559660126822,
+      "loss": 0.0786,
+      "step": 40931
+    },
+    {
+      "epoch": 0.35530941571687746,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0013229271311475381,
+      "loss": 0.1064,
+      "step": 40932
+    },
+    {
+      "epoch": 0.3553180961970816,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001322898296038604,
+      "loss": 0.0898,
+      "step": 40933
+    },
+    {
+      "epoch": 0.3553267766772858,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.001322869460685912,
+      "loss": 0.103,
+      "step": 40934
+    },
+    {
+      "epoch": 0.35533545715748993,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0013228406250894932,
+      "loss": 0.082,
+      "step": 40935
+    },
+    {
+      "epoch": 0.3553441376376941,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0013228117892493795,
+      "loss": 0.125,
+      "step": 40936
+    },
+    {
+      "epoch": 0.35535281811789826,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.001322782953165602,
+      "loss": 0.0703,
+      "step": 40937
+    },
+    {
+      "epoch": 0.35536149859810245,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0013227541168381923,
+      "loss": 0.0825,
+      "step": 40938
+    },
+    {
+      "epoch": 0.3553701790783066,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0013227252802671821,
+      "loss": 0.1074,
+      "step": 40939
+    },
+    {
+      "epoch": 0.3553788595585108,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001322696443452603,
+      "loss": 0.1143,
+      "step": 40940
+    },
+    {
+      "epoch": 0.3553875400387149,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0013226676063944863,
+      "loss": 0.0757,
+      "step": 40941
+    },
+    {
+      "epoch": 0.3553962205189191,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0013226387690928635,
+      "loss": 0.1123,
+      "step": 40942
+    },
+    {
+      "epoch": 0.35540490099912325,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0013226099315477664,
+      "loss": 0.0991,
+      "step": 40943
+    },
+    {
+      "epoch": 0.35541358147932745,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0013225810937592267,
+      "loss": 0.1543,
+      "step": 40944
+    },
+    {
+      "epoch": 0.3554222619595316,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0013225522557272754,
+      "loss": 0.1084,
+      "step": 40945
+    },
+    {
+      "epoch": 0.3554309424397358,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0013225234174519443,
+      "loss": 0.085,
+      "step": 40946
+    },
+    {
+      "epoch": 0.3554396229199399,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001322494578933265,
+      "loss": 0.0806,
+      "step": 40947
+    },
+    {
+      "epoch": 0.3554483034001441,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001322465740171269,
+      "loss": 0.1094,
+      "step": 40948
+    },
+    {
+      "epoch": 0.35545698388034824,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0013224369011659875,
+      "loss": 0.0952,
+      "step": 40949
+    },
+    {
+      "epoch": 0.35546566436055244,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0013224080619174525,
+      "loss": 0.1084,
+      "step": 40950
+    },
+    {
+      "epoch": 0.3554743448407566,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0013223792224256957,
+      "loss": 0.0796,
+      "step": 40951
+    },
+    {
+      "epoch": 0.35548302532096077,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001322350382690748,
+      "loss": 0.0952,
+      "step": 40952
+    },
+    {
+      "epoch": 0.3554917058011649,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0013223215427126413,
+      "loss": 0.1157,
+      "step": 40953
+    },
+    {
+      "epoch": 0.3555003862813691,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0013222927024914071,
+      "loss": 0.0752,
+      "step": 40954
+    },
+    {
+      "epoch": 0.35550906676157323,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001322263862027077,
+      "loss": 0.0947,
+      "step": 40955
+    },
+    {
+      "epoch": 0.3555177472417774,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0013222350213196827,
+      "loss": 0.1074,
+      "step": 40956
+    },
+    {
+      "epoch": 0.35552642772198156,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0013222061803692554,
+      "loss": 0.127,
+      "step": 40957
+    },
+    {
+      "epoch": 0.35553510820218576,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0013221773391758264,
+      "loss": 0.1523,
+      "step": 40958
+    },
+    {
+      "epoch": 0.3555437886823899,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001322148497739428,
+      "loss": 0.1201,
+      "step": 40959
+    },
+    {
+      "epoch": 0.3555524691625941,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0013221196560600913,
+      "loss": 0.0859,
+      "step": 40960
+    },
+    {
+      "epoch": 0.3555611496427982,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0013220908141378476,
+      "loss": 0.0928,
+      "step": 40961
+    },
+    {
+      "epoch": 0.3555698301230024,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001322061971972729,
+      "loss": 0.0791,
+      "step": 40962
+    },
+    {
+      "epoch": 0.35557851060320655,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0013220331295647665,
+      "loss": 0.0898,
+      "step": 40963
+    },
+    {
+      "epoch": 0.35558719108341075,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001322004286913992,
+      "loss": 0.0977,
+      "step": 40964
+    },
+    {
+      "epoch": 0.3555958715636149,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0013219754440204371,
+      "loss": 0.0898,
+      "step": 40965
+    },
+    {
+      "epoch": 0.3556045520438191,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0013219466008841333,
+      "loss": 0.1777,
+      "step": 40966
+    },
+    {
+      "epoch": 0.3556132325240232,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0013219177575051115,
+      "loss": 0.1143,
+      "step": 40967
+    },
+    {
+      "epoch": 0.3556219130042274,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0013218889138834043,
+      "loss": 0.0967,
+      "step": 40968
+    },
+    {
+      "epoch": 0.35563059348443155,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0013218600700190429,
+      "loss": 0.0879,
+      "step": 40969
+    },
+    {
+      "epoch": 0.35563927396463574,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001321831225912058,
+      "loss": 0.1719,
+      "step": 40970
+    },
+    {
+      "epoch": 0.3556479544448399,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0013218023815624823,
+      "loss": 0.0742,
+      "step": 40971
+    },
+    {
+      "epoch": 0.35565663492504407,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0013217735369703465,
+      "loss": 0.0728,
+      "step": 40972
+    },
+    {
+      "epoch": 0.3556653154052482,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0013217446921356827,
+      "loss": 0.1123,
+      "step": 40973
+    },
+    {
+      "epoch": 0.3556739958854524,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0013217158470585223,
+      "loss": 0.0986,
+      "step": 40974
+    },
+    {
+      "epoch": 0.35568267636565654,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0013216870017388963,
+      "loss": 0.123,
+      "step": 40975
+    },
+    {
+      "epoch": 0.35569135684586073,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0013216581561768372,
+      "loss": 0.0669,
+      "step": 40976
+    },
+    {
+      "epoch": 0.35570003732606487,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0013216293103723762,
+      "loss": 0.1455,
+      "step": 40977
+    },
+    {
+      "epoch": 0.35570871780626906,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0013216004643255444,
+      "loss": 0.1494,
+      "step": 40978
+    },
+    {
+      "epoch": 0.3557173982864732,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0013215716180363737,
+      "loss": 0.0737,
+      "step": 40979
+    },
+    {
+      "epoch": 0.3557260787666774,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0013215427715048958,
+      "loss": 0.1162,
+      "step": 40980
+    },
+    {
+      "epoch": 0.3557347592468815,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0013215139247311416,
+      "loss": 0.1064,
+      "step": 40981
+    },
+    {
+      "epoch": 0.3557434397270857,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0013214850777151434,
+      "loss": 0.1167,
+      "step": 40982
+    },
+    {
+      "epoch": 0.35575212020728986,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0013214562304569323,
+      "loss": 0.0864,
+      "step": 40983
+    },
+    {
+      "epoch": 0.35576080068749405,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0013214273829565404,
+      "loss": 0.0854,
+      "step": 40984
+    },
+    {
+      "epoch": 0.3557694811676982,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0013213985352139987,
+      "loss": 0.0811,
+      "step": 40985
+    },
+    {
+      "epoch": 0.3557781616479024,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0013213696872293385,
+      "loss": 0.0845,
+      "step": 40986
+    },
+    {
+      "epoch": 0.3557868421281065,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0013213408390025923,
+      "loss": 0.1367,
+      "step": 40987
+    },
+    {
+      "epoch": 0.3557955226083107,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001321311990533791,
+      "loss": 0.0918,
+      "step": 40988
+    },
+    {
+      "epoch": 0.35580420308851485,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0013212831418229656,
+      "loss": 0.1016,
+      "step": 40989
+    },
+    {
+      "epoch": 0.35581288356871904,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0013212542928701485,
+      "loss": 0.0957,
+      "step": 40990
+    },
+    {
+      "epoch": 0.3558215640489232,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0013212254436753713,
+      "loss": 0.123,
+      "step": 40991
+    },
+    {
+      "epoch": 0.35583024452912737,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0013211965942386654,
+      "loss": 0.1426,
+      "step": 40992
+    },
+    {
+      "epoch": 0.3558389250093315,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0013211677445600619,
+      "loss": 0.1196,
+      "step": 40993
+    },
+    {
+      "epoch": 0.3558476054895357,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0013211388946395928,
+      "loss": 0.126,
+      "step": 40994
+    },
+    {
+      "epoch": 0.35585628596973984,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0013211100444772896,
+      "loss": 0.167,
+      "step": 40995
+    },
+    {
+      "epoch": 0.35586496644994403,
+      "grad_norm": 0.25,
+      "learning_rate": 0.001321081194073184,
+      "loss": 0.1104,
+      "step": 40996
+    },
+    {
+      "epoch": 0.35587364693014817,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0013210523434273069,
+      "loss": 0.0898,
+      "step": 40997
+    },
+    {
+      "epoch": 0.35588232741035236,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0013210234925396904,
+      "loss": 0.0859,
+      "step": 40998
+    },
+    {
+      "epoch": 0.3558910078905565,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0013209946414103658,
+      "loss": 0.1094,
+      "step": 40999
+    },
+    {
+      "epoch": 0.3558996883707607,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001320965790039365,
+      "loss": 0.1025,
+      "step": 41000
+    },
+    {
+      "epoch": 0.35590836885096483,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0013209369384267193,
+      "loss": 0.1299,
+      "step": 41001
+    },
+    {
+      "epoch": 0.355917049331169,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0013209080865724601,
+      "loss": 0.0688,
+      "step": 41002
+    },
+    {
+      "epoch": 0.35592572981137316,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0013208792344766194,
+      "loss": 0.0938,
+      "step": 41003
+    },
+    {
+      "epoch": 0.35593441029157735,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0013208503821392284,
+      "loss": 0.127,
+      "step": 41004
+    },
+    {
+      "epoch": 0.3559430907717815,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0013208215295603188,
+      "loss": 0.1289,
+      "step": 41005
+    },
+    {
+      "epoch": 0.3559517712519857,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0013207926767399218,
+      "loss": 0.1128,
+      "step": 41006
+    },
+    {
+      "epoch": 0.3559604517321898,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0013207638236780697,
+      "loss": 0.084,
+      "step": 41007
+    },
+    {
+      "epoch": 0.355969132212394,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0013207349703747934,
+      "loss": 0.1011,
+      "step": 41008
+    },
+    {
+      "epoch": 0.35597781269259815,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0013207061168301244,
+      "loss": 0.0952,
+      "step": 41009
+    },
+    {
+      "epoch": 0.35598649317280234,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0013206772630440947,
+      "loss": 0.1143,
+      "step": 41010
+    },
+    {
+      "epoch": 0.3559951736530065,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0013206484090167356,
+      "loss": 0.0996,
+      "step": 41011
+    },
+    {
+      "epoch": 0.3560038541332107,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001320619554748079,
+      "loss": 0.1162,
+      "step": 41012
+    },
+    {
+      "epoch": 0.3560125346134148,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0013205907002381558,
+      "loss": 0.1152,
+      "step": 41013
+    },
+    {
+      "epoch": 0.356021215093619,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0013205618454869984,
+      "loss": 0.1055,
+      "step": 41014
+    },
+    {
+      "epoch": 0.35602989557382314,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0013205329904946372,
+      "loss": 0.083,
+      "step": 41015
+    },
+    {
+      "epoch": 0.35603857605402733,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001320504135261105,
+      "loss": 0.0781,
+      "step": 41016
+    },
+    {
+      "epoch": 0.35604725653423147,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0013204752797864325,
+      "loss": 0.1328,
+      "step": 41017
+    },
+    {
+      "epoch": 0.35605593701443566,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0013204464240706515,
+      "loss": 0.0918,
+      "step": 41018
+    },
+    {
+      "epoch": 0.3560646174946398,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0013204175681137938,
+      "loss": 0.0884,
+      "step": 41019
+    },
+    {
+      "epoch": 0.356073297974844,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0013203887119158906,
+      "loss": 0.1211,
+      "step": 41020
+    },
+    {
+      "epoch": 0.35608197845504813,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0013203598554769739,
+      "loss": 0.1211,
+      "step": 41021
+    },
+    {
+      "epoch": 0.3560906589352523,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0013203309987970748,
+      "loss": 0.1162,
+      "step": 41022
+    },
+    {
+      "epoch": 0.35609933941545646,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.001320302141876225,
+      "loss": 0.1416,
+      "step": 41023
+    },
+    {
+      "epoch": 0.35610801989566065,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001320273284714456,
+      "loss": 0.082,
+      "step": 41024
+    },
+    {
+      "epoch": 0.3561167003758648,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0013202444273117998,
+      "loss": 0.1147,
+      "step": 41025
+    },
+    {
+      "epoch": 0.356125380856069,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0013202155696682874,
+      "loss": 0.1055,
+      "step": 41026
+    },
+    {
+      "epoch": 0.3561340613362731,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0013201867117839502,
+      "loss": 0.1211,
+      "step": 41027
+    },
+    {
+      "epoch": 0.3561427418164773,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001320157853658821,
+      "loss": 0.0854,
+      "step": 41028
+    },
+    {
+      "epoch": 0.35615142229668145,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0013201289952929295,
+      "loss": 0.1035,
+      "step": 41029
+    },
+    {
+      "epoch": 0.3561601027768856,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0013201001366863086,
+      "loss": 0.0898,
+      "step": 41030
+    },
+    {
+      "epoch": 0.3561687832570898,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0013200712778389896,
+      "loss": 0.084,
+      "step": 41031
+    },
+    {
+      "epoch": 0.3561774637372939,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001320042418751004,
+      "loss": 0.0996,
+      "step": 41032
+    },
+    {
+      "epoch": 0.3561861442174981,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0013200135594223833,
+      "loss": 0.0898,
+      "step": 41033
+    },
+    {
+      "epoch": 0.35619482469770225,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001319984699853159,
+      "loss": 0.0811,
+      "step": 41034
+    },
+    {
+      "epoch": 0.35620350517790644,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0013199558400433626,
+      "loss": 0.0938,
+      "step": 41035
+    },
+    {
+      "epoch": 0.3562121856581106,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0013199269799930259,
+      "loss": 0.1099,
+      "step": 41036
+    },
+    {
+      "epoch": 0.3562208661383148,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0013198981197021804,
+      "loss": 0.1113,
+      "step": 41037
+    },
+    {
+      "epoch": 0.3562295466185189,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0013198692591708577,
+      "loss": 0.1211,
+      "step": 41038
+    },
+    {
+      "epoch": 0.3562382270987231,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0013198403983990892,
+      "loss": 0.082,
+      "step": 41039
+    },
+    {
+      "epoch": 0.35624690757892724,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0013198115373869066,
+      "loss": 0.1104,
+      "step": 41040
+    },
+    {
+      "epoch": 0.35625558805913143,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0013197826761343413,
+      "loss": 0.1182,
+      "step": 41041
+    },
+    {
+      "epoch": 0.35626426853933557,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001319753814641425,
+      "loss": 0.1001,
+      "step": 41042
+    },
+    {
+      "epoch": 0.35627294901953976,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0013197249529081892,
+      "loss": 0.0854,
+      "step": 41043
+    },
+    {
+      "epoch": 0.3562816294997439,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0013196960909346656,
+      "loss": 0.1074,
+      "step": 41044
+    },
+    {
+      "epoch": 0.3562903099799481,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0013196672287208856,
+      "loss": 0.1455,
+      "step": 41045
+    },
+    {
+      "epoch": 0.35629899046015223,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0013196383662668807,
+      "loss": 0.0991,
+      "step": 41046
+    },
+    {
+      "epoch": 0.3563076709403564,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0013196095035726827,
+      "loss": 0.0859,
+      "step": 41047
+    },
+    {
+      "epoch": 0.35631635142056056,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001319580640638323,
+      "loss": 0.1084,
+      "step": 41048
+    },
+    {
+      "epoch": 0.35632503190076475,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0013195517774638333,
+      "loss": 0.0801,
+      "step": 41049
+    },
+    {
+      "epoch": 0.3563337123809689,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001319522914049245,
+      "loss": 0.0889,
+      "step": 41050
+    },
+    {
+      "epoch": 0.3563423928611731,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0013194940503945898,
+      "loss": 0.0942,
+      "step": 41051
+    },
+    {
+      "epoch": 0.3563510733413772,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0013194651864998992,
+      "loss": 0.1016,
+      "step": 41052
+    },
+    {
+      "epoch": 0.3563597538215814,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0013194363223652048,
+      "loss": 0.1064,
+      "step": 41053
+    },
+    {
+      "epoch": 0.35636843430178555,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001319407457990538,
+      "loss": 0.0996,
+      "step": 41054
+    },
+    {
+      "epoch": 0.35637711478198975,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0013193785933759306,
+      "loss": 0.1162,
+      "step": 41055
+    },
+    {
+      "epoch": 0.3563857952621939,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0013193497285214142,
+      "loss": 0.1172,
+      "step": 41056
+    },
+    {
+      "epoch": 0.3563944757423981,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00131932086342702,
+      "loss": 0.1104,
+      "step": 41057
+    },
+    {
+      "epoch": 0.3564031562226022,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0013192919980927796,
+      "loss": 0.1084,
+      "step": 41058
+    },
+    {
+      "epoch": 0.3564118367028064,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001319263132518725,
+      "loss": 0.0996,
+      "step": 41059
+    },
+    {
+      "epoch": 0.35642051718301054,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0013192342667048879,
+      "loss": 0.0898,
+      "step": 41060
+    },
+    {
+      "epoch": 0.35642919766321474,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.001319205400651299,
+      "loss": 0.1348,
+      "step": 41061
+    },
+    {
+      "epoch": 0.3564378781434189,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0013191765343579904,
+      "loss": 0.1396,
+      "step": 41062
+    },
+    {
+      "epoch": 0.35644655862362307,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0013191476678249938,
+      "loss": 0.0947,
+      "step": 41063
+    },
+    {
+      "epoch": 0.3564552391038272,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0013191188010523408,
+      "loss": 0.1245,
+      "step": 41064
+    },
+    {
+      "epoch": 0.3564639195840314,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0013190899340400626,
+      "loss": 0.0923,
+      "step": 41065
+    },
+    {
+      "epoch": 0.35647260006423553,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0013190610667881907,
+      "loss": 0.0898,
+      "step": 41066
+    },
+    {
+      "epoch": 0.3564812805444397,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001319032199296757,
+      "loss": 0.0688,
+      "step": 41067
+    },
+    {
+      "epoch": 0.35648996102464386,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0013190033315657933,
+      "loss": 0.1328,
+      "step": 41068
+    },
+    {
+      "epoch": 0.35649864150484806,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0013189744635953306,
+      "loss": 0.0796,
+      "step": 41069
+    },
+    {
+      "epoch": 0.3565073219850522,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0013189455953854007,
+      "loss": 0.1016,
+      "step": 41070
+    },
+    {
+      "epoch": 0.3565160024652564,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001318916726936035,
+      "loss": 0.0835,
+      "step": 41071
+    },
+    {
+      "epoch": 0.3565246829454605,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0013188878582472657,
+      "loss": 0.0776,
+      "step": 41072
+    },
+    {
+      "epoch": 0.3565333634256647,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0013188589893191238,
+      "loss": 0.1099,
+      "step": 41073
+    },
+    {
+      "epoch": 0.35654204390586886,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001318830120151641,
+      "loss": 0.1143,
+      "step": 41074
+    },
+    {
+      "epoch": 0.35655072438607305,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0013188012507448486,
+      "loss": 0.0947,
+      "step": 41075
+    },
+    {
+      "epoch": 0.3565594048662772,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0013187723810987786,
+      "loss": 0.1074,
+      "step": 41076
+    },
+    {
+      "epoch": 0.3565680853464814,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0013187435112134622,
+      "loss": 0.0996,
+      "step": 41077
+    },
+    {
+      "epoch": 0.3565767658266855,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0013187146410889314,
+      "loss": 0.1543,
+      "step": 41078
+    },
+    {
+      "epoch": 0.3565854463068897,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0013186857707252178,
+      "loss": 0.1064,
+      "step": 41079
+    },
+    {
+      "epoch": 0.35659412678709385,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0013186569001223526,
+      "loss": 0.0835,
+      "step": 41080
+    },
+    {
+      "epoch": 0.35660280726729804,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0013186280292803673,
+      "loss": 0.1719,
+      "step": 41081
+    },
+    {
+      "epoch": 0.3566114877475022,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0013185991581992937,
+      "loss": 0.0962,
+      "step": 41082
+    },
+    {
+      "epoch": 0.35662016822770637,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0013185702868791637,
+      "loss": 0.1172,
+      "step": 41083
+    },
+    {
+      "epoch": 0.3566288487079105,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001318541415320008,
+      "loss": 0.0933,
+      "step": 41084
+    },
+    {
+      "epoch": 0.3566375291881147,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001318512543521859,
+      "loss": 0.085,
+      "step": 41085
+    },
+    {
+      "epoch": 0.35664620966831884,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0013184836714847476,
+      "loss": 0.126,
+      "step": 41086
+    },
+    {
+      "epoch": 0.35665489014852303,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001318454799208706,
+      "loss": 0.1113,
+      "step": 41087
+    },
+    {
+      "epoch": 0.35666357062872717,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0013184259266937658,
+      "loss": 0.1143,
+      "step": 41088
+    },
+    {
+      "epoch": 0.35667225110893136,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001318397053939958,
+      "loss": 0.083,
+      "step": 41089
+    },
+    {
+      "epoch": 0.3566809315891355,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0013183681809473145,
+      "loss": 0.0967,
+      "step": 41090
+    },
+    {
+      "epoch": 0.3566896120693397,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0013183393077158666,
+      "loss": 0.1157,
+      "step": 41091
+    },
+    {
+      "epoch": 0.3566982925495438,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0013183104342456467,
+      "loss": 0.083,
+      "step": 41092
+    },
+    {
+      "epoch": 0.356706973029748,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0013182815605366851,
+      "loss": 0.123,
+      "step": 41093
+    },
+    {
+      "epoch": 0.35671565350995216,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0013182526865890146,
+      "loss": 0.0854,
+      "step": 41094
+    },
+    {
+      "epoch": 0.35672433399015635,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001318223812402666,
+      "loss": 0.1143,
+      "step": 41095
+    },
+    {
+      "epoch": 0.3567330144703605,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0013181949379776712,
+      "loss": 0.1055,
+      "step": 41096
+    },
+    {
+      "epoch": 0.3567416949505647,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0013181660633140612,
+      "loss": 0.0874,
+      "step": 41097
+    },
+    {
+      "epoch": 0.3567503754307688,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0013181371884118686,
+      "loss": 0.1025,
+      "step": 41098
+    },
+    {
+      "epoch": 0.356759055910973,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0013181083132711247,
+      "loss": 0.0972,
+      "step": 41099
+    },
+    {
+      "epoch": 0.35676773639117715,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0013180794378918604,
+      "loss": 0.0908,
+      "step": 41100
+    },
+    {
+      "epoch": 0.35677641687138134,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0013180505622741078,
+      "loss": 0.1074,
+      "step": 41101
+    },
+    {
+      "epoch": 0.3567850973515855,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0013180216864178984,
+      "loss": 0.0835,
+      "step": 41102
+    },
+    {
+      "epoch": 0.35679377783178967,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0013179928103232636,
+      "loss": 0.0586,
+      "step": 41103
+    },
+    {
+      "epoch": 0.3568024583119938,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0013179639339902353,
+      "loss": 0.1104,
+      "step": 41104
+    },
+    {
+      "epoch": 0.356811138792198,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001317935057418845,
+      "loss": 0.0781,
+      "step": 41105
+    },
+    {
+      "epoch": 0.35681981927240214,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001317906180609124,
+      "loss": 0.1113,
+      "step": 41106
+    },
+    {
+      "epoch": 0.35682849975260633,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0013178773035611043,
+      "loss": 0.1318,
+      "step": 41107
+    },
+    {
+      "epoch": 0.35683718023281047,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001317848426274817,
+      "loss": 0.0869,
+      "step": 41108
+    },
+    {
+      "epoch": 0.35684586071301466,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0013178195487502941,
+      "loss": 0.0835,
+      "step": 41109
+    },
+    {
+      "epoch": 0.3568545411932188,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0013177906709875668,
+      "loss": 0.1328,
+      "step": 41110
+    },
+    {
+      "epoch": 0.356863221673423,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0013177617929866672,
+      "loss": 0.0918,
+      "step": 41111
+    },
+    {
+      "epoch": 0.35687190215362713,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0013177329147476267,
+      "loss": 0.0903,
+      "step": 41112
+    },
+    {
+      "epoch": 0.3568805826338313,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0013177040362704765,
+      "loss": 0.0889,
+      "step": 41113
+    },
+    {
+      "epoch": 0.35688926311403546,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001317675157555248,
+      "loss": 0.0908,
+      "step": 41114
+    },
+    {
+      "epoch": 0.35689794359423965,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0013176462786019735,
+      "loss": 0.1064,
+      "step": 41115
+    },
+    {
+      "epoch": 0.3569066240744438,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0013176173994106847,
+      "loss": 0.0986,
+      "step": 41116
+    },
+    {
+      "epoch": 0.356915304554648,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0013175885199814127,
+      "loss": 0.1064,
+      "step": 41117
+    },
+    {
+      "epoch": 0.3569239850348521,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0013175596403141889,
+      "loss": 0.0693,
+      "step": 41118
+    },
+    {
+      "epoch": 0.3569326655150563,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0013175307604090451,
+      "loss": 0.1104,
+      "step": 41119
+    },
+    {
+      "epoch": 0.35694134599526045,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001317501880266013,
+      "loss": 0.0889,
+      "step": 41120
+    },
+    {
+      "epoch": 0.35695002647546464,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0013174729998851242,
+      "loss": 0.0942,
+      "step": 41121
+    },
+    {
+      "epoch": 0.3569587069556688,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0013174441192664104,
+      "loss": 0.4062,
+      "step": 41122
+    },
+    {
+      "epoch": 0.356967387435873,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0013174152384099027,
+      "loss": 0.0942,
+      "step": 41123
+    },
+    {
+      "epoch": 0.3569760679160771,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0013173863573156328,
+      "loss": 0.1074,
+      "step": 41124
+    },
+    {
+      "epoch": 0.3569847483962813,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0013173574759836328,
+      "loss": 0.0762,
+      "step": 41125
+    },
+    {
+      "epoch": 0.35699342887648544,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0013173285944139336,
+      "loss": 0.0972,
+      "step": 41126
+    },
+    {
+      "epoch": 0.35700210935668963,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0013172997126065676,
+      "loss": 0.1465,
+      "step": 41127
+    },
+    {
+      "epoch": 0.35701078983689377,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0013172708305615654,
+      "loss": 0.165,
+      "step": 41128
+    },
+    {
+      "epoch": 0.35701947031709796,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0013172419482789595,
+      "loss": 0.0986,
+      "step": 41129
+    },
+    {
+      "epoch": 0.3570281507973021,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0013172130657587808,
+      "loss": 0.1348,
+      "step": 41130
+    },
+    {
+      "epoch": 0.3570368312775063,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001317184183001061,
+      "loss": 0.105,
+      "step": 41131
+    },
+    {
+      "epoch": 0.35704551175771043,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0013171553000058322,
+      "loss": 0.0972,
+      "step": 41132
+    },
+    {
+      "epoch": 0.3570541922379146,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0013171264167731256,
+      "loss": 0.1025,
+      "step": 41133
+    },
+    {
+      "epoch": 0.35706287271811876,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0013170975333029727,
+      "loss": 0.3789,
+      "step": 41134
+    },
+    {
+      "epoch": 0.35707155319832296,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0013170686495954053,
+      "loss": 0.125,
+      "step": 41135
+    },
+    {
+      "epoch": 0.3570802336785271,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0013170397656504544,
+      "loss": 0.0869,
+      "step": 41136
+    },
+    {
+      "epoch": 0.3570889141587313,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0013170108814681526,
+      "loss": 0.127,
+      "step": 41137
+    },
+    {
+      "epoch": 0.3570975946389354,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0013169819970485308,
+      "loss": 0.1006,
+      "step": 41138
+    },
+    {
+      "epoch": 0.3571062751191396,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001316953112391621,
+      "loss": 0.0908,
+      "step": 41139
+    },
+    {
+      "epoch": 0.35711495559934375,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001316924227497454,
+      "loss": 0.0723,
+      "step": 41140
+    },
+    {
+      "epoch": 0.35712363607954795,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0013168953423660624,
+      "loss": 0.1035,
+      "step": 41141
+    },
+    {
+      "epoch": 0.3571323165597521,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001316866456997477,
+      "loss": 0.1094,
+      "step": 41142
+    },
+    {
+      "epoch": 0.3571409970399563,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0013168375713917297,
+      "loss": 0.1494,
+      "step": 41143
+    },
+    {
+      "epoch": 0.3571496775201604,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001316808685548852,
+      "loss": 0.0898,
+      "step": 41144
+    },
+    {
+      "epoch": 0.3571583580003646,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001316779799468876,
+      "loss": 0.126,
+      "step": 41145
+    },
+    {
+      "epoch": 0.35716703848056874,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001316750913151833,
+      "loss": 0.1162,
+      "step": 41146
+    },
+    {
+      "epoch": 0.35717571896077294,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001316722026597754,
+      "loss": 0.0786,
+      "step": 41147
+    },
+    {
+      "epoch": 0.3571843994409771,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0013166931398066713,
+      "loss": 0.0884,
+      "step": 41148
+    },
+    {
+      "epoch": 0.35719307992118127,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.001316664252778616,
+      "loss": 0.1064,
+      "step": 41149
+    },
+    {
+      "epoch": 0.3572017604013854,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0013166353655136198,
+      "loss": 0.0928,
+      "step": 41150
+    },
+    {
+      "epoch": 0.3572104408815896,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0013166064780117152,
+      "loss": 0.2695,
+      "step": 41151
+    },
+    {
+      "epoch": 0.35721912136179373,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0013165775902729323,
+      "loss": 0.0845,
+      "step": 41152
+    },
+    {
+      "epoch": 0.35722780184199787,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0013165487022973033,
+      "loss": 0.0869,
+      "step": 41153
+    },
+    {
+      "epoch": 0.35723648232220206,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0013165198140848602,
+      "loss": 0.1348,
+      "step": 41154
+    },
+    {
+      "epoch": 0.3572451628024062,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0013164909256356344,
+      "loss": 0.082,
+      "step": 41155
+    },
+    {
+      "epoch": 0.3572538432826104,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001316462036949657,
+      "loss": 0.0688,
+      "step": 41156
+    },
+    {
+      "epoch": 0.35726252376281453,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0013164331480269601,
+      "loss": 0.0684,
+      "step": 41157
+    },
+    {
+      "epoch": 0.3572712042430187,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0013164042588675756,
+      "loss": 0.1113,
+      "step": 41158
+    },
+    {
+      "epoch": 0.35727988472322286,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0013163753694715342,
+      "loss": 0.0928,
+      "step": 41159
+    },
+    {
+      "epoch": 0.35728856520342706,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001316346479838868,
+      "loss": 0.0908,
+      "step": 41160
+    },
+    {
+      "epoch": 0.3572972456836312,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001316317589969609,
+      "loss": 0.1177,
+      "step": 41161
+    },
+    {
+      "epoch": 0.3573059261638354,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.0013162886998637875,
+      "loss": 0.1582,
+      "step": 41162
+    },
+    {
+      "epoch": 0.3573146066440395,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0013162598095214364,
+      "loss": 0.124,
+      "step": 41163
+    },
+    {
+      "epoch": 0.3573232871242437,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0013162309189425868,
+      "loss": 0.1279,
+      "step": 41164
+    },
+    {
+      "epoch": 0.35733196760444785,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00131620202812727,
+      "loss": 0.1143,
+      "step": 41165
+    },
+    {
+      "epoch": 0.35734064808465205,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0013161731370755186,
+      "loss": 0.1064,
+      "step": 41166
+    },
+    {
+      "epoch": 0.3573493285648562,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001316144245787363,
+      "loss": 0.1436,
+      "step": 41167
+    },
+    {
+      "epoch": 0.3573580090450604,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0013161153542628355,
+      "loss": 0.082,
+      "step": 41168
+    },
+    {
+      "epoch": 0.3573666895252645,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0013160864625019674,
+      "loss": 0.0947,
+      "step": 41169
+    },
+    {
+      "epoch": 0.3573753700054687,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0013160575705047904,
+      "loss": 0.0898,
+      "step": 41170
+    },
+    {
+      "epoch": 0.35738405048567284,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001316028678271336,
+      "loss": 0.1309,
+      "step": 41171
+    },
+    {
+      "epoch": 0.35739273096587704,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001315999785801636,
+      "loss": 0.124,
+      "step": 41172
+    },
+    {
+      "epoch": 0.3574014114460812,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0013159708930957218,
+      "loss": 0.103,
+      "step": 41173
+    },
+    {
+      "epoch": 0.35741009192628537,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001315942000153625,
+      "loss": 0.1113,
+      "step": 41174
+    },
+    {
+      "epoch": 0.3574187724064895,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0013159131069753773,
+      "loss": 0.1318,
+      "step": 41175
+    },
+    {
+      "epoch": 0.3574274528866937,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0013158842135610105,
+      "loss": 0.082,
+      "step": 41176
+    },
+    {
+      "epoch": 0.35743613336689783,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.001315855319910556,
+      "loss": 0.1055,
+      "step": 41177
+    },
+    {
+      "epoch": 0.357444813847102,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0013158264260240454,
+      "loss": 0.1289,
+      "step": 41178
+    },
+    {
+      "epoch": 0.35745349432730616,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0013157975319015097,
+      "loss": 0.125,
+      "step": 41179
+    },
+    {
+      "epoch": 0.35746217480751036,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0013157686375429817,
+      "loss": 0.1377,
+      "step": 41180
+    },
+    {
+      "epoch": 0.3574708552877145,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0013157397429484918,
+      "loss": 0.1011,
+      "step": 41181
+    },
+    {
+      "epoch": 0.3574795357679187,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0013157108481180724,
+      "loss": 0.084,
+      "step": 41182
+    },
+    {
+      "epoch": 0.3574882162481228,
+      "grad_norm": 2.15625,
+      "learning_rate": 0.0013156819530517547,
+      "loss": 0.3945,
+      "step": 41183
+    },
+    {
+      "epoch": 0.357496896728327,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0013156530577495711,
+      "loss": 0.1328,
+      "step": 41184
+    },
+    {
+      "epoch": 0.35750557720853116,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.001315624162211552,
+      "loss": 0.1377,
+      "step": 41185
+    },
+    {
+      "epoch": 0.35751425768873535,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0013155952664377297,
+      "loss": 0.0996,
+      "step": 41186
+    },
+    {
+      "epoch": 0.3575229381689395,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0013155663704281359,
+      "loss": 0.0874,
+      "step": 41187
+    },
+    {
+      "epoch": 0.3575316186491437,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0013155374741828014,
+      "loss": 0.1406,
+      "step": 41188
+    },
+    {
+      "epoch": 0.3575402991293478,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0013155085777017586,
+      "loss": 0.1021,
+      "step": 41189
+    },
+    {
+      "epoch": 0.357548979609552,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001315479680985039,
+      "loss": 0.2051,
+      "step": 41190
+    },
+    {
+      "epoch": 0.35755766008975615,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001315450784032674,
+      "loss": 0.0601,
+      "step": 41191
+    },
+    {
+      "epoch": 0.35756634056996034,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001315421886844695,
+      "loss": 0.0869,
+      "step": 41192
+    },
+    {
+      "epoch": 0.3575750210501645,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001315392989421134,
+      "loss": 0.0864,
+      "step": 41193
+    },
+    {
+      "epoch": 0.35758370153036867,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001315364091762023,
+      "loss": 0.1172,
+      "step": 41194
+    },
+    {
+      "epoch": 0.3575923820105728,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0013153351938673924,
+      "loss": 0.082,
+      "step": 41195
+    },
+    {
+      "epoch": 0.357601062490777,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0013153062957372745,
+      "loss": 0.1162,
+      "step": 41196
+    },
+    {
+      "epoch": 0.35760974297098114,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0013152773973717012,
+      "loss": 0.0654,
+      "step": 41197
+    },
+    {
+      "epoch": 0.35761842345118533,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0013152484987707035,
+      "loss": 0.1191,
+      "step": 41198
+    },
+    {
+      "epoch": 0.35762710393138947,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0013152195999343134,
+      "loss": 0.0771,
+      "step": 41199
+    },
+    {
+      "epoch": 0.35763578441159366,
+      "grad_norm": 2.609375,
+      "learning_rate": 0.0013151907008625626,
+      "loss": 0.2383,
+      "step": 41200
+    },
+    {
+      "epoch": 0.3576444648917978,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0013151618015554817,
+      "loss": 0.1162,
+      "step": 41201
+    },
+    {
+      "epoch": 0.357653145372002,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001315132902013104,
+      "loss": 0.0986,
+      "step": 41202
+    },
+    {
+      "epoch": 0.35766182585220613,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0013151040022354597,
+      "loss": 0.1104,
+      "step": 41203
+    },
+    {
+      "epoch": 0.3576705063324103,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0013150751022225813,
+      "loss": 0.082,
+      "step": 41204
+    },
+    {
+      "epoch": 0.35767918681261446,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0013150462019744995,
+      "loss": 0.1289,
+      "step": 41205
+    },
+    {
+      "epoch": 0.35768786729281865,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0013150173014912468,
+      "loss": 0.0864,
+      "step": 41206
+    },
+    {
+      "epoch": 0.3576965477730228,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0013149884007728544,
+      "loss": 0.1084,
+      "step": 41207
+    },
+    {
+      "epoch": 0.357705228253227,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0013149594998193538,
+      "loss": 0.0923,
+      "step": 41208
+    },
+    {
+      "epoch": 0.3577139087334311,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0013149305986307762,
+      "loss": 0.125,
+      "step": 41209
+    },
+    {
+      "epoch": 0.3577225892136353,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0013149016972071543,
+      "loss": 0.1113,
+      "step": 41210
+    },
+    {
+      "epoch": 0.35773126969383945,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0013148727955485191,
+      "loss": 0.1328,
+      "step": 41211
+    },
+    {
+      "epoch": 0.35773995017404364,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0013148438936549023,
+      "loss": 0.1035,
+      "step": 41212
+    },
+    {
+      "epoch": 0.3577486306542478,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0013148149915263353,
+      "loss": 0.1221,
+      "step": 41213
+    },
+    {
+      "epoch": 0.35775731113445197,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0013147860891628497,
+      "loss": 0.0923,
+      "step": 41214
+    },
+    {
+      "epoch": 0.3577659916146561,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0013147571865644776,
+      "loss": 0.0776,
+      "step": 41215
+    },
+    {
+      "epoch": 0.3577746720948603,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0013147282837312501,
+      "loss": 0.1191,
+      "step": 41216
+    },
+    {
+      "epoch": 0.35778335257506444,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001314699380663199,
+      "loss": 0.1016,
+      "step": 41217
+    },
+    {
+      "epoch": 0.35779203305526863,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0013146704773603557,
+      "loss": 0.1182,
+      "step": 41218
+    },
+    {
+      "epoch": 0.35780071353547277,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0013146415738227522,
+      "loss": 0.0854,
+      "step": 41219
+    },
+    {
+      "epoch": 0.35780939401567696,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.00131461267005042,
+      "loss": 0.0825,
+      "step": 41220
+    },
+    {
+      "epoch": 0.3578180744958811,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0013145837660433902,
+      "loss": 0.1011,
+      "step": 41221
+    },
+    {
+      "epoch": 0.3578267549760853,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0013145548618016952,
+      "loss": 0.1079,
+      "step": 41222
+    },
+    {
+      "epoch": 0.35783543545628943,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0013145259573253659,
+      "loss": 0.0923,
+      "step": 41223
+    },
+    {
+      "epoch": 0.3578441159364936,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0013144970526144347,
+      "loss": 0.0801,
+      "step": 41224
+    },
+    {
+      "epoch": 0.35785279641669776,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0013144681476689325,
+      "loss": 0.106,
+      "step": 41225
+    },
+    {
+      "epoch": 0.35786147689690195,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001314439242488891,
+      "loss": 0.0933,
+      "step": 41226
+    },
+    {
+      "epoch": 0.3578701573771061,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0013144103370743424,
+      "loss": 0.1309,
+      "step": 41227
+    },
+    {
+      "epoch": 0.3578788378573103,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0013143814314253174,
+      "loss": 0.127,
+      "step": 41228
+    },
+    {
+      "epoch": 0.3578875183375144,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0013143525255418483,
+      "loss": 0.0996,
+      "step": 41229
+    },
+    {
+      "epoch": 0.3578961988177186,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0013143236194239664,
+      "loss": 0.0977,
+      "step": 41230
+    },
+    {
+      "epoch": 0.35790487929792275,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0013142947130717034,
+      "loss": 0.1074,
+      "step": 41231
+    },
+    {
+      "epoch": 0.35791355977812694,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001314265806485091,
+      "loss": 0.166,
+      "step": 41232
+    },
+    {
+      "epoch": 0.3579222402583311,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0013142368996641608,
+      "loss": 0.0776,
+      "step": 41233
+    },
+    {
+      "epoch": 0.3579309207385353,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0013142079926089442,
+      "loss": 0.0991,
+      "step": 41234
+    },
+    {
+      "epoch": 0.3579396012187394,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001314179085319473,
+      "loss": 0.1006,
+      "step": 41235
+    },
+    {
+      "epoch": 0.3579482816989436,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001314150177795779,
+      "loss": 0.1162,
+      "step": 41236
+    },
+    {
+      "epoch": 0.35795696217914774,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0013141212700378932,
+      "loss": 0.1021,
+      "step": 41237
+    },
+    {
+      "epoch": 0.35796564265935193,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001314092362045848,
+      "loss": 0.1006,
+      "step": 41238
+    },
+    {
+      "epoch": 0.35797432313955607,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0013140634538196742,
+      "loss": 0.1021,
+      "step": 41239
+    },
+    {
+      "epoch": 0.35798300361976026,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001314034545359404,
+      "loss": 0.1064,
+      "step": 41240
+    },
+    {
+      "epoch": 0.3579916840999644,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001314005636665069,
+      "loss": 0.0869,
+      "step": 41241
+    },
+    {
+      "epoch": 0.3580003645801686,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0013139767277367004,
+      "loss": 0.1123,
+      "step": 41242
+    },
+    {
+      "epoch": 0.35800904506037273,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0013139478185743303,
+      "loss": 0.1084,
+      "step": 41243
+    },
+    {
+      "epoch": 0.3580177255405769,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.00131391890917799,
+      "loss": 0.0884,
+      "step": 41244
+    },
+    {
+      "epoch": 0.35802640602078106,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001313889999547711,
+      "loss": 0.1055,
+      "step": 41245
+    },
+    {
+      "epoch": 0.35803508650098526,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0013138610896835255,
+      "loss": 0.082,
+      "step": 41246
+    },
+    {
+      "epoch": 0.3580437669811894,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0013138321795854644,
+      "loss": 0.1025,
+      "step": 41247
+    },
+    {
+      "epoch": 0.3580524474613936,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0013138032692535597,
+      "loss": 0.1221,
+      "step": 41248
+    },
+    {
+      "epoch": 0.3580611279415977,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0013137743586878429,
+      "loss": 0.2695,
+      "step": 41249
+    },
+    {
+      "epoch": 0.3580698084218019,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0013137454478883456,
+      "loss": 0.1367,
+      "step": 41250
+    },
+    {
+      "epoch": 0.35807848890200605,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0013137165368550998,
+      "loss": 0.0977,
+      "step": 41251
+    },
+    {
+      "epoch": 0.35808716938221025,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0013136876255881364,
+      "loss": 0.1055,
+      "step": 41252
+    },
+    {
+      "epoch": 0.3580958498624144,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001313658714087488,
+      "loss": 0.0952,
+      "step": 41253
+    },
+    {
+      "epoch": 0.3581045303426186,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0013136298023531854,
+      "loss": 0.1211,
+      "step": 41254
+    },
+    {
+      "epoch": 0.3581132108228227,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0013136008903852604,
+      "loss": 0.1016,
+      "step": 41255
+    },
+    {
+      "epoch": 0.3581218913030269,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0013135719781837446,
+      "loss": 0.0898,
+      "step": 41256
+    },
+    {
+      "epoch": 0.35813057178323104,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0013135430657486694,
+      "loss": 0.0776,
+      "step": 41257
+    },
+    {
+      "epoch": 0.35813925226343524,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0013135141530800673,
+      "loss": 0.1025,
+      "step": 41258
+    },
+    {
+      "epoch": 0.3581479327436394,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0013134852401779692,
+      "loss": 0.1157,
+      "step": 41259
+    },
+    {
+      "epoch": 0.35815661322384357,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0013134563270424068,
+      "loss": 0.0776,
+      "step": 41260
+    },
+    {
+      "epoch": 0.3581652937040477,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0013134274136734115,
+      "loss": 0.0908,
+      "step": 41261
+    },
+    {
+      "epoch": 0.3581739741842519,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0013133985000710153,
+      "loss": 0.2148,
+      "step": 41262
+    },
+    {
+      "epoch": 0.35818265466445603,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.00131336958623525,
+      "loss": 0.1006,
+      "step": 41263
+    },
+    {
+      "epoch": 0.35819133514466023,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0013133406721661468,
+      "loss": 0.0986,
+      "step": 41264
+    },
+    {
+      "epoch": 0.35820001562486437,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0013133117578637372,
+      "loss": 0.1025,
+      "step": 41265
+    },
+    {
+      "epoch": 0.35820869610506856,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0013132828433280533,
+      "loss": 0.124,
+      "step": 41266
+    },
+    {
+      "epoch": 0.3582173765852727,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0013132539285591262,
+      "loss": 0.1025,
+      "step": 41267
+    },
+    {
+      "epoch": 0.3582260570654769,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0013132250135569881,
+      "loss": 0.1113,
+      "step": 41268
+    },
+    {
+      "epoch": 0.358234737545681,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0013131960983216702,
+      "loss": 0.1211,
+      "step": 41269
+    },
+    {
+      "epoch": 0.3582434180258852,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0013131671828532042,
+      "loss": 0.1357,
+      "step": 41270
+    },
+    {
+      "epoch": 0.35825209850608936,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0013131382671516222,
+      "loss": 0.0918,
+      "step": 41271
+    },
+    {
+      "epoch": 0.35826077898629355,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001313109351216955,
+      "loss": 0.127,
+      "step": 41272
+    },
+    {
+      "epoch": 0.3582694594664977,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0013130804350492346,
+      "loss": 0.1211,
+      "step": 41273
+    },
+    {
+      "epoch": 0.3582781399467019,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0013130515186484928,
+      "loss": 0.0771,
+      "step": 41274
+    },
+    {
+      "epoch": 0.358286820426906,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0013130226020147608,
+      "loss": 0.0625,
+      "step": 41275
+    },
+    {
+      "epoch": 0.35829550090711015,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001312993685148071,
+      "loss": 0.1289,
+      "step": 41276
+    },
+    {
+      "epoch": 0.35830418138731435,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001312964768048454,
+      "loss": 0.1348,
+      "step": 41277
+    },
+    {
+      "epoch": 0.3583128618675185,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001312935850715942,
+      "loss": 0.1143,
+      "step": 41278
+    },
+    {
+      "epoch": 0.3583215423477227,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0013129069331505668,
+      "loss": 0.1211,
+      "step": 41279
+    },
+    {
+      "epoch": 0.3583302228279268,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0013128780153523595,
+      "loss": 0.1104,
+      "step": 41280
+    },
+    {
+      "epoch": 0.358338903308131,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0013128490973213522,
+      "loss": 0.1133,
+      "step": 41281
+    },
+    {
+      "epoch": 0.35834758378833514,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0013128201790575764,
+      "loss": 0.082,
+      "step": 41282
+    },
+    {
+      "epoch": 0.35835626426853934,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0013127912605610634,
+      "loss": 0.0967,
+      "step": 41283
+    },
+    {
+      "epoch": 0.3583649447487435,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0013127623418318453,
+      "loss": 0.1084,
+      "step": 41284
+    },
+    {
+      "epoch": 0.35837362522894767,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0013127334228699534,
+      "loss": 0.0811,
+      "step": 41285
+    },
+    {
+      "epoch": 0.3583823057091518,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0013127045036754193,
+      "loss": 0.0811,
+      "step": 41286
+    },
+    {
+      "epoch": 0.358390986189356,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0013126755842482749,
+      "loss": 0.0806,
+      "step": 41287
+    },
+    {
+      "epoch": 0.35839966666956014,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0013126466645885515,
+      "loss": 0.0723,
+      "step": 41288
+    },
+    {
+      "epoch": 0.35840834714976433,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001312617744696281,
+      "loss": 0.1201,
+      "step": 41289
+    },
+    {
+      "epoch": 0.35841702762996847,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0013125888245714952,
+      "loss": 0.124,
+      "step": 41290
+    },
+    {
+      "epoch": 0.35842570811017266,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001312559904214225,
+      "loss": 0.1182,
+      "step": 41291
+    },
+    {
+      "epoch": 0.3584343885903768,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001312530983624503,
+      "loss": 0.0879,
+      "step": 41292
+    },
+    {
+      "epoch": 0.358443069070581,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.00131250206280236,
+      "loss": 0.0845,
+      "step": 41293
+    },
+    {
+      "epoch": 0.3584517495507851,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0013124731417478279,
+      "loss": 0.126,
+      "step": 41294
+    },
+    {
+      "epoch": 0.3584604300309893,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0013124442204609386,
+      "loss": 0.0947,
+      "step": 41295
+    },
+    {
+      "epoch": 0.35846911051119346,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001312415298941723,
+      "loss": 0.1152,
+      "step": 41296
+    },
+    {
+      "epoch": 0.35847779099139765,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0013123863771902135,
+      "loss": 0.1338,
+      "step": 41297
+    },
+    {
+      "epoch": 0.3584864714716018,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0013123574552064417,
+      "loss": 0.1152,
+      "step": 41298
+    },
+    {
+      "epoch": 0.358495151951806,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0013123285329904387,
+      "loss": 0.123,
+      "step": 41299
+    },
+    {
+      "epoch": 0.3585038324320101,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0013122996105422362,
+      "loss": 0.084,
+      "step": 41300
+    },
+    {
+      "epoch": 0.3585125129122143,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0013122706878618664,
+      "loss": 0.0898,
+      "step": 41301
+    },
+    {
+      "epoch": 0.35852119339241845,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0013122417649493606,
+      "loss": 0.1118,
+      "step": 41302
+    },
+    {
+      "epoch": 0.35852987387262264,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0013122128418047504,
+      "loss": 0.0649,
+      "step": 41303
+    },
+    {
+      "epoch": 0.3585385543528268,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0013121839184280673,
+      "loss": 0.0952,
+      "step": 41304
+    },
+    {
+      "epoch": 0.35854723483303097,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.001312154994819343,
+      "loss": 0.2168,
+      "step": 41305
+    },
+    {
+      "epoch": 0.3585559153132351,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0013121260709786089,
+      "loss": 0.1133,
+      "step": 41306
+    },
+    {
+      "epoch": 0.3585645957934393,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0013120971469058975,
+      "loss": 0.0742,
+      "step": 41307
+    },
+    {
+      "epoch": 0.35857327627364344,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0013120682226012395,
+      "loss": 0.0869,
+      "step": 41308
+    },
+    {
+      "epoch": 0.35858195675384763,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001312039298064667,
+      "loss": 0.0898,
+      "step": 41309
+    },
+    {
+      "epoch": 0.35859063723405177,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0013120103732962114,
+      "loss": 0.0908,
+      "step": 41310
+    },
+    {
+      "epoch": 0.35859931771425596,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0013119814482959048,
+      "loss": 0.1836,
+      "step": 41311
+    },
+    {
+      "epoch": 0.3586079981944601,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0013119525230637782,
+      "loss": 0.085,
+      "step": 41312
+    },
+    {
+      "epoch": 0.3586166786746643,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0013119235975998633,
+      "loss": 0.1104,
+      "step": 41313
+    },
+    {
+      "epoch": 0.35862535915486843,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0013118946719041923,
+      "loss": 0.1001,
+      "step": 41314
+    },
+    {
+      "epoch": 0.3586340396350726,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0013118657459767961,
+      "loss": 0.1006,
+      "step": 41315
+    },
+    {
+      "epoch": 0.35864272011527676,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0013118368198177068,
+      "loss": 0.0762,
+      "step": 41316
+    },
+    {
+      "epoch": 0.35865140059548095,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001311807893426956,
+      "loss": 0.1025,
+      "step": 41317
+    },
+    {
+      "epoch": 0.3586600810756851,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0013117789668045754,
+      "loss": 0.168,
+      "step": 41318
+    },
+    {
+      "epoch": 0.3586687615558893,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0013117500399505964,
+      "loss": 0.1367,
+      "step": 41319
+    },
+    {
+      "epoch": 0.3586774420360934,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0013117211128650507,
+      "loss": 0.1201,
+      "step": 41320
+    },
+    {
+      "epoch": 0.3586861225162976,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0013116921855479704,
+      "loss": 0.1099,
+      "step": 41321
+    },
+    {
+      "epoch": 0.35869480299650175,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0013116632579993862,
+      "loss": 0.1328,
+      "step": 41322
+    },
+    {
+      "epoch": 0.35870348347670594,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0013116343302193303,
+      "loss": 0.1387,
+      "step": 41323
+    },
+    {
+      "epoch": 0.3587121639569101,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0013116054022078345,
+      "loss": 0.0957,
+      "step": 41324
+    },
+    {
+      "epoch": 0.3587208444371143,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0013115764739649299,
+      "loss": 0.0933,
+      "step": 41325
+    },
+    {
+      "epoch": 0.3587295249173184,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0013115475454906485,
+      "loss": 0.0615,
+      "step": 41326
+    },
+    {
+      "epoch": 0.3587382053975226,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0013115186167850222,
+      "loss": 0.0986,
+      "step": 41327
+    },
+    {
+      "epoch": 0.35874688587772674,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001311489687848082,
+      "loss": 0.1133,
+      "step": 41328
+    },
+    {
+      "epoch": 0.35875556635793093,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.00131146075867986,
+      "loss": 0.1631,
+      "step": 41329
+    },
+    {
+      "epoch": 0.35876424683813507,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0013114318292803875,
+      "loss": 0.1357,
+      "step": 41330
+    },
+    {
+      "epoch": 0.35877292731833926,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0013114028996496965,
+      "loss": 0.1211,
+      "step": 41331
+    },
+    {
+      "epoch": 0.3587816077985434,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0013113739697878186,
+      "loss": 0.0854,
+      "step": 41332
+    },
+    {
+      "epoch": 0.3587902882787476,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0013113450396947852,
+      "loss": 0.1094,
+      "step": 41333
+    },
+    {
+      "epoch": 0.35879896875895173,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0013113161093706279,
+      "loss": 0.0825,
+      "step": 41334
+    },
+    {
+      "epoch": 0.3588076492391559,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0013112871788153787,
+      "loss": 0.1235,
+      "step": 41335
+    },
+    {
+      "epoch": 0.35881632971936006,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.001311258248029069,
+      "loss": 0.0854,
+      "step": 41336
+    },
+    {
+      "epoch": 0.35882501019956425,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.0013112293170117301,
+      "loss": 0.1465,
+      "step": 41337
+    },
+    {
+      "epoch": 0.3588336906797684,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0013112003857633943,
+      "loss": 0.0801,
+      "step": 41338
+    },
+    {
+      "epoch": 0.3588423711599726,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0013111714542840926,
+      "loss": 0.103,
+      "step": 41339
+    },
+    {
+      "epoch": 0.3588510516401767,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0013111425225738576,
+      "loss": 0.1182,
+      "step": 41340
+    },
+    {
+      "epoch": 0.3588597321203809,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0013111135906327199,
+      "loss": 0.0894,
+      "step": 41341
+    },
+    {
+      "epoch": 0.35886841260058505,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0013110846584607114,
+      "loss": 0.0801,
+      "step": 41342
+    },
+    {
+      "epoch": 0.35887709308078924,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0013110557260578642,
+      "loss": 0.1387,
+      "step": 41343
+    },
+    {
+      "epoch": 0.3588857735609934,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0013110267934242094,
+      "loss": 0.0957,
+      "step": 41344
+    },
+    {
+      "epoch": 0.3588944540411976,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001310997860559779,
+      "loss": 0.1309,
+      "step": 41345
+    },
+    {
+      "epoch": 0.3589031345214017,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0013109689274646047,
+      "loss": 0.1406,
+      "step": 41346
+    },
+    {
+      "epoch": 0.3589118150016059,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0013109399941387176,
+      "loss": 0.0859,
+      "step": 41347
+    },
+    {
+      "epoch": 0.35892049548181004,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0013109110605821496,
+      "loss": 0.0928,
+      "step": 41348
+    },
+    {
+      "epoch": 0.35892917596201424,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0013108821267949328,
+      "loss": 0.0786,
+      "step": 41349
+    },
+    {
+      "epoch": 0.3589378564422184,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0013108531927770986,
+      "loss": 0.1338,
+      "step": 41350
+    },
+    {
+      "epoch": 0.35894653692242257,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0013108242585286782,
+      "loss": 0.1426,
+      "step": 41351
+    },
+    {
+      "epoch": 0.3589552174026267,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0013107953240497035,
+      "loss": 0.0952,
+      "step": 41352
+    },
+    {
+      "epoch": 0.3589638978828309,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0013107663893402064,
+      "loss": 0.124,
+      "step": 41353
+    },
+    {
+      "epoch": 0.35897257836303503,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0013107374544002182,
+      "loss": 0.0801,
+      "step": 41354
+    },
+    {
+      "epoch": 0.3589812588432392,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0013107085192297708,
+      "loss": 0.0908,
+      "step": 41355
+    },
+    {
+      "epoch": 0.35898993932344336,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0013106795838288955,
+      "loss": 0.0977,
+      "step": 41356
+    },
+    {
+      "epoch": 0.35899861980364756,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0013106506481976245,
+      "loss": 0.1104,
+      "step": 41357
+    },
+    {
+      "epoch": 0.3590073002838517,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001310621712335989,
+      "loss": 0.1089,
+      "step": 41358
+    },
+    {
+      "epoch": 0.3590159807640559,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0013105927762440208,
+      "loss": 0.1182,
+      "step": 41359
+    },
+    {
+      "epoch": 0.35902466124426,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0013105638399217513,
+      "loss": 0.125,
+      "step": 41360
+    },
+    {
+      "epoch": 0.3590333417244642,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0013105349033692127,
+      "loss": 0.0859,
+      "step": 41361
+    },
+    {
+      "epoch": 0.35904202220466835,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.001310505966586436,
+      "loss": 0.0913,
+      "step": 41362
+    },
+    {
+      "epoch": 0.35905070268487255,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0013104770295734535,
+      "loss": 0.0752,
+      "step": 41363
+    },
+    {
+      "epoch": 0.3590593831650767,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0013104480923302956,
+      "loss": 0.0986,
+      "step": 41364
+    },
+    {
+      "epoch": 0.3590680636452809,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0013104191548569954,
+      "loss": 0.126,
+      "step": 41365
+    },
+    {
+      "epoch": 0.359076744125485,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0013103902171535842,
+      "loss": 0.0889,
+      "step": 41366
+    },
+    {
+      "epoch": 0.3590854246056892,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0013103612792200931,
+      "loss": 0.1816,
+      "step": 41367
+    },
+    {
+      "epoch": 0.35909410508589334,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0013103323410565545,
+      "loss": 0.0977,
+      "step": 41368
+    },
+    {
+      "epoch": 0.35910278556609754,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001310303402662999,
+      "loss": 0.1299,
+      "step": 41369
+    },
+    {
+      "epoch": 0.3591114660463017,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0013102744640394594,
+      "loss": 0.1211,
+      "step": 41370
+    },
+    {
+      "epoch": 0.35912014652650587,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0013102455251859664,
+      "loss": 0.0752,
+      "step": 41371
+    },
+    {
+      "epoch": 0.35912882700671,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0013102165861025522,
+      "loss": 0.1465,
+      "step": 41372
+    },
+    {
+      "epoch": 0.3591375074869142,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0013101876467892482,
+      "loss": 0.085,
+      "step": 41373
+    },
+    {
+      "epoch": 0.35914618796711834,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0013101587072460862,
+      "loss": 0.1475,
+      "step": 41374
+    },
+    {
+      "epoch": 0.35915486844732253,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001310129767473098,
+      "loss": 0.1064,
+      "step": 41375
+    },
+    {
+      "epoch": 0.35916354892752667,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.001310100827470315,
+      "loss": 0.0933,
+      "step": 41376
+    },
+    {
+      "epoch": 0.35917222940773086,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0013100718872377688,
+      "loss": 0.1045,
+      "step": 41377
+    },
+    {
+      "epoch": 0.359180909887935,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0013100429467754909,
+      "loss": 0.0913,
+      "step": 41378
+    },
+    {
+      "epoch": 0.3591895903681392,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0013100140060835134,
+      "loss": 0.0981,
+      "step": 41379
+    },
+    {
+      "epoch": 0.3591982708483433,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001309985065161868,
+      "loss": 0.0933,
+      "step": 41380
+    },
+    {
+      "epoch": 0.3592069513285475,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001309956124010586,
+      "loss": 0.0938,
+      "step": 41381
+    },
+    {
+      "epoch": 0.35921563180875166,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0013099271826296987,
+      "loss": 0.1001,
+      "step": 41382
+    },
+    {
+      "epoch": 0.35922431228895585,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0013098982410192385,
+      "loss": 0.1377,
+      "step": 41383
+    },
+    {
+      "epoch": 0.35923299276916,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0013098692991792367,
+      "loss": 0.1309,
+      "step": 41384
+    },
+    {
+      "epoch": 0.3592416732493642,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001309840357109725,
+      "loss": 0.1299,
+      "step": 41385
+    },
+    {
+      "epoch": 0.3592503537295683,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001309811414810735,
+      "loss": 0.0957,
+      "step": 41386
+    },
+    {
+      "epoch": 0.3592590342097725,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0013097824722822983,
+      "loss": 0.1357,
+      "step": 41387
+    },
+    {
+      "epoch": 0.35926771468997665,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001309753529524447,
+      "loss": 0.0845,
+      "step": 41388
+    },
+    {
+      "epoch": 0.35927639517018084,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001309724586537212,
+      "loss": 0.0762,
+      "step": 41389
+    },
+    {
+      "epoch": 0.359285075650385,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0013096956433206258,
+      "loss": 0.0977,
+      "step": 41390
+    },
+    {
+      "epoch": 0.35929375613058917,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.001309666699874719,
+      "loss": 0.0679,
+      "step": 41391
+    },
+    {
+      "epoch": 0.3593024366107933,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0013096377561995242,
+      "loss": 0.1045,
+      "step": 41392
+    },
+    {
+      "epoch": 0.3593111170909975,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0013096088122950724,
+      "loss": 0.1133,
+      "step": 41393
+    },
+    {
+      "epoch": 0.35931979757120164,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0013095798681613962,
+      "loss": 0.1924,
+      "step": 41394
+    },
+    {
+      "epoch": 0.35932847805140583,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001309550923798526,
+      "loss": 0.0815,
+      "step": 41395
+    },
+    {
+      "epoch": 0.35933715853160997,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0013095219792064942,
+      "loss": 0.1367,
+      "step": 41396
+    },
+    {
+      "epoch": 0.35934583901181416,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0013094930343853326,
+      "loss": 0.1094,
+      "step": 41397
+    },
+    {
+      "epoch": 0.3593545194920183,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0013094640893350723,
+      "loss": 0.0537,
+      "step": 41398
+    },
+    {
+      "epoch": 0.35936319997222244,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001309435144055745,
+      "loss": 0.1396,
+      "step": 41399
+    },
+    {
+      "epoch": 0.35937188045242663,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001309406198547383,
+      "loss": 0.1211,
+      "step": 41400
+    },
+    {
+      "epoch": 0.35938056093263077,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0013093772528100173,
+      "loss": 0.123,
+      "step": 41401
+    },
+    {
+      "epoch": 0.35938924141283496,
+      "grad_norm": 2.84375,
+      "learning_rate": 0.0013093483068436795,
+      "loss": 0.5703,
+      "step": 41402
+    },
+    {
+      "epoch": 0.3593979218930391,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0013093193606484022,
+      "loss": 0.1201,
+      "step": 41403
+    },
+    {
+      "epoch": 0.3594066023732433,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001309290414224216,
+      "loss": 0.0923,
+      "step": 41404
+    },
+    {
+      "epoch": 0.3594152828534474,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001309261467571153,
+      "loss": 0.0996,
+      "step": 41405
+    },
+    {
+      "epoch": 0.3594239633336516,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0013092325206892448,
+      "loss": 0.0806,
+      "step": 41406
+    },
+    {
+      "epoch": 0.35943264381385576,
+      "grad_norm": 1.96875,
+      "learning_rate": 0.001309203573578523,
+      "loss": 0.3047,
+      "step": 41407
+    },
+    {
+      "epoch": 0.35944132429405995,
+      "grad_norm": 5.0,
+      "learning_rate": 0.0013091746262390194,
+      "loss": 0.2598,
+      "step": 41408
+    },
+    {
+      "epoch": 0.3594500047742641,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0013091456786707657,
+      "loss": 0.0708,
+      "step": 41409
+    },
+    {
+      "epoch": 0.3594586852544683,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0013091167308737935,
+      "loss": 0.1484,
+      "step": 41410
+    },
+    {
+      "epoch": 0.3594673657346724,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0013090877828481338,
+      "loss": 0.1011,
+      "step": 41411
+    },
+    {
+      "epoch": 0.3594760462148766,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0013090588345938193,
+      "loss": 0.1152,
+      "step": 41412
+    },
+    {
+      "epoch": 0.35948472669508075,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0013090298861108808,
+      "loss": 0.1387,
+      "step": 41413
+    },
+    {
+      "epoch": 0.35949340717528494,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0013090009373993507,
+      "loss": 0.0913,
+      "step": 41414
+    },
+    {
+      "epoch": 0.3595020876554891,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0013089719884592605,
+      "loss": 0.0723,
+      "step": 41415
+    },
+    {
+      "epoch": 0.35951076813569327,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0013089430392906416,
+      "loss": 0.0928,
+      "step": 41416
+    },
+    {
+      "epoch": 0.3595194486158974,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0013089140898935258,
+      "loss": 0.084,
+      "step": 41417
+    },
+    {
+      "epoch": 0.3595281290961016,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0013088851402679443,
+      "loss": 0.124,
+      "step": 41418
+    },
+    {
+      "epoch": 0.35953680957630574,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0013088561904139294,
+      "loss": 0.1064,
+      "step": 41419
+    },
+    {
+      "epoch": 0.35954549005650993,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0013088272403315125,
+      "loss": 0.0913,
+      "step": 41420
+    },
+    {
+      "epoch": 0.35955417053671407,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001308798290020725,
+      "loss": 0.123,
+      "step": 41421
+    },
+    {
+      "epoch": 0.35956285101691826,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0013087693394815989,
+      "loss": 0.0942,
+      "step": 41422
+    },
+    {
+      "epoch": 0.3595715314971224,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001308740388714166,
+      "loss": 0.126,
+      "step": 41423
+    },
+    {
+      "epoch": 0.3595802119773266,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0013087114377184578,
+      "loss": 0.1196,
+      "step": 41424
+    },
+    {
+      "epoch": 0.35958889245753073,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0013086824864945055,
+      "loss": 0.1055,
+      "step": 41425
+    },
+    {
+      "epoch": 0.3595975729377349,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0013086535350423417,
+      "loss": 0.0894,
+      "step": 41426
+    },
+    {
+      "epoch": 0.35960625341793906,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0013086245833619971,
+      "loss": 0.1377,
+      "step": 41427
+    },
+    {
+      "epoch": 0.35961493389814325,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0013085956314535041,
+      "loss": 0.0771,
+      "step": 41428
+    },
+    {
+      "epoch": 0.3596236143783474,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001308566679316894,
+      "loss": 0.0942,
+      "step": 41429
+    },
+    {
+      "epoch": 0.3596322948585516,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0013085377269521983,
+      "loss": 0.0908,
+      "step": 41430
+    },
+    {
+      "epoch": 0.3596409753387557,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0013085087743594489,
+      "loss": 0.1055,
+      "step": 41431
+    },
+    {
+      "epoch": 0.3596496558189599,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0013084798215386776,
+      "loss": 0.0791,
+      "step": 41432
+    },
+    {
+      "epoch": 0.35965833629916405,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.001308450868489916,
+      "loss": 0.0928,
+      "step": 41433
+    },
+    {
+      "epoch": 0.35966701677936824,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0013084219152131953,
+      "loss": 0.1221,
+      "step": 41434
+    },
+    {
+      "epoch": 0.3596756972595724,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0013083929617085477,
+      "loss": 0.0781,
+      "step": 41435
+    },
+    {
+      "epoch": 0.3596843777397766,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0013083640079760049,
+      "loss": 0.123,
+      "step": 41436
+    },
+    {
+      "epoch": 0.3596930582199807,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001308335054015598,
+      "loss": 0.124,
+      "step": 41437
+    },
+    {
+      "epoch": 0.3597017387001849,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001308306099827359,
+      "loss": 0.0947,
+      "step": 41438
+    },
+    {
+      "epoch": 0.35971041918038904,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00130827714541132,
+      "loss": 0.0742,
+      "step": 41439
+    },
+    {
+      "epoch": 0.35971909966059323,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0013082481907675122,
+      "loss": 0.0918,
+      "step": 41440
+    },
+    {
+      "epoch": 0.35972778014079737,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001308219235895967,
+      "loss": 0.166,
+      "step": 41441
+    },
+    {
+      "epoch": 0.35973646062100156,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0013081902807967165,
+      "loss": 0.0781,
+      "step": 41442
+    },
+    {
+      "epoch": 0.3597451411012057,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001308161325469792,
+      "loss": 0.0938,
+      "step": 41443
+    },
+    {
+      "epoch": 0.3597538215814099,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0013081323699152257,
+      "loss": 0.0659,
+      "step": 41444
+    },
+    {
+      "epoch": 0.35976250206161403,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0013081034141330487,
+      "loss": 0.1162,
+      "step": 41445
+    },
+    {
+      "epoch": 0.3597711825418182,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0013080744581232934,
+      "loss": 0.0869,
+      "step": 41446
+    },
+    {
+      "epoch": 0.35977986302202236,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0013080455018859908,
+      "loss": 0.0967,
+      "step": 41447
+    },
+    {
+      "epoch": 0.35978854350222655,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0013080165454211726,
+      "loss": 0.0898,
+      "step": 41448
+    },
+    {
+      "epoch": 0.3597972239824307,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0013079875887288708,
+      "loss": 0.1455,
+      "step": 41449
+    },
+    {
+      "epoch": 0.3598059044626349,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0013079586318091169,
+      "loss": 0.0977,
+      "step": 41450
+    },
+    {
+      "epoch": 0.359814584942839,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001307929674661942,
+      "loss": 0.1172,
+      "step": 41451
+    },
+    {
+      "epoch": 0.3598232654230432,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0013079007172873789,
+      "loss": 0.126,
+      "step": 41452
+    },
+    {
+      "epoch": 0.35983194590324735,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0013078717596854586,
+      "loss": 0.1201,
+      "step": 41453
+    },
+    {
+      "epoch": 0.35984062638345155,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0013078428018562132,
+      "loss": 0.166,
+      "step": 41454
+    },
+    {
+      "epoch": 0.3598493068636557,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0013078138437996737,
+      "loss": 0.0874,
+      "step": 41455
+    },
+    {
+      "epoch": 0.3598579873438599,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.001307784885515872,
+      "loss": 0.125,
+      "step": 41456
+    },
+    {
+      "epoch": 0.359866667824064,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.00130775592700484,
+      "loss": 0.123,
+      "step": 41457
+    },
+    {
+      "epoch": 0.3598753483042682,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0013077269682666094,
+      "loss": 0.0933,
+      "step": 41458
+    },
+    {
+      "epoch": 0.35988402878447234,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0013076980093012114,
+      "loss": 0.0928,
+      "step": 41459
+    },
+    {
+      "epoch": 0.35989270926467654,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001307669050108678,
+      "loss": 0.123,
+      "step": 41460
+    },
+    {
+      "epoch": 0.3599013897448807,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0013076400906890408,
+      "loss": 0.0713,
+      "step": 41461
+    },
+    {
+      "epoch": 0.35991007022508487,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0013076111310423317,
+      "loss": 0.1138,
+      "step": 41462
+    },
+    {
+      "epoch": 0.359918750705289,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0013075821711685822,
+      "loss": 0.0933,
+      "step": 41463
+    },
+    {
+      "epoch": 0.3599274311854932,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0013075532110678242,
+      "loss": 0.1328,
+      "step": 41464
+    },
+    {
+      "epoch": 0.35993611166569733,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0013075242507400886,
+      "loss": 0.0781,
+      "step": 41465
+    },
+    {
+      "epoch": 0.3599447921459015,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001307495290185408,
+      "loss": 0.0859,
+      "step": 41466
+    },
+    {
+      "epoch": 0.35995347262610566,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0013074663294038134,
+      "loss": 0.1123,
+      "step": 41467
+    },
+    {
+      "epoch": 0.35996215310630986,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001307437368395337,
+      "loss": 0.0986,
+      "step": 41468
+    },
+    {
+      "epoch": 0.359970833586514,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.00130740840716001,
+      "loss": 0.0771,
+      "step": 41469
+    },
+    {
+      "epoch": 0.3599795140667182,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0013073794456978643,
+      "loss": 0.082,
+      "step": 41470
+    },
+    {
+      "epoch": 0.3599881945469223,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0013073504840089316,
+      "loss": 0.1328,
+      "step": 41471
+    },
+    {
+      "epoch": 0.3599968750271265,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0013073215220932436,
+      "loss": 0.1074,
+      "step": 41472
+    },
+    {
+      "epoch": 0.36000555550733065,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0013072925599508316,
+      "loss": 0.0874,
+      "step": 41473
+    },
+    {
+      "epoch": 0.36001423598753485,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0013072635975817277,
+      "loss": 0.125,
+      "step": 41474
+    },
+    {
+      "epoch": 0.360022916467739,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0013072346349859638,
+      "loss": 0.1099,
+      "step": 41475
+    },
+    {
+      "epoch": 0.3600315969479432,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001307205672163571,
+      "loss": 0.1045,
+      "step": 41476
+    },
+    {
+      "epoch": 0.3600402774281473,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001307176709114581,
+      "loss": 0.0967,
+      "step": 41477
+    },
+    {
+      "epoch": 0.3600489579083515,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001307147745839026,
+      "loss": 0.0806,
+      "step": 41478
+    },
+    {
+      "epoch": 0.36005763838855565,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0013071187823369371,
+      "loss": 0.1162,
+      "step": 41479
+    },
+    {
+      "epoch": 0.36006631886875984,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0013070898186083463,
+      "loss": 0.1113,
+      "step": 41480
+    },
+    {
+      "epoch": 0.360074999348964,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001307060854653285,
+      "loss": 0.1172,
+      "step": 41481
+    },
+    {
+      "epoch": 0.36008367982916817,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.001307031890471785,
+      "loss": 0.0884,
+      "step": 41482
+    },
+    {
+      "epoch": 0.3600923603093723,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0013070029260638784,
+      "loss": 0.1396,
+      "step": 41483
+    },
+    {
+      "epoch": 0.3601010407895765,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0013069739614295964,
+      "loss": 0.1074,
+      "step": 41484
+    },
+    {
+      "epoch": 0.36010972126978064,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0013069449965689708,
+      "loss": 0.1221,
+      "step": 41485
+    },
+    {
+      "epoch": 0.36011840174998483,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0013069160314820329,
+      "loss": 0.126,
+      "step": 41486
+    },
+    {
+      "epoch": 0.36012708223018897,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0013068870661688153,
+      "loss": 0.1074,
+      "step": 41487
+    },
+    {
+      "epoch": 0.36013576271039316,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001306858100629349,
+      "loss": 0.1328,
+      "step": 41488
+    },
+    {
+      "epoch": 0.3601444431905973,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0013068291348636656,
+      "loss": 0.0903,
+      "step": 41489
+    },
+    {
+      "epoch": 0.3601531236708015,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0013068001688717966,
+      "loss": 0.127,
+      "step": 41490
+    },
+    {
+      "epoch": 0.3601618041510056,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0013067712026537746,
+      "loss": 0.1245,
+      "step": 41491
+    },
+    {
+      "epoch": 0.3601704846312098,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0013067422362096307,
+      "loss": 0.1035,
+      "step": 41492
+    },
+    {
+      "epoch": 0.36017916511141396,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0013067132695393966,
+      "loss": 0.1562,
+      "step": 41493
+    },
+    {
+      "epoch": 0.36018784559161815,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0013066843026431035,
+      "loss": 0.1128,
+      "step": 41494
+    },
+    {
+      "epoch": 0.3601965260718223,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0013066553355207842,
+      "loss": 0.0752,
+      "step": 41495
+    },
+    {
+      "epoch": 0.3602052065520265,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0013066263681724693,
+      "loss": 0.1016,
+      "step": 41496
+    },
+    {
+      "epoch": 0.3602138870322306,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001306597400598191,
+      "loss": 0.0928,
+      "step": 41497
+    },
+    {
+      "epoch": 0.3602225675124348,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0013065684327979808,
+      "loss": 0.1143,
+      "step": 41498
+    },
+    {
+      "epoch": 0.36023124799263895,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0013065394647718705,
+      "loss": 0.0791,
+      "step": 41499
+    },
+    {
+      "epoch": 0.36023992847284314,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0013065104965198917,
+      "loss": 0.1797,
+      "step": 41500
+    },
+    {
+      "epoch": 0.3602486089530473,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0013064815280420766,
+      "loss": 0.126,
+      "step": 41501
+    },
+    {
+      "epoch": 0.36025728943325147,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001306452559338456,
+      "loss": 0.0796,
+      "step": 41502
+    },
+    {
+      "epoch": 0.3602659699134556,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0013064235904090618,
+      "loss": 0.1143,
+      "step": 41503
+    },
+    {
+      "epoch": 0.3602746503936598,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0013063946212539263,
+      "loss": 0.0854,
+      "step": 41504
+    },
+    {
+      "epoch": 0.36028333087386394,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0013063656518730806,
+      "loss": 0.1167,
+      "step": 41505
+    },
+    {
+      "epoch": 0.36029201135406813,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0013063366822665566,
+      "loss": 0.0723,
+      "step": 41506
+    },
+    {
+      "epoch": 0.36030069183427227,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0013063077124343856,
+      "loss": 0.0806,
+      "step": 41507
+    },
+    {
+      "epoch": 0.36030937231447646,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0013062787423765996,
+      "loss": 0.0923,
+      "step": 41508
+    },
+    {
+      "epoch": 0.3603180527946806,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0013062497720932305,
+      "loss": 0.0713,
+      "step": 41509
+    },
+    {
+      "epoch": 0.3603267332748848,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0013062208015843096,
+      "loss": 0.0923,
+      "step": 41510
+    },
+    {
+      "epoch": 0.36033541375508893,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0013061918308498689,
+      "loss": 0.1006,
+      "step": 41511
+    },
+    {
+      "epoch": 0.3603440942352931,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0013061628598899397,
+      "loss": 0.0952,
+      "step": 41512
+    },
+    {
+      "epoch": 0.36035277471549726,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0013061338887045539,
+      "loss": 0.0845,
+      "step": 41513
+    },
+    {
+      "epoch": 0.36036145519570145,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0013061049172937436,
+      "loss": 0.082,
+      "step": 41514
+    },
+    {
+      "epoch": 0.3603701356759056,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0013060759456575395,
+      "loss": 0.0938,
+      "step": 41515
+    },
+    {
+      "epoch": 0.3603788161561098,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0013060469737959741,
+      "loss": 0.1328,
+      "step": 41516
+    },
+    {
+      "epoch": 0.3603874966363139,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0013060180017090788,
+      "loss": 0.127,
+      "step": 41517
+    },
+    {
+      "epoch": 0.3603961771165181,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0013059890293968853,
+      "loss": 0.127,
+      "step": 41518
+    },
+    {
+      "epoch": 0.36040485759672225,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0013059600568594255,
+      "loss": 0.0835,
+      "step": 41519
+    },
+    {
+      "epoch": 0.36041353807692644,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0013059310840967304,
+      "loss": 0.123,
+      "step": 41520
+    },
+    {
+      "epoch": 0.3604222185571306,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0013059021111088325,
+      "loss": 0.082,
+      "step": 41521
+    },
+    {
+      "epoch": 0.3604308990373347,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.001305873137895763,
+      "loss": 0.0942,
+      "step": 41522
+    },
+    {
+      "epoch": 0.3604395795175389,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0013058441644575541,
+      "loss": 0.0947,
+      "step": 41523
+    },
+    {
+      "epoch": 0.36044825999774305,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0013058151907942368,
+      "loss": 0.1143,
+      "step": 41524
+    },
+    {
+      "epoch": 0.36045694047794724,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001305786216905843,
+      "loss": 0.083,
+      "step": 41525
+    },
+    {
+      "epoch": 0.3604656209581514,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0013057572427924047,
+      "loss": 0.1216,
+      "step": 41526
+    },
+    {
+      "epoch": 0.36047430143835557,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001305728268453953,
+      "loss": 0.1094,
+      "step": 41527
+    },
+    {
+      "epoch": 0.3604829819185597,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0013056992938905204,
+      "loss": 0.0825,
+      "step": 41528
+    },
+    {
+      "epoch": 0.3604916623987639,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001305670319102138,
+      "loss": 0.1221,
+      "step": 41529
+    },
+    {
+      "epoch": 0.36050034287896804,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0013056413440888377,
+      "loss": 0.1016,
+      "step": 41530
+    },
+    {
+      "epoch": 0.36050902335917223,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001305612368850651,
+      "loss": 0.0879,
+      "step": 41531
+    },
+    {
+      "epoch": 0.36051770383937637,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0013055833933876097,
+      "loss": 0.1084,
+      "step": 41532
+    },
+    {
+      "epoch": 0.36052638431958056,
+      "grad_norm": 3.03125,
+      "learning_rate": 0.0013055544176997457,
+      "loss": 0.4355,
+      "step": 41533
+    },
+    {
+      "epoch": 0.3605350647997847,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0013055254417870903,
+      "loss": 0.1055,
+      "step": 41534
+    },
+    {
+      "epoch": 0.3605437452799889,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0013054964656496752,
+      "loss": 0.1162,
+      "step": 41535
+    },
+    {
+      "epoch": 0.36055242576019303,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0013054674892875324,
+      "loss": 0.127,
+      "step": 41536
+    },
+    {
+      "epoch": 0.3605611062403972,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0013054385127006934,
+      "loss": 0.1484,
+      "step": 41537
+    },
+    {
+      "epoch": 0.36056978672060136,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.00130540953588919,
+      "loss": 0.0938,
+      "step": 41538
+    },
+    {
+      "epoch": 0.36057846720080555,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0013053805588530538,
+      "loss": 0.0781,
+      "step": 41539
+    },
+    {
+      "epoch": 0.3605871476810097,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0013053515815923168,
+      "loss": 0.0986,
+      "step": 41540
+    },
+    {
+      "epoch": 0.3605958281612139,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0013053226041070101,
+      "loss": 0.1084,
+      "step": 41541
+    },
+    {
+      "epoch": 0.360604508641418,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0013052936263971654,
+      "loss": 0.0908,
+      "step": 41542
+    },
+    {
+      "epoch": 0.3606131891216222,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001305264648462815,
+      "loss": 0.0806,
+      "step": 41543
+    },
+    {
+      "epoch": 0.36062186960182635,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0013052356703039903,
+      "loss": 0.1465,
+      "step": 41544
+    },
+    {
+      "epoch": 0.36063055008203054,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0013052066919207231,
+      "loss": 0.1406,
+      "step": 41545
+    },
+    {
+      "epoch": 0.3606392305622347,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0013051777133130445,
+      "loss": 0.1035,
+      "step": 41546
+    },
+    {
+      "epoch": 0.3606479110424389,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0013051487344809868,
+      "loss": 0.1045,
+      "step": 41547
+    },
+    {
+      "epoch": 0.360656591522643,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0013051197554245816,
+      "loss": 0.0908,
+      "step": 41548
+    },
+    {
+      "epoch": 0.3606652720028472,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0013050907761438606,
+      "loss": 0.0967,
+      "step": 41549
+    },
+    {
+      "epoch": 0.36067395248305134,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0013050617966388553,
+      "loss": 0.104,
+      "step": 41550
+    },
+    {
+      "epoch": 0.36068263296325553,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0013050328169095975,
+      "loss": 0.0845,
+      "step": 41551
+    },
+    {
+      "epoch": 0.36069131344345967,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001305003836956119,
+      "loss": 0.0703,
+      "step": 41552
+    },
+    {
+      "epoch": 0.36069999392366386,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0013049748567784513,
+      "loss": 0.0762,
+      "step": 41553
+    },
+    {
+      "epoch": 0.360708674403868,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0013049458763766263,
+      "loss": 0.0674,
+      "step": 41554
+    },
+    {
+      "epoch": 0.3607173548840722,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0013049168957506752,
+      "loss": 0.1221,
+      "step": 41555
+    },
+    {
+      "epoch": 0.36072603536427633,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0013048879149006303,
+      "loss": 0.0967,
+      "step": 41556
+    },
+    {
+      "epoch": 0.3607347158444805,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0013048589338265232,
+      "loss": 0.1104,
+      "step": 41557
+    },
+    {
+      "epoch": 0.36074339632468466,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0013048299525283851,
+      "loss": 0.1084,
+      "step": 41558
+    },
+    {
+      "epoch": 0.36075207680488885,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0013048009710062483,
+      "loss": 0.0913,
+      "step": 41559
+    },
+    {
+      "epoch": 0.360760757285093,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0013047719892601442,
+      "loss": 0.1143,
+      "step": 41560
+    },
+    {
+      "epoch": 0.3607694377652972,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0013047430072901046,
+      "loss": 0.1211,
+      "step": 41561
+    },
+    {
+      "epoch": 0.3607781182455013,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0013047140250961608,
+      "loss": 0.0996,
+      "step": 41562
+    },
+    {
+      "epoch": 0.3607867987257055,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.001304685042678345,
+      "loss": 0.0742,
+      "step": 41563
+    },
+    {
+      "epoch": 0.36079547920590965,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0013046560600366891,
+      "loss": 0.1069,
+      "step": 41564
+    },
+    {
+      "epoch": 0.36080415968611385,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001304627077171224,
+      "loss": 0.1016,
+      "step": 41565
+    },
+    {
+      "epoch": 0.360812840166318,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001304598094081982,
+      "loss": 0.0869,
+      "step": 41566
+    },
+    {
+      "epoch": 0.3608215206465222,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0013045691107689945,
+      "loss": 0.0923,
+      "step": 41567
+    },
+    {
+      "epoch": 0.3608302011267263,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001304540127232293,
+      "loss": 0.0884,
+      "step": 41568
+    },
+    {
+      "epoch": 0.3608388816069305,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0013045111434719099,
+      "loss": 0.1084,
+      "step": 41569
+    },
+    {
+      "epoch": 0.36084756208713464,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0013044821594878765,
+      "loss": 0.0967,
+      "step": 41570
+    },
+    {
+      "epoch": 0.36085624256733884,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0013044531752802243,
+      "loss": 0.085,
+      "step": 41571
+    },
+    {
+      "epoch": 0.360864923047543,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0013044241908489848,
+      "loss": 0.1162,
+      "step": 41572
+    },
+    {
+      "epoch": 0.36087360352774717,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0013043952061941907,
+      "loss": 0.1084,
+      "step": 41573
+    },
+    {
+      "epoch": 0.3608822840079513,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001304366221315873,
+      "loss": 0.1064,
+      "step": 41574
+    },
+    {
+      "epoch": 0.3608909644881555,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0013043372362140633,
+      "loss": 0.0962,
+      "step": 41575
+    },
+    {
+      "epoch": 0.36089964496835963,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0013043082508887936,
+      "loss": 0.0942,
+      "step": 41576
+    },
+    {
+      "epoch": 0.3609083254485638,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001304279265340095,
+      "loss": 0.0708,
+      "step": 41577
+    },
+    {
+      "epoch": 0.36091700592876796,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0013042502795680001,
+      "loss": 0.2168,
+      "step": 41578
+    },
+    {
+      "epoch": 0.36092568640897216,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0013042212935725401,
+      "loss": 0.1162,
+      "step": 41579
+    },
+    {
+      "epoch": 0.3609343668891763,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0013041923073537469,
+      "loss": 0.0986,
+      "step": 41580
+    },
+    {
+      "epoch": 0.3609430473693805,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001304163320911652,
+      "loss": 0.1416,
+      "step": 41581
+    },
+    {
+      "epoch": 0.3609517278495846,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001304134334246287,
+      "loss": 0.0967,
+      "step": 41582
+    },
+    {
+      "epoch": 0.3609604083297888,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001304105347357684,
+      "loss": 0.0835,
+      "step": 41583
+    },
+    {
+      "epoch": 0.36096908880999296,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001304076360245874,
+      "loss": 0.1074,
+      "step": 41584
+    },
+    {
+      "epoch": 0.36097776929019715,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0013040473729108897,
+      "loss": 0.1016,
+      "step": 41585
+    },
+    {
+      "epoch": 0.3609864497704013,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001304018385352762,
+      "loss": 0.0913,
+      "step": 41586
+    },
+    {
+      "epoch": 0.3609951302506055,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0013039893975715226,
+      "loss": 0.1123,
+      "step": 41587
+    },
+    {
+      "epoch": 0.3610038107308096,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0013039604095672038,
+      "loss": 0.0928,
+      "step": 41588
+    },
+    {
+      "epoch": 0.3610124912110138,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001303931421339837,
+      "loss": 0.0996,
+      "step": 41589
+    },
+    {
+      "epoch": 0.36102117169121795,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0013039024328894535,
+      "loss": 0.1133,
+      "step": 41590
+    },
+    {
+      "epoch": 0.36102985217142214,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0013038734442160859,
+      "loss": 0.083,
+      "step": 41591
+    },
+    {
+      "epoch": 0.3610385326516263,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001303844455319765,
+      "loss": 0.0938,
+      "step": 41592
+    },
+    {
+      "epoch": 0.36104721313183047,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0013038154662005231,
+      "loss": 0.1514,
+      "step": 41593
+    },
+    {
+      "epoch": 0.3610558936120346,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0013037864768583915,
+      "loss": 0.1123,
+      "step": 41594
+    },
+    {
+      "epoch": 0.3610645740922388,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0013037574872934018,
+      "loss": 0.0859,
+      "step": 41595
+    },
+    {
+      "epoch": 0.36107325457244294,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0013037284975055863,
+      "loss": 0.1309,
+      "step": 41596
+    },
+    {
+      "epoch": 0.36108193505264713,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0013036995074949766,
+      "loss": 0.0737,
+      "step": 41597
+    },
+    {
+      "epoch": 0.36109061553285127,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0013036705172616034,
+      "loss": 0.0894,
+      "step": 41598
+    },
+    {
+      "epoch": 0.36109929601305546,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0013036415268055,
+      "loss": 0.0986,
+      "step": 41599
+    },
+    {
+      "epoch": 0.3611079764932596,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001303612536126697,
+      "loss": 0.0801,
+      "step": 41600
+    },
+    {
+      "epoch": 0.3611166569734638,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0013035835452252262,
+      "loss": 0.1211,
+      "step": 41601
+    },
+    {
+      "epoch": 0.3611253374536679,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.00130355455410112,
+      "loss": 0.0889,
+      "step": 41602
+    },
+    {
+      "epoch": 0.3611340179338721,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001303525562754409,
+      "loss": 0.1191,
+      "step": 41603
+    },
+    {
+      "epoch": 0.36114269841407626,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0013034965711851256,
+      "loss": 0.0977,
+      "step": 41604
+    },
+    {
+      "epoch": 0.36115137889428045,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0013034675793933018,
+      "loss": 0.1147,
+      "step": 41605
+    },
+    {
+      "epoch": 0.3611600593744846,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0013034385873789687,
+      "loss": 0.0771,
+      "step": 41606
+    },
+    {
+      "epoch": 0.3611687398546888,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0013034095951421582,
+      "loss": 0.0806,
+      "step": 41607
+    },
+    {
+      "epoch": 0.3611774203348929,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0013033806026829019,
+      "loss": 0.1367,
+      "step": 41608
+    },
+    {
+      "epoch": 0.3611861008150971,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.0013033516100012317,
+      "loss": 0.1719,
+      "step": 41609
+    },
+    {
+      "epoch": 0.36119478129530125,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0013033226170971796,
+      "loss": 0.1245,
+      "step": 41610
+    },
+    {
+      "epoch": 0.36120346177550544,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0013032936239707764,
+      "loss": 0.1016,
+      "step": 41611
+    },
+    {
+      "epoch": 0.3612121422557096,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0013032646306220545,
+      "loss": 0.0947,
+      "step": 41612
+    },
+    {
+      "epoch": 0.36122082273591377,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0013032356370510458,
+      "loss": 0.1152,
+      "step": 41613
+    },
+    {
+      "epoch": 0.3612295032161179,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0013032066432577811,
+      "loss": 0.1001,
+      "step": 41614
+    },
+    {
+      "epoch": 0.3612381836963221,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001303177649242293,
+      "loss": 0.1133,
+      "step": 41615
+    },
+    {
+      "epoch": 0.36124686417652624,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001303148655004613,
+      "loss": 0.0815,
+      "step": 41616
+    },
+    {
+      "epoch": 0.36125554465673043,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0013031196605447725,
+      "loss": 0.1367,
+      "step": 41617
+    },
+    {
+      "epoch": 0.36126422513693457,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0013030906658628035,
+      "loss": 0.125,
+      "step": 41618
+    },
+    {
+      "epoch": 0.36127290561713876,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0013030616709587375,
+      "loss": 0.1338,
+      "step": 41619
+    },
+    {
+      "epoch": 0.3612815860973429,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001303032675832606,
+      "loss": 0.0903,
+      "step": 41620
+    },
+    {
+      "epoch": 0.3612902665775471,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0013030036804844416,
+      "loss": 0.1104,
+      "step": 41621
+    },
+    {
+      "epoch": 0.36129894705775123,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0013029746849142752,
+      "loss": 0.0996,
+      "step": 41622
+    },
+    {
+      "epoch": 0.3613076275379554,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0013029456891221385,
+      "loss": 0.0913,
+      "step": 41623
+    },
+    {
+      "epoch": 0.36131630801815956,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0013029166931080637,
+      "loss": 0.0986,
+      "step": 41624
+    },
+    {
+      "epoch": 0.36132498849836375,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001302887696872082,
+      "loss": 0.0791,
+      "step": 41625
+    },
+    {
+      "epoch": 0.3613336689785679,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0013028587004142256,
+      "loss": 0.3125,
+      "step": 41626
+    },
+    {
+      "epoch": 0.3613423494587721,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001302829703734526,
+      "loss": 0.1172,
+      "step": 41627
+    },
+    {
+      "epoch": 0.3613510299389762,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0013028007068330149,
+      "loss": 0.1279,
+      "step": 41628
+    },
+    {
+      "epoch": 0.3613597104191804,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0013027717097097238,
+      "loss": 0.0845,
+      "step": 41629
+    },
+    {
+      "epoch": 0.36136839089938455,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0013027427123646846,
+      "loss": 0.1025,
+      "step": 41630
+    },
+    {
+      "epoch": 0.36137707137958874,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0013027137147979292,
+      "loss": 0.0967,
+      "step": 41631
+    },
+    {
+      "epoch": 0.3613857518597929,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001302684717009489,
+      "loss": 0.126,
+      "step": 41632
+    },
+    {
+      "epoch": 0.3613944323399971,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001302655718999396,
+      "loss": 0.0596,
+      "step": 41633
+    },
+    {
+      "epoch": 0.3614031128202012,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0013026267207676815,
+      "loss": 0.0576,
+      "step": 41634
+    },
+    {
+      "epoch": 0.3614117933004054,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001302597722314378,
+      "loss": 0.1318,
+      "step": 41635
+    },
+    {
+      "epoch": 0.36142047378060954,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001302568723639516,
+      "loss": 0.106,
+      "step": 41636
+    },
+    {
+      "epoch": 0.36142915426081373,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001302539724743128,
+      "loss": 0.084,
+      "step": 41637
+    },
+    {
+      "epoch": 0.36143783474101787,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0013025107256252459,
+      "loss": 0.0728,
+      "step": 41638
+    },
+    {
+      "epoch": 0.36144651522122206,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001302481726285901,
+      "loss": 0.105,
+      "step": 41639
+    },
+    {
+      "epoch": 0.3614551957014262,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0013024527267251251,
+      "loss": 0.085,
+      "step": 41640
+    },
+    {
+      "epoch": 0.3614638761816304,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0013024237269429499,
+      "loss": 0.1465,
+      "step": 41641
+    },
+    {
+      "epoch": 0.36147255666183453,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0013023947269394072,
+      "loss": 0.0918,
+      "step": 41642
+    },
+    {
+      "epoch": 0.3614812371420387,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0013023657267145286,
+      "loss": 0.1074,
+      "step": 41643
+    },
+    {
+      "epoch": 0.36148991762224286,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001302336726268346,
+      "loss": 0.1084,
+      "step": 41644
+    },
+    {
+      "epoch": 0.361498598102447,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0013023077256008912,
+      "loss": 0.0728,
+      "step": 41645
+    },
+    {
+      "epoch": 0.3615072785826512,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0013022787247121952,
+      "loss": 0.1064,
+      "step": 41646
+    },
+    {
+      "epoch": 0.36151595906285533,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0013022497236022906,
+      "loss": 0.1001,
+      "step": 41647
+    },
+    {
+      "epoch": 0.3615246395430595,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0013022207222712088,
+      "loss": 0.0928,
+      "step": 41648
+    },
+    {
+      "epoch": 0.36153332002326366,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001302191720718981,
+      "loss": 0.0884,
+      "step": 41649
+    },
+    {
+      "epoch": 0.36154200050346785,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0013021627189456397,
+      "loss": 0.1338,
+      "step": 41650
+    },
+    {
+      "epoch": 0.361550680983672,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001302133716951216,
+      "loss": 0.1104,
+      "step": 41651
+    },
+    {
+      "epoch": 0.3615593614638762,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0013021047147357423,
+      "loss": 0.0913,
+      "step": 41652
+    },
+    {
+      "epoch": 0.3615680419440803,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.00130207571229925,
+      "loss": 0.1157,
+      "step": 41653
+    },
+    {
+      "epoch": 0.3615767224242845,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0013020467096417703,
+      "loss": 0.083,
+      "step": 41654
+    },
+    {
+      "epoch": 0.36158540290448865,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0013020177067633357,
+      "loss": 0.1113,
+      "step": 41655
+    },
+    {
+      "epoch": 0.36159408338469284,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0013019887036639776,
+      "loss": 0.1133,
+      "step": 41656
+    },
+    {
+      "epoch": 0.361602763864897,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0013019597003437273,
+      "loss": 0.1016,
+      "step": 41657
+    },
+    {
+      "epoch": 0.3616114443451012,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0013019306968026173,
+      "loss": 0.0703,
+      "step": 41658
+    },
+    {
+      "epoch": 0.3616201248253053,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0013019016930406785,
+      "loss": 0.104,
+      "step": 41659
+    },
+    {
+      "epoch": 0.3616288053055095,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0013018726890579432,
+      "loss": 0.0918,
+      "step": 41660
+    },
+    {
+      "epoch": 0.36163748578571364,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0013018436848544431,
+      "loss": 0.0986,
+      "step": 41661
+    },
+    {
+      "epoch": 0.36164616626591783,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0013018146804302099,
+      "loss": 0.1035,
+      "step": 41662
+    },
+    {
+      "epoch": 0.36165484674612197,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001301785675785275,
+      "loss": 0.125,
+      "step": 41663
+    },
+    {
+      "epoch": 0.36166352722632616,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0013017566709196701,
+      "loss": 0.0977,
+      "step": 41664
+    },
+    {
+      "epoch": 0.3616722077065303,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0013017276658334274,
+      "loss": 0.106,
+      "step": 41665
+    },
+    {
+      "epoch": 0.3616808881867345,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0013016986605265783,
+      "loss": 0.1064,
+      "step": 41666
+    },
+    {
+      "epoch": 0.36168956866693863,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0013016696549991546,
+      "loss": 0.0835,
+      "step": 41667
+    },
+    {
+      "epoch": 0.3616982491471428,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0013016406492511881,
+      "loss": 0.1025,
+      "step": 41668
+    },
+    {
+      "epoch": 0.36170692962734696,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0013016116432827103,
+      "loss": 0.1006,
+      "step": 41669
+    },
+    {
+      "epoch": 0.36171561010755116,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001301582637093753,
+      "loss": 0.0713,
+      "step": 41670
+    },
+    {
+      "epoch": 0.3617242905877553,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.001301553630684348,
+      "loss": 0.1006,
+      "step": 41671
+    },
+    {
+      "epoch": 0.3617329710679595,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0013015246240545269,
+      "loss": 0.1221,
+      "step": 41672
+    },
+    {
+      "epoch": 0.3617416515481636,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0013014956172043217,
+      "loss": 0.1001,
+      "step": 41673
+    },
+    {
+      "epoch": 0.3617503320283678,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0013014666101337639,
+      "loss": 0.1201,
+      "step": 41674
+    },
+    {
+      "epoch": 0.36175901250857195,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0013014376028428851,
+      "loss": 0.0903,
+      "step": 41675
+    },
+    {
+      "epoch": 0.36176769298877615,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001301408595331717,
+      "loss": 0.0698,
+      "step": 41676
+    },
+    {
+      "epoch": 0.3617763734689803,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.001301379587600292,
+      "loss": 0.1504,
+      "step": 41677
+    },
+    {
+      "epoch": 0.3617850539491845,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.001301350579648641,
+      "loss": 0.125,
+      "step": 41678
+    },
+    {
+      "epoch": 0.3617937344293886,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001301321571476796,
+      "loss": 0.1738,
+      "step": 41679
+    },
+    {
+      "epoch": 0.3618024149095928,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0013012925630847888,
+      "loss": 0.1318,
+      "step": 41680
+    },
+    {
+      "epoch": 0.36181109538979694,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001301263554472651,
+      "loss": 0.0806,
+      "step": 41681
+    },
+    {
+      "epoch": 0.36181977587000114,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0013012345456404144,
+      "loss": 0.104,
+      "step": 41682
+    },
+    {
+      "epoch": 0.3618284563502053,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0013012055365881111,
+      "loss": 0.1064,
+      "step": 41683
+    },
+    {
+      "epoch": 0.36183713683040947,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001301176527315772,
+      "loss": 0.1885,
+      "step": 41684
+    },
+    {
+      "epoch": 0.3618458173106136,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0013011475178234294,
+      "loss": 0.0957,
+      "step": 41685
+    },
+    {
+      "epoch": 0.3618544977908178,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0013011185081111148,
+      "loss": 0.0786,
+      "step": 41686
+    },
+    {
+      "epoch": 0.36186317827102193,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0013010894981788602,
+      "loss": 0.0874,
+      "step": 41687
+    },
+    {
+      "epoch": 0.3618718587512261,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0013010604880266973,
+      "loss": 0.0913,
+      "step": 41688
+    },
+    {
+      "epoch": 0.36188053923143026,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0013010314776546572,
+      "loss": 0.1377,
+      "step": 41689
+    },
+    {
+      "epoch": 0.36188921971163446,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0013010024670627725,
+      "loss": 0.0913,
+      "step": 41690
+    },
+    {
+      "epoch": 0.3618979001918386,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0013009734562510742,
+      "loss": 0.1079,
+      "step": 41691
+    },
+    {
+      "epoch": 0.3619065806720428,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0013009444452195948,
+      "loss": 0.1035,
+      "step": 41692
+    },
+    {
+      "epoch": 0.3619152611522469,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0013009154339683649,
+      "loss": 0.105,
+      "step": 41693
+    },
+    {
+      "epoch": 0.3619239416324511,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0013008864224974173,
+      "loss": 0.0874,
+      "step": 41694
+    },
+    {
+      "epoch": 0.36193262211265526,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0013008574108067834,
+      "loss": 0.1064,
+      "step": 41695
+    },
+    {
+      "epoch": 0.36194130259285945,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001300828398896495,
+      "loss": 0.1426,
+      "step": 41696
+    },
+    {
+      "epoch": 0.3619499830730636,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0013007993867665834,
+      "loss": 0.0742,
+      "step": 41697
+    },
+    {
+      "epoch": 0.3619586635532678,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0013007703744170806,
+      "loss": 0.0947,
+      "step": 41698
+    },
+    {
+      "epoch": 0.3619673440334719,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0013007413618480186,
+      "loss": 0.1367,
+      "step": 41699
+    },
+    {
+      "epoch": 0.3619760245136761,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0013007123490594284,
+      "loss": 0.0811,
+      "step": 41700
+    },
+    {
+      "epoch": 0.36198470499388025,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0013006833360513425,
+      "loss": 0.1001,
+      "step": 41701
+    },
+    {
+      "epoch": 0.36199338547408444,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.001300654322823792,
+      "loss": 0.1016,
+      "step": 41702
+    },
+    {
+      "epoch": 0.3620020659542886,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0013006253093768095,
+      "loss": 0.1484,
+      "step": 41703
+    },
+    {
+      "epoch": 0.36201074643449277,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001300596295710426,
+      "loss": 0.1299,
+      "step": 41704
+    },
+    {
+      "epoch": 0.3620194269146969,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0013005672818246733,
+      "loss": 0.0942,
+      "step": 41705
+    },
+    {
+      "epoch": 0.3620281073949011,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0013005382677195832,
+      "loss": 0.1367,
+      "step": 41706
+    },
+    {
+      "epoch": 0.36203678787510524,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0013005092533951875,
+      "loss": 0.0703,
+      "step": 41707
+    },
+    {
+      "epoch": 0.36204546835530943,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0013004802388515177,
+      "loss": 0.1133,
+      "step": 41708
+    },
+    {
+      "epoch": 0.36205414883551357,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001300451224088606,
+      "loss": 0.168,
+      "step": 41709
+    },
+    {
+      "epoch": 0.36206282931571776,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001300422209106484,
+      "loss": 0.0859,
+      "step": 41710
+    },
+    {
+      "epoch": 0.3620715097959219,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0013003931939051824,
+      "loss": 0.0962,
+      "step": 41711
+    },
+    {
+      "epoch": 0.3620801902761261,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0013003641784847348,
+      "loss": 0.0874,
+      "step": 41712
+    },
+    {
+      "epoch": 0.36208887075633023,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0013003351628451717,
+      "loss": 0.0928,
+      "step": 41713
+    },
+    {
+      "epoch": 0.3620975512365344,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0013003061469865252,
+      "loss": 0.0903,
+      "step": 41714
+    },
+    {
+      "epoch": 0.36210623171673856,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0013002771309088266,
+      "loss": 0.1211,
+      "step": 41715
+    },
+    {
+      "epoch": 0.36211491219694275,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001300248114612108,
+      "loss": 0.0791,
+      "step": 41716
+    },
+    {
+      "epoch": 0.3621235926771469,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001300219098096401,
+      "loss": 0.1182,
+      "step": 41717
+    },
+    {
+      "epoch": 0.3621322731573511,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0013001900813617378,
+      "loss": 0.0962,
+      "step": 41718
+    },
+    {
+      "epoch": 0.3621409536375552,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0013001610644081492,
+      "loss": 0.0981,
+      "step": 41719
+    },
+    {
+      "epoch": 0.3621496341177594,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0013001320472356675,
+      "loss": 0.1367,
+      "step": 41720
+    },
+    {
+      "epoch": 0.36215831459796355,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0013001030298443246,
+      "loss": 0.0664,
+      "step": 41721
+    },
+    {
+      "epoch": 0.36216699507816774,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0013000740122341523,
+      "loss": 0.0869,
+      "step": 41722
+    },
+    {
+      "epoch": 0.3621756755583719,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0013000449944051817,
+      "loss": 0.0991,
+      "step": 41723
+    },
+    {
+      "epoch": 0.36218435603857607,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0013000159763574448,
+      "loss": 0.0674,
+      "step": 41724
+    },
+    {
+      "epoch": 0.3621930365187802,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0012999869580909737,
+      "loss": 0.0811,
+      "step": 41725
+    },
+    {
+      "epoch": 0.3622017169989844,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0012999579396057999,
+      "loss": 0.1045,
+      "step": 41726
+    },
+    {
+      "epoch": 0.36221039747918854,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001299928920901955,
+      "loss": 0.1035,
+      "step": 41727
+    },
+    {
+      "epoch": 0.36221907795939273,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0012998999019794711,
+      "loss": 0.1216,
+      "step": 41728
+    },
+    {
+      "epoch": 0.36222775843959687,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0012998708828383789,
+      "loss": 0.0674,
+      "step": 41729
+    },
+    {
+      "epoch": 0.36223643891980106,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0012998418634787116,
+      "loss": 0.1045,
+      "step": 41730
+    },
+    {
+      "epoch": 0.3622451194000052,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0012998128439005,
+      "loss": 0.0981,
+      "step": 41731
+    },
+    {
+      "epoch": 0.3622537998802094,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001299783824103776,
+      "loss": 0.1084,
+      "step": 41732
+    },
+    {
+      "epoch": 0.36226248036041353,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0012997548040885717,
+      "loss": 0.0781,
+      "step": 41733
+    },
+    {
+      "epoch": 0.3622711608406177,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0012997257838549185,
+      "loss": 0.1104,
+      "step": 41734
+    },
+    {
+      "epoch": 0.36227984132082186,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0012996967634028482,
+      "loss": 0.1182,
+      "step": 41735
+    },
+    {
+      "epoch": 0.36228852180102605,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0012996677427323923,
+      "loss": 0.0586,
+      "step": 41736
+    },
+    {
+      "epoch": 0.3622972022812302,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0012996387218435828,
+      "loss": 0.0889,
+      "step": 41737
+    },
+    {
+      "epoch": 0.3623058827614344,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0012996097007364513,
+      "loss": 0.1406,
+      "step": 41738
+    },
+    {
+      "epoch": 0.3623145632416385,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.00129958067941103,
+      "loss": 0.1221,
+      "step": 41739
+    },
+    {
+      "epoch": 0.3623232437218427,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.00129955165786735,
+      "loss": 0.1045,
+      "step": 41740
+    },
+    {
+      "epoch": 0.36233192420204685,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0012995226361054433,
+      "loss": 0.103,
+      "step": 41741
+    },
+    {
+      "epoch": 0.36234060468225104,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0012994936141253418,
+      "loss": 0.0752,
+      "step": 41742
+    },
+    {
+      "epoch": 0.3623492851624552,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001299464591927077,
+      "loss": 0.0728,
+      "step": 41743
+    },
+    {
+      "epoch": 0.3623579656426594,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0012994355695106806,
+      "loss": 0.1084,
+      "step": 41744
+    },
+    {
+      "epoch": 0.3623666461228635,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0012994065468761847,
+      "loss": 0.085,
+      "step": 41745
+    },
+    {
+      "epoch": 0.3623753266030677,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0012993775240236205,
+      "loss": 0.0991,
+      "step": 41746
+    },
+    {
+      "epoch": 0.36238400708327184,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0012993485009530205,
+      "loss": 0.0762,
+      "step": 41747
+    },
+    {
+      "epoch": 0.36239268756347603,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0012993194776644154,
+      "loss": 0.0801,
+      "step": 41748
+    },
+    {
+      "epoch": 0.36240136804368017,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0012992904541578377,
+      "loss": 0.0933,
+      "step": 41749
+    },
+    {
+      "epoch": 0.36241004852388436,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.001299261430433319,
+      "loss": 0.0933,
+      "step": 41750
+    },
+    {
+      "epoch": 0.3624187290040885,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0012992324064908909,
+      "loss": 0.0938,
+      "step": 41751
+    },
+    {
+      "epoch": 0.3624274094842927,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0012992033823305855,
+      "loss": 0.1157,
+      "step": 41752
+    },
+    {
+      "epoch": 0.36243608996449683,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0012991743579524341,
+      "loss": 0.104,
+      "step": 41753
+    },
+    {
+      "epoch": 0.362444770444701,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0012991453333564688,
+      "loss": 0.0898,
+      "step": 41754
+    },
+    {
+      "epoch": 0.36245345092490516,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0012991163085427208,
+      "loss": 0.1494,
+      "step": 41755
+    },
+    {
+      "epoch": 0.36246213140510936,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0012990872835112227,
+      "loss": 0.1128,
+      "step": 41756
+    },
+    {
+      "epoch": 0.3624708118853135,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0012990582582620052,
+      "loss": 0.1021,
+      "step": 41757
+    },
+    {
+      "epoch": 0.3624794923655177,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0012990292327951008,
+      "loss": 0.0967,
+      "step": 41758
+    },
+    {
+      "epoch": 0.3624881728457218,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0012990002071105412,
+      "loss": 0.0693,
+      "step": 41759
+    },
+    {
+      "epoch": 0.362496853325926,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0012989711812083577,
+      "loss": 0.0869,
+      "step": 41760
+    },
+    {
+      "epoch": 0.36250553380613015,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0012989421550885824,
+      "loss": 0.0898,
+      "step": 41761
+    },
+    {
+      "epoch": 0.36251421428633435,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0012989131287512467,
+      "loss": 0.0845,
+      "step": 41762
+    },
+    {
+      "epoch": 0.3625228947665385,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0012988841021963828,
+      "loss": 0.1157,
+      "step": 41763
+    },
+    {
+      "epoch": 0.3625315752467427,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0012988550754240227,
+      "loss": 0.1177,
+      "step": 41764
+    },
+    {
+      "epoch": 0.3625402557269468,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001298826048434197,
+      "loss": 0.0801,
+      "step": 41765
+    },
+    {
+      "epoch": 0.362548936207151,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0012987970212269384,
+      "loss": 0.1035,
+      "step": 41766
+    },
+    {
+      "epoch": 0.36255761668735514,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0012987679938022785,
+      "loss": 0.1157,
+      "step": 41767
+    },
+    {
+      "epoch": 0.36256629716755934,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0012987389661602484,
+      "loss": 0.0791,
+      "step": 41768
+    },
+    {
+      "epoch": 0.3625749776477635,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0012987099383008809,
+      "loss": 0.1113,
+      "step": 41769
+    },
+    {
+      "epoch": 0.3625836581279676,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001298680910224207,
+      "loss": 0.1055,
+      "step": 41770
+    },
+    {
+      "epoch": 0.3625923386081718,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0012986518819302585,
+      "loss": 0.1191,
+      "step": 41771
+    },
+    {
+      "epoch": 0.36260101908837594,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0012986228534190673,
+      "loss": 0.1143,
+      "step": 41772
+    },
+    {
+      "epoch": 0.36260969956858013,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0012985938246906656,
+      "loss": 0.168,
+      "step": 41773
+    },
+    {
+      "epoch": 0.3626183800487843,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0012985647957450843,
+      "loss": 0.0811,
+      "step": 41774
+    },
+    {
+      "epoch": 0.36262706052898847,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0012985357665823553,
+      "loss": 0.127,
+      "step": 41775
+    },
+    {
+      "epoch": 0.3626357410091926,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0012985067372025113,
+      "loss": 0.0967,
+      "step": 41776
+    },
+    {
+      "epoch": 0.3626444214893968,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0012984777076055826,
+      "loss": 0.0845,
+      "step": 41777
+    },
+    {
+      "epoch": 0.36265310196960093,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001298448677791602,
+      "loss": 0.0913,
+      "step": 41778
+    },
+    {
+      "epoch": 0.3626617824498051,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0012984196477606007,
+      "loss": 0.1123,
+      "step": 41779
+    },
+    {
+      "epoch": 0.36267046293000926,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0012983906175126107,
+      "loss": 0.085,
+      "step": 41780
+    },
+    {
+      "epoch": 0.36267914341021346,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001298361587047664,
+      "loss": 0.127,
+      "step": 41781
+    },
+    {
+      "epoch": 0.3626878238904176,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0012983325563657915,
+      "loss": 0.1445,
+      "step": 41782
+    },
+    {
+      "epoch": 0.3626965043706218,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0012983035254670262,
+      "loss": 0.0903,
+      "step": 41783
+    },
+    {
+      "epoch": 0.3627051848508259,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0012982744943513984,
+      "loss": 0.0977,
+      "step": 41784
+    },
+    {
+      "epoch": 0.3627138653310301,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001298245463018941,
+      "loss": 0.0811,
+      "step": 41785
+    },
+    {
+      "epoch": 0.36272254581123425,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001298216431469685,
+      "loss": 0.1387,
+      "step": 41786
+    },
+    {
+      "epoch": 0.36273122629143845,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0012981873997036633,
+      "loss": 0.1226,
+      "step": 41787
+    },
+    {
+      "epoch": 0.3627399067716426,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0012981583677209058,
+      "loss": 0.1094,
+      "step": 41788
+    },
+    {
+      "epoch": 0.3627485872518468,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001298129335521446,
+      "loss": 0.1006,
+      "step": 41789
+    },
+    {
+      "epoch": 0.3627572677320509,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0012981003031053149,
+      "loss": 0.0693,
+      "step": 41790
+    },
+    {
+      "epoch": 0.3627659482122551,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0012980712704725441,
+      "loss": 0.207,
+      "step": 41791
+    },
+    {
+      "epoch": 0.36277462869245924,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0012980422376231656,
+      "loss": 0.085,
+      "step": 41792
+    },
+    {
+      "epoch": 0.36278330917266344,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.001298013204557211,
+      "loss": 0.0938,
+      "step": 41793
+    },
+    {
+      "epoch": 0.3627919896528676,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001297984171274712,
+      "loss": 0.0962,
+      "step": 41794
+    },
+    {
+      "epoch": 0.36280067013307177,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001297955137775701,
+      "loss": 0.125,
+      "step": 41795
+    },
+    {
+      "epoch": 0.3628093506132759,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0012979261040602086,
+      "loss": 0.0869,
+      "step": 41796
+    },
+    {
+      "epoch": 0.3628180310934801,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0012978970701282675,
+      "loss": 0.1201,
+      "step": 41797
+    },
+    {
+      "epoch": 0.36282671157368424,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0012978680359799091,
+      "loss": 0.0947,
+      "step": 41798
+    },
+    {
+      "epoch": 0.36283539205388843,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0012978390016151653,
+      "loss": 0.083,
+      "step": 41799
+    },
+    {
+      "epoch": 0.36284407253409257,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0012978099670340677,
+      "loss": 0.0938,
+      "step": 41800
+    },
+    {
+      "epoch": 0.36285275301429676,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0012977809322366478,
+      "loss": 0.1504,
+      "step": 41801
+    },
+    {
+      "epoch": 0.3628614334945009,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001297751897222938,
+      "loss": 0.0864,
+      "step": 41802
+    },
+    {
+      "epoch": 0.3628701139747051,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0012977228619929697,
+      "loss": 0.1025,
+      "step": 41803
+    },
+    {
+      "epoch": 0.3628787944549092,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0012976938265467745,
+      "loss": 0.0967,
+      "step": 41804
+    },
+    {
+      "epoch": 0.3628874749351134,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0012976647908843845,
+      "loss": 0.0864,
+      "step": 41805
+    },
+    {
+      "epoch": 0.36289615541531756,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0012976357550058307,
+      "loss": 0.1348,
+      "step": 41806
+    },
+    {
+      "epoch": 0.36290483589552175,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0012976067189111459,
+      "loss": 0.085,
+      "step": 41807
+    },
+    {
+      "epoch": 0.3629135163757259,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0012975776826003613,
+      "loss": 0.0771,
+      "step": 41808
+    },
+    {
+      "epoch": 0.3629221968559301,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0012975486460735086,
+      "loss": 0.0781,
+      "step": 41809
+    },
+    {
+      "epoch": 0.3629308773361342,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0012975196093306198,
+      "loss": 0.1201,
+      "step": 41810
+    },
+    {
+      "epoch": 0.3629395578163384,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0012974905723717265,
+      "loss": 0.1123,
+      "step": 41811
+    },
+    {
+      "epoch": 0.36294823829654255,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0012974615351968603,
+      "loss": 0.1289,
+      "step": 41812
+    },
+    {
+      "epoch": 0.36295691877674674,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0012974324978060534,
+      "loss": 0.1094,
+      "step": 41813
+    },
+    {
+      "epoch": 0.3629655992569509,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0012974034601993371,
+      "loss": 0.1006,
+      "step": 41814
+    },
+    {
+      "epoch": 0.36297427973715507,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0012973744223767434,
+      "loss": 0.1035,
+      "step": 41815
+    },
+    {
+      "epoch": 0.3629829602173592,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001297345384338304,
+      "loss": 0.0684,
+      "step": 41816
+    },
+    {
+      "epoch": 0.3629916406975634,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0012973163460840507,
+      "loss": 0.0928,
+      "step": 41817
+    },
+    {
+      "epoch": 0.36300032117776754,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0012972873076140153,
+      "loss": 0.1035,
+      "step": 41818
+    },
+    {
+      "epoch": 0.36300900165797173,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001297258268928229,
+      "loss": 0.0938,
+      "step": 41819
+    },
+    {
+      "epoch": 0.36301768213817587,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0012972292300267242,
+      "loss": 0.0859,
+      "step": 41820
+    },
+    {
+      "epoch": 0.36302636261838006,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001297200190909533,
+      "loss": 0.0933,
+      "step": 41821
+    },
+    {
+      "epoch": 0.3630350430985842,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0012971711515766865,
+      "loss": 0.1367,
+      "step": 41822
+    },
+    {
+      "epoch": 0.3630437235787884,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0012971421120282162,
+      "loss": 0.168,
+      "step": 41823
+    },
+    {
+      "epoch": 0.36305240405899253,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0012971130722641542,
+      "loss": 0.1118,
+      "step": 41824
+    },
+    {
+      "epoch": 0.3630610845391967,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0012970840322845325,
+      "loss": 0.0991,
+      "step": 41825
+    },
+    {
+      "epoch": 0.36306976501940086,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0012970549920893826,
+      "loss": 0.125,
+      "step": 41826
+    },
+    {
+      "epoch": 0.36307844549960505,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.001297025951678736,
+      "loss": 0.0977,
+      "step": 41827
+    },
+    {
+      "epoch": 0.3630871259798092,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0012969969110526256,
+      "loss": 0.1465,
+      "step": 41828
+    },
+    {
+      "epoch": 0.3630958064600134,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0012969678702110818,
+      "loss": 0.1045,
+      "step": 41829
+    },
+    {
+      "epoch": 0.3631044869402175,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0012969388291541371,
+      "loss": 0.0898,
+      "step": 41830
+    },
+    {
+      "epoch": 0.3631131674204217,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001296909787881823,
+      "loss": 0.0898,
+      "step": 41831
+    },
+    {
+      "epoch": 0.36312184790062585,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0012968807463941713,
+      "loss": 0.1172,
+      "step": 41832
+    },
+    {
+      "epoch": 0.36313052838083004,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0012968517046912137,
+      "loss": 0.0825,
+      "step": 41833
+    },
+    {
+      "epoch": 0.3631392088610342,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001296822662772982,
+      "loss": 0.0898,
+      "step": 41834
+    },
+    {
+      "epoch": 0.3631478893412384,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0012967936206395082,
+      "loss": 0.0728,
+      "step": 41835
+    },
+    {
+      "epoch": 0.3631565698214425,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0012967645782908236,
+      "loss": 0.1069,
+      "step": 41836
+    },
+    {
+      "epoch": 0.3631652503016467,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0012967355357269604,
+      "loss": 0.0957,
+      "step": 41837
+    },
+    {
+      "epoch": 0.36317393078185084,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0012967064929479502,
+      "loss": 0.1084,
+      "step": 41838
+    },
+    {
+      "epoch": 0.36318261126205503,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0012966774499538246,
+      "loss": 0.0742,
+      "step": 41839
+    },
+    {
+      "epoch": 0.36319129174225917,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0012966484067446153,
+      "loss": 0.1162,
+      "step": 41840
+    },
+    {
+      "epoch": 0.36319997222246336,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0012966193633203544,
+      "loss": 0.1074,
+      "step": 41841
+    },
+    {
+      "epoch": 0.3632086527026675,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001296590319681074,
+      "loss": 0.0908,
+      "step": 41842
+    },
+    {
+      "epoch": 0.3632173331828717,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0012965612758268049,
+      "loss": 0.1108,
+      "step": 41843
+    },
+    {
+      "epoch": 0.36322601366307583,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0012965322317575795,
+      "loss": 0.0747,
+      "step": 41844
+    },
+    {
+      "epoch": 0.36323469414328,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0012965031874734292,
+      "loss": 0.1328,
+      "step": 41845
+    },
+    {
+      "epoch": 0.36324337462348416,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001296474142974386,
+      "loss": 0.1245,
+      "step": 41846
+    },
+    {
+      "epoch": 0.36325205510368835,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0012964450982604818,
+      "loss": 0.1357,
+      "step": 41847
+    },
+    {
+      "epoch": 0.3632607355838925,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0012964160533317482,
+      "loss": 0.1172,
+      "step": 41848
+    },
+    {
+      "epoch": 0.3632694160640967,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001296387008188217,
+      "loss": 0.0967,
+      "step": 41849
+    },
+    {
+      "epoch": 0.3632780965443008,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0012963579628299197,
+      "loss": 0.0845,
+      "step": 41850
+    },
+    {
+      "epoch": 0.363286777024505,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0012963289172568883,
+      "loss": 0.0957,
+      "step": 41851
+    },
+    {
+      "epoch": 0.36329545750470915,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001296299871469155,
+      "loss": 0.084,
+      "step": 41852
+    },
+    {
+      "epoch": 0.36330413798491334,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0012962708254667503,
+      "loss": 0.083,
+      "step": 41853
+    },
+    {
+      "epoch": 0.3633128184651175,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0012962417792497071,
+      "loss": 0.1177,
+      "step": 41854
+    },
+    {
+      "epoch": 0.3633214989453217,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001296212732818057,
+      "loss": 0.0884,
+      "step": 41855
+    },
+    {
+      "epoch": 0.3633301794255258,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0012961836861718318,
+      "loss": 0.0679,
+      "step": 41856
+    },
+    {
+      "epoch": 0.36333885990573,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0012961546393110625,
+      "loss": 0.084,
+      "step": 41857
+    },
+    {
+      "epoch": 0.36334754038593414,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0012961255922357819,
+      "loss": 0.0825,
+      "step": 41858
+    },
+    {
+      "epoch": 0.36335622086613834,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001296096544946021,
+      "loss": 0.0869,
+      "step": 41859
+    },
+    {
+      "epoch": 0.3633649013463425,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0012960674974418125,
+      "loss": 0.1055,
+      "step": 41860
+    },
+    {
+      "epoch": 0.36337358182654667,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0012960384497231869,
+      "loss": 0.1025,
+      "step": 41861
+    },
+    {
+      "epoch": 0.3633822623067508,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0012960094017901767,
+      "loss": 0.0869,
+      "step": 41862
+    },
+    {
+      "epoch": 0.363390942786955,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0012959803536428136,
+      "loss": 0.1426,
+      "step": 41863
+    },
+    {
+      "epoch": 0.36339962326715913,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0012959513052811294,
+      "loss": 0.0889,
+      "step": 41864
+    },
+    {
+      "epoch": 0.3634083037473633,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0012959222567051555,
+      "loss": 0.1021,
+      "step": 41865
+    },
+    {
+      "epoch": 0.36341698422756746,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0012958932079149242,
+      "loss": 0.1758,
+      "step": 41866
+    },
+    {
+      "epoch": 0.36342566470777166,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0012958641589104671,
+      "loss": 0.1387,
+      "step": 41867
+    },
+    {
+      "epoch": 0.3634343451879758,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.001295835109691816,
+      "loss": 0.1514,
+      "step": 41868
+    },
+    {
+      "epoch": 0.36344302566818,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0012958060602590025,
+      "loss": 0.0703,
+      "step": 41869
+    },
+    {
+      "epoch": 0.3634517061483841,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0012957770106120582,
+      "loss": 0.0884,
+      "step": 41870
+    },
+    {
+      "epoch": 0.3634603866285883,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0012957479607510155,
+      "loss": 0.1118,
+      "step": 41871
+    },
+    {
+      "epoch": 0.36346906710879245,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0012957189106759053,
+      "loss": 0.0811,
+      "step": 41872
+    },
+    {
+      "epoch": 0.36347774758899665,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0012956898603867601,
+      "loss": 0.1035,
+      "step": 41873
+    },
+    {
+      "epoch": 0.3634864280692008,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001295660809883611,
+      "loss": 0.1045,
+      "step": 41874
+    },
+    {
+      "epoch": 0.363495108549405,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0012956317591664907,
+      "loss": 0.0972,
+      "step": 41875
+    },
+    {
+      "epoch": 0.3635037890296091,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0012956027082354305,
+      "loss": 0.0947,
+      "step": 41876
+    },
+    {
+      "epoch": 0.3635124695098133,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0012955736570904619,
+      "loss": 0.0732,
+      "step": 41877
+    },
+    {
+      "epoch": 0.36352114999001744,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001295544605731617,
+      "loss": 0.0957,
+      "step": 41878
+    },
+    {
+      "epoch": 0.36352983047022164,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001295515554158927,
+      "loss": 0.084,
+      "step": 41879
+    },
+    {
+      "epoch": 0.3635385109504258,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0012954865023724245,
+      "loss": 0.1074,
+      "step": 41880
+    },
+    {
+      "epoch": 0.36354719143062997,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001295457450372141,
+      "loss": 0.106,
+      "step": 41881
+    },
+    {
+      "epoch": 0.3635558719108341,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001295428398158108,
+      "loss": 0.0869,
+      "step": 41882
+    },
+    {
+      "epoch": 0.3635645523910383,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0012953993457303571,
+      "loss": 0.1445,
+      "step": 41883
+    },
+    {
+      "epoch": 0.36357323287124244,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.001295370293088921,
+      "loss": 0.084,
+      "step": 41884
+    },
+    {
+      "epoch": 0.36358191335144663,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0012953412402338306,
+      "loss": 0.1406,
+      "step": 41885
+    },
+    {
+      "epoch": 0.36359059383165077,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0012953121871651183,
+      "loss": 0.1113,
+      "step": 41886
+    },
+    {
+      "epoch": 0.36359927431185496,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0012952831338828156,
+      "loss": 0.0674,
+      "step": 41887
+    },
+    {
+      "epoch": 0.3636079547920591,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0012952540803869534,
+      "loss": 0.0938,
+      "step": 41888
+    },
+    {
+      "epoch": 0.3636166352722633,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.001295225026677565,
+      "loss": 0.0791,
+      "step": 41889
+    },
+    {
+      "epoch": 0.3636253157524674,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001295195972754681,
+      "loss": 0.1035,
+      "step": 41890
+    },
+    {
+      "epoch": 0.3636339962326716,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.001295166918618334,
+      "loss": 0.0913,
+      "step": 41891
+    },
+    {
+      "epoch": 0.36364267671287576,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001295137864268555,
+      "loss": 0.1196,
+      "step": 41892
+    },
+    {
+      "epoch": 0.3636513571930799,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0012951088097053762,
+      "loss": 0.0947,
+      "step": 41893
+    },
+    {
+      "epoch": 0.3636600376732841,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0012950797549288295,
+      "loss": 0.063,
+      "step": 41894
+    },
+    {
+      "epoch": 0.3636687181534882,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0012950506999389467,
+      "loss": 0.0977,
+      "step": 41895
+    },
+    {
+      "epoch": 0.3636773986336924,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001295021644735759,
+      "loss": 0.1172,
+      "step": 41896
+    },
+    {
+      "epoch": 0.36368607911389655,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0012949925893192987,
+      "loss": 0.1074,
+      "step": 41897
+    },
+    {
+      "epoch": 0.36369475959410075,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0012949635336895976,
+      "loss": 0.0889,
+      "step": 41898
+    },
+    {
+      "epoch": 0.3637034400743049,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001294934477846687,
+      "loss": 0.1006,
+      "step": 41899
+    },
+    {
+      "epoch": 0.3637121205545091,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0012949054217905993,
+      "loss": 0.1289,
+      "step": 41900
+    },
+    {
+      "epoch": 0.3637208010347132,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0012948763655213657,
+      "loss": 0.1152,
+      "step": 41901
+    },
+    {
+      "epoch": 0.3637294815149174,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0012948473090390183,
+      "loss": 0.123,
+      "step": 41902
+    },
+    {
+      "epoch": 0.36373816199512155,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001294818252343589,
+      "loss": 0.1226,
+      "step": 41903
+    },
+    {
+      "epoch": 0.36374684247532574,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0012947891954351093,
+      "loss": 0.1235,
+      "step": 41904
+    },
+    {
+      "epoch": 0.3637555229555299,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.001294760138313611,
+      "loss": 0.0879,
+      "step": 41905
+    },
+    {
+      "epoch": 0.36376420343573407,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0012947310809791259,
+      "loss": 0.0962,
+      "step": 41906
+    },
+    {
+      "epoch": 0.3637728839159382,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.001294702023431686,
+      "loss": 0.083,
+      "step": 41907
+    },
+    {
+      "epoch": 0.3637815643961424,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0012946729656713228,
+      "loss": 0.1309,
+      "step": 41908
+    },
+    {
+      "epoch": 0.36379024487634654,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0012946439076980678,
+      "loss": 0.1309,
+      "step": 41909
+    },
+    {
+      "epoch": 0.36379892535655073,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0012946148495119535,
+      "loss": 0.1079,
+      "step": 41910
+    },
+    {
+      "epoch": 0.36380760583675487,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0012945857911130113,
+      "loss": 0.0562,
+      "step": 41911
+    },
+    {
+      "epoch": 0.36381628631695906,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001294556732501273,
+      "loss": 0.1348,
+      "step": 41912
+    },
+    {
+      "epoch": 0.3638249667971632,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0012945276736767706,
+      "loss": 0.1104,
+      "step": 41913
+    },
+    {
+      "epoch": 0.3638336472773674,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0012944986146395353,
+      "loss": 0.1016,
+      "step": 41914
+    },
+    {
+      "epoch": 0.3638423277575715,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0012944695553895992,
+      "loss": 0.084,
+      "step": 41915
+    },
+    {
+      "epoch": 0.3638510082377757,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0012944404959269943,
+      "loss": 0.125,
+      "step": 41916
+    },
+    {
+      "epoch": 0.36385968871797986,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0012944114362517525,
+      "loss": 0.0928,
+      "step": 41917
+    },
+    {
+      "epoch": 0.36386836919818405,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0012943823763639048,
+      "loss": 0.1475,
+      "step": 41918
+    },
+    {
+      "epoch": 0.3638770496783882,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0012943533162634838,
+      "loss": 0.1211,
+      "step": 41919
+    },
+    {
+      "epoch": 0.3638857301585924,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0012943242559505209,
+      "loss": 0.082,
+      "step": 41920
+    },
+    {
+      "epoch": 0.3638944106387965,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0012942951954250477,
+      "loss": 0.0879,
+      "step": 41921
+    },
+    {
+      "epoch": 0.3639030911190007,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0012942661346870962,
+      "loss": 0.1045,
+      "step": 41922
+    },
+    {
+      "epoch": 0.36391177159920485,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001294237073736698,
+      "loss": 0.1147,
+      "step": 41923
+    },
+    {
+      "epoch": 0.36392045207940904,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0012942080125738855,
+      "loss": 0.0986,
+      "step": 41924
+    },
+    {
+      "epoch": 0.3639291325596132,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0012941789511986897,
+      "loss": 0.1074,
+      "step": 41925
+    },
+    {
+      "epoch": 0.36393781303981737,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001294149889611143,
+      "loss": 0.1143,
+      "step": 41926
+    },
+    {
+      "epoch": 0.3639464935200215,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0012941208278112765,
+      "loss": 0.1699,
+      "step": 41927
+    },
+    {
+      "epoch": 0.3639551740002257,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001294091765799123,
+      "loss": 0.1504,
+      "step": 41928
+    },
+    {
+      "epoch": 0.36396385448042984,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0012940627035747132,
+      "loss": 0.123,
+      "step": 41929
+    },
+    {
+      "epoch": 0.36397253496063403,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0012940336411380792,
+      "loss": 0.123,
+      "step": 41930
+    },
+    {
+      "epoch": 0.36398121544083817,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001294004578489253,
+      "loss": 0.1641,
+      "step": 41931
+    },
+    {
+      "epoch": 0.36398989592104236,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0012939755156282666,
+      "loss": 0.0947,
+      "step": 41932
+    },
+    {
+      "epoch": 0.3639985764012465,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0012939464525551514,
+      "loss": 0.1289,
+      "step": 41933
+    },
+    {
+      "epoch": 0.3640072568814507,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0012939173892699393,
+      "loss": 0.0884,
+      "step": 41934
+    },
+    {
+      "epoch": 0.36401593736165483,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0012938883257726619,
+      "loss": 0.0996,
+      "step": 41935
+    },
+    {
+      "epoch": 0.364024617841859,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0012938592620633513,
+      "loss": 0.0913,
+      "step": 41936
+    },
+    {
+      "epoch": 0.36403329832206316,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001293830198142039,
+      "loss": 0.1885,
+      "step": 41937
+    },
+    {
+      "epoch": 0.36404197880226735,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001293801134008757,
+      "loss": 0.0977,
+      "step": 41938
+    },
+    {
+      "epoch": 0.3640506592824715,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001293772069663537,
+      "loss": 0.0996,
+      "step": 41939
+    },
+    {
+      "epoch": 0.3640593397626757,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0012937430051064105,
+      "loss": 0.0762,
+      "step": 41940
+    },
+    {
+      "epoch": 0.3640680202428798,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0012937139403374099,
+      "loss": 0.1191,
+      "step": 41941
+    },
+    {
+      "epoch": 0.364076700723084,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0012936848753565663,
+      "loss": 0.103,
+      "step": 41942
+    },
+    {
+      "epoch": 0.36408538120328815,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0012936558101639122,
+      "loss": 0.0913,
+      "step": 41943
+    },
+    {
+      "epoch": 0.36409406168349234,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0012936267447594787,
+      "loss": 0.1104,
+      "step": 41944
+    },
+    {
+      "epoch": 0.3641027421636965,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0012935976791432983,
+      "loss": 0.0957,
+      "step": 41945
+    },
+    {
+      "epoch": 0.3641114226439007,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001293568613315402,
+      "loss": 0.0977,
+      "step": 41946
+    },
+    {
+      "epoch": 0.3641201031241048,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0012935395472758226,
+      "loss": 0.0923,
+      "step": 41947
+    },
+    {
+      "epoch": 0.364128783604309,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0012935104810245905,
+      "loss": 0.0859,
+      "step": 41948
+    },
+    {
+      "epoch": 0.36413746408451314,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0012934814145617384,
+      "loss": 0.1084,
+      "step": 41949
+    },
+    {
+      "epoch": 0.36414614456471733,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0012934523478872982,
+      "loss": 0.1089,
+      "step": 41950
+    },
+    {
+      "epoch": 0.36415482504492147,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0012934232810013011,
+      "loss": 0.106,
+      "step": 41951
+    },
+    {
+      "epoch": 0.36416350552512566,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0012933942139037795,
+      "loss": 0.0942,
+      "step": 41952
+    },
+    {
+      "epoch": 0.3641721860053298,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0012933651465947646,
+      "loss": 0.1172,
+      "step": 41953
+    },
+    {
+      "epoch": 0.364180866485534,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0012933360790742887,
+      "loss": 0.0859,
+      "step": 41954
+    },
+    {
+      "epoch": 0.36418954696573813,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0012933070113423835,
+      "loss": 0.1064,
+      "step": 41955
+    },
+    {
+      "epoch": 0.3641982274459423,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0012932779433990804,
+      "loss": 0.125,
+      "step": 41956
+    },
+    {
+      "epoch": 0.36420690792614646,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0012932488752444115,
+      "loss": 0.0991,
+      "step": 41957
+    },
+    {
+      "epoch": 0.36421558840635065,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0012932198068784087,
+      "loss": 0.0869,
+      "step": 41958
+    },
+    {
+      "epoch": 0.3642242688865548,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0012931907383011032,
+      "loss": 0.1143,
+      "step": 41959
+    },
+    {
+      "epoch": 0.364232949366759,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0012931616695125276,
+      "loss": 0.0898,
+      "step": 41960
+    },
+    {
+      "epoch": 0.3642416298469631,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0012931326005127133,
+      "loss": 0.0801,
+      "step": 41961
+    },
+    {
+      "epoch": 0.3642503103271673,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001293103531301692,
+      "loss": 0.0747,
+      "step": 41962
+    },
+    {
+      "epoch": 0.36425899080737145,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0012930744618794955,
+      "loss": 0.1641,
+      "step": 41963
+    },
+    {
+      "epoch": 0.36426767128757565,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0012930453922461557,
+      "loss": 0.1309,
+      "step": 41964
+    },
+    {
+      "epoch": 0.3642763517677798,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0012930163224017044,
+      "loss": 0.0972,
+      "step": 41965
+    },
+    {
+      "epoch": 0.364285032247984,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0012929872523461735,
+      "loss": 0.1084,
+      "step": 41966
+    },
+    {
+      "epoch": 0.3642937127281881,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0012929581820795945,
+      "loss": 0.0923,
+      "step": 41967
+    },
+    {
+      "epoch": 0.3643023932083923,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0012929291116019993,
+      "loss": 0.0815,
+      "step": 41968
+    },
+    {
+      "epoch": 0.36431107368859644,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0012929000409134198,
+      "loss": 0.0977,
+      "step": 41969
+    },
+    {
+      "epoch": 0.36431975416880064,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0012928709700138876,
+      "loss": 0.1172,
+      "step": 41970
+    },
+    {
+      "epoch": 0.3643284346490048,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0012928418989034346,
+      "loss": 0.0854,
+      "step": 41971
+    },
+    {
+      "epoch": 0.36433711512920897,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0012928128275820927,
+      "loss": 0.1494,
+      "step": 41972
+    },
+    {
+      "epoch": 0.3643457956094131,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0012927837560498935,
+      "loss": 0.0762,
+      "step": 41973
+    },
+    {
+      "epoch": 0.3643544760896173,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0012927546843068689,
+      "loss": 0.1084,
+      "step": 41974
+    },
+    {
+      "epoch": 0.36436315656982143,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0012927256123530509,
+      "loss": 0.1118,
+      "step": 41975
+    },
+    {
+      "epoch": 0.3643718370500256,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0012926965401884709,
+      "loss": 0.0815,
+      "step": 41976
+    },
+    {
+      "epoch": 0.36438051753022976,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001292667467813161,
+      "loss": 0.1045,
+      "step": 41977
+    },
+    {
+      "epoch": 0.36438919801043396,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0012926383952271525,
+      "loss": 0.1162,
+      "step": 41978
+    },
+    {
+      "epoch": 0.3643978784906381,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0012926093224304776,
+      "loss": 0.1011,
+      "step": 41979
+    },
+    {
+      "epoch": 0.3644065589708423,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0012925802494231683,
+      "loss": 0.0752,
+      "step": 41980
+    },
+    {
+      "epoch": 0.3644152394510464,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0012925511762052558,
+      "loss": 0.1025,
+      "step": 41981
+    },
+    {
+      "epoch": 0.3644239199312506,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0012925221027767727,
+      "loss": 0.1006,
+      "step": 41982
+    },
+    {
+      "epoch": 0.36443260041145475,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.00129249302913775,
+      "loss": 0.1094,
+      "step": 41983
+    },
+    {
+      "epoch": 0.36444128089165895,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0012924639552882198,
+      "loss": 0.0781,
+      "step": 41984
+    },
+    {
+      "epoch": 0.3644499613718631,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0012924348812282137,
+      "loss": 0.0776,
+      "step": 41985
+    },
+    {
+      "epoch": 0.3644586418520673,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0012924058069577642,
+      "loss": 0.1001,
+      "step": 41986
+    },
+    {
+      "epoch": 0.3644673223322714,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0012923767324769025,
+      "loss": 0.0806,
+      "step": 41987
+    },
+    {
+      "epoch": 0.3644760028124756,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.00129234765778566,
+      "loss": 0.1504,
+      "step": 41988
+    },
+    {
+      "epoch": 0.36448468329267975,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0012923185828840696,
+      "loss": 0.1211,
+      "step": 41989
+    },
+    {
+      "epoch": 0.36449336377288394,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0012922895077721623,
+      "loss": 0.125,
+      "step": 41990
+    },
+    {
+      "epoch": 0.3645020442530881,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0012922604324499699,
+      "loss": 0.0967,
+      "step": 41991
+    },
+    {
+      "epoch": 0.36451072473329227,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0012922313569175246,
+      "loss": 0.1016,
+      "step": 41992
+    },
+    {
+      "epoch": 0.3645194052134964,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001292202281174858,
+      "loss": 0.0942,
+      "step": 41993
+    },
+    {
+      "epoch": 0.3645280856937006,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.001292173205222002,
+      "loss": 0.083,
+      "step": 41994
+    },
+    {
+      "epoch": 0.36453676617390474,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001292144129058988,
+      "loss": 0.0845,
+      "step": 41995
+    },
+    {
+      "epoch": 0.36454544665410893,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0012921150526858476,
+      "loss": 0.082,
+      "step": 41996
+    },
+    {
+      "epoch": 0.36455412713431307,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0012920859761026138,
+      "loss": 0.1172,
+      "step": 41997
+    },
+    {
+      "epoch": 0.36456280761451726,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.001292056899309317,
+      "loss": 0.1055,
+      "step": 41998
+    },
+    {
+      "epoch": 0.3645714880947214,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0012920278223059905,
+      "loss": 0.1245,
+      "step": 41999
+    },
+    {
+      "epoch": 0.3645801685749256,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0012919987450926646,
+      "loss": 0.1045,
+      "step": 42000
+    },
+    {
+      "epoch": 0.3645888490551297,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0012919696676693723,
+      "loss": 0.0977,
+      "step": 42001
+    },
+    {
+      "epoch": 0.3645975295353339,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0012919405900361446,
+      "loss": 0.1064,
+      "step": 42002
+    },
+    {
+      "epoch": 0.36460621001553806,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0012919115121930135,
+      "loss": 0.1289,
+      "step": 42003
+    },
+    {
+      "epoch": 0.36461489049574225,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0012918824341400109,
+      "loss": 0.0928,
+      "step": 42004
+    },
+    {
+      "epoch": 0.3646235709759464,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0012918533558771684,
+      "loss": 0.0889,
+      "step": 42005
+    },
+    {
+      "epoch": 0.3646322514561506,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0012918242774045182,
+      "loss": 0.0957,
+      "step": 42006
+    },
+    {
+      "epoch": 0.3646409319363547,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0012917951987220914,
+      "loss": 0.1069,
+      "step": 42007
+    },
+    {
+      "epoch": 0.3646496124165589,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0012917661198299207,
+      "loss": 0.1079,
+      "step": 42008
+    },
+    {
+      "epoch": 0.36465829289676305,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0012917370407280373,
+      "loss": 0.0913,
+      "step": 42009
+    },
+    {
+      "epoch": 0.36466697337696724,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0012917079614164731,
+      "loss": 0.0781,
+      "step": 42010
+    },
+    {
+      "epoch": 0.3646756538571714,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.00129167888189526,
+      "loss": 0.0889,
+      "step": 42011
+    },
+    {
+      "epoch": 0.36468433433737557,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0012916498021644301,
+      "loss": 0.1309,
+      "step": 42012
+    },
+    {
+      "epoch": 0.3646930148175797,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0012916207222240144,
+      "loss": 0.0938,
+      "step": 42013
+    },
+    {
+      "epoch": 0.3647016952977839,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0012915916420740453,
+      "loss": 0.0781,
+      "step": 42014
+    },
+    {
+      "epoch": 0.36471037577798804,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0012915625617145546,
+      "loss": 0.0684,
+      "step": 42015
+    },
+    {
+      "epoch": 0.3647190562581922,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0012915334811455738,
+      "loss": 0.103,
+      "step": 42016
+    },
+    {
+      "epoch": 0.36472773673839637,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0012915044003671348,
+      "loss": 0.103,
+      "step": 42017
+    },
+    {
+      "epoch": 0.3647364172186005,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0012914753193792694,
+      "loss": 0.084,
+      "step": 42018
+    },
+    {
+      "epoch": 0.3647450976988047,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0012914462381820098,
+      "loss": 0.0898,
+      "step": 42019
+    },
+    {
+      "epoch": 0.36475377817900884,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0012914171567753874,
+      "loss": 0.1094,
+      "step": 42020
+    },
+    {
+      "epoch": 0.36476245865921303,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001291388075159434,
+      "loss": 0.0767,
+      "step": 42021
+    },
+    {
+      "epoch": 0.36477113913941717,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0012913589933341814,
+      "loss": 0.0942,
+      "step": 42022
+    },
+    {
+      "epoch": 0.36477981961962136,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0012913299112996615,
+      "loss": 0.1094,
+      "step": 42023
+    },
+    {
+      "epoch": 0.3647885000998255,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0012913008290559064,
+      "loss": 0.0894,
+      "step": 42024
+    },
+    {
+      "epoch": 0.3647971805800297,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001291271746602947,
+      "loss": 0.1309,
+      "step": 42025
+    },
+    {
+      "epoch": 0.3648058610602338,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001291242663940816,
+      "loss": 0.0869,
+      "step": 42026
+    },
+    {
+      "epoch": 0.364814541540438,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0012912135810695447,
+      "loss": 0.1128,
+      "step": 42027
+    },
+    {
+      "epoch": 0.36482322202064216,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0012911844979891654,
+      "loss": 0.1055,
+      "step": 42028
+    },
+    {
+      "epoch": 0.36483190250084635,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0012911554146997095,
+      "loss": 0.0889,
+      "step": 42029
+    },
+    {
+      "epoch": 0.3648405829810505,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0012911263312012087,
+      "loss": 0.0923,
+      "step": 42030
+    },
+    {
+      "epoch": 0.3648492634612547,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0012910972474936952,
+      "loss": 0.0986,
+      "step": 42031
+    },
+    {
+      "epoch": 0.3648579439414588,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001291068163577201,
+      "loss": 0.1289,
+      "step": 42032
+    },
+    {
+      "epoch": 0.364866624421663,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001291039079451757,
+      "loss": 0.0732,
+      "step": 42033
+    },
+    {
+      "epoch": 0.36487530490186715,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001291009995117396,
+      "loss": 0.1235,
+      "step": 42034
+    },
+    {
+      "epoch": 0.36488398538207134,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0012909809105741486,
+      "loss": 0.0669,
+      "step": 42035
+    },
+    {
+      "epoch": 0.3648926658622755,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.001290951825822048,
+      "loss": 0.1514,
+      "step": 42036
+    },
+    {
+      "epoch": 0.36490134634247967,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001290922740861125,
+      "loss": 0.0898,
+      "step": 42037
+    },
+    {
+      "epoch": 0.3649100268226838,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001290893655691412,
+      "loss": 0.1016,
+      "step": 42038
+    },
+    {
+      "epoch": 0.364918707302888,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0012908645703129402,
+      "loss": 0.0898,
+      "step": 42039
+    },
+    {
+      "epoch": 0.36492738778309214,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0012908354847257421,
+      "loss": 0.124,
+      "step": 42040
+    },
+    {
+      "epoch": 0.36493606826329633,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0012908063989298493,
+      "loss": 0.0742,
+      "step": 42041
+    },
+    {
+      "epoch": 0.36494474874350047,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0012907773129252934,
+      "loss": 0.0737,
+      "step": 42042
+    },
+    {
+      "epoch": 0.36495342922370466,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001290748226712106,
+      "loss": 0.1543,
+      "step": 42043
+    },
+    {
+      "epoch": 0.3649621097039088,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0012907191402903194,
+      "loss": 0.0918,
+      "step": 42044
+    },
+    {
+      "epoch": 0.364970790184113,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0012906900536599653,
+      "loss": 0.1279,
+      "step": 42045
+    },
+    {
+      "epoch": 0.36497947066431713,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0012906609668210754,
+      "loss": 0.1172,
+      "step": 42046
+    },
+    {
+      "epoch": 0.3649881511445213,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0012906318797736814,
+      "loss": 0.1309,
+      "step": 42047
+    },
+    {
+      "epoch": 0.36499683162472546,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0012906027925178151,
+      "loss": 0.0884,
+      "step": 42048
+    },
+    {
+      "epoch": 0.36500551210492965,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001290573705053509,
+      "loss": 0.0986,
+      "step": 42049
+    },
+    {
+      "epoch": 0.3650141925851338,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0012905446173807939,
+      "loss": 0.0942,
+      "step": 42050
+    },
+    {
+      "epoch": 0.365022873065338,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0012905155294997022,
+      "loss": 0.0693,
+      "step": 42051
+    },
+    {
+      "epoch": 0.3650315535455421,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0012904864414102655,
+      "loss": 0.0781,
+      "step": 42052
+    },
+    {
+      "epoch": 0.3650402340257463,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0012904573531125156,
+      "loss": 0.0835,
+      "step": 42053
+    },
+    {
+      "epoch": 0.36504891450595045,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0012904282646064849,
+      "loss": 0.0806,
+      "step": 42054
+    },
+    {
+      "epoch": 0.36505759498615464,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0012903991758922042,
+      "loss": 0.0664,
+      "step": 42055
+    },
+    {
+      "epoch": 0.3650662754663588,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0012903700869697056,
+      "loss": 0.0879,
+      "step": 42056
+    },
+    {
+      "epoch": 0.365074955946563,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0012903409978390218,
+      "loss": 0.0962,
+      "step": 42057
+    },
+    {
+      "epoch": 0.3650836364267671,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0012903119085001839,
+      "loss": 0.1338,
+      "step": 42058
+    },
+    {
+      "epoch": 0.3650923169069713,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0012902828189532234,
+      "loss": 0.1387,
+      "step": 42059
+    },
+    {
+      "epoch": 0.36510099738717544,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0012902537291981729,
+      "loss": 0.1641,
+      "step": 42060
+    },
+    {
+      "epoch": 0.36510967786737963,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0012902246392350633,
+      "loss": 0.127,
+      "step": 42061
+    },
+    {
+      "epoch": 0.36511835834758377,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001290195549063927,
+      "loss": 0.1465,
+      "step": 42062
+    },
+    {
+      "epoch": 0.36512703882778796,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0012901664586847958,
+      "loss": 0.1172,
+      "step": 42063
+    },
+    {
+      "epoch": 0.3651357193079921,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0012901373680977014,
+      "loss": 0.1318,
+      "step": 42064
+    },
+    {
+      "epoch": 0.3651443997881963,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0012901082773026757,
+      "loss": 0.0713,
+      "step": 42065
+    },
+    {
+      "epoch": 0.36515308026840043,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0012900791862997501,
+      "loss": 0.083,
+      "step": 42066
+    },
+    {
+      "epoch": 0.3651617607486046,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0012900500950889572,
+      "loss": 0.1387,
+      "step": 42067
+    },
+    {
+      "epoch": 0.36517044122880876,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0012900210036703283,
+      "loss": 0.123,
+      "step": 42068
+    },
+    {
+      "epoch": 0.36517912170901295,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001289991912043895,
+      "loss": 0.1152,
+      "step": 42069
+    },
+    {
+      "epoch": 0.3651878021892171,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0012899628202096896,
+      "loss": 0.0977,
+      "step": 42070
+    },
+    {
+      "epoch": 0.3651964826694213,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0012899337281677438,
+      "loss": 0.0581,
+      "step": 42071
+    },
+    {
+      "epoch": 0.3652051631496254,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0012899046359180892,
+      "loss": 0.1172,
+      "step": 42072
+    },
+    {
+      "epoch": 0.3652138436298296,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0012898755434607577,
+      "loss": 0.1113,
+      "step": 42073
+    },
+    {
+      "epoch": 0.36522252411003375,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0012898464507957812,
+      "loss": 0.0732,
+      "step": 42074
+    },
+    {
+      "epoch": 0.36523120459023795,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0012898173579231913,
+      "loss": 0.0811,
+      "step": 42075
+    },
+    {
+      "epoch": 0.3652398850704421,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0012897882648430204,
+      "loss": 0.0762,
+      "step": 42076
+    },
+    {
+      "epoch": 0.3652485655506463,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0012897591715552996,
+      "loss": 0.0991,
+      "step": 42077
+    },
+    {
+      "epoch": 0.3652572460308504,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0012897300780600612,
+      "loss": 0.1113,
+      "step": 42078
+    },
+    {
+      "epoch": 0.3652659265110546,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.001289700984357337,
+      "loss": 0.1211,
+      "step": 42079
+    },
+    {
+      "epoch": 0.36527460699125874,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0012896718904471583,
+      "loss": 0.0918,
+      "step": 42080
+    },
+    {
+      "epoch": 0.36528328747146294,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0012896427963295573,
+      "loss": 0.054,
+      "step": 42081
+    },
+    {
+      "epoch": 0.3652919679516671,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001289613702004566,
+      "loss": 0.083,
+      "step": 42082
+    },
+    {
+      "epoch": 0.36530064843187127,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0012895846074722156,
+      "loss": 0.1079,
+      "step": 42083
+    },
+    {
+      "epoch": 0.3653093289120754,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0012895555127325387,
+      "loss": 0.0894,
+      "step": 42084
+    },
+    {
+      "epoch": 0.3653180093922796,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0012895264177855666,
+      "loss": 0.1211,
+      "step": 42085
+    },
+    {
+      "epoch": 0.36532668987248373,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001289497322631331,
+      "loss": 0.1396,
+      "step": 42086
+    },
+    {
+      "epoch": 0.3653353703526879,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0012894682272698646,
+      "loss": 0.106,
+      "step": 42087
+    },
+    {
+      "epoch": 0.36534405083289206,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0012894391317011982,
+      "loss": 0.0977,
+      "step": 42088
+    },
+    {
+      "epoch": 0.36535273131309626,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0012894100359253642,
+      "loss": 0.1025,
+      "step": 42089
+    },
+    {
+      "epoch": 0.3653614117933004,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0012893809399423942,
+      "loss": 0.1201,
+      "step": 42090
+    },
+    {
+      "epoch": 0.3653700922735046,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0012893518437523197,
+      "loss": 0.1426,
+      "step": 42091
+    },
+    {
+      "epoch": 0.3653787727537087,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0012893227473551733,
+      "loss": 0.1245,
+      "step": 42092
+    },
+    {
+      "epoch": 0.3653874532339129,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001289293650750986,
+      "loss": 0.1108,
+      "step": 42093
+    },
+    {
+      "epoch": 0.36539613371411706,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0012892645539397903,
+      "loss": 0.1348,
+      "step": 42094
+    },
+    {
+      "epoch": 0.36540481419432125,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0012892354569216174,
+      "loss": 0.1885,
+      "step": 42095
+    },
+    {
+      "epoch": 0.3654134946745254,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0012892063596965,
+      "loss": 0.0723,
+      "step": 42096
+    },
+    {
+      "epoch": 0.3654221751547296,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001289177262264469,
+      "loss": 0.0723,
+      "step": 42097
+    },
+    {
+      "epoch": 0.3654308556349337,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0012891481646255568,
+      "loss": 0.1045,
+      "step": 42098
+    },
+    {
+      "epoch": 0.3654395361151379,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0012891190667797948,
+      "loss": 0.1064,
+      "step": 42099
+    },
+    {
+      "epoch": 0.36544821659534205,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001289089968727215,
+      "loss": 0.125,
+      "step": 42100
+    },
+    {
+      "epoch": 0.36545689707554624,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0012890608704678496,
+      "loss": 0.1172,
+      "step": 42101
+    },
+    {
+      "epoch": 0.3654655775557504,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0012890317720017294,
+      "loss": 0.1367,
+      "step": 42102
+    },
+    {
+      "epoch": 0.36547425803595457,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0012890026733288875,
+      "loss": 0.0586,
+      "step": 42103
+    },
+    {
+      "epoch": 0.3654829385161587,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0012889735744493547,
+      "loss": 0.1074,
+      "step": 42104
+    },
+    {
+      "epoch": 0.3654916189963629,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0012889444753631633,
+      "loss": 0.0977,
+      "step": 42105
+    },
+    {
+      "epoch": 0.36550029947656704,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0012889153760703455,
+      "loss": 0.0942,
+      "step": 42106
+    },
+    {
+      "epoch": 0.36550897995677123,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0012888862765709323,
+      "loss": 0.1025,
+      "step": 42107
+    },
+    {
+      "epoch": 0.36551766043697537,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0012888571768649559,
+      "loss": 0.1021,
+      "step": 42108
+    },
+    {
+      "epoch": 0.36552634091717956,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0012888280769524484,
+      "loss": 0.1094,
+      "step": 42109
+    },
+    {
+      "epoch": 0.3655350213973837,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0012887989768334412,
+      "loss": 0.0864,
+      "step": 42110
+    },
+    {
+      "epoch": 0.3655437018775879,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0012887698765079662,
+      "loss": 0.1514,
+      "step": 42111
+    },
+    {
+      "epoch": 0.365552382357792,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0012887407759760554,
+      "loss": 0.1309,
+      "step": 42112
+    },
+    {
+      "epoch": 0.3655610628379962,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0012887116752377404,
+      "loss": 0.0732,
+      "step": 42113
+    },
+    {
+      "epoch": 0.36556974331820036,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0012886825742930532,
+      "loss": 0.1084,
+      "step": 42114
+    },
+    {
+      "epoch": 0.36557842379840455,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0012886534731420256,
+      "loss": 0.0806,
+      "step": 42115
+    },
+    {
+      "epoch": 0.3655871042786087,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0012886243717846896,
+      "loss": 0.1084,
+      "step": 42116
+    },
+    {
+      "epoch": 0.3655957847588129,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0012885952702210764,
+      "loss": 0.1621,
+      "step": 42117
+    },
+    {
+      "epoch": 0.365604465239017,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0012885661684512184,
+      "loss": 0.1006,
+      "step": 42118
+    },
+    {
+      "epoch": 0.3656131457192212,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0012885370664751474,
+      "loss": 0.0991,
+      "step": 42119
+    },
+    {
+      "epoch": 0.36562182619942535,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0012885079642928953,
+      "loss": 0.0859,
+      "step": 42120
+    },
+    {
+      "epoch": 0.36563050667962954,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0012884788619044933,
+      "loss": 0.0771,
+      "step": 42121
+    },
+    {
+      "epoch": 0.3656391871598337,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0012884497593099737,
+      "loss": 0.0874,
+      "step": 42122
+    },
+    {
+      "epoch": 0.36564786764003787,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0012884206565093683,
+      "loss": 0.1152,
+      "step": 42123
+    },
+    {
+      "epoch": 0.365656548120242,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0012883915535027092,
+      "loss": 0.1426,
+      "step": 42124
+    },
+    {
+      "epoch": 0.3656652286004462,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0012883624502900278,
+      "loss": 0.0908,
+      "step": 42125
+    },
+    {
+      "epoch": 0.36567390908065034,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0012883333468713559,
+      "loss": 0.1094,
+      "step": 42126
+    },
+    {
+      "epoch": 0.36568258956085453,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0012883042432467256,
+      "loss": 0.084,
+      "step": 42127
+    },
+    {
+      "epoch": 0.36569127004105867,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0012882751394161687,
+      "loss": 0.1279,
+      "step": 42128
+    },
+    {
+      "epoch": 0.36569995052126286,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0012882460353797168,
+      "loss": 0.1846,
+      "step": 42129
+    },
+    {
+      "epoch": 0.365708631001467,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0012882169311374018,
+      "loss": 0.0879,
+      "step": 42130
+    },
+    {
+      "epoch": 0.3657173114816712,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0012881878266892556,
+      "loss": 0.1299,
+      "step": 42131
+    },
+    {
+      "epoch": 0.36572599196187533,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.00128815872203531,
+      "loss": 0.124,
+      "step": 42132
+    },
+    {
+      "epoch": 0.3657346724420795,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001288129617175597,
+      "loss": 0.0884,
+      "step": 42133
+    },
+    {
+      "epoch": 0.36574335292228366,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0012881005121101478,
+      "loss": 0.1191,
+      "step": 42134
+    },
+    {
+      "epoch": 0.36575203340248785,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0012880714068389954,
+      "loss": 0.1523,
+      "step": 42135
+    },
+    {
+      "epoch": 0.365760713882692,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0012880423013621706,
+      "loss": 0.1201,
+      "step": 42136
+    },
+    {
+      "epoch": 0.3657693943628962,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0012880131956797057,
+      "loss": 0.123,
+      "step": 42137
+    },
+    {
+      "epoch": 0.3657780748431003,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0012879840897916319,
+      "loss": 0.0767,
+      "step": 42138
+    },
+    {
+      "epoch": 0.36578675532330446,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0012879549836979822,
+      "loss": 0.0996,
+      "step": 42139
+    },
+    {
+      "epoch": 0.36579543580350865,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0012879258773987876,
+      "loss": 0.0757,
+      "step": 42140
+    },
+    {
+      "epoch": 0.3658041162837128,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0012878967708940797,
+      "loss": 0.1113,
+      "step": 42141
+    },
+    {
+      "epoch": 0.365812796763917,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001287867664183891,
+      "loss": 0.1426,
+      "step": 42142
+    },
+    {
+      "epoch": 0.3658214772441211,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001287838557268253,
+      "loss": 0.0703,
+      "step": 42143
+    },
+    {
+      "epoch": 0.3658301577243253,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0012878094501471977,
+      "loss": 0.0757,
+      "step": 42144
+    },
+    {
+      "epoch": 0.36583883820452945,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001287780342820757,
+      "loss": 0.0859,
+      "step": 42145
+    },
+    {
+      "epoch": 0.36584751868473364,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0012877512352889623,
+      "loss": 0.0928,
+      "step": 42146
+    },
+    {
+      "epoch": 0.3658561991649378,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0012877221275518452,
+      "loss": 0.0649,
+      "step": 42147
+    },
+    {
+      "epoch": 0.36586487964514197,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0012876930196094385,
+      "loss": 0.0889,
+      "step": 42148
+    },
+    {
+      "epoch": 0.3658735601253461,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0012876639114617733,
+      "loss": 0.0869,
+      "step": 42149
+    },
+    {
+      "epoch": 0.3658822406055503,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001287634803108882,
+      "loss": 0.166,
+      "step": 42150
+    },
+    {
+      "epoch": 0.36589092108575444,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0012876056945507956,
+      "loss": 0.1108,
+      "step": 42151
+    },
+    {
+      "epoch": 0.36589960156595863,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0012875765857875469,
+      "loss": 0.0952,
+      "step": 42152
+    },
+    {
+      "epoch": 0.36590828204616277,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001287547476819167,
+      "loss": 0.0869,
+      "step": 42153
+    },
+    {
+      "epoch": 0.36591696252636696,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001287518367645688,
+      "loss": 0.0864,
+      "step": 42154
+    },
+    {
+      "epoch": 0.3659256430065711,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0012874892582671422,
+      "loss": 0.1108,
+      "step": 42155
+    },
+    {
+      "epoch": 0.3659343234867753,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0012874601486835605,
+      "loss": 0.123,
+      "step": 42156
+    },
+    {
+      "epoch": 0.36594300396697943,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0012874310388949753,
+      "loss": 0.1064,
+      "step": 42157
+    },
+    {
+      "epoch": 0.3659516844471836,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0012874019289014183,
+      "loss": 0.0703,
+      "step": 42158
+    },
+    {
+      "epoch": 0.36596036492738776,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0012873728187029216,
+      "loss": 0.0884,
+      "step": 42159
+    },
+    {
+      "epoch": 0.36596904540759195,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0012873437082995163,
+      "loss": 0.0898,
+      "step": 42160
+    },
+    {
+      "epoch": 0.3659777258877961,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001287314597691235,
+      "loss": 0.1147,
+      "step": 42161
+    },
+    {
+      "epoch": 0.3659864063680003,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0012872854868781094,
+      "loss": 0.106,
+      "step": 42162
+    },
+    {
+      "epoch": 0.3659950868482044,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0012872563758601712,
+      "loss": 0.0796,
+      "step": 42163
+    },
+    {
+      "epoch": 0.3660037673284086,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0012872272646374519,
+      "loss": 0.0728,
+      "step": 42164
+    },
+    {
+      "epoch": 0.36601244780861275,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0012871981532099842,
+      "loss": 0.3008,
+      "step": 42165
+    },
+    {
+      "epoch": 0.36602112828881694,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001287169041577799,
+      "loss": 0.1025,
+      "step": 42166
+    },
+    {
+      "epoch": 0.3660298087690211,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0012871399297409289,
+      "loss": 0.2266,
+      "step": 42167
+    },
+    {
+      "epoch": 0.3660384892492253,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001287110817699405,
+      "loss": 0.0859,
+      "step": 42168
+    },
+    {
+      "epoch": 0.3660471697294294,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0012870817054532594,
+      "loss": 0.1074,
+      "step": 42169
+    },
+    {
+      "epoch": 0.3660558502096336,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0012870525930025248,
+      "loss": 0.1309,
+      "step": 42170
+    },
+    {
+      "epoch": 0.36606453068983774,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0012870234803472316,
+      "loss": 0.105,
+      "step": 42171
+    },
+    {
+      "epoch": 0.36607321117004193,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0012869943674874125,
+      "loss": 0.0918,
+      "step": 42172
+    },
+    {
+      "epoch": 0.36608189165024607,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0012869652544230993,
+      "loss": 0.0957,
+      "step": 42173
+    },
+    {
+      "epoch": 0.36609057213045026,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0012869361411543237,
+      "loss": 0.1367,
+      "step": 42174
+    },
+    {
+      "epoch": 0.3660992526106544,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0012869070276811175,
+      "loss": 0.0752,
+      "step": 42175
+    },
+    {
+      "epoch": 0.3661079330908586,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0012868779140035125,
+      "loss": 0.0952,
+      "step": 42176
+    },
+    {
+      "epoch": 0.36611661357106273,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0012868488001215406,
+      "loss": 0.0898,
+      "step": 42177
+    },
+    {
+      "epoch": 0.3661252940512669,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0012868196860352339,
+      "loss": 0.1445,
+      "step": 42178
+    },
+    {
+      "epoch": 0.36613397453147106,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001286790571744624,
+      "loss": 0.1172,
+      "step": 42179
+    },
+    {
+      "epoch": 0.36614265501167526,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0012867614572497424,
+      "loss": 0.0908,
+      "step": 42180
+    },
+    {
+      "epoch": 0.3661513354918794,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0012867323425506214,
+      "loss": 0.1011,
+      "step": 42181
+    },
+    {
+      "epoch": 0.3661600159720836,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0012867032276472927,
+      "loss": 0.1328,
+      "step": 42182
+    },
+    {
+      "epoch": 0.3661686964522877,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0012866741125397885,
+      "loss": 0.1118,
+      "step": 42183
+    },
+    {
+      "epoch": 0.3661773769324919,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0012866449972281399,
+      "loss": 0.0845,
+      "step": 42184
+    },
+    {
+      "epoch": 0.36618605741269605,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0012866158817123794,
+      "loss": 0.1074,
+      "step": 42185
+    },
+    {
+      "epoch": 0.36619473789290025,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0012865867659925382,
+      "loss": 0.1055,
+      "step": 42186
+    },
+    {
+      "epoch": 0.3662034183731044,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0012865576500686489,
+      "loss": 0.0889,
+      "step": 42187
+    },
+    {
+      "epoch": 0.3662120988533086,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001286528533940743,
+      "loss": 0.083,
+      "step": 42188
+    },
+    {
+      "epoch": 0.3662207793335127,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0012864994176088518,
+      "loss": 0.1045,
+      "step": 42189
+    },
+    {
+      "epoch": 0.3662294598137169,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001286470301073008,
+      "loss": 0.0884,
+      "step": 42190
+    },
+    {
+      "epoch": 0.36623814029392104,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.001286441184333243,
+      "loss": 0.084,
+      "step": 42191
+    },
+    {
+      "epoch": 0.36624682077412524,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001286412067389589,
+      "loss": 0.0752,
+      "step": 42192
+    },
+    {
+      "epoch": 0.3662555012543294,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0012863829502420773,
+      "loss": 0.1328,
+      "step": 42193
+    },
+    {
+      "epoch": 0.36626418173453357,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.00128635383289074,
+      "loss": 0.1406,
+      "step": 42194
+    },
+    {
+      "epoch": 0.3662728622147377,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0012863247153356088,
+      "loss": 0.125,
+      "step": 42195
+    },
+    {
+      "epoch": 0.3662815426949419,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0012862955975767158,
+      "loss": 0.1172,
+      "step": 42196
+    },
+    {
+      "epoch": 0.36629022317514603,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0012862664796140931,
+      "loss": 0.1138,
+      "step": 42197
+    },
+    {
+      "epoch": 0.3662989036553502,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0012862373614477716,
+      "loss": 0.1143,
+      "step": 42198
+    },
+    {
+      "epoch": 0.36630758413555436,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001286208243077784,
+      "loss": 0.1016,
+      "step": 42199
+    },
+    {
+      "epoch": 0.36631626461575856,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001286179124504162,
+      "loss": 0.0811,
+      "step": 42200
+    },
+    {
+      "epoch": 0.3663249450959627,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0012861500057269372,
+      "loss": 0.1143,
+      "step": 42201
+    },
+    {
+      "epoch": 0.3663336255761669,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0012861208867461413,
+      "loss": 0.1064,
+      "step": 42202
+    },
+    {
+      "epoch": 0.366342306056371,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0012860917675618067,
+      "loss": 0.1113,
+      "step": 42203
+    },
+    {
+      "epoch": 0.3663509865365752,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0012860626481739647,
+      "loss": 0.1245,
+      "step": 42204
+    },
+    {
+      "epoch": 0.36635966701677936,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001286033528582648,
+      "loss": 0.0801,
+      "step": 42205
+    },
+    {
+      "epoch": 0.36636834749698355,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0012860044087878873,
+      "loss": 0.125,
+      "step": 42206
+    },
+    {
+      "epoch": 0.3663770279771877,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001285975288789715,
+      "loss": 0.083,
+      "step": 42207
+    },
+    {
+      "epoch": 0.3663857084573919,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0012859461685881628,
+      "loss": 0.0957,
+      "step": 42208
+    },
+    {
+      "epoch": 0.366394388937596,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0012859170481832626,
+      "loss": 0.123,
+      "step": 42209
+    },
+    {
+      "epoch": 0.3664030694178002,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0012858879275750468,
+      "loss": 0.0688,
+      "step": 42210
+    },
+    {
+      "epoch": 0.36641174989800435,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0012858588067635466,
+      "loss": 0.1201,
+      "step": 42211
+    },
+    {
+      "epoch": 0.36642043037820854,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0012858296857487935,
+      "loss": 0.0879,
+      "step": 42212
+    },
+    {
+      "epoch": 0.3664291108584127,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0012858005645308204,
+      "loss": 0.1235,
+      "step": 42213
+    },
+    {
+      "epoch": 0.36643779133861687,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0012857714431096586,
+      "loss": 0.1035,
+      "step": 42214
+    },
+    {
+      "epoch": 0.366446471818821,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.00128574232148534,
+      "loss": 0.125,
+      "step": 42215
+    },
+    {
+      "epoch": 0.3664551522990252,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0012857131996578959,
+      "loss": 0.1162,
+      "step": 42216
+    },
+    {
+      "epoch": 0.36646383277922934,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0012856840776273588,
+      "loss": 0.1348,
+      "step": 42217
+    },
+    {
+      "epoch": 0.36647251325943353,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0012856549553937604,
+      "loss": 0.1318,
+      "step": 42218
+    },
+    {
+      "epoch": 0.36648119373963767,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0012856258329571324,
+      "loss": 0.1045,
+      "step": 42219
+    },
+    {
+      "epoch": 0.36648987421984186,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001285596710317507,
+      "loss": 0.1211,
+      "step": 42220
+    },
+    {
+      "epoch": 0.366498554700046,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.001285567587474916,
+      "loss": 0.0771,
+      "step": 42221
+    },
+    {
+      "epoch": 0.3665072351802502,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.001285538464429391,
+      "loss": 0.0913,
+      "step": 42222
+    },
+    {
+      "epoch": 0.36651591566045433,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0012855093411809637,
+      "loss": 0.1465,
+      "step": 42223
+    },
+    {
+      "epoch": 0.3665245961406585,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0012854802177296664,
+      "loss": 0.104,
+      "step": 42224
+    },
+    {
+      "epoch": 0.36653327662086266,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0012854510940755303,
+      "loss": 0.1904,
+      "step": 42225
+    },
+    {
+      "epoch": 0.36654195710106685,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0012854219702185881,
+      "loss": 0.0996,
+      "step": 42226
+    },
+    {
+      "epoch": 0.366550637581271,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0012853928461588713,
+      "loss": 0.1094,
+      "step": 42227
+    },
+    {
+      "epoch": 0.3665593180614752,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0012853637218964112,
+      "loss": 0.0791,
+      "step": 42228
+    },
+    {
+      "epoch": 0.3665679985416793,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0012853345974312403,
+      "loss": 0.0874,
+      "step": 42229
+    },
+    {
+      "epoch": 0.3665766790218835,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0012853054727633903,
+      "loss": 0.166,
+      "step": 42230
+    },
+    {
+      "epoch": 0.36658535950208765,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0012852763478928932,
+      "loss": 0.1064,
+      "step": 42231
+    },
+    {
+      "epoch": 0.36659403998229184,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0012852472228197806,
+      "loss": 0.5273,
+      "step": 42232
+    },
+    {
+      "epoch": 0.366602720462496,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0012852180975440842,
+      "loss": 0.1108,
+      "step": 42233
+    },
+    {
+      "epoch": 0.36661140094270017,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0012851889720658365,
+      "loss": 0.1055,
+      "step": 42234
+    },
+    {
+      "epoch": 0.3666200814229043,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0012851598463850687,
+      "loss": 0.1006,
+      "step": 42235
+    },
+    {
+      "epoch": 0.3666287619031085,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0012851307205018127,
+      "loss": 0.1035,
+      "step": 42236
+    },
+    {
+      "epoch": 0.36663744238331264,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0012851015944161006,
+      "loss": 0.1299,
+      "step": 42237
+    },
+    {
+      "epoch": 0.36664612286351683,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0012850724681279639,
+      "loss": 0.1416,
+      "step": 42238
+    },
+    {
+      "epoch": 0.36665480334372097,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0012850433416374352,
+      "loss": 0.1318,
+      "step": 42239
+    },
+    {
+      "epoch": 0.36666348382392516,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0012850142149445458,
+      "loss": 0.084,
+      "step": 42240
+    },
+    {
+      "epoch": 0.3666721643041293,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0012849850880493276,
+      "loss": 0.1084,
+      "step": 42241
+    },
+    {
+      "epoch": 0.3666808447843335,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0012849559609518124,
+      "loss": 0.1216,
+      "step": 42242
+    },
+    {
+      "epoch": 0.36668952526453763,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0012849268336520324,
+      "loss": 0.1191,
+      "step": 42243
+    },
+    {
+      "epoch": 0.3666982057447418,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0012848977061500188,
+      "loss": 0.0879,
+      "step": 42244
+    },
+    {
+      "epoch": 0.36670688622494596,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001284868578445804,
+      "loss": 0.1172,
+      "step": 42245
+    },
+    {
+      "epoch": 0.36671556670515015,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0012848394505394199,
+      "loss": 0.1084,
+      "step": 42246
+    },
+    {
+      "epoch": 0.3667242471853543,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0012848103224308981,
+      "loss": 0.1118,
+      "step": 42247
+    },
+    {
+      "epoch": 0.3667329276655585,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0012847811941202702,
+      "loss": 0.1104,
+      "step": 42248
+    },
+    {
+      "epoch": 0.3667416081457626,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0012847520656075683,
+      "loss": 0.1079,
+      "step": 42249
+    },
+    {
+      "epoch": 0.3667502886259668,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001284722936892825,
+      "loss": 0.2832,
+      "step": 42250
+    },
+    {
+      "epoch": 0.36675896910617095,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0012846938079760708,
+      "loss": 0.085,
+      "step": 42251
+    },
+    {
+      "epoch": 0.36676764958637514,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0012846646788573386,
+      "loss": 0.0957,
+      "step": 42252
+    },
+    {
+      "epoch": 0.3667763300665793,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0012846355495366598,
+      "loss": 0.1211,
+      "step": 42253
+    },
+    {
+      "epoch": 0.3667850105467835,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0012846064200140663,
+      "loss": 0.1094,
+      "step": 42254
+    },
+    {
+      "epoch": 0.3667936910269876,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00128457729028959,
+      "loss": 0.1143,
+      "step": 42255
+    },
+    {
+      "epoch": 0.3668023715071918,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0012845481603632628,
+      "loss": 0.1099,
+      "step": 42256
+    },
+    {
+      "epoch": 0.36681105198739594,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001284519030235116,
+      "loss": 0.0801,
+      "step": 42257
+    },
+    {
+      "epoch": 0.36681973246760013,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001284489899905183,
+      "loss": 0.0996,
+      "step": 42258
+    },
+    {
+      "epoch": 0.36682841294780427,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0012844607693734936,
+      "loss": 0.1133,
+      "step": 42259
+    },
+    {
+      "epoch": 0.36683709342800846,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001284431638640081,
+      "loss": 0.0742,
+      "step": 42260
+    },
+    {
+      "epoch": 0.3668457739082126,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001284402507704977,
+      "loss": 0.082,
+      "step": 42261
+    },
+    {
+      "epoch": 0.36685445438841674,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0012843733765682131,
+      "loss": 0.1143,
+      "step": 42262
+    },
+    {
+      "epoch": 0.36686313486862093,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0012843442452298212,
+      "loss": 0.127,
+      "step": 42263
+    },
+    {
+      "epoch": 0.36687181534882507,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.001284315113689833,
+      "loss": 0.085,
+      "step": 42264
+    },
+    {
+      "epoch": 0.36688049582902926,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001284285981948281,
+      "loss": 0.1108,
+      "step": 42265
+    },
+    {
+      "epoch": 0.3668891763092334,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0012842568500051961,
+      "loss": 0.1143,
+      "step": 42266
+    },
+    {
+      "epoch": 0.3668978567894376,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001284227717860611,
+      "loss": 0.1001,
+      "step": 42267
+    },
+    {
+      "epoch": 0.36690653726964173,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001284198585514557,
+      "loss": 0.0854,
+      "step": 42268
+    },
+    {
+      "epoch": 0.3669152177498459,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0012841694529670664,
+      "loss": 0.0771,
+      "step": 42269
+    },
+    {
+      "epoch": 0.36692389823005006,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001284140320218171,
+      "loss": 0.0928,
+      "step": 42270
+    },
+    {
+      "epoch": 0.36693257871025425,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0012841111872679022,
+      "loss": 0.1147,
+      "step": 42271
+    },
+    {
+      "epoch": 0.3669412591904584,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0012840820541162922,
+      "loss": 0.0996,
+      "step": 42272
+    },
+    {
+      "epoch": 0.3669499396706626,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.001284052920763373,
+      "loss": 0.0972,
+      "step": 42273
+    },
+    {
+      "epoch": 0.3669586201508667,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0012840237872091761,
+      "loss": 0.0962,
+      "step": 42274
+    },
+    {
+      "epoch": 0.3669673006310709,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001283994653453734,
+      "loss": 0.0913,
+      "step": 42275
+    },
+    {
+      "epoch": 0.36697598111127505,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0012839655194970777,
+      "loss": 0.1582,
+      "step": 42276
+    },
+    {
+      "epoch": 0.36698466159147924,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0012839363853392393,
+      "loss": 0.106,
+      "step": 42277
+    },
+    {
+      "epoch": 0.3669933420716834,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0012839072509802513,
+      "loss": 0.0913,
+      "step": 42278
+    },
+    {
+      "epoch": 0.3670020225518876,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0012838781164201448,
+      "loss": 0.1108,
+      "step": 42279
+    },
+    {
+      "epoch": 0.3670107030320917,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001283848981658952,
+      "loss": 0.1328,
+      "step": 42280
+    },
+    {
+      "epoch": 0.3670193835122959,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0012838198466967046,
+      "loss": 0.0845,
+      "step": 42281
+    },
+    {
+      "epoch": 0.36702806399250004,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0012837907115334348,
+      "loss": 0.1279,
+      "step": 42282
+    },
+    {
+      "epoch": 0.36703674447270423,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001283761576169174,
+      "loss": 0.0977,
+      "step": 42283
+    },
+    {
+      "epoch": 0.3670454249529084,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0012837324406039547,
+      "loss": 0.103,
+      "step": 42284
+    },
+    {
+      "epoch": 0.36705410543311257,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001283703304837808,
+      "loss": 0.1138,
+      "step": 42285
+    },
+    {
+      "epoch": 0.3670627859133167,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0012836741688707662,
+      "loss": 0.104,
+      "step": 42286
+    },
+    {
+      "epoch": 0.3670714663935209,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001283645032702861,
+      "loss": 0.1553,
+      "step": 42287
+    },
+    {
+      "epoch": 0.36708014687372503,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0012836158963341249,
+      "loss": 0.1226,
+      "step": 42288
+    },
+    {
+      "epoch": 0.3670888273539292,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0012835867597645887,
+      "loss": 0.0957,
+      "step": 42289
+    },
+    {
+      "epoch": 0.36709750783413336,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0012835576229942847,
+      "loss": 0.0986,
+      "step": 42290
+    },
+    {
+      "epoch": 0.36710618831433756,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0012835284860232452,
+      "loss": 0.1025,
+      "step": 42291
+    },
+    {
+      "epoch": 0.3671148687945417,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0012834993488515017,
+      "loss": 0.1289,
+      "step": 42292
+    },
+    {
+      "epoch": 0.3671235492747459,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0012834702114790856,
+      "loss": 0.127,
+      "step": 42293
+    },
+    {
+      "epoch": 0.36713222975495,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0012834410739060297,
+      "loss": 0.0796,
+      "step": 42294
+    },
+    {
+      "epoch": 0.3671409102351542,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0012834119361323651,
+      "loss": 0.1064,
+      "step": 42295
+    },
+    {
+      "epoch": 0.36714959071535835,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0012833827981581238,
+      "loss": 0.125,
+      "step": 42296
+    },
+    {
+      "epoch": 0.36715827119556255,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0012833536599833385,
+      "loss": 0.0996,
+      "step": 42297
+    },
+    {
+      "epoch": 0.3671669516757667,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0012833245216080397,
+      "loss": 0.1021,
+      "step": 42298
+    },
+    {
+      "epoch": 0.3671756321559709,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0012832953830322605,
+      "loss": 0.0874,
+      "step": 42299
+    },
+    {
+      "epoch": 0.367184312636175,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001283266244256032,
+      "loss": 0.0771,
+      "step": 42300
+    },
+    {
+      "epoch": 0.3671929931163792,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0012832371052793863,
+      "loss": 0.1064,
+      "step": 42301
+    },
+    {
+      "epoch": 0.36720167359658334,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0012832079661023552,
+      "loss": 0.0903,
+      "step": 42302
+    },
+    {
+      "epoch": 0.36721035407678754,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0012831788267249707,
+      "loss": 0.1055,
+      "step": 42303
+    },
+    {
+      "epoch": 0.3672190345569917,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0012831496871472646,
+      "loss": 0.1006,
+      "step": 42304
+    },
+    {
+      "epoch": 0.36722771503719587,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0012831205473692686,
+      "loss": 0.127,
+      "step": 42305
+    },
+    {
+      "epoch": 0.3672363955174,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0012830914073910147,
+      "loss": 0.1221,
+      "step": 42306
+    },
+    {
+      "epoch": 0.3672450759976042,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0012830622672125346,
+      "loss": 0.1064,
+      "step": 42307
+    },
+    {
+      "epoch": 0.36725375647780834,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0012830331268338612,
+      "loss": 0.1523,
+      "step": 42308
+    },
+    {
+      "epoch": 0.36726243695801253,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001283003986255025,
+      "loss": 0.0986,
+      "step": 42309
+    },
+    {
+      "epoch": 0.36727111743821667,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0012829748454760582,
+      "loss": 0.1021,
+      "step": 42310
+    },
+    {
+      "epoch": 0.36727979791842086,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001282945704496993,
+      "loss": 0.0898,
+      "step": 42311
+    },
+    {
+      "epoch": 0.367288478398625,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0012829165633178612,
+      "loss": 0.1299,
+      "step": 42312
+    },
+    {
+      "epoch": 0.3672971588788292,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0012828874219386948,
+      "loss": 0.1152,
+      "step": 42313
+    },
+    {
+      "epoch": 0.3673058393590333,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001282858280359525,
+      "loss": 0.103,
+      "step": 42314
+    },
+    {
+      "epoch": 0.3673145198392375,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0012828291385803842,
+      "loss": 0.1162,
+      "step": 42315
+    },
+    {
+      "epoch": 0.36732320031944166,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0012827999966013044,
+      "loss": 0.0933,
+      "step": 42316
+    },
+    {
+      "epoch": 0.36733188079964585,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0012827708544223173,
+      "loss": 0.1221,
+      "step": 42317
+    },
+    {
+      "epoch": 0.36734056127985,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0012827417120434546,
+      "loss": 0.1201,
+      "step": 42318
+    },
+    {
+      "epoch": 0.3673492417600542,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0012827125694647487,
+      "loss": 0.167,
+      "step": 42319
+    },
+    {
+      "epoch": 0.3673579222402583,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0012826834266862304,
+      "loss": 0.1855,
+      "step": 42320
+    },
+    {
+      "epoch": 0.3673666027204625,
+      "grad_norm": 0.05419921875,
+      "learning_rate": 0.0012826542837079326,
+      "loss": 0.0874,
+      "step": 42321
+    },
+    {
+      "epoch": 0.36737528320066665,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001282625140529887,
+      "loss": 0.0996,
+      "step": 42322
+    },
+    {
+      "epoch": 0.36738396368087084,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0012825959971521248,
+      "loss": 0.1279,
+      "step": 42323
+    },
+    {
+      "epoch": 0.367392644161075,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0012825668535746788,
+      "loss": 0.1152,
+      "step": 42324
+    },
+    {
+      "epoch": 0.36740132464127917,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0012825377097975801,
+      "loss": 0.103,
+      "step": 42325
+    },
+    {
+      "epoch": 0.3674100051214833,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0012825085658208612,
+      "loss": 0.1001,
+      "step": 42326
+    },
+    {
+      "epoch": 0.3674186856016875,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0012824794216445537,
+      "loss": 0.2178,
+      "step": 42327
+    },
+    {
+      "epoch": 0.36742736608189164,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0012824502772686892,
+      "loss": 0.1025,
+      "step": 42328
+    },
+    {
+      "epoch": 0.36743604656209583,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0012824211326932999,
+      "loss": 0.0938,
+      "step": 42329
+    },
+    {
+      "epoch": 0.36744472704229997,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0012823919879184178,
+      "loss": 0.0884,
+      "step": 42330
+    },
+    {
+      "epoch": 0.36745340752250416,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0012823628429440743,
+      "loss": 0.1172,
+      "step": 42331
+    },
+    {
+      "epoch": 0.3674620880027083,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0012823336977703017,
+      "loss": 0.1123,
+      "step": 42332
+    },
+    {
+      "epoch": 0.3674707684829125,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0012823045523971317,
+      "loss": 0.0967,
+      "step": 42333
+    },
+    {
+      "epoch": 0.36747944896311663,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001282275406824596,
+      "loss": 0.0908,
+      "step": 42334
+    },
+    {
+      "epoch": 0.3674881294433208,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0012822462610527269,
+      "loss": 0.082,
+      "step": 42335
+    },
+    {
+      "epoch": 0.36749680992352496,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0012822171150815558,
+      "loss": 0.1094,
+      "step": 42336
+    },
+    {
+      "epoch": 0.36750549040372915,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001282187968911115,
+      "loss": 0.0957,
+      "step": 42337
+    },
+    {
+      "epoch": 0.3675141708839333,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0012821588225414358,
+      "loss": 0.123,
+      "step": 42338
+    },
+    {
+      "epoch": 0.3675228513641375,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0012821296759725508,
+      "loss": 0.0767,
+      "step": 42339
+    },
+    {
+      "epoch": 0.3675315318443416,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0012821005292044916,
+      "loss": 0.1089,
+      "step": 42340
+    },
+    {
+      "epoch": 0.3675402123245458,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0012820713822372899,
+      "loss": 0.1133,
+      "step": 42341
+    },
+    {
+      "epoch": 0.36754889280474995,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0012820422350709776,
+      "loss": 0.0879,
+      "step": 42342
+    },
+    {
+      "epoch": 0.36755757328495414,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0012820130877055867,
+      "loss": 0.083,
+      "step": 42343
+    },
+    {
+      "epoch": 0.3675662537651583,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0012819839401411487,
+      "loss": 0.1104,
+      "step": 42344
+    },
+    {
+      "epoch": 0.3675749342453625,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001281954792377696,
+      "loss": 0.0957,
+      "step": 42345
+    },
+    {
+      "epoch": 0.3675836147255666,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0012819256444152605,
+      "loss": 0.1133,
+      "step": 42346
+    },
+    {
+      "epoch": 0.3675922952057708,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0012818964962538737,
+      "loss": 0.1064,
+      "step": 42347
+    },
+    {
+      "epoch": 0.36760097568597494,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0012818673478935678,
+      "loss": 0.1309,
+      "step": 42348
+    },
+    {
+      "epoch": 0.36760965616617913,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0012818381993343743,
+      "loss": 0.0918,
+      "step": 42349
+    },
+    {
+      "epoch": 0.36761833664638327,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001281809050576325,
+      "loss": 0.0894,
+      "step": 42350
+    },
+    {
+      "epoch": 0.36762701712658746,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0012817799016194525,
+      "loss": 0.1621,
+      "step": 42351
+    },
+    {
+      "epoch": 0.3676356976067916,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0012817507524637884,
+      "loss": 0.0928,
+      "step": 42352
+    },
+    {
+      "epoch": 0.3676443780869958,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0012817216031093639,
+      "loss": 0.1201,
+      "step": 42353
+    },
+    {
+      "epoch": 0.36765305856719993,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0012816924535562115,
+      "loss": 0.083,
+      "step": 42354
+    },
+    {
+      "epoch": 0.3676617390474041,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0012816633038043631,
+      "loss": 0.0781,
+      "step": 42355
+    },
+    {
+      "epoch": 0.36767041952760826,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0012816341538538503,
+      "loss": 0.0742,
+      "step": 42356
+    },
+    {
+      "epoch": 0.36767910000781245,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0012816050037047053,
+      "loss": 0.0806,
+      "step": 42357
+    },
+    {
+      "epoch": 0.3676877804880166,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0012815758533569596,
+      "loss": 0.1011,
+      "step": 42358
+    },
+    {
+      "epoch": 0.3676964609682208,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0012815467028106453,
+      "loss": 0.0688,
+      "step": 42359
+    },
+    {
+      "epoch": 0.3677051414484249,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0012815175520657946,
+      "loss": 0.0737,
+      "step": 42360
+    },
+    {
+      "epoch": 0.3677138219286291,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0012814884011224388,
+      "loss": 0.1631,
+      "step": 42361
+    },
+    {
+      "epoch": 0.36772250240883325,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0012814592499806098,
+      "loss": 0.0967,
+      "step": 42362
+    },
+    {
+      "epoch": 0.36773118288903744,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0012814300986403395,
+      "loss": 0.0952,
+      "step": 42363
+    },
+    {
+      "epoch": 0.3677398633692416,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0012814009471016606,
+      "loss": 0.0952,
+      "step": 42364
+    },
+    {
+      "epoch": 0.3677485438494458,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0012813717953646042,
+      "loss": 0.1406,
+      "step": 42365
+    },
+    {
+      "epoch": 0.3677572243296499,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0012813426434292023,
+      "loss": 0.1123,
+      "step": 42366
+    },
+    {
+      "epoch": 0.3677659048098541,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0012813134912954866,
+      "loss": 0.0889,
+      "step": 42367
+    },
+    {
+      "epoch": 0.36777458529005824,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0012812843389634892,
+      "loss": 0.1357,
+      "step": 42368
+    },
+    {
+      "epoch": 0.36778326577026244,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0012812551864332421,
+      "loss": 0.0947,
+      "step": 42369
+    },
+    {
+      "epoch": 0.3677919462504666,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0012812260337047771,
+      "loss": 0.1162,
+      "step": 42370
+    },
+    {
+      "epoch": 0.36780062673067077,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001281196880778126,
+      "loss": 0.0967,
+      "step": 42371
+    },
+    {
+      "epoch": 0.3678093072108749,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.001281167727653321,
+      "loss": 0.1504,
+      "step": 42372
+    },
+    {
+      "epoch": 0.3678179876910791,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0012811385743303932,
+      "loss": 0.0923,
+      "step": 42373
+    },
+    {
+      "epoch": 0.36782666817128323,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001281109420809375,
+      "loss": 0.0894,
+      "step": 42374
+    },
+    {
+      "epoch": 0.3678353486514874,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0012810802670902986,
+      "loss": 0.0986,
+      "step": 42375
+    },
+    {
+      "epoch": 0.36784402913169156,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001281051113173195,
+      "loss": 0.126,
+      "step": 42376
+    },
+    {
+      "epoch": 0.36785270961189576,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001281021959058097,
+      "loss": 0.1069,
+      "step": 42377
+    },
+    {
+      "epoch": 0.3678613900920999,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0012809928047450363,
+      "loss": 0.0703,
+      "step": 42378
+    },
+    {
+      "epoch": 0.3678700705723041,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0012809636502340445,
+      "loss": 0.0835,
+      "step": 42379
+    },
+    {
+      "epoch": 0.3678787510525082,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001280934495525153,
+      "loss": 0.0718,
+      "step": 42380
+    },
+    {
+      "epoch": 0.3678874315327124,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001280905340618395,
+      "loss": 0.0894,
+      "step": 42381
+    },
+    {
+      "epoch": 0.36789611201291655,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0012808761855138012,
+      "loss": 0.0986,
+      "step": 42382
+    },
+    {
+      "epoch": 0.36790479249312075,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0012808470302114042,
+      "loss": 0.1011,
+      "step": 42383
+    },
+    {
+      "epoch": 0.3679134729733249,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0012808178747112354,
+      "loss": 0.0752,
+      "step": 42384
+    },
+    {
+      "epoch": 0.367922153453529,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0012807887190133267,
+      "loss": 0.1064,
+      "step": 42385
+    },
+    {
+      "epoch": 0.3679308339337332,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0012807595631177107,
+      "loss": 0.1953,
+      "step": 42386
+    },
+    {
+      "epoch": 0.36793951441393735,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0012807304070244186,
+      "loss": 0.1172,
+      "step": 42387
+    },
+    {
+      "epoch": 0.36794819489414154,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0012807012507334823,
+      "loss": 0.0986,
+      "step": 42388
+    },
+    {
+      "epoch": 0.3679568753743457,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0012806720942449337,
+      "loss": 0.083,
+      "step": 42389
+    },
+    {
+      "epoch": 0.3679655558545499,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.001280642937558805,
+      "loss": 0.0776,
+      "step": 42390
+    },
+    {
+      "epoch": 0.367974236334754,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0012806137806751277,
+      "loss": 0.1611,
+      "step": 42391
+    },
+    {
+      "epoch": 0.3679829168149582,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001280584623593934,
+      "loss": 0.1099,
+      "step": 42392
+    },
+    {
+      "epoch": 0.36799159729516234,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0012805554663152558,
+      "loss": 0.085,
+      "step": 42393
+    },
+    {
+      "epoch": 0.36800027777536654,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0012805263088391248,
+      "loss": 0.1025,
+      "step": 42394
+    },
+    {
+      "epoch": 0.3680089582555707,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001280497151165573,
+      "loss": 0.1079,
+      "step": 42395
+    },
+    {
+      "epoch": 0.36801763873577487,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001280467993294632,
+      "loss": 0.083,
+      "step": 42396
+    },
+    {
+      "epoch": 0.368026319215979,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0012804388352263341,
+      "loss": 0.0771,
+      "step": 42397
+    },
+    {
+      "epoch": 0.3680349996961832,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.001280409676960711,
+      "loss": 0.0767,
+      "step": 42398
+    },
+    {
+      "epoch": 0.36804368017638733,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0012803805184977946,
+      "loss": 0.1069,
+      "step": 42399
+    },
+    {
+      "epoch": 0.3680523606565915,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0012803513598376166,
+      "loss": 0.2041,
+      "step": 42400
+    },
+    {
+      "epoch": 0.36806104113679566,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0012803222009802092,
+      "loss": 0.0732,
+      "step": 42401
+    },
+    {
+      "epoch": 0.36806972161699986,
+      "grad_norm": 1.9375,
+      "learning_rate": 0.0012802930419256039,
+      "loss": 0.1006,
+      "step": 42402
+    },
+    {
+      "epoch": 0.368078402097204,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0012802638826738331,
+      "loss": 0.1211,
+      "step": 42403
+    },
+    {
+      "epoch": 0.3680870825774082,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0012802347232249286,
+      "loss": 0.0781,
+      "step": 42404
+    },
+    {
+      "epoch": 0.3680957630576123,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0012802055635789218,
+      "loss": 0.0977,
+      "step": 42405
+    },
+    {
+      "epoch": 0.3681044435378165,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.001280176403735845,
+      "loss": 0.1318,
+      "step": 42406
+    },
+    {
+      "epoch": 0.36811312401802065,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.00128014724369573,
+      "loss": 0.083,
+      "step": 42407
+    },
+    {
+      "epoch": 0.36812180449822485,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0012801180834586087,
+      "loss": 0.1123,
+      "step": 42408
+    },
+    {
+      "epoch": 0.368130484978429,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0012800889230245131,
+      "loss": 0.0908,
+      "step": 42409
+    },
+    {
+      "epoch": 0.3681391654586332,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0012800597623934746,
+      "loss": 0.167,
+      "step": 42410
+    },
+    {
+      "epoch": 0.3681478459388373,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0012800306015655256,
+      "loss": 0.085,
+      "step": 42411
+    },
+    {
+      "epoch": 0.3681565264190415,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001280001440540698,
+      "loss": 0.1797,
+      "step": 42412
+    },
+    {
+      "epoch": 0.36816520689924565,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0012799722793190235,
+      "loss": 0.1216,
+      "step": 42413
+    },
+    {
+      "epoch": 0.36817388737944984,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001279943117900534,
+      "loss": 0.0889,
+      "step": 42414
+    },
+    {
+      "epoch": 0.368182567859654,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.001279913956285261,
+      "loss": 0.125,
+      "step": 42415
+    },
+    {
+      "epoch": 0.36819124833985817,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0012798847944732374,
+      "loss": 0.0996,
+      "step": 42416
+    },
+    {
+      "epoch": 0.3681999288200623,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0012798556324644943,
+      "loss": 0.1143,
+      "step": 42417
+    },
+    {
+      "epoch": 0.3682086093002665,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0012798264702590639,
+      "loss": 0.0977,
+      "step": 42418
+    },
+    {
+      "epoch": 0.36821728978047064,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0012797973078569776,
+      "loss": 0.1206,
+      "step": 42419
+    },
+    {
+      "epoch": 0.36822597026067483,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0012797681452582676,
+      "loss": 0.1504,
+      "step": 42420
+    },
+    {
+      "epoch": 0.36823465074087897,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0012797389824629661,
+      "loss": 0.082,
+      "step": 42421
+    },
+    {
+      "epoch": 0.36824333122108316,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001279709819471105,
+      "loss": 0.1797,
+      "step": 42422
+    },
+    {
+      "epoch": 0.3682520117012873,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001279680656282716,
+      "loss": 0.0996,
+      "step": 42423
+    },
+    {
+      "epoch": 0.3682606921814915,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0012796514928978305,
+      "loss": 0.0718,
+      "step": 42424
+    },
+    {
+      "epoch": 0.3682693726616956,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001279622329316481,
+      "loss": 0.1167,
+      "step": 42425
+    },
+    {
+      "epoch": 0.3682780531418998,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001279593165538699,
+      "loss": 0.1025,
+      "step": 42426
+    },
+    {
+      "epoch": 0.36828673362210396,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0012795640015645171,
+      "loss": 0.1138,
+      "step": 42427
+    },
+    {
+      "epoch": 0.36829541410230815,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0012795348373939664,
+      "loss": 0.1387,
+      "step": 42428
+    },
+    {
+      "epoch": 0.3683040945825123,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0012795056730270792,
+      "loss": 0.0933,
+      "step": 42429
+    },
+    {
+      "epoch": 0.3683127750627165,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0012794765084638868,
+      "loss": 0.0962,
+      "step": 42430
+    },
+    {
+      "epoch": 0.3683214555429206,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001279447343704422,
+      "loss": 0.1211,
+      "step": 42431
+    },
+    {
+      "epoch": 0.3683301360231248,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0012794181787487162,
+      "loss": 0.0825,
+      "step": 42432
+    },
+    {
+      "epoch": 0.36833881650332895,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0012793890135968016,
+      "loss": 0.1108,
+      "step": 42433
+    },
+    {
+      "epoch": 0.36834749698353314,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0012793598482487099,
+      "loss": 0.0903,
+      "step": 42434
+    },
+    {
+      "epoch": 0.3683561774637373,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0012793306827044728,
+      "loss": 0.1045,
+      "step": 42435
+    },
+    {
+      "epoch": 0.36836485794394147,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0012793015169641222,
+      "loss": 0.1035,
+      "step": 42436
+    },
+    {
+      "epoch": 0.3683735384241456,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0012792723510276903,
+      "loss": 0.1035,
+      "step": 42437
+    },
+    {
+      "epoch": 0.3683822189043498,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001279243184895209,
+      "loss": 0.1172,
+      "step": 42438
+    },
+    {
+      "epoch": 0.36839089938455394,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0012792140185667099,
+      "loss": 0.0962,
+      "step": 42439
+    },
+    {
+      "epoch": 0.36839957986475813,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0012791848520422248,
+      "loss": 0.0796,
+      "step": 42440
+    },
+    {
+      "epoch": 0.36840826034496227,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001279155685321786,
+      "loss": 0.1367,
+      "step": 42441
+    },
+    {
+      "epoch": 0.36841694082516646,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0012791265184054256,
+      "loss": 0.0918,
+      "step": 42442
+    },
+    {
+      "epoch": 0.3684256213053706,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0012790973512931748,
+      "loss": 0.0996,
+      "step": 42443
+    },
+    {
+      "epoch": 0.3684343017855748,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0012790681839850656,
+      "loss": 0.1006,
+      "step": 42444
+    },
+    {
+      "epoch": 0.36844298226577893,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0012790390164811305,
+      "loss": 0.0776,
+      "step": 42445
+    },
+    {
+      "epoch": 0.3684516627459831,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001279009848781401,
+      "loss": 0.1045,
+      "step": 42446
+    },
+    {
+      "epoch": 0.36846034322618726,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0012789806808859092,
+      "loss": 0.0884,
+      "step": 42447
+    },
+    {
+      "epoch": 0.36846902370639145,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0012789515127946862,
+      "loss": 0.0869,
+      "step": 42448
+    },
+    {
+      "epoch": 0.3684777041865956,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001278922344507765,
+      "loss": 0.0947,
+      "step": 42449
+    },
+    {
+      "epoch": 0.3684863846667998,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0012788931760251768,
+      "loss": 0.125,
+      "step": 42450
+    },
+    {
+      "epoch": 0.3684950651470039,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0012788640073469538,
+      "loss": 0.1201,
+      "step": 42451
+    },
+    {
+      "epoch": 0.3685037456272081,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001278834838473128,
+      "loss": 0.127,
+      "step": 42452
+    },
+    {
+      "epoch": 0.36851242610741225,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0012788056694037308,
+      "loss": 0.1025,
+      "step": 42453
+    },
+    {
+      "epoch": 0.36852110658761644,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0012787765001387942,
+      "loss": 0.0952,
+      "step": 42454
+    },
+    {
+      "epoch": 0.3685297870678206,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0012787473306783508,
+      "loss": 0.0967,
+      "step": 42455
+    },
+    {
+      "epoch": 0.3685384675480248,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0012787181610224318,
+      "loss": 0.1328,
+      "step": 42456
+    },
+    {
+      "epoch": 0.3685471480282289,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0012786889911710695,
+      "loss": 0.0815,
+      "step": 42457
+    },
+    {
+      "epoch": 0.3685558285084331,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0012786598211242952,
+      "loss": 0.0986,
+      "step": 42458
+    },
+    {
+      "epoch": 0.36856450898863724,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0012786306508821414,
+      "loss": 0.1094,
+      "step": 42459
+    },
+    {
+      "epoch": 0.36857318946884143,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0012786014804446399,
+      "loss": 0.0889,
+      "step": 42460
+    },
+    {
+      "epoch": 0.36858186994904557,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0012785723098118226,
+      "loss": 0.0991,
+      "step": 42461
+    },
+    {
+      "epoch": 0.36859055042924976,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001278543138983721,
+      "loss": 0.1099,
+      "step": 42462
+    },
+    {
+      "epoch": 0.3685992309094539,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0012785139679603675,
+      "loss": 0.0708,
+      "step": 42463
+    },
+    {
+      "epoch": 0.3686079113896581,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0012784847967417942,
+      "loss": 0.0737,
+      "step": 42464
+    },
+    {
+      "epoch": 0.36861659186986223,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0012784556253280317,
+      "loss": 0.0806,
+      "step": 42465
+    },
+    {
+      "epoch": 0.3686252723500664,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0012784264537191135,
+      "loss": 0.1011,
+      "step": 42466
+    },
+    {
+      "epoch": 0.36863395283027056,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0012783972819150706,
+      "loss": 0.1104,
+      "step": 42467
+    },
+    {
+      "epoch": 0.36864263331047475,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0012783681099159349,
+      "loss": 0.083,
+      "step": 42468
+    },
+    {
+      "epoch": 0.3686513137906789,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0012783389377217388,
+      "loss": 0.084,
+      "step": 42469
+    },
+    {
+      "epoch": 0.3686599942708831,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0012783097653325138,
+      "loss": 0.1084,
+      "step": 42470
+    },
+    {
+      "epoch": 0.3686686747510872,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001278280592748292,
+      "loss": 0.0894,
+      "step": 42471
+    },
+    {
+      "epoch": 0.3686773552312914,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001278251419969105,
+      "loss": 0.1084,
+      "step": 42472
+    },
+    {
+      "epoch": 0.36868603571149555,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0012782222469949853,
+      "loss": 0.1055,
+      "step": 42473
+    },
+    {
+      "epoch": 0.36869471619169975,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0012781930738259642,
+      "loss": 0.0615,
+      "step": 42474
+    },
+    {
+      "epoch": 0.3687033966719039,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0012781639004620737,
+      "loss": 0.0664,
+      "step": 42475
+    },
+    {
+      "epoch": 0.3687120771521081,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0012781347269033463,
+      "loss": 0.1196,
+      "step": 42476
+    },
+    {
+      "epoch": 0.3687207576323122,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0012781055531498131,
+      "loss": 0.0596,
+      "step": 42477
+    },
+    {
+      "epoch": 0.3687294381125164,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0012780763792015063,
+      "loss": 0.1035,
+      "step": 42478
+    },
+    {
+      "epoch": 0.36873811859272054,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0012780472050584579,
+      "loss": 0.0796,
+      "step": 42479
+    },
+    {
+      "epoch": 0.36874679907292474,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0012780180307206998,
+      "loss": 0.1226,
+      "step": 42480
+    },
+    {
+      "epoch": 0.3687554795531289,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0012779888561882638,
+      "loss": 0.0669,
+      "step": 42481
+    },
+    {
+      "epoch": 0.36876416003333307,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0012779596814611817,
+      "loss": 0.1299,
+      "step": 42482
+    },
+    {
+      "epoch": 0.3687728405135372,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0012779305065394862,
+      "loss": 0.1143,
+      "step": 42483
+    },
+    {
+      "epoch": 0.3687815209937414,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.001277901331423208,
+      "loss": 0.1172,
+      "step": 42484
+    },
+    {
+      "epoch": 0.36879020147394553,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0012778721561123795,
+      "loss": 0.0659,
+      "step": 42485
+    },
+    {
+      "epoch": 0.3687988819541497,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0012778429806070332,
+      "loss": 0.0957,
+      "step": 42486
+    },
+    {
+      "epoch": 0.36880756243435386,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0012778138049072,
+      "loss": 0.085,
+      "step": 42487
+    },
+    {
+      "epoch": 0.36881624291455806,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0012777846290129124,
+      "loss": 0.1016,
+      "step": 42488
+    },
+    {
+      "epoch": 0.3688249233947622,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001277755452924202,
+      "loss": 0.0894,
+      "step": 42489
+    },
+    {
+      "epoch": 0.3688336038749664,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0012777262766411016,
+      "loss": 0.0825,
+      "step": 42490
+    },
+    {
+      "epoch": 0.3688422843551705,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001277697100163642,
+      "loss": 0.1118,
+      "step": 42491
+    },
+    {
+      "epoch": 0.3688509648353747,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0012776679234918556,
+      "loss": 0.0825,
+      "step": 42492
+    },
+    {
+      "epoch": 0.36885964531557885,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001277638746625774,
+      "loss": 0.0933,
+      "step": 42493
+    },
+    {
+      "epoch": 0.36886832579578305,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0012776095695654294,
+      "loss": 0.1426,
+      "step": 42494
+    },
+    {
+      "epoch": 0.3688770062759872,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0012775803923108539,
+      "loss": 0.0815,
+      "step": 42495
+    },
+    {
+      "epoch": 0.3688856867561914,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001277551214862079,
+      "loss": 0.0815,
+      "step": 42496
+    },
+    {
+      "epoch": 0.3688943672363955,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0012775220372191367,
+      "loss": 0.1143,
+      "step": 42497
+    },
+    {
+      "epoch": 0.3689030477165997,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0012774928593820589,
+      "loss": 0.0645,
+      "step": 42498
+    },
+    {
+      "epoch": 0.36891172819680385,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0012774636813508778,
+      "loss": 0.1069,
+      "step": 42499
+    },
+    {
+      "epoch": 0.36892040867700804,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001277434503125625,
+      "loss": 0.1387,
+      "step": 42500
+    },
+    {
+      "epoch": 0.3689290891572122,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0012774053247063323,
+      "loss": 0.0786,
+      "step": 42501
+    },
+    {
+      "epoch": 0.36893776963741637,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001277376146093032,
+      "loss": 0.0859,
+      "step": 42502
+    },
+    {
+      "epoch": 0.3689464501176205,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0012773469672857558,
+      "loss": 0.0791,
+      "step": 42503
+    },
+    {
+      "epoch": 0.3689551305978247,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0012773177882845358,
+      "loss": 0.0918,
+      "step": 42504
+    },
+    {
+      "epoch": 0.36896381107802884,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0012772886090894034,
+      "loss": 0.1118,
+      "step": 42505
+    },
+    {
+      "epoch": 0.36897249155823303,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0012772594297003907,
+      "loss": 0.1162,
+      "step": 42506
+    },
+    {
+      "epoch": 0.36898117203843717,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00127723025011753,
+      "loss": 0.0791,
+      "step": 42507
+    },
+    {
+      "epoch": 0.3689898525186413,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001277201070340853,
+      "loss": 0.1396,
+      "step": 42508
+    },
+    {
+      "epoch": 0.3689985329988455,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.001277171890370392,
+      "loss": 0.0996,
+      "step": 42509
+    },
+    {
+      "epoch": 0.36900721347904963,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0012771427102061777,
+      "loss": 0.1172,
+      "step": 42510
+    },
+    {
+      "epoch": 0.3690158939592538,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001277113529848243,
+      "loss": 0.1016,
+      "step": 42511
+    },
+    {
+      "epoch": 0.36902457443945796,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.00127708434929662,
+      "loss": 0.123,
+      "step": 42512
+    },
+    {
+      "epoch": 0.36903325491966216,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0012770551685513398,
+      "loss": 0.085,
+      "step": 42513
+    },
+    {
+      "epoch": 0.3690419353998663,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0012770259876124348,
+      "loss": 0.1006,
+      "step": 42514
+    },
+    {
+      "epoch": 0.3690506158800705,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0012769968064799372,
+      "loss": 0.1182,
+      "step": 42515
+    },
+    {
+      "epoch": 0.3690592963602746,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001276967625153878,
+      "loss": 0.0781,
+      "step": 42516
+    },
+    {
+      "epoch": 0.3690679768404788,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.00127693844363429,
+      "loss": 0.0688,
+      "step": 42517
+    },
+    {
+      "epoch": 0.36907665732068295,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.001276909261921205,
+      "loss": 0.0732,
+      "step": 42518
+    },
+    {
+      "epoch": 0.36908533780088715,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001276880080014654,
+      "loss": 0.084,
+      "step": 42519
+    },
+    {
+      "epoch": 0.3690940182810913,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0012768508979146703,
+      "loss": 0.0928,
+      "step": 42520
+    },
+    {
+      "epoch": 0.3691026987612955,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0012768217156212847,
+      "loss": 0.1006,
+      "step": 42521
+    },
+    {
+      "epoch": 0.3691113792414996,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0012767925331345295,
+      "loss": 0.0723,
+      "step": 42522
+    },
+    {
+      "epoch": 0.3691200597217038,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001276763350454437,
+      "loss": 0.0884,
+      "step": 42523
+    },
+    {
+      "epoch": 0.36912874020190795,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0012767341675810385,
+      "loss": 0.1777,
+      "step": 42524
+    },
+    {
+      "epoch": 0.36913742068211214,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001276704984514366,
+      "loss": 0.0957,
+      "step": 42525
+    },
+    {
+      "epoch": 0.3691461011623163,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.001276675801254452,
+      "loss": 0.1001,
+      "step": 42526
+    },
+    {
+      "epoch": 0.36915478164252047,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0012766466178013274,
+      "loss": 0.082,
+      "step": 42527
+    },
+    {
+      "epoch": 0.3691634621227246,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.0012766174341550254,
+      "loss": 0.1602,
+      "step": 42528
+    },
+    {
+      "epoch": 0.3691721426029288,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0012765882503155768,
+      "loss": 0.1426,
+      "step": 42529
+    },
+    {
+      "epoch": 0.36918082308313294,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0012765590662830143,
+      "loss": 0.1084,
+      "step": 42530
+    },
+    {
+      "epoch": 0.36918950356333713,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0012765298820573694,
+      "loss": 0.0967,
+      "step": 42531
+    },
+    {
+      "epoch": 0.36919818404354127,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0012765006976386737,
+      "loss": 0.0986,
+      "step": 42532
+    },
+    {
+      "epoch": 0.36920686452374546,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0012764715130269598,
+      "loss": 0.0913,
+      "step": 42533
+    },
+    {
+      "epoch": 0.3692155450039496,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0012764423282222593,
+      "loss": 0.1162,
+      "step": 42534
+    },
+    {
+      "epoch": 0.3692242254841538,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0012764131432246037,
+      "loss": 0.0996,
+      "step": 42535
+    },
+    {
+      "epoch": 0.3692329059643579,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0012763839580340254,
+      "loss": 0.1211,
+      "step": 42536
+    },
+    {
+      "epoch": 0.3692415864445621,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001276354772650557,
+      "loss": 0.1426,
+      "step": 42537
+    },
+    {
+      "epoch": 0.36925026692476626,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.001276325587074229,
+      "loss": 0.0649,
+      "step": 42538
+    },
+    {
+      "epoch": 0.36925894740497045,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0012762964013050741,
+      "loss": 0.0952,
+      "step": 42539
+    },
+    {
+      "epoch": 0.3692676278851746,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0012762672153431244,
+      "loss": 0.0908,
+      "step": 42540
+    },
+    {
+      "epoch": 0.3692763083653788,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0012762380291884113,
+      "loss": 0.0835,
+      "step": 42541
+    },
+    {
+      "epoch": 0.3692849888455829,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0012762088428409668,
+      "loss": 0.0918,
+      "step": 42542
+    },
+    {
+      "epoch": 0.3692936693257871,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0012761796563008233,
+      "loss": 0.124,
+      "step": 42543
+    },
+    {
+      "epoch": 0.36930234980599125,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0012761504695680118,
+      "loss": 0.0933,
+      "step": 42544
+    },
+    {
+      "epoch": 0.36931103028619544,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0012761212826425652,
+      "loss": 0.1309,
+      "step": 42545
+    },
+    {
+      "epoch": 0.3693197107663996,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.001276092095524515,
+      "loss": 0.0835,
+      "step": 42546
+    },
+    {
+      "epoch": 0.36932839124660377,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0012760629082138933,
+      "loss": 0.0811,
+      "step": 42547
+    },
+    {
+      "epoch": 0.3693370717268079,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0012760337207107317,
+      "loss": 0.1064,
+      "step": 42548
+    },
+    {
+      "epoch": 0.3693457522070121,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001276004533015062,
+      "loss": 0.0962,
+      "step": 42549
+    },
+    {
+      "epoch": 0.36935443268721624,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0012759753451269168,
+      "loss": 0.1045,
+      "step": 42550
+    },
+    {
+      "epoch": 0.36936311316742043,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0012759461570463274,
+      "loss": 0.0771,
+      "step": 42551
+    },
+    {
+      "epoch": 0.36937179364762457,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0012759169687733259,
+      "loss": 0.1104,
+      "step": 42552
+    },
+    {
+      "epoch": 0.36938047412782876,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0012758877803079442,
+      "loss": 0.0645,
+      "step": 42553
+    },
+    {
+      "epoch": 0.3693891546080329,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0012758585916502144,
+      "loss": 0.1162,
+      "step": 42554
+    },
+    {
+      "epoch": 0.3693978350882371,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001275829402800168,
+      "loss": 0.0957,
+      "step": 42555
+    },
+    {
+      "epoch": 0.36940651556844123,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0012758002137578377,
+      "loss": 0.126,
+      "step": 42556
+    },
+    {
+      "epoch": 0.3694151960486454,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0012757710245232547,
+      "loss": 0.1494,
+      "step": 42557
+    },
+    {
+      "epoch": 0.36942387652884956,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0012757418350964511,
+      "loss": 0.0977,
+      "step": 42558
+    },
+    {
+      "epoch": 0.36943255700905375,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0012757126454774588,
+      "loss": 0.1318,
+      "step": 42559
+    },
+    {
+      "epoch": 0.3694412374892579,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.00127568345566631,
+      "loss": 0.1143,
+      "step": 42560
+    },
+    {
+      "epoch": 0.3694499179694621,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0012756542656630365,
+      "loss": 0.083,
+      "step": 42561
+    },
+    {
+      "epoch": 0.3694585984496662,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0012756250754676697,
+      "loss": 0.0811,
+      "step": 42562
+    },
+    {
+      "epoch": 0.3694672789298704,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.0012755958850802422,
+      "loss": 0.1104,
+      "step": 42563
+    },
+    {
+      "epoch": 0.36947595941007455,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0012755666945007856,
+      "loss": 0.1211,
+      "step": 42564
+    },
+    {
+      "epoch": 0.36948463989027874,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0012755375037293319,
+      "loss": 0.0938,
+      "step": 42565
+    },
+    {
+      "epoch": 0.3694933203704829,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.001275508312765913,
+      "loss": 0.0957,
+      "step": 42566
+    },
+    {
+      "epoch": 0.3695020008506871,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001275479121610561,
+      "loss": 0.1172,
+      "step": 42567
+    },
+    {
+      "epoch": 0.3695106813308912,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001275449930263308,
+      "loss": 0.0952,
+      "step": 42568
+    },
+    {
+      "epoch": 0.3695193618110954,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0012754207387241849,
+      "loss": 0.1504,
+      "step": 42569
+    },
+    {
+      "epoch": 0.36952804229129954,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0012753915469932244,
+      "loss": 0.0708,
+      "step": 42570
+    },
+    {
+      "epoch": 0.36953672277150373,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0012753623550704586,
+      "loss": 0.0977,
+      "step": 42571
+    },
+    {
+      "epoch": 0.36954540325170787,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0012753331629559192,
+      "loss": 0.1484,
+      "step": 42572
+    },
+    {
+      "epoch": 0.36955408373191206,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0012753039706496381,
+      "loss": 0.0928,
+      "step": 42573
+    },
+    {
+      "epoch": 0.3695627642121162,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0012752747781516471,
+      "loss": 0.0742,
+      "step": 42574
+    },
+    {
+      "epoch": 0.3695714446923204,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001275245585461978,
+      "loss": 0.1523,
+      "step": 42575
+    },
+    {
+      "epoch": 0.36958012517252453,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0012752163925806635,
+      "loss": 0.1162,
+      "step": 42576
+    },
+    {
+      "epoch": 0.3695888056527287,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0012751871995077347,
+      "loss": 0.0923,
+      "step": 42577
+    },
+    {
+      "epoch": 0.36959748613293286,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0012751580062432238,
+      "loss": 0.1035,
+      "step": 42578
+    },
+    {
+      "epoch": 0.36960616661313705,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0012751288127871626,
+      "loss": 0.1245,
+      "step": 42579
+    },
+    {
+      "epoch": 0.3696148470933412,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0012750996191395834,
+      "loss": 0.1289,
+      "step": 42580
+    },
+    {
+      "epoch": 0.3696235275735454,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0012750704253005178,
+      "loss": 0.1113,
+      "step": 42581
+    },
+    {
+      "epoch": 0.3696322080537495,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0012750412312699977,
+      "loss": 0.0801,
+      "step": 42582
+    },
+    {
+      "epoch": 0.3696408885339537,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0012750120370480554,
+      "loss": 0.0771,
+      "step": 42583
+    },
+    {
+      "epoch": 0.36964956901415785,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0012749828426347222,
+      "loss": 0.1299,
+      "step": 42584
+    },
+    {
+      "epoch": 0.36965824949436205,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0012749536480300308,
+      "loss": 0.1123,
+      "step": 42585
+    },
+    {
+      "epoch": 0.3696669299745662,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0012749244532340126,
+      "loss": 0.126,
+      "step": 42586
+    },
+    {
+      "epoch": 0.3696756104547704,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0012748952582466995,
+      "loss": 0.0864,
+      "step": 42587
+    },
+    {
+      "epoch": 0.3696842909349745,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0012748660630681236,
+      "loss": 0.1377,
+      "step": 42588
+    },
+    {
+      "epoch": 0.3696929714151787,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0012748368676983167,
+      "loss": 0.0806,
+      "step": 42589
+    },
+    {
+      "epoch": 0.36970165189538284,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0012748076721373113,
+      "loss": 0.1064,
+      "step": 42590
+    },
+    {
+      "epoch": 0.36971033237558704,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0012747784763851384,
+      "loss": 0.0776,
+      "step": 42591
+    },
+    {
+      "epoch": 0.3697190128557912,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0012747492804418303,
+      "loss": 0.0933,
+      "step": 42592
+    },
+    {
+      "epoch": 0.36972769333599537,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0012747200843074191,
+      "loss": 0.1055,
+      "step": 42593
+    },
+    {
+      "epoch": 0.3697363738161995,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.001274690887981937,
+      "loss": 0.4219,
+      "step": 42594
+    },
+    {
+      "epoch": 0.3697450542964037,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0012746616914654152,
+      "loss": 0.1816,
+      "step": 42595
+    },
+    {
+      "epoch": 0.36975373477660783,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001274632494757886,
+      "loss": 0.0654,
+      "step": 42596
+    },
+    {
+      "epoch": 0.369762415256812,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0012746032978593815,
+      "loss": 0.0615,
+      "step": 42597
+    },
+    {
+      "epoch": 0.36977109573701616,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0012745741007699337,
+      "loss": 0.1089,
+      "step": 42598
+    },
+    {
+      "epoch": 0.36977977621722036,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001274544903489574,
+      "loss": 0.0879,
+      "step": 42599
+    },
+    {
+      "epoch": 0.3697884566974245,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0012745157060183347,
+      "loss": 0.0688,
+      "step": 42600
+    },
+    {
+      "epoch": 0.3697971371776287,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0012744865083562475,
+      "loss": 0.1084,
+      "step": 42601
+    },
+    {
+      "epoch": 0.3698058176578328,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0012744573105033445,
+      "loss": 0.1484,
+      "step": 42602
+    },
+    {
+      "epoch": 0.369814498138037,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0012744281124596577,
+      "loss": 0.1045,
+      "step": 42603
+    },
+    {
+      "epoch": 0.36982317861824116,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.001274398914225219,
+      "loss": 0.1143,
+      "step": 42604
+    },
+    {
+      "epoch": 0.36983185909844535,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0012743697158000599,
+      "loss": 0.0718,
+      "step": 42605
+    },
+    {
+      "epoch": 0.3698405395786495,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0012743405171842129,
+      "loss": 0.0859,
+      "step": 42606
+    },
+    {
+      "epoch": 0.3698492200588537,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.00127431131837771,
+      "loss": 0.0908,
+      "step": 42607
+    },
+    {
+      "epoch": 0.3698579005390578,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0012742821193805825,
+      "loss": 0.126,
+      "step": 42608
+    },
+    {
+      "epoch": 0.369866581019262,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0012742529201928627,
+      "loss": 0.1133,
+      "step": 42609
+    },
+    {
+      "epoch": 0.36987526149946615,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0012742237208145828,
+      "loss": 0.0801,
+      "step": 42610
+    },
+    {
+      "epoch": 0.36988394197967034,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0012741945212457744,
+      "loss": 0.0996,
+      "step": 42611
+    },
+    {
+      "epoch": 0.3698926224598745,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0012741653214864695,
+      "loss": 0.1133,
+      "step": 42612
+    },
+    {
+      "epoch": 0.36990130294007867,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0012741361215366997,
+      "loss": 0.1021,
+      "step": 42613
+    },
+    {
+      "epoch": 0.3699099834202828,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0012741069213964976,
+      "loss": 0.082,
+      "step": 42614
+    },
+    {
+      "epoch": 0.369918663900487,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0012740777210658945,
+      "loss": 0.0894,
+      "step": 42615
+    },
+    {
+      "epoch": 0.36992734438069114,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0012740485205449228,
+      "loss": 0.1123,
+      "step": 42616
+    },
+    {
+      "epoch": 0.36993602486089533,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0012740193198336144,
+      "loss": 0.0854,
+      "step": 42617
+    },
+    {
+      "epoch": 0.36994470534109947,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001273990118932001,
+      "loss": 0.0791,
+      "step": 42618
+    },
+    {
+      "epoch": 0.36995338582130366,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0012739609178401147,
+      "loss": 0.0684,
+      "step": 42619
+    },
+    {
+      "epoch": 0.3699620663015078,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0012739317165579868,
+      "loss": 0.0977,
+      "step": 42620
+    },
+    {
+      "epoch": 0.369970746781712,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.00127390251508565,
+      "loss": 0.0889,
+      "step": 42621
+    },
+    {
+      "epoch": 0.3699794272619161,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0012738733134231364,
+      "loss": 0.0718,
+      "step": 42622
+    },
+    {
+      "epoch": 0.3699881077421203,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0012738441115704774,
+      "loss": 0.1357,
+      "step": 42623
+    },
+    {
+      "epoch": 0.36999678822232446,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001273814909527705,
+      "loss": 0.1143,
+      "step": 42624
+    },
+    {
+      "epoch": 0.37000546870252865,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0012737857072948513,
+      "loss": 0.1064,
+      "step": 42625
+    },
+    {
+      "epoch": 0.3700141491827328,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0012737565048719483,
+      "loss": 0.1025,
+      "step": 42626
+    },
+    {
+      "epoch": 0.370022829662937,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0012737273022590274,
+      "loss": 0.0991,
+      "step": 42627
+    },
+    {
+      "epoch": 0.3700315101431411,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0012736980994561213,
+      "loss": 0.085,
+      "step": 42628
+    },
+    {
+      "epoch": 0.3700401906233453,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0012736688964632614,
+      "loss": 0.1494,
+      "step": 42629
+    },
+    {
+      "epoch": 0.37004887110354945,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.00127363969328048,
+      "loss": 0.1064,
+      "step": 42630
+    },
+    {
+      "epoch": 0.3700575515837536,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0012736104899078087,
+      "loss": 0.0889,
+      "step": 42631
+    },
+    {
+      "epoch": 0.3700662320639578,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0012735812863452793,
+      "loss": 0.1816,
+      "step": 42632
+    },
+    {
+      "epoch": 0.3700749125441619,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0012735520825929243,
+      "loss": 0.1133,
+      "step": 42633
+    },
+    {
+      "epoch": 0.3700835930243661,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0012735228786507756,
+      "loss": 0.1045,
+      "step": 42634
+    },
+    {
+      "epoch": 0.37009227350457025,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0012734936745188645,
+      "loss": 0.1045,
+      "step": 42635
+    },
+    {
+      "epoch": 0.37010095398477444,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0012734644701972236,
+      "loss": 0.1445,
+      "step": 42636
+    },
+    {
+      "epoch": 0.3701096344649786,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0012734352656858844,
+      "loss": 0.0947,
+      "step": 42637
+    },
+    {
+      "epoch": 0.37011831494518277,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001273406060984879,
+      "loss": 0.1201,
+      "step": 42638
+    },
+    {
+      "epoch": 0.3701269954253869,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0012733768560942394,
+      "loss": 0.127,
+      "step": 42639
+    },
+    {
+      "epoch": 0.3701356759055911,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0012733476510139972,
+      "loss": 0.1084,
+      "step": 42640
+    },
+    {
+      "epoch": 0.37014435638579524,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0012733184457441847,
+      "loss": 0.1406,
+      "step": 42641
+    },
+    {
+      "epoch": 0.37015303686599943,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0012732892402848338,
+      "loss": 0.0796,
+      "step": 42642
+    },
+    {
+      "epoch": 0.37016171734620357,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0012732600346359766,
+      "loss": 0.0918,
+      "step": 42643
+    },
+    {
+      "epoch": 0.37017039782640776,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0012732308287976446,
+      "loss": 0.1235,
+      "step": 42644
+    },
+    {
+      "epoch": 0.3701790783066119,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0012732016227698702,
+      "loss": 0.1069,
+      "step": 42645
+    },
+    {
+      "epoch": 0.3701877587868161,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.001273172416552685,
+      "loss": 0.0918,
+      "step": 42646
+    },
+    {
+      "epoch": 0.3701964392670202,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001273143210146121,
+      "loss": 0.0703,
+      "step": 42647
+    },
+    {
+      "epoch": 0.3702051197472244,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0012731140035502101,
+      "loss": 0.0693,
+      "step": 42648
+    },
+    {
+      "epoch": 0.37021380022742856,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0012730847967649846,
+      "loss": 0.0791,
+      "step": 42649
+    },
+    {
+      "epoch": 0.37022248070763275,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.001273055589790476,
+      "loss": 0.0845,
+      "step": 42650
+    },
+    {
+      "epoch": 0.3702311611878369,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0012730263826267164,
+      "loss": 0.0864,
+      "step": 42651
+    },
+    {
+      "epoch": 0.3702398416680411,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0012729971752737377,
+      "loss": 0.0864,
+      "step": 42652
+    },
+    {
+      "epoch": 0.3702485221482452,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001272967967731572,
+      "loss": 0.0957,
+      "step": 42653
+    },
+    {
+      "epoch": 0.3702572026284494,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001272938760000251,
+      "loss": 0.1738,
+      "step": 42654
+    },
+    {
+      "epoch": 0.37026588310865355,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0012729095520798068,
+      "loss": 0.0957,
+      "step": 42655
+    },
+    {
+      "epoch": 0.37027456358885774,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0012728803439702716,
+      "loss": 0.0732,
+      "step": 42656
+    },
+    {
+      "epoch": 0.3702832440690619,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0012728511356716766,
+      "loss": 0.1104,
+      "step": 42657
+    },
+    {
+      "epoch": 0.37029192454926607,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0012728219271840545,
+      "loss": 0.126,
+      "step": 42658
+    },
+    {
+      "epoch": 0.3703006050294702,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0012727927185074367,
+      "loss": 0.0952,
+      "step": 42659
+    },
+    {
+      "epoch": 0.3703092855096744,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0012727635096418557,
+      "loss": 0.0854,
+      "step": 42660
+    },
+    {
+      "epoch": 0.37031796598987854,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0012727343005873428,
+      "loss": 0.1006,
+      "step": 42661
+    },
+    {
+      "epoch": 0.37032664647008273,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0012727050913439305,
+      "loss": 0.0898,
+      "step": 42662
+    },
+    {
+      "epoch": 0.37033532695028687,
+      "grad_norm": 1.796875,
+      "learning_rate": 0.0012726758819116505,
+      "loss": 0.3652,
+      "step": 42663
+    },
+    {
+      "epoch": 0.37034400743049106,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.0012726466722905345,
+      "loss": 0.0654,
+      "step": 42664
+    },
+    {
+      "epoch": 0.3703526879106952,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001272617462480615,
+      "loss": 0.0674,
+      "step": 42665
+    },
+    {
+      "epoch": 0.3703613683908994,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0012725882524819235,
+      "loss": 0.104,
+      "step": 42666
+    },
+    {
+      "epoch": 0.37037004887110353,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001272559042294492,
+      "loss": 0.1201,
+      "step": 42667
+    },
+    {
+      "epoch": 0.3703787293513077,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0012725298319183527,
+      "loss": 0.0762,
+      "step": 42668
+    },
+    {
+      "epoch": 0.37038740983151186,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0012725006213535372,
+      "loss": 0.125,
+      "step": 42669
+    },
+    {
+      "epoch": 0.37039609031171605,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0012724714106000777,
+      "loss": 0.0991,
+      "step": 42670
+    },
+    {
+      "epoch": 0.3704047707919202,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001272442199658006,
+      "loss": 0.1553,
+      "step": 42671
+    },
+    {
+      "epoch": 0.3704134512721244,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001272412988527354,
+      "loss": 0.1123,
+      "step": 42672
+    },
+    {
+      "epoch": 0.3704221317523285,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0012723837772081541,
+      "loss": 0.0952,
+      "step": 42673
+    },
+    {
+      "epoch": 0.3704308122325327,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0012723545657004375,
+      "loss": 0.0908,
+      "step": 42674
+    },
+    {
+      "epoch": 0.37043949271273685,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0012723253540042367,
+      "loss": 0.0732,
+      "step": 42675
+    },
+    {
+      "epoch": 0.37044817319294104,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0012722961421195835,
+      "loss": 0.084,
+      "step": 42676
+    },
+    {
+      "epoch": 0.3704568536731452,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0012722669300465103,
+      "loss": 0.127,
+      "step": 42677
+    },
+    {
+      "epoch": 0.3704655341533494,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0012722377177850476,
+      "loss": 0.1094,
+      "step": 42678
+    },
+    {
+      "epoch": 0.3704742146335535,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.001272208505335229,
+      "loss": 0.0874,
+      "step": 42679
+    },
+    {
+      "epoch": 0.3704828951137577,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0012721792926970853,
+      "loss": 0.0908,
+      "step": 42680
+    },
+    {
+      "epoch": 0.37049157559396184,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0012721500798706493,
+      "loss": 0.0811,
+      "step": 42681
+    },
+    {
+      "epoch": 0.37050025607416603,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0012721208668559525,
+      "loss": 0.1172,
+      "step": 42682
+    },
+    {
+      "epoch": 0.37050893655437017,
+      "grad_norm": 2.75,
+      "learning_rate": 0.0012720916536530268,
+      "loss": 0.2559,
+      "step": 42683
+    },
+    {
+      "epoch": 0.37051761703457436,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0012720624402619046,
+      "loss": 0.126,
+      "step": 42684
+    },
+    {
+      "epoch": 0.3705262975147785,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001272033226682617,
+      "loss": 0.1025,
+      "step": 42685
+    },
+    {
+      "epoch": 0.3705349779949827,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0012720040129151969,
+      "loss": 0.1113,
+      "step": 42686
+    },
+    {
+      "epoch": 0.37054365847518683,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0012719747989596753,
+      "loss": 0.085,
+      "step": 42687
+    },
+    {
+      "epoch": 0.370552338955391,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0012719455848160851,
+      "loss": 0.0815,
+      "step": 42688
+    },
+    {
+      "epoch": 0.37056101943559516,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0012719163704844572,
+      "loss": 0.1338,
+      "step": 42689
+    },
+    {
+      "epoch": 0.37056969991579936,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0012718871559648248,
+      "loss": 0.1172,
+      "step": 42690
+    },
+    {
+      "epoch": 0.3705783803960035,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0012718579412572186,
+      "loss": 0.1211,
+      "step": 42691
+    },
+    {
+      "epoch": 0.3705870608762077,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0012718287263616718,
+      "loss": 0.0869,
+      "step": 42692
+    },
+    {
+      "epoch": 0.3705957413564118,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0012717995112782152,
+      "loss": 0.1143,
+      "step": 42693
+    },
+    {
+      "epoch": 0.370604421836616,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0012717702960068813,
+      "loss": 0.1074,
+      "step": 42694
+    },
+    {
+      "epoch": 0.37061310231682015,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0012717410805477022,
+      "loss": 0.0674,
+      "step": 42695
+    },
+    {
+      "epoch": 0.37062178279702435,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0012717118649007095,
+      "loss": 0.0918,
+      "step": 42696
+    },
+    {
+      "epoch": 0.3706304632772285,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0012716826490659354,
+      "loss": 0.0771,
+      "step": 42697
+    },
+    {
+      "epoch": 0.3706391437574327,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0012716534330434112,
+      "loss": 0.1113,
+      "step": 42698
+    },
+    {
+      "epoch": 0.3706478242376368,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00127162421683317,
+      "loss": 0.0747,
+      "step": 42699
+    },
+    {
+      "epoch": 0.370656504717841,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.001271595000435243,
+      "loss": 0.1582,
+      "step": 42700
+    },
+    {
+      "epoch": 0.37066518519804514,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0012715657838496622,
+      "loss": 0.1162,
+      "step": 42701
+    },
+    {
+      "epoch": 0.37067386567824934,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0012715365670764596,
+      "loss": 0.0938,
+      "step": 42702
+    },
+    {
+      "epoch": 0.3706825461584535,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0012715073501156674,
+      "loss": 0.0923,
+      "step": 42703
+    },
+    {
+      "epoch": 0.37069122663865767,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.001271478132967317,
+      "loss": 0.2852,
+      "step": 42704
+    },
+    {
+      "epoch": 0.3706999071188618,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001271448915631441,
+      "loss": 0.0918,
+      "step": 42705
+    },
+    {
+      "epoch": 0.370708587599066,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0012714196981080711,
+      "loss": 0.0952,
+      "step": 42706
+    },
+    {
+      "epoch": 0.37071726807927013,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001271390480397239,
+      "loss": 0.1211,
+      "step": 42707
+    },
+    {
+      "epoch": 0.3707259485594743,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0012713612624989767,
+      "loss": 0.0898,
+      "step": 42708
+    },
+    {
+      "epoch": 0.37073462903967846,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0012713320444133164,
+      "loss": 0.0894,
+      "step": 42709
+    },
+    {
+      "epoch": 0.37074330951988266,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.00127130282614029,
+      "loss": 0.1011,
+      "step": 42710
+    },
+    {
+      "epoch": 0.3707519900000868,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0012712736076799297,
+      "loss": 0.0933,
+      "step": 42711
+    },
+    {
+      "epoch": 0.370760670480291,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0012712443890322666,
+      "loss": 0.1064,
+      "step": 42712
+    },
+    {
+      "epoch": 0.3707693509604951,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0012712151701973332,
+      "loss": 0.0991,
+      "step": 42713
+    },
+    {
+      "epoch": 0.3707780314406993,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001271185951175162,
+      "loss": 0.0928,
+      "step": 42714
+    },
+    {
+      "epoch": 0.37078671192090346,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0012711567319657845,
+      "loss": 0.1035,
+      "step": 42715
+    },
+    {
+      "epoch": 0.37079539240110765,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001271127512569232,
+      "loss": 0.104,
+      "step": 42716
+    },
+    {
+      "epoch": 0.3708040728813118,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0012710982929855373,
+      "loss": 0.0815,
+      "step": 42717
+    },
+    {
+      "epoch": 0.370812753361516,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001271069073214732,
+      "loss": 0.1118,
+      "step": 42718
+    },
+    {
+      "epoch": 0.3708214338417201,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0012710398532568478,
+      "loss": 0.0713,
+      "step": 42719
+    },
+    {
+      "epoch": 0.3708301143219243,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0012710106331119179,
+      "loss": 0.1055,
+      "step": 42720
+    },
+    {
+      "epoch": 0.37083879480212845,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0012709814127799725,
+      "loss": 0.0903,
+      "step": 42721
+    },
+    {
+      "epoch": 0.37084747528233264,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0012709521922610446,
+      "loss": 0.1035,
+      "step": 42722
+    },
+    {
+      "epoch": 0.3708561557625368,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0012709229715551662,
+      "loss": 0.125,
+      "step": 42723
+    },
+    {
+      "epoch": 0.37086483624274097,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0012708937506623688,
+      "loss": 0.0947,
+      "step": 42724
+    },
+    {
+      "epoch": 0.3708735167229451,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0012708645295826848,
+      "loss": 0.0718,
+      "step": 42725
+    },
+    {
+      "epoch": 0.3708821972031493,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0012708353083161455,
+      "loss": 0.1123,
+      "step": 42726
+    },
+    {
+      "epoch": 0.37089087768335344,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0012708060868627834,
+      "loss": 0.085,
+      "step": 42727
+    },
+    {
+      "epoch": 0.37089955816355763,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0012707768652226305,
+      "loss": 0.0981,
+      "step": 42728
+    },
+    {
+      "epoch": 0.37090823864376177,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0012707476433957187,
+      "loss": 0.1084,
+      "step": 42729
+    },
+    {
+      "epoch": 0.37091691912396596,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0012707184213820793,
+      "loss": 0.1021,
+      "step": 42730
+    },
+    {
+      "epoch": 0.3709255996041701,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0012706891991817453,
+      "loss": 0.126,
+      "step": 42731
+    },
+    {
+      "epoch": 0.3709342800843743,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0012706599767947479,
+      "loss": 0.0679,
+      "step": 42732
+    },
+    {
+      "epoch": 0.37094296056457843,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0012706307542211196,
+      "loss": 0.0718,
+      "step": 42733
+    },
+    {
+      "epoch": 0.3709516410447826,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001270601531460892,
+      "loss": 0.0898,
+      "step": 42734
+    },
+    {
+      "epoch": 0.37096032152498676,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0012705723085140968,
+      "loss": 0.1289,
+      "step": 42735
+    },
+    {
+      "epoch": 0.37096900200519095,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0012705430853807668,
+      "loss": 0.0938,
+      "step": 42736
+    },
+    {
+      "epoch": 0.3709776824853951,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0012705138620609329,
+      "loss": 0.1299,
+      "step": 42737
+    },
+    {
+      "epoch": 0.3709863629655993,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001270484638554628,
+      "loss": 0.1123,
+      "step": 42738
+    },
+    {
+      "epoch": 0.3709950434458034,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001270455414861883,
+      "loss": 0.1206,
+      "step": 42739
+    },
+    {
+      "epoch": 0.3710037239260076,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0012704261909827312,
+      "loss": 0.0811,
+      "step": 42740
+    },
+    {
+      "epoch": 0.37101240440621175,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0012703969669172039,
+      "loss": 0.0977,
+      "step": 42741
+    },
+    {
+      "epoch": 0.37102108488641594,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0012703677426653328,
+      "loss": 0.0933,
+      "step": 42742
+    },
+    {
+      "epoch": 0.3710297653666201,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.00127033851822715,
+      "loss": 0.0781,
+      "step": 42743
+    },
+    {
+      "epoch": 0.37103844584682427,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0012703092936026878,
+      "loss": 0.1123,
+      "step": 42744
+    },
+    {
+      "epoch": 0.3710471263270284,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001270280068791978,
+      "loss": 0.1387,
+      "step": 42745
+    },
+    {
+      "epoch": 0.3710558068072326,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0012702508437950522,
+      "loss": 0.0996,
+      "step": 42746
+    },
+    {
+      "epoch": 0.37106448728743674,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0012702216186119427,
+      "loss": 0.0767,
+      "step": 42747
+    },
+    {
+      "epoch": 0.37107316776764093,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0012701923932426813,
+      "loss": 0.0752,
+      "step": 42748
+    },
+    {
+      "epoch": 0.37108184824784507,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0012701631676873002,
+      "loss": 0.0864,
+      "step": 42749
+    },
+    {
+      "epoch": 0.37109052872804926,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0012701339419458313,
+      "loss": 0.1572,
+      "step": 42750
+    },
+    {
+      "epoch": 0.3710992092082534,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0012701047160183063,
+      "loss": 0.1104,
+      "step": 42751
+    },
+    {
+      "epoch": 0.3711078896884576,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0012700754899047575,
+      "loss": 0.1221,
+      "step": 42752
+    },
+    {
+      "epoch": 0.37111657016866173,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0012700462636052166,
+      "loss": 0.0942,
+      "step": 42753
+    },
+    {
+      "epoch": 0.37112525064886587,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0012700170371197157,
+      "loss": 0.0898,
+      "step": 42754
+    },
+    {
+      "epoch": 0.37113393112907006,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0012699878104482868,
+      "loss": 0.0815,
+      "step": 42755
+    },
+    {
+      "epoch": 0.3711426116092742,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0012699585835909614,
+      "loss": 0.0825,
+      "step": 42756
+    },
+    {
+      "epoch": 0.3711512920894784,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0012699293565477721,
+      "loss": 0.082,
+      "step": 42757
+    },
+    {
+      "epoch": 0.37115997256968253,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0012699001293187506,
+      "loss": 0.1074,
+      "step": 42758
+    },
+    {
+      "epoch": 0.3711686530498867,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0012698709019039291,
+      "loss": 0.1406,
+      "step": 42759
+    },
+    {
+      "epoch": 0.37117733353009086,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0012698416743033391,
+      "loss": 0.0811,
+      "step": 42760
+    },
+    {
+      "epoch": 0.37118601401029505,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0012698124465170128,
+      "loss": 0.0864,
+      "step": 42761
+    },
+    {
+      "epoch": 0.3711946944904992,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0012697832185449823,
+      "loss": 0.1084,
+      "step": 42762
+    },
+    {
+      "epoch": 0.3712033749707034,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0012697539903872795,
+      "loss": 0.1113,
+      "step": 42763
+    },
+    {
+      "epoch": 0.3712120554509075,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001269724762043936,
+      "loss": 0.1245,
+      "step": 42764
+    },
+    {
+      "epoch": 0.3712207359311117,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.001269695533514984,
+      "loss": 0.1152,
+      "step": 42765
+    },
+    {
+      "epoch": 0.37122941641131585,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0012696663048004555,
+      "loss": 0.0928,
+      "step": 42766
+    },
+    {
+      "epoch": 0.37123809689152004,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0012696370759003829,
+      "loss": 0.0815,
+      "step": 42767
+    },
+    {
+      "epoch": 0.3712467773717242,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0012696078468147974,
+      "loss": 0.105,
+      "step": 42768
+    },
+    {
+      "epoch": 0.37125545785192837,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0012695786175437312,
+      "loss": 0.1089,
+      "step": 42769
+    },
+    {
+      "epoch": 0.3712641383321325,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0012695493880872167,
+      "loss": 0.1064,
+      "step": 42770
+    },
+    {
+      "epoch": 0.3712728188123367,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0012695201584452856,
+      "loss": 0.0688,
+      "step": 42771
+    },
+    {
+      "epoch": 0.37128149929254084,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0012694909286179697,
+      "loss": 0.0811,
+      "step": 42772
+    },
+    {
+      "epoch": 0.37129017977274503,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001269461698605301,
+      "loss": 0.1006,
+      "step": 42773
+    },
+    {
+      "epoch": 0.37129886025294917,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0012694324684073112,
+      "loss": 0.166,
+      "step": 42774
+    },
+    {
+      "epoch": 0.37130754073315336,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0012694032380240329,
+      "loss": 0.127,
+      "step": 42775
+    },
+    {
+      "epoch": 0.3713162212133575,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0012693740074554978,
+      "loss": 0.2246,
+      "step": 42776
+    },
+    {
+      "epoch": 0.3713249016935617,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0012693447767017378,
+      "loss": 0.1504,
+      "step": 42777
+    },
+    {
+      "epoch": 0.37133358217376583,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0012693155457627848,
+      "loss": 0.1221,
+      "step": 42778
+    },
+    {
+      "epoch": 0.37134226265397,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0012692863146386711,
+      "loss": 0.0703,
+      "step": 42779
+    },
+    {
+      "epoch": 0.37135094313417416,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0012692570833294284,
+      "loss": 0.1338,
+      "step": 42780
+    },
+    {
+      "epoch": 0.37135962361437835,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0012692278518350883,
+      "loss": 0.1064,
+      "step": 42781
+    },
+    {
+      "epoch": 0.3713683040945825,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0012691986201556836,
+      "loss": 0.1064,
+      "step": 42782
+    },
+    {
+      "epoch": 0.3713769845747867,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0012691693882912456,
+      "loss": 0.0747,
+      "step": 42783
+    },
+    {
+      "epoch": 0.3713856650549908,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0012691401562418066,
+      "loss": 0.1162,
+      "step": 42784
+    },
+    {
+      "epoch": 0.371394345535195,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0012691109240073983,
+      "loss": 0.1133,
+      "step": 42785
+    },
+    {
+      "epoch": 0.37140302601539915,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0012690816915880531,
+      "loss": 0.1099,
+      "step": 42786
+    },
+    {
+      "epoch": 0.37141170649560334,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0012690524589838027,
+      "loss": 0.0996,
+      "step": 42787
+    },
+    {
+      "epoch": 0.3714203869758075,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0012690232261946789,
+      "loss": 0.1367,
+      "step": 42788
+    },
+    {
+      "epoch": 0.3714290674560117,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001268993993220714,
+      "loss": 0.0986,
+      "step": 42789
+    },
+    {
+      "epoch": 0.3714377479362158,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0012689647600619396,
+      "loss": 0.0918,
+      "step": 42790
+    },
+    {
+      "epoch": 0.37144642841642,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001268935526718388,
+      "loss": 0.1348,
+      "step": 42791
+    },
+    {
+      "epoch": 0.37145510889662414,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001268906293190091,
+      "loss": 0.0918,
+      "step": 42792
+    },
+    {
+      "epoch": 0.37146378937682833,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0012688770594770807,
+      "loss": 0.0942,
+      "step": 42793
+    },
+    {
+      "epoch": 0.3714724698570325,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0012688478255793887,
+      "loss": 0.1172,
+      "step": 42794
+    },
+    {
+      "epoch": 0.37148115033723667,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0012688185914970474,
+      "loss": 0.1504,
+      "step": 42795
+    },
+    {
+      "epoch": 0.3714898308174408,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0012687893572300888,
+      "loss": 0.1162,
+      "step": 42796
+    },
+    {
+      "epoch": 0.371498511297645,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0012687601227785446,
+      "loss": 0.1562,
+      "step": 42797
+    },
+    {
+      "epoch": 0.37150719177784913,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001268730888142447,
+      "loss": 0.0698,
+      "step": 42798
+    },
+    {
+      "epoch": 0.3715158722580533,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0012687016533218275,
+      "loss": 0.1162,
+      "step": 42799
+    },
+    {
+      "epoch": 0.37152455273825746,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001268672418316719,
+      "loss": 0.0693,
+      "step": 42800
+    },
+    {
+      "epoch": 0.37153323321846166,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0012686431831271524,
+      "loss": 0.1128,
+      "step": 42801
+    },
+    {
+      "epoch": 0.3715419136986658,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0012686139477531603,
+      "loss": 0.1523,
+      "step": 42802
+    },
+    {
+      "epoch": 0.37155059417887,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001268584712194774,
+      "loss": 0.1182,
+      "step": 42803
+    },
+    {
+      "epoch": 0.3715592746590741,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001268555476452027,
+      "loss": 0.0898,
+      "step": 42804
+    },
+    {
+      "epoch": 0.3715679551392783,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0012685262405249494,
+      "loss": 0.1318,
+      "step": 42805
+    },
+    {
+      "epoch": 0.37157663561948245,
+      "grad_norm": 3.15625,
+      "learning_rate": 0.0012684970044135743,
+      "loss": 0.2578,
+      "step": 42806
+    },
+    {
+      "epoch": 0.37158531609968665,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0012684677681179335,
+      "loss": 0.0815,
+      "step": 42807
+    },
+    {
+      "epoch": 0.3715939965798908,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0012684385316380587,
+      "loss": 0.1504,
+      "step": 42808
+    },
+    {
+      "epoch": 0.371602677060095,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0012684092949739824,
+      "loss": 0.0923,
+      "step": 42809
+    },
+    {
+      "epoch": 0.3716113575402991,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001268380058125736,
+      "loss": 0.1104,
+      "step": 42810
+    },
+    {
+      "epoch": 0.3716200380205033,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0012683508210933516,
+      "loss": 0.1309,
+      "step": 42811
+    },
+    {
+      "epoch": 0.37162871850070744,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0012683215838768613,
+      "loss": 0.0825,
+      "step": 42812
+    },
+    {
+      "epoch": 0.37163739898091164,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0012682923464762972,
+      "loss": 0.1055,
+      "step": 42813
+    },
+    {
+      "epoch": 0.3716460794611158,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.001268263108891691,
+      "loss": 0.1543,
+      "step": 42814
+    },
+    {
+      "epoch": 0.37165475994131997,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001268233871123075,
+      "loss": 0.0928,
+      "step": 42815
+    },
+    {
+      "epoch": 0.3716634404215241,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0012682046331704807,
+      "loss": 0.1074,
+      "step": 42816
+    },
+    {
+      "epoch": 0.3716721209017283,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0012681753950339403,
+      "loss": 0.0947,
+      "step": 42817
+    },
+    {
+      "epoch": 0.37168080138193244,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0012681461567134859,
+      "loss": 0.1279,
+      "step": 42818
+    },
+    {
+      "epoch": 0.37168948186213663,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0012681169182091497,
+      "loss": 0.0986,
+      "step": 42819
+    },
+    {
+      "epoch": 0.37169816234234077,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0012680876795209634,
+      "loss": 0.0991,
+      "step": 42820
+    },
+    {
+      "epoch": 0.37170684282254496,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0012680584406489582,
+      "loss": 0.0898,
+      "step": 42821
+    },
+    {
+      "epoch": 0.3717155233027491,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0012680292015931675,
+      "loss": 0.0752,
+      "step": 42822
+    },
+    {
+      "epoch": 0.3717242037829533,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001267999962353622,
+      "loss": 0.0991,
+      "step": 42823
+    },
+    {
+      "epoch": 0.3717328842631574,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001267970722930355,
+      "loss": 0.1016,
+      "step": 42824
+    },
+    {
+      "epoch": 0.3717415647433616,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001267941483323397,
+      "loss": 0.0864,
+      "step": 42825
+    },
+    {
+      "epoch": 0.37175024522356576,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001267912243532781,
+      "loss": 0.0869,
+      "step": 42826
+    },
+    {
+      "epoch": 0.37175892570376995,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001267883003558539,
+      "loss": 0.083,
+      "step": 42827
+    },
+    {
+      "epoch": 0.3717676061839741,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0012678537634007025,
+      "loss": 0.126,
+      "step": 42828
+    },
+    {
+      "epoch": 0.3717762866641783,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0012678245230593034,
+      "loss": 0.1191,
+      "step": 42829
+    },
+    {
+      "epoch": 0.3717849671443824,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001267795282534374,
+      "loss": 0.0884,
+      "step": 42830
+    },
+    {
+      "epoch": 0.3717936476245866,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0012677660418259466,
+      "loss": 0.0776,
+      "step": 42831
+    },
+    {
+      "epoch": 0.37180232810479075,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0012677368009340524,
+      "loss": 0.0884,
+      "step": 42832
+    },
+    {
+      "epoch": 0.37181100858499494,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0012677075598587237,
+      "loss": 0.0742,
+      "step": 42833
+    },
+    {
+      "epoch": 0.3718196890651991,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0012676783185999927,
+      "loss": 0.1348,
+      "step": 42834
+    },
+    {
+      "epoch": 0.37182836954540327,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0012676490771578912,
+      "loss": 0.1836,
+      "step": 42835
+    },
+    {
+      "epoch": 0.3718370500256074,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0012676198355324514,
+      "loss": 0.0918,
+      "step": 42836
+    },
+    {
+      "epoch": 0.3718457305058116,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0012675905937237048,
+      "loss": 0.0996,
+      "step": 42837
+    },
+    {
+      "epoch": 0.37185441098601574,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0012675613517316834,
+      "loss": 0.0967,
+      "step": 42838
+    },
+    {
+      "epoch": 0.37186309146621993,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.00126753210955642,
+      "loss": 0.0776,
+      "step": 42839
+    },
+    {
+      "epoch": 0.37187177194642407,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0012675028671979456,
+      "loss": 0.083,
+      "step": 42840
+    },
+    {
+      "epoch": 0.37188045242662826,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0012674736246562927,
+      "loss": 0.0933,
+      "step": 42841
+    },
+    {
+      "epoch": 0.3718891329068324,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0012674443819314931,
+      "loss": 0.1377,
+      "step": 42842
+    },
+    {
+      "epoch": 0.3718978133870366,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0012674151390235787,
+      "loss": 0.0742,
+      "step": 42843
+    },
+    {
+      "epoch": 0.37190649386724073,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0012673858959325818,
+      "loss": 0.0972,
+      "step": 42844
+    },
+    {
+      "epoch": 0.3719151743474449,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0012673566526585345,
+      "loss": 0.0967,
+      "step": 42845
+    },
+    {
+      "epoch": 0.37192385482764906,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0012673274092014682,
+      "loss": 0.0874,
+      "step": 42846
+    },
+    {
+      "epoch": 0.37193253530785325,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001267298165561415,
+      "loss": 0.1245,
+      "step": 42847
+    },
+    {
+      "epoch": 0.3719412157880574,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0012672689217384072,
+      "loss": 0.0757,
+      "step": 42848
+    },
+    {
+      "epoch": 0.3719498962682616,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0012672396777324766,
+      "loss": 0.0967,
+      "step": 42849
+    },
+    {
+      "epoch": 0.3719585767484657,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0012672104335436554,
+      "loss": 0.0728,
+      "step": 42850
+    },
+    {
+      "epoch": 0.3719672572286699,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0012671811891719752,
+      "loss": 0.123,
+      "step": 42851
+    },
+    {
+      "epoch": 0.37197593770887405,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001267151944617468,
+      "loss": 0.1006,
+      "step": 42852
+    },
+    {
+      "epoch": 0.37198461818907824,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0012671226998801658,
+      "loss": 0.0693,
+      "step": 42853
+    },
+    {
+      "epoch": 0.3719932986692824,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0012670934549601014,
+      "loss": 0.0884,
+      "step": 42854
+    },
+    {
+      "epoch": 0.3720019791494866,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0012670642098573058,
+      "loss": 0.1387,
+      "step": 42855
+    },
+    {
+      "epoch": 0.3720106596296907,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0012670349645718109,
+      "loss": 0.1182,
+      "step": 42856
+    },
+    {
+      "epoch": 0.3720193401098949,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0012670057191036498,
+      "loss": 0.1016,
+      "step": 42857
+    },
+    {
+      "epoch": 0.37202802059009904,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0012669764734528533,
+      "loss": 0.127,
+      "step": 42858
+    },
+    {
+      "epoch": 0.37203670107030323,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0012669472276194537,
+      "loss": 0.084,
+      "step": 42859
+    },
+    {
+      "epoch": 0.37204538155050737,
+      "grad_norm": 2.328125,
+      "learning_rate": 0.0012669179816034832,
+      "loss": 0.1406,
+      "step": 42860
+    },
+    {
+      "epoch": 0.37205406203071156,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0012668887354049738,
+      "loss": 0.0889,
+      "step": 42861
+    },
+    {
+      "epoch": 0.3720627425109157,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0012668594890239576,
+      "loss": 0.1025,
+      "step": 42862
+    },
+    {
+      "epoch": 0.3720714229911199,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001266830242460466,
+      "loss": 0.2637,
+      "step": 42863
+    },
+    {
+      "epoch": 0.37208010347132403,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0012668009957145314,
+      "loss": 0.1133,
+      "step": 42864
+    },
+    {
+      "epoch": 0.3720887839515282,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0012667717487861861,
+      "loss": 0.1152,
+      "step": 42865
+    },
+    {
+      "epoch": 0.37209746443173236,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0012667425016754615,
+      "loss": 0.0869,
+      "step": 42866
+    },
+    {
+      "epoch": 0.37210614491193655,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0012667132543823898,
+      "loss": 0.1162,
+      "step": 42867
+    },
+    {
+      "epoch": 0.3721148253921407,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001266684006907003,
+      "loss": 0.1182,
+      "step": 42868
+    },
+    {
+      "epoch": 0.3721235058723449,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0012666547592493332,
+      "loss": 0.0752,
+      "step": 42869
+    },
+    {
+      "epoch": 0.372132186352549,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001266625511409412,
+      "loss": 0.0811,
+      "step": 42870
+    },
+    {
+      "epoch": 0.3721408668327532,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0012665962633872715,
+      "loss": 0.0977,
+      "step": 42871
+    },
+    {
+      "epoch": 0.37214954731295735,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0012665670151829442,
+      "loss": 0.082,
+      "step": 42872
+    },
+    {
+      "epoch": 0.37215822779316154,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0012665377667964615,
+      "loss": 0.1079,
+      "step": 42873
+    },
+    {
+      "epoch": 0.3721669082733657,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0012665085182278558,
+      "loss": 0.0879,
+      "step": 42874
+    },
+    {
+      "epoch": 0.3721755887535699,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001266479269477159,
+      "loss": 0.1182,
+      "step": 42875
+    },
+    {
+      "epoch": 0.372184269233774,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0012664500205444027,
+      "loss": 0.1201,
+      "step": 42876
+    },
+    {
+      "epoch": 0.37219294971397815,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.001266420771429619,
+      "loss": 0.084,
+      "step": 42877
+    },
+    {
+      "epoch": 0.37220163019418234,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0012663915221328404,
+      "loss": 0.1025,
+      "step": 42878
+    },
+    {
+      "epoch": 0.3722103106743865,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0012663622726540983,
+      "loss": 0.0889,
+      "step": 42879
+    },
+    {
+      "epoch": 0.3722189911545907,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001266333022993425,
+      "loss": 0.1094,
+      "step": 42880
+    },
+    {
+      "epoch": 0.3722276716347948,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0012663037731508525,
+      "loss": 0.0859,
+      "step": 42881
+    },
+    {
+      "epoch": 0.372236352114999,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0012662745231264124,
+      "loss": 0.1436,
+      "step": 42882
+    },
+    {
+      "epoch": 0.37224503259520314,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0012662452729201373,
+      "loss": 0.1094,
+      "step": 42883
+    },
+    {
+      "epoch": 0.37225371307540733,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0012662160225320588,
+      "loss": 0.0796,
+      "step": 42884
+    },
+    {
+      "epoch": 0.37226239355561147,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0012661867719622089,
+      "loss": 0.0957,
+      "step": 42885
+    },
+    {
+      "epoch": 0.37227107403581566,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0012661575212106198,
+      "loss": 0.1172,
+      "step": 42886
+    },
+    {
+      "epoch": 0.3722797545160198,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001266128270277323,
+      "loss": 0.0762,
+      "step": 42887
+    },
+    {
+      "epoch": 0.372288434996224,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001266099019162351,
+      "loss": 0.1045,
+      "step": 42888
+    },
+    {
+      "epoch": 0.37229711547642813,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0012660697678657357,
+      "loss": 0.1172,
+      "step": 42889
+    },
+    {
+      "epoch": 0.3723057959566323,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0012660405163875089,
+      "loss": 0.0859,
+      "step": 42890
+    },
+    {
+      "epoch": 0.37231447643683646,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0012660112647277027,
+      "loss": 0.0752,
+      "step": 42891
+    },
+    {
+      "epoch": 0.37232315691704065,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001265982012886349,
+      "loss": 0.0903,
+      "step": 42892
+    },
+    {
+      "epoch": 0.3723318373972448,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00126595276086348,
+      "loss": 0.2002,
+      "step": 42893
+    },
+    {
+      "epoch": 0.372340517877449,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0012659235086591274,
+      "loss": 0.1279,
+      "step": 42894
+    },
+    {
+      "epoch": 0.3723491983576531,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0012658942562733236,
+      "loss": 0.1934,
+      "step": 42895
+    },
+    {
+      "epoch": 0.3723578788378573,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0012658650037061002,
+      "loss": 0.0918,
+      "step": 42896
+    },
+    {
+      "epoch": 0.37236655931806145,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0012658357509574894,
+      "loss": 0.1064,
+      "step": 42897
+    },
+    {
+      "epoch": 0.37237523979826564,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001265806498027523,
+      "loss": 0.1147,
+      "step": 42898
+    },
+    {
+      "epoch": 0.3723839202784698,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0012657772449162327,
+      "loss": 0.1074,
+      "step": 42899
+    },
+    {
+      "epoch": 0.372392600758674,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0012657479916236515,
+      "loss": 0.1533,
+      "step": 42900
+    },
+    {
+      "epoch": 0.3724012812388781,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0012657187381498103,
+      "loss": 0.0996,
+      "step": 42901
+    },
+    {
+      "epoch": 0.3724099617190823,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001265689484494742,
+      "loss": 0.1006,
+      "step": 42902
+    },
+    {
+      "epoch": 0.37241864219928644,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001265660230658478,
+      "loss": 0.064,
+      "step": 42903
+    },
+    {
+      "epoch": 0.37242732267949064,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0012656309766410506,
+      "loss": 0.0947,
+      "step": 42904
+    },
+    {
+      "epoch": 0.3724360031596948,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0012656017224424916,
+      "loss": 0.0757,
+      "step": 42905
+    },
+    {
+      "epoch": 0.37244468363989897,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001265572468062833,
+      "loss": 0.1084,
+      "step": 42906
+    },
+    {
+      "epoch": 0.3724533641201031,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0012655432135021069,
+      "loss": 0.1426,
+      "step": 42907
+    },
+    {
+      "epoch": 0.3724620446003073,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.001265513958760345,
+      "loss": 0.0938,
+      "step": 42908
+    },
+    {
+      "epoch": 0.37247072508051143,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.00126548470383758,
+      "loss": 0.1055,
+      "step": 42909
+    },
+    {
+      "epoch": 0.3724794055607156,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001265455448733843,
+      "loss": 0.0947,
+      "step": 42910
+    },
+    {
+      "epoch": 0.37248808604091976,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0012654261934491665,
+      "loss": 0.1074,
+      "step": 42911
+    },
+    {
+      "epoch": 0.37249676652112396,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001265396937983582,
+      "loss": 0.1221,
+      "step": 42912
+    },
+    {
+      "epoch": 0.3725054470013281,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0012653676823371227,
+      "loss": 0.1475,
+      "step": 42913
+    },
+    {
+      "epoch": 0.3725141274815323,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0012653384265098194,
+      "loss": 0.0962,
+      "step": 42914
+    },
+    {
+      "epoch": 0.3725228079617364,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0012653091705017044,
+      "loss": 0.1367,
+      "step": 42915
+    },
+    {
+      "epoch": 0.3725314884419406,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0012652799143128097,
+      "loss": 0.127,
+      "step": 42916
+    },
+    {
+      "epoch": 0.37254016892214475,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0012652506579431675,
+      "loss": 0.0674,
+      "step": 42917
+    },
+    {
+      "epoch": 0.37254884940234895,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0012652214013928098,
+      "loss": 0.1162,
+      "step": 42918
+    },
+    {
+      "epoch": 0.3725575298825531,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0012651921446617683,
+      "loss": 0.0933,
+      "step": 42919
+    },
+    {
+      "epoch": 0.3725662103627573,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001265162887750075,
+      "loss": 0.1201,
+      "step": 42920
+    },
+    {
+      "epoch": 0.3725748908429614,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0012651336306577622,
+      "loss": 0.0918,
+      "step": 42921
+    },
+    {
+      "epoch": 0.3725835713231656,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0012651043733848618,
+      "loss": 0.0825,
+      "step": 42922
+    },
+    {
+      "epoch": 0.37259225180336975,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001265075115931406,
+      "loss": 0.0913,
+      "step": 42923
+    },
+    {
+      "epoch": 0.37260093228357394,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0012650458582974262,
+      "loss": 0.0781,
+      "step": 42924
+    },
+    {
+      "epoch": 0.3726096127637781,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0012650166004829547,
+      "loss": 0.085,
+      "step": 42925
+    },
+    {
+      "epoch": 0.37261829324398227,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0012649873424880236,
+      "loss": 0.0986,
+      "step": 42926
+    },
+    {
+      "epoch": 0.3726269737241864,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001264958084312665,
+      "loss": 0.0898,
+      "step": 42927
+    },
+    {
+      "epoch": 0.3726356542043906,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0012649288259569105,
+      "loss": 0.085,
+      "step": 42928
+    },
+    {
+      "epoch": 0.37264433468459474,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0012648995674207923,
+      "loss": 0.1279,
+      "step": 42929
+    },
+    {
+      "epoch": 0.37265301516479893,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0012648703087043422,
+      "loss": 0.25,
+      "step": 42930
+    },
+    {
+      "epoch": 0.37266169564500307,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0012648410498075928,
+      "loss": 0.1074,
+      "step": 42931
+    },
+    {
+      "epoch": 0.37267037612520726,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0012648117907305757,
+      "loss": 0.1172,
+      "step": 42932
+    },
+    {
+      "epoch": 0.3726790566054114,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0012647825314733225,
+      "loss": 0.1152,
+      "step": 42933
+    },
+    {
+      "epoch": 0.3726877370856156,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001264753272035866,
+      "loss": 0.0967,
+      "step": 42934
+    },
+    {
+      "epoch": 0.3726964175658197,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0012647240124182377,
+      "loss": 0.0898,
+      "step": 42935
+    },
+    {
+      "epoch": 0.3727050980460239,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0012646947526204697,
+      "loss": 0.0894,
+      "step": 42936
+    },
+    {
+      "epoch": 0.37271377852622806,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0012646654926425941,
+      "loss": 0.0825,
+      "step": 42937
+    },
+    {
+      "epoch": 0.37272245900643225,
+      "grad_norm": 6.8125,
+      "learning_rate": 0.0012646362324846426,
+      "loss": 0.3359,
+      "step": 42938
+    },
+    {
+      "epoch": 0.3727311394866364,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0012646069721466474,
+      "loss": 0.1001,
+      "step": 42939
+    },
+    {
+      "epoch": 0.3727398199668406,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0012645777116286406,
+      "loss": 0.1138,
+      "step": 42940
+    },
+    {
+      "epoch": 0.3727485004470447,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0012645484509306542,
+      "loss": 0.127,
+      "step": 42941
+    },
+    {
+      "epoch": 0.3727571809272489,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0012645191900527196,
+      "loss": 0.1245,
+      "step": 42942
+    },
+    {
+      "epoch": 0.37276586140745305,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0012644899289948697,
+      "loss": 0.1182,
+      "step": 42943
+    },
+    {
+      "epoch": 0.37277454188765724,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001264460667757136,
+      "loss": 0.1357,
+      "step": 42944
+    },
+    {
+      "epoch": 0.3727832223678614,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0012644314063395507,
+      "loss": 0.0884,
+      "step": 42945
+    },
+    {
+      "epoch": 0.37279190284806557,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0012644021447421453,
+      "loss": 0.1201,
+      "step": 42946
+    },
+    {
+      "epoch": 0.3728005833282697,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0012643728829649526,
+      "loss": 0.1094,
+      "step": 42947
+    },
+    {
+      "epoch": 0.3728092638084739,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0012643436210080037,
+      "loss": 0.1377,
+      "step": 42948
+    },
+    {
+      "epoch": 0.37281794428867804,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0012643143588713314,
+      "loss": 0.0996,
+      "step": 42949
+    },
+    {
+      "epoch": 0.37282662476888223,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0012642850965549676,
+      "loss": 0.0981,
+      "step": 42950
+    },
+    {
+      "epoch": 0.37283530524908637,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0012642558340589436,
+      "loss": 0.1133,
+      "step": 42951
+    },
+    {
+      "epoch": 0.37284398572929056,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0012642265713832923,
+      "loss": 0.1011,
+      "step": 42952
+    },
+    {
+      "epoch": 0.3728526662094947,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0012641973085280454,
+      "loss": 0.0898,
+      "step": 42953
+    },
+    {
+      "epoch": 0.3728613466896989,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0012641680454932346,
+      "loss": 0.0771,
+      "step": 42954
+    },
+    {
+      "epoch": 0.37287002716990303,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001264138782278892,
+      "loss": 0.0776,
+      "step": 42955
+    },
+    {
+      "epoch": 0.3728787076501072,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0012641095188850498,
+      "loss": 0.0996,
+      "step": 42956
+    },
+    {
+      "epoch": 0.37288738813031136,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0012640802553117397,
+      "loss": 0.0884,
+      "step": 42957
+    },
+    {
+      "epoch": 0.37289606861051555,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001264050991558994,
+      "loss": 0.1436,
+      "step": 42958
+    },
+    {
+      "epoch": 0.3729047490907197,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0012640217276268443,
+      "loss": 0.1064,
+      "step": 42959
+    },
+    {
+      "epoch": 0.3729134295709239,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0012639924635153234,
+      "loss": 0.1035,
+      "step": 42960
+    },
+    {
+      "epoch": 0.372922110051128,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0012639631992244628,
+      "loss": 0.0928,
+      "step": 42961
+    },
+    {
+      "epoch": 0.3729307905313322,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0012639339347542942,
+      "loss": 0.0947,
+      "step": 42962
+    },
+    {
+      "epoch": 0.37293947101153635,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0012639046701048502,
+      "loss": 0.1016,
+      "step": 42963
+    },
+    {
+      "epoch": 0.37294815149174054,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0012638754052761623,
+      "loss": 0.1172,
+      "step": 42964
+    },
+    {
+      "epoch": 0.3729568319719447,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0012638461402682628,
+      "loss": 0.0928,
+      "step": 42965
+    },
+    {
+      "epoch": 0.3729655124521489,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0012638168750811836,
+      "loss": 0.1055,
+      "step": 42966
+    },
+    {
+      "epoch": 0.372974192932353,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0012637876097149566,
+      "loss": 0.1123,
+      "step": 42967
+    },
+    {
+      "epoch": 0.3729828734125572,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001263758344169614,
+      "loss": 0.0977,
+      "step": 42968
+    },
+    {
+      "epoch": 0.37299155389276134,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0012637290784451877,
+      "loss": 0.0713,
+      "step": 42969
+    },
+    {
+      "epoch": 0.37300023437296553,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.00126369981254171,
+      "loss": 0.0742,
+      "step": 42970
+    },
+    {
+      "epoch": 0.37300891485316967,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0012636705464592127,
+      "loss": 0.0996,
+      "step": 42971
+    },
+    {
+      "epoch": 0.37301759533337386,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0012636412801977272,
+      "loss": 0.1367,
+      "step": 42972
+    },
+    {
+      "epoch": 0.373026275813578,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0012636120137572867,
+      "loss": 0.1211,
+      "step": 42973
+    },
+    {
+      "epoch": 0.3730349562937822,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0012635827471379221,
+      "loss": 0.1094,
+      "step": 42974
+    },
+    {
+      "epoch": 0.37304363677398633,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0012635534803396662,
+      "loss": 0.0898,
+      "step": 42975
+    },
+    {
+      "epoch": 0.3730523172541905,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0012635242133625503,
+      "loss": 0.0674,
+      "step": 42976
+    },
+    {
+      "epoch": 0.37306099773439466,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0012634949462066068,
+      "loss": 0.1602,
+      "step": 42977
+    },
+    {
+      "epoch": 0.37306967821459885,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0012634656788718679,
+      "loss": 0.1182,
+      "step": 42978
+    },
+    {
+      "epoch": 0.373078358694803,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0012634364113583657,
+      "loss": 0.0952,
+      "step": 42979
+    },
+    {
+      "epoch": 0.3730870391750072,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0012634071436661313,
+      "loss": 0.1201,
+      "step": 42980
+    },
+    {
+      "epoch": 0.3730957196552113,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0012633778757951974,
+      "loss": 0.0654,
+      "step": 42981
+    },
+    {
+      "epoch": 0.3731044001354155,
+      "grad_norm": 1.25,
+      "learning_rate": 0.0012633486077455963,
+      "loss": 0.1934,
+      "step": 42982
+    },
+    {
+      "epoch": 0.37311308061561965,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0012633193395173593,
+      "loss": 0.123,
+      "step": 42983
+    },
+    {
+      "epoch": 0.37312176109582385,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0012632900711105188,
+      "loss": 0.0801,
+      "step": 42984
+    },
+    {
+      "epoch": 0.373130441576028,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0012632608025251067,
+      "loss": 0.1094,
+      "step": 42985
+    },
+    {
+      "epoch": 0.3731391220562322,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0012632315337611551,
+      "loss": 0.0938,
+      "step": 42986
+    },
+    {
+      "epoch": 0.3731478025364363,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0012632022648186958,
+      "loss": 0.1123,
+      "step": 42987
+    },
+    {
+      "epoch": 0.3731564830166405,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0012631729956977611,
+      "loss": 0.0664,
+      "step": 42988
+    },
+    {
+      "epoch": 0.37316516349684464,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0012631437263983828,
+      "loss": 0.1074,
+      "step": 42989
+    },
+    {
+      "epoch": 0.37317384397704884,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0012631144569205928,
+      "loss": 0.0898,
+      "step": 42990
+    },
+    {
+      "epoch": 0.373182524457253,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0012630851872644235,
+      "loss": 0.1025,
+      "step": 42991
+    },
+    {
+      "epoch": 0.37319120493745717,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001263055917429907,
+      "loss": 0.1074,
+      "step": 42992
+    },
+    {
+      "epoch": 0.3731998854176613,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0012630266474170746,
+      "loss": 0.1138,
+      "step": 42993
+    },
+    {
+      "epoch": 0.3732085658978655,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0012629973772259589,
+      "loss": 0.1025,
+      "step": 42994
+    },
+    {
+      "epoch": 0.37321724637806963,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0012629681068565914,
+      "loss": 0.1211,
+      "step": 42995
+    },
+    {
+      "epoch": 0.3732259268582738,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0012629388363090046,
+      "loss": 0.103,
+      "step": 42996
+    },
+    {
+      "epoch": 0.37323460733847796,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0012629095655832304,
+      "loss": 0.0781,
+      "step": 42997
+    },
+    {
+      "epoch": 0.37324328781868216,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0012628802946793006,
+      "loss": 0.1221,
+      "step": 42998
+    },
+    {
+      "epoch": 0.3732519682988863,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0012628510235972474,
+      "loss": 0.0806,
+      "step": 42999
+    },
+    {
+      "epoch": 0.3732606487790905,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0012628217523371031,
+      "loss": 0.0801,
+      "step": 43000
+    },
+    {
+      "epoch": 0.3732693292592946,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.001262792480898899,
+      "loss": 0.1045,
+      "step": 43001
+    },
+    {
+      "epoch": 0.37327800973949876,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0012627632092826674,
+      "loss": 0.1455,
+      "step": 43002
+    },
+    {
+      "epoch": 0.37328669021970295,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0012627339374884407,
+      "loss": 0.1006,
+      "step": 43003
+    },
+    {
+      "epoch": 0.3732953706999071,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0012627046655162503,
+      "loss": 0.1143,
+      "step": 43004
+    },
+    {
+      "epoch": 0.3733040511801113,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0012626753933661287,
+      "loss": 0.1216,
+      "step": 43005
+    },
+    {
+      "epoch": 0.3733127316603154,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001262646121038108,
+      "loss": 0.1094,
+      "step": 43006
+    },
+    {
+      "epoch": 0.3733214121405196,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0012626168485322196,
+      "loss": 0.0991,
+      "step": 43007
+    },
+    {
+      "epoch": 0.37333009262072375,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0012625875758484962,
+      "loss": 0.1338,
+      "step": 43008
+    },
+    {
+      "epoch": 0.37333877310092795,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001262558302986969,
+      "loss": 0.1309,
+      "step": 43009
+    },
+    {
+      "epoch": 0.3733474535811321,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0012625290299476707,
+      "loss": 0.0967,
+      "step": 43010
+    },
+    {
+      "epoch": 0.3733561340613363,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001262499756730633,
+      "loss": 0.1289,
+      "step": 43011
+    },
+    {
+      "epoch": 0.3733648145415404,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0012624704833358884,
+      "loss": 0.1064,
+      "step": 43012
+    },
+    {
+      "epoch": 0.3733734950217446,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0012624412097634683,
+      "loss": 0.0981,
+      "step": 43013
+    },
+    {
+      "epoch": 0.37338217550194874,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0012624119360134048,
+      "loss": 0.0767,
+      "step": 43014
+    },
+    {
+      "epoch": 0.37339085598215294,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0012623826620857303,
+      "loss": 0.1094,
+      "step": 43015
+    },
+    {
+      "epoch": 0.3733995364623571,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001262353387980476,
+      "loss": 0.1006,
+      "step": 43016
+    },
+    {
+      "epoch": 0.37340821694256127,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0012623241136976748,
+      "loss": 0.1348,
+      "step": 43017
+    },
+    {
+      "epoch": 0.3734168974227654,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0012622948392373588,
+      "loss": 0.0854,
+      "step": 43018
+    },
+    {
+      "epoch": 0.3734255779029696,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0012622655645995592,
+      "loss": 0.0762,
+      "step": 43019
+    },
+    {
+      "epoch": 0.37343425838317373,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0012622362897843082,
+      "loss": 0.1084,
+      "step": 43020
+    },
+    {
+      "epoch": 0.3734429388633779,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0012622070147916384,
+      "loss": 0.0923,
+      "step": 43021
+    },
+    {
+      "epoch": 0.37345161934358206,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0012621777396215814,
+      "loss": 0.1211,
+      "step": 43022
+    },
+    {
+      "epoch": 0.37346029982378626,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0012621484642741693,
+      "loss": 0.1543,
+      "step": 43023
+    },
+    {
+      "epoch": 0.3734689803039904,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001262119188749434,
+      "loss": 0.0991,
+      "step": 43024
+    },
+    {
+      "epoch": 0.3734776607841946,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0012620899130474073,
+      "loss": 0.0923,
+      "step": 43025
+    },
+    {
+      "epoch": 0.3734863412643987,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0012620606371681218,
+      "loss": 0.1113,
+      "step": 43026
+    },
+    {
+      "epoch": 0.3734950217446029,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0012620313611116094,
+      "loss": 0.1055,
+      "step": 43027
+    },
+    {
+      "epoch": 0.37350370222480705,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0012620020848779017,
+      "loss": 0.1089,
+      "step": 43028
+    },
+    {
+      "epoch": 0.37351238270501125,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001261972808467031,
+      "loss": 0.1016,
+      "step": 43029
+    },
+    {
+      "epoch": 0.3735210631852154,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.001261943531879029,
+      "loss": 0.1016,
+      "step": 43030
+    },
+    {
+      "epoch": 0.3735297436654196,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0012619142551139285,
+      "loss": 0.1157,
+      "step": 43031
+    },
+    {
+      "epoch": 0.3735384241456237,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0012618849781717608,
+      "loss": 0.1426,
+      "step": 43032
+    },
+    {
+      "epoch": 0.3735471046258279,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0012618557010525577,
+      "loss": 0.0781,
+      "step": 43033
+    },
+    {
+      "epoch": 0.37355578510603205,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0012618264237563522,
+      "loss": 0.125,
+      "step": 43034
+    },
+    {
+      "epoch": 0.37356446558623624,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0012617971462831757,
+      "loss": 0.0874,
+      "step": 43035
+    },
+    {
+      "epoch": 0.3735731460664404,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0012617678686330598,
+      "loss": 0.1143,
+      "step": 43036
+    },
+    {
+      "epoch": 0.37358182654664457,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0012617385908060372,
+      "loss": 0.0928,
+      "step": 43037
+    },
+    {
+      "epoch": 0.3735905070268487,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.00126170931280214,
+      "loss": 0.0977,
+      "step": 43038
+    },
+    {
+      "epoch": 0.3735991875070529,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0012616800346214,
+      "loss": 0.1162,
+      "step": 43039
+    },
+    {
+      "epoch": 0.37360786798725704,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0012616507562638489,
+      "loss": 0.1172,
+      "step": 43040
+    },
+    {
+      "epoch": 0.37361654846746123,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.001261621477729519,
+      "loss": 0.1719,
+      "step": 43041
+    },
+    {
+      "epoch": 0.37362522894766537,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001261592199018442,
+      "loss": 0.0947,
+      "step": 43042
+    },
+    {
+      "epoch": 0.37363390942786956,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0012615629201306506,
+      "loss": 0.1328,
+      "step": 43043
+    },
+    {
+      "epoch": 0.3736425899080737,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001261533641066176,
+      "loss": 0.1201,
+      "step": 43044
+    },
+    {
+      "epoch": 0.3736512703882779,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001261504361825051,
+      "loss": 0.1133,
+      "step": 43045
+    },
+    {
+      "epoch": 0.373659950868482,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001261475082407307,
+      "loss": 0.0679,
+      "step": 43046
+    },
+    {
+      "epoch": 0.3736686313486862,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0012614458028129766,
+      "loss": 0.1045,
+      "step": 43047
+    },
+    {
+      "epoch": 0.37367731182889036,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0012614165230420912,
+      "loss": 0.1055,
+      "step": 43048
+    },
+    {
+      "epoch": 0.37368599230909455,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0012613872430946836,
+      "loss": 0.0806,
+      "step": 43049
+    },
+    {
+      "epoch": 0.3736946727892987,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0012613579629707848,
+      "loss": 0.1055,
+      "step": 43050
+    },
+    {
+      "epoch": 0.3737033532695029,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0012613286826704275,
+      "loss": 0.1162,
+      "step": 43051
+    },
+    {
+      "epoch": 0.373712033749707,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0012612994021936434,
+      "loss": 0.0991,
+      "step": 43052
+    },
+    {
+      "epoch": 0.3737207142299112,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001261270121540465,
+      "loss": 0.0791,
+      "step": 43053
+    },
+    {
+      "epoch": 0.37372939471011535,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0012612408407109241,
+      "loss": 0.1143,
+      "step": 43054
+    },
+    {
+      "epoch": 0.37373807519031954,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0012612115597050522,
+      "loss": 0.0757,
+      "step": 43055
+    },
+    {
+      "epoch": 0.3737467556705237,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001261182278522882,
+      "loss": 0.0996,
+      "step": 43056
+    },
+    {
+      "epoch": 0.37375543615072787,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0012611529971644454,
+      "loss": 0.0781,
+      "step": 43057
+    },
+    {
+      "epoch": 0.373764116630932,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0012611237156297743,
+      "loss": 0.1182,
+      "step": 43058
+    },
+    {
+      "epoch": 0.3737727971111362,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0012610944339189005,
+      "loss": 0.0996,
+      "step": 43059
+    },
+    {
+      "epoch": 0.37378147759134034,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0012610651520318565,
+      "loss": 0.1045,
+      "step": 43060
+    },
+    {
+      "epoch": 0.37379015807154453,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001261035869968674,
+      "loss": 0.0967,
+      "step": 43061
+    },
+    {
+      "epoch": 0.37379883855174867,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0012610065877293848,
+      "loss": 0.1221,
+      "step": 43062
+    },
+    {
+      "epoch": 0.37380751903195286,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0012609773053140214,
+      "loss": 0.1064,
+      "step": 43063
+    },
+    {
+      "epoch": 0.373816199512157,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0012609480227226157,
+      "loss": 0.1221,
+      "step": 43064
+    },
+    {
+      "epoch": 0.3738248799923612,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0012609187399551997,
+      "loss": 0.0854,
+      "step": 43065
+    },
+    {
+      "epoch": 0.37383356047256533,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0012608894570118054,
+      "loss": 0.1089,
+      "step": 43066
+    },
+    {
+      "epoch": 0.3738422409527695,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0012608601738924648,
+      "loss": 0.1318,
+      "step": 43067
+    },
+    {
+      "epoch": 0.37385092143297366,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0012608308905972098,
+      "loss": 0.1523,
+      "step": 43068
+    },
+    {
+      "epoch": 0.37385960191317785,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0012608016071260727,
+      "loss": 0.1387,
+      "step": 43069
+    },
+    {
+      "epoch": 0.373868282393382,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0012607723234790854,
+      "loss": 0.1006,
+      "step": 43070
+    },
+    {
+      "epoch": 0.3738769628735862,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0012607430396562797,
+      "loss": 0.1084,
+      "step": 43071
+    },
+    {
+      "epoch": 0.3738856433537903,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001260713755657688,
+      "loss": 0.0703,
+      "step": 43072
+    },
+    {
+      "epoch": 0.3738943238339945,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001260684471483342,
+      "loss": 0.1445,
+      "step": 43073
+    },
+    {
+      "epoch": 0.37390300431419865,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001260655187133274,
+      "loss": 0.0903,
+      "step": 43074
+    },
+    {
+      "epoch": 0.37391168479440284,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0012606259026075159,
+      "loss": 0.127,
+      "step": 43075
+    },
+    {
+      "epoch": 0.373920365274607,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0012605966179060997,
+      "loss": 0.1177,
+      "step": 43076
+    },
+    {
+      "epoch": 0.3739290457548112,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0012605673330290572,
+      "loss": 0.0791,
+      "step": 43077
+    },
+    {
+      "epoch": 0.3739377262350153,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0012605380479764213,
+      "loss": 0.1064,
+      "step": 43078
+    },
+    {
+      "epoch": 0.3739464067152195,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0012605087627482231,
+      "loss": 0.1123,
+      "step": 43079
+    },
+    {
+      "epoch": 0.37395508719542364,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0012604794773444946,
+      "loss": 0.0791,
+      "step": 43080
+    },
+    {
+      "epoch": 0.37396376767562783,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0012604501917652684,
+      "loss": 0.0967,
+      "step": 43081
+    },
+    {
+      "epoch": 0.37397244815583197,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0012604209060105762,
+      "loss": 0.0957,
+      "step": 43082
+    },
+    {
+      "epoch": 0.37398112863603616,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0012603916200804501,
+      "loss": 0.1328,
+      "step": 43083
+    },
+    {
+      "epoch": 0.3739898091162403,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0012603623339749225,
+      "loss": 0.1064,
+      "step": 43084
+    },
+    {
+      "epoch": 0.3739984895964445,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0012603330476940245,
+      "loss": 0.1309,
+      "step": 43085
+    },
+    {
+      "epoch": 0.37400717007664863,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0012603037612377891,
+      "loss": 0.0771,
+      "step": 43086
+    },
+    {
+      "epoch": 0.3740158505568528,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001260274474606248,
+      "loss": 0.0791,
+      "step": 43087
+    },
+    {
+      "epoch": 0.37402453103705696,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0012602451877994328,
+      "loss": 0.1973,
+      "step": 43088
+    },
+    {
+      "epoch": 0.37403321151726115,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0012602159008173758,
+      "loss": 0.1328,
+      "step": 43089
+    },
+    {
+      "epoch": 0.3740418919974653,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0012601866136601097,
+      "loss": 0.1133,
+      "step": 43090
+    },
+    {
+      "epoch": 0.3740505724776695,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0012601573263276652,
+      "loss": 0.1211,
+      "step": 43091
+    },
+    {
+      "epoch": 0.3740592529578736,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0012601280388200755,
+      "loss": 0.0923,
+      "step": 43092
+    },
+    {
+      "epoch": 0.3740679334380778,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001260098751137372,
+      "loss": 0.0723,
+      "step": 43093
+    },
+    {
+      "epoch": 0.37407661391828195,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001260069463279587,
+      "loss": 0.127,
+      "step": 43094
+    },
+    {
+      "epoch": 0.37408529439848615,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0012600401752467525,
+      "loss": 0.1328,
+      "step": 43095
+    },
+    {
+      "epoch": 0.3740939748786903,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0012600108870389004,
+      "loss": 0.085,
+      "step": 43096
+    },
+    {
+      "epoch": 0.3741026553588945,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0012599815986560627,
+      "loss": 0.0903,
+      "step": 43097
+    },
+    {
+      "epoch": 0.3741113358390986,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0012599523100982714,
+      "loss": 0.1172,
+      "step": 43098
+    },
+    {
+      "epoch": 0.3741200163193028,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0012599230213655593,
+      "loss": 0.1016,
+      "step": 43099
+    },
+    {
+      "epoch": 0.37412869679950694,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0012598937324579571,
+      "loss": 0.125,
+      "step": 43100
+    },
+    {
+      "epoch": 0.37413737727971114,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0012598644433754979,
+      "loss": 0.1094,
+      "step": 43101
+    },
+    {
+      "epoch": 0.3741460577599153,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001259835154118213,
+      "loss": 0.0972,
+      "step": 43102
+    },
+    {
+      "epoch": 0.37415473824011947,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.001259805864686135,
+      "loss": 0.1602,
+      "step": 43103
+    },
+    {
+      "epoch": 0.3741634187203236,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0012597765750792958,
+      "loss": 0.0874,
+      "step": 43104
+    },
+    {
+      "epoch": 0.3741720992005278,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0012597472852977274,
+      "loss": 0.1172,
+      "step": 43105
+    },
+    {
+      "epoch": 0.37418077968073193,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0012597179953414614,
+      "loss": 0.1055,
+      "step": 43106
+    },
+    {
+      "epoch": 0.3741894601609361,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0012596887052105305,
+      "loss": 0.124,
+      "step": 43107
+    },
+    {
+      "epoch": 0.37419814064114026,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0012596594149049665,
+      "loss": 0.0869,
+      "step": 43108
+    },
+    {
+      "epoch": 0.37420682112134446,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0012596301244248013,
+      "loss": 0.1025,
+      "step": 43109
+    },
+    {
+      "epoch": 0.3742155016015486,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0012596008337700667,
+      "loss": 0.1123,
+      "step": 43110
+    },
+    {
+      "epoch": 0.3742241820817528,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0012595715429407955,
+      "loss": 0.0723,
+      "step": 43111
+    },
+    {
+      "epoch": 0.3742328625619569,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0012595422519370188,
+      "loss": 0.0781,
+      "step": 43112
+    },
+    {
+      "epoch": 0.3742415430421611,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0012595129607587692,
+      "loss": 0.0967,
+      "step": 43113
+    },
+    {
+      "epoch": 0.37425022352236526,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0012594836694060788,
+      "loss": 0.0957,
+      "step": 43114
+    },
+    {
+      "epoch": 0.37425890400256945,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001259454377878979,
+      "loss": 0.126,
+      "step": 43115
+    },
+    {
+      "epoch": 0.3742675844827736,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0012594250861775026,
+      "loss": 0.0801,
+      "step": 43116
+    },
+    {
+      "epoch": 0.3742762649629778,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0012593957943016817,
+      "loss": 0.0884,
+      "step": 43117
+    },
+    {
+      "epoch": 0.3742849454431819,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0012593665022515477,
+      "loss": 0.0957,
+      "step": 43118
+    },
+    {
+      "epoch": 0.3742936259233861,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0012593372100271325,
+      "loss": 0.1128,
+      "step": 43119
+    },
+    {
+      "epoch": 0.37430230640359025,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001259307917628469,
+      "loss": 0.0747,
+      "step": 43120
+    },
+    {
+      "epoch": 0.37431098688379444,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0012592786250555884,
+      "loss": 0.0942,
+      "step": 43121
+    },
+    {
+      "epoch": 0.3743196673639986,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0012592493323085234,
+      "loss": 0.1152,
+      "step": 43122
+    },
+    {
+      "epoch": 0.37432834784420277,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0012592200393873056,
+      "loss": 0.1367,
+      "step": 43123
+    },
+    {
+      "epoch": 0.3743370283244069,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0012591907462919671,
+      "loss": 0.1172,
+      "step": 43124
+    },
+    {
+      "epoch": 0.37434570880461104,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00125916145302254,
+      "loss": 0.0962,
+      "step": 43125
+    },
+    {
+      "epoch": 0.37435438928481524,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0012591321595790566,
+      "loss": 0.1357,
+      "step": 43126
+    },
+    {
+      "epoch": 0.3743630697650194,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0012591028659615484,
+      "loss": 0.0986,
+      "step": 43127
+    },
+    {
+      "epoch": 0.37437175024522357,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0012590735721700475,
+      "loss": 0.1484,
+      "step": 43128
+    },
+    {
+      "epoch": 0.3743804307254277,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0012590442782045865,
+      "loss": 0.0938,
+      "step": 43129
+    },
+    {
+      "epoch": 0.3743891112056319,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0012590149840651969,
+      "loss": 0.1006,
+      "step": 43130
+    },
+    {
+      "epoch": 0.37439779168583603,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001258985689751911,
+      "loss": 0.0889,
+      "step": 43131
+    },
+    {
+      "epoch": 0.3744064721660402,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0012589563952647605,
+      "loss": 0.1289,
+      "step": 43132
+    },
+    {
+      "epoch": 0.37441515264624436,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0012589271006037779,
+      "loss": 0.1348,
+      "step": 43133
+    },
+    {
+      "epoch": 0.37442383312644856,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001258897805768995,
+      "loss": 0.1172,
+      "step": 43134
+    },
+    {
+      "epoch": 0.3744325136066527,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001258868510760444,
+      "loss": 0.0879,
+      "step": 43135
+    },
+    {
+      "epoch": 0.3744411940868569,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0012588392155781563,
+      "loss": 0.1299,
+      "step": 43136
+    },
+    {
+      "epoch": 0.374449874567061,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0012588099202221648,
+      "loss": 0.0908,
+      "step": 43137
+    },
+    {
+      "epoch": 0.3744585550472652,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0012587806246925008,
+      "loss": 0.0791,
+      "step": 43138
+    },
+    {
+      "epoch": 0.37446723552746936,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0012587513289891974,
+      "loss": 0.0928,
+      "step": 43139
+    },
+    {
+      "epoch": 0.37447591600767355,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0012587220331122854,
+      "loss": 0.1001,
+      "step": 43140
+    },
+    {
+      "epoch": 0.3744845964878777,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001258692737061797,
+      "loss": 0.0938,
+      "step": 43141
+    },
+    {
+      "epoch": 0.3744932769680819,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0012586634408377653,
+      "loss": 0.0781,
+      "step": 43142
+    },
+    {
+      "epoch": 0.374501957448286,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0012586341444402213,
+      "loss": 0.0854,
+      "step": 43143
+    },
+    {
+      "epoch": 0.3745106379284902,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0012586048478691978,
+      "loss": 0.083,
+      "step": 43144
+    },
+    {
+      "epoch": 0.37451931840869435,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0012585755511247258,
+      "loss": 0.0845,
+      "step": 43145
+    },
+    {
+      "epoch": 0.37452799888889854,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0012585462542068384,
+      "loss": 0.1035,
+      "step": 43146
+    },
+    {
+      "epoch": 0.3745366793691027,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001258516957115567,
+      "loss": 0.0864,
+      "step": 43147
+    },
+    {
+      "epoch": 0.37454535984930687,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0012584876598509443,
+      "loss": 0.0859,
+      "step": 43148
+    },
+    {
+      "epoch": 0.374554040329511,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0012584583624130011,
+      "loss": 0.1309,
+      "step": 43149
+    },
+    {
+      "epoch": 0.3745627208097152,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0012584290648017704,
+      "loss": 0.126,
+      "step": 43150
+    },
+    {
+      "epoch": 0.37457140128991934,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0012583997670172844,
+      "loss": 0.1387,
+      "step": 43151
+    },
+    {
+      "epoch": 0.37458008177012353,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0012583704690595748,
+      "loss": 0.0815,
+      "step": 43152
+    },
+    {
+      "epoch": 0.37458876225032767,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0012583411709286736,
+      "loss": 0.1367,
+      "step": 43153
+    },
+    {
+      "epoch": 0.37459744273053186,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0012583118726246126,
+      "loss": 0.1553,
+      "step": 43154
+    },
+    {
+      "epoch": 0.374606123210736,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0012582825741474242,
+      "loss": 0.1143,
+      "step": 43155
+    },
+    {
+      "epoch": 0.3746148036909402,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0012582532754971405,
+      "loss": 0.0781,
+      "step": 43156
+    },
+    {
+      "epoch": 0.3746234841711443,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0012582239766737934,
+      "loss": 0.124,
+      "step": 43157
+    },
+    {
+      "epoch": 0.3746321646513485,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0012581946776774146,
+      "loss": 0.0908,
+      "step": 43158
+    },
+    {
+      "epoch": 0.37464084513155266,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0012581653785080369,
+      "loss": 0.106,
+      "step": 43159
+    },
+    {
+      "epoch": 0.37464952561175685,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0012581360791656918,
+      "loss": 0.0713,
+      "step": 43160
+    },
+    {
+      "epoch": 0.374658206091961,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0012581067796504114,
+      "loss": 0.1309,
+      "step": 43161
+    },
+    {
+      "epoch": 0.3746668865721652,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001258077479962228,
+      "loss": 0.0972,
+      "step": 43162
+    },
+    {
+      "epoch": 0.3746755670523693,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001258048180101173,
+      "loss": 0.1045,
+      "step": 43163
+    },
+    {
+      "epoch": 0.3746842475325735,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001258018880067279,
+      "loss": 0.1045,
+      "step": 43164
+    },
+    {
+      "epoch": 0.37469292801277765,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0012579895798605785,
+      "loss": 0.0894,
+      "step": 43165
+    },
+    {
+      "epoch": 0.37470160849298184,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0012579602794811023,
+      "loss": 0.1084,
+      "step": 43166
+    },
+    {
+      "epoch": 0.374710288973186,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0012579309789288835,
+      "loss": 0.083,
+      "step": 43167
+    },
+    {
+      "epoch": 0.37471896945339017,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0012579016782039534,
+      "loss": 0.1055,
+      "step": 43168
+    },
+    {
+      "epoch": 0.3747276499335943,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0012578723773063447,
+      "loss": 0.1113,
+      "step": 43169
+    },
+    {
+      "epoch": 0.3747363304137985,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0012578430762360889,
+      "loss": 0.0977,
+      "step": 43170
+    },
+    {
+      "epoch": 0.37474501089400264,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0012578137749932185,
+      "loss": 0.103,
+      "step": 43171
+    },
+    {
+      "epoch": 0.37475369137420683,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.001257784473577765,
+      "loss": 0.0898,
+      "step": 43172
+    },
+    {
+      "epoch": 0.37476237185441097,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0012577551719897613,
+      "loss": 0.1099,
+      "step": 43173
+    },
+    {
+      "epoch": 0.37477105233461516,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0012577258702292388,
+      "loss": 0.0732,
+      "step": 43174
+    },
+    {
+      "epoch": 0.3747797328148193,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001257696568296229,
+      "loss": 0.1133,
+      "step": 43175
+    },
+    {
+      "epoch": 0.3747884132950235,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0012576672661907652,
+      "loss": 0.1016,
+      "step": 43176
+    },
+    {
+      "epoch": 0.37479709377522763,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001257637963912879,
+      "loss": 0.1338,
+      "step": 43177
+    },
+    {
+      "epoch": 0.3748057742554318,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001257608661462602,
+      "loss": 0.1055,
+      "step": 43178
+    },
+    {
+      "epoch": 0.37481445473563596,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0012575793588399664,
+      "loss": 0.0977,
+      "step": 43179
+    },
+    {
+      "epoch": 0.37482313521584015,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0012575500560450046,
+      "loss": 0.1201,
+      "step": 43180
+    },
+    {
+      "epoch": 0.3748318156960443,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0012575207530777486,
+      "loss": 0.1201,
+      "step": 43181
+    },
+    {
+      "epoch": 0.3748404961762485,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0012574914499382302,
+      "loss": 0.1226,
+      "step": 43182
+    },
+    {
+      "epoch": 0.3748491766564526,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0012574621466264811,
+      "loss": 0.166,
+      "step": 43183
+    },
+    {
+      "epoch": 0.3748578571366568,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0012574328431425338,
+      "loss": 0.0986,
+      "step": 43184
+    },
+    {
+      "epoch": 0.37486653761686095,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0012574035394864206,
+      "loss": 0.1348,
+      "step": 43185
+    },
+    {
+      "epoch": 0.37487521809706514,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0012573742356581734,
+      "loss": 0.1064,
+      "step": 43186
+    },
+    {
+      "epoch": 0.3748838985772693,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0012573449316578239,
+      "loss": 0.1074,
+      "step": 43187
+    },
+    {
+      "epoch": 0.3748925790574735,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0012573156274854043,
+      "loss": 0.0693,
+      "step": 43188
+    },
+    {
+      "epoch": 0.3749012595376776,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0012572863231409465,
+      "loss": 0.0938,
+      "step": 43189
+    },
+    {
+      "epoch": 0.3749099400178818,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0012572570186244832,
+      "loss": 0.1348,
+      "step": 43190
+    },
+    {
+      "epoch": 0.37491862049808594,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0012572277139360458,
+      "loss": 0.1016,
+      "step": 43191
+    },
+    {
+      "epoch": 0.37492730097829013,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0012571984090756665,
+      "loss": 0.126,
+      "step": 43192
+    },
+    {
+      "epoch": 0.37493598145849427,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0012571691040433773,
+      "loss": 0.127,
+      "step": 43193
+    },
+    {
+      "epoch": 0.37494466193869846,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0012571397988392103,
+      "loss": 0.0591,
+      "step": 43194
+    },
+    {
+      "epoch": 0.3749533424189026,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0012571104934631981,
+      "loss": 0.1055,
+      "step": 43195
+    },
+    {
+      "epoch": 0.3749620228991068,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0012570811879153716,
+      "loss": 0.1094,
+      "step": 43196
+    },
+    {
+      "epoch": 0.37497070337931093,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0012570518821957634,
+      "loss": 0.2031,
+      "step": 43197
+    },
+    {
+      "epoch": 0.3749793838595151,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0012570225763044059,
+      "loss": 0.0864,
+      "step": 43198
+    },
+    {
+      "epoch": 0.37498806433971926,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0012569932702413311,
+      "loss": 0.084,
+      "step": 43199
+    },
+    {
+      "epoch": 0.37499674481992346,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0012569639640065704,
+      "loss": 0.1045,
+      "step": 43200
+    },
+    {
+      "epoch": 0.3750054253001276,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0012569346576001565,
+      "loss": 0.1216,
+      "step": 43201
+    },
+    {
+      "epoch": 0.3750141057803318,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0012569053510221213,
+      "loss": 0.0933,
+      "step": 43202
+    },
+    {
+      "epoch": 0.3750227862605359,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0012568760442724968,
+      "loss": 0.1025,
+      "step": 43203
+    },
+    {
+      "epoch": 0.3750314667407401,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0012568467373513148,
+      "loss": 0.0713,
+      "step": 43204
+    },
+    {
+      "epoch": 0.37504014722094425,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0012568174302586077,
+      "loss": 0.1133,
+      "step": 43205
+    },
+    {
+      "epoch": 0.37504882770114845,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0012567881229944072,
+      "loss": 0.0972,
+      "step": 43206
+    },
+    {
+      "epoch": 0.3750575081813526,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0012567588155587455,
+      "loss": 0.1084,
+      "step": 43207
+    },
+    {
+      "epoch": 0.3750661886615568,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001256729507951655,
+      "loss": 0.1001,
+      "step": 43208
+    },
+    {
+      "epoch": 0.3750748691417609,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0012567002001731675,
+      "loss": 0.1001,
+      "step": 43209
+    },
+    {
+      "epoch": 0.3750835496219651,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0012566708922233146,
+      "loss": 0.0918,
+      "step": 43210
+    },
+    {
+      "epoch": 0.37509223010216924,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0012566415841021293,
+      "loss": 0.0771,
+      "step": 43211
+    },
+    {
+      "epoch": 0.37510091058237344,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0012566122758096428,
+      "loss": 0.0996,
+      "step": 43212
+    },
+    {
+      "epoch": 0.3751095910625776,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0012565829673458876,
+      "loss": 0.0811,
+      "step": 43213
+    },
+    {
+      "epoch": 0.37511827154278177,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0012565536587108954,
+      "loss": 0.0688,
+      "step": 43214
+    },
+    {
+      "epoch": 0.3751269520229859,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0012565243499046987,
+      "loss": 0.0952,
+      "step": 43215
+    },
+    {
+      "epoch": 0.3751356325031901,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0012564950409273295,
+      "loss": 0.1279,
+      "step": 43216
+    },
+    {
+      "epoch": 0.37514431298339423,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0012564657317788192,
+      "loss": 0.106,
+      "step": 43217
+    },
+    {
+      "epoch": 0.3751529934635984,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0012564364224592007,
+      "loss": 0.123,
+      "step": 43218
+    },
+    {
+      "epoch": 0.37516167394380256,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0012564071129685054,
+      "loss": 0.0981,
+      "step": 43219
+    },
+    {
+      "epoch": 0.37517035442400676,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0012563778033067659,
+      "loss": 0.0698,
+      "step": 43220
+    },
+    {
+      "epoch": 0.3751790349042109,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0012563484934740136,
+      "loss": 0.0884,
+      "step": 43221
+    },
+    {
+      "epoch": 0.3751877153844151,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0012563191834702814,
+      "loss": 0.1465,
+      "step": 43222
+    },
+    {
+      "epoch": 0.3751963958646192,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0012562898732956006,
+      "loss": 0.1006,
+      "step": 43223
+    },
+    {
+      "epoch": 0.3752050763448234,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0012562605629500036,
+      "loss": 0.1079,
+      "step": 43224
+    },
+    {
+      "epoch": 0.37521375682502756,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0012562312524335227,
+      "loss": 0.0894,
+      "step": 43225
+    },
+    {
+      "epoch": 0.37522243730523175,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0012562019417461893,
+      "loss": 0.1006,
+      "step": 43226
+    },
+    {
+      "epoch": 0.3752311177854359,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0012561726308880355,
+      "loss": 0.104,
+      "step": 43227
+    },
+    {
+      "epoch": 0.3752397982656401,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0012561433198590942,
+      "loss": 0.1016,
+      "step": 43228
+    },
+    {
+      "epoch": 0.3752484787458442,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0012561140086593968,
+      "loss": 0.1191,
+      "step": 43229
+    },
+    {
+      "epoch": 0.3752571592260484,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0012560846972889752,
+      "loss": 0.103,
+      "step": 43230
+    },
+    {
+      "epoch": 0.37526583970625255,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0012560553857478623,
+      "loss": 0.0693,
+      "step": 43231
+    },
+    {
+      "epoch": 0.37527452018645674,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0012560260740360893,
+      "loss": 0.084,
+      "step": 43232
+    },
+    {
+      "epoch": 0.3752832006666609,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0012559967621536881,
+      "loss": 0.1143,
+      "step": 43233
+    },
+    {
+      "epoch": 0.37529188114686507,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0012559674501006917,
+      "loss": 0.1113,
+      "step": 43234
+    },
+    {
+      "epoch": 0.3753005616270692,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0012559381378771312,
+      "loss": 0.1006,
+      "step": 43235
+    },
+    {
+      "epoch": 0.3753092421072734,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0012559088254830396,
+      "loss": 0.1045,
+      "step": 43236
+    },
+    {
+      "epoch": 0.37531792258747754,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0012558795129184478,
+      "loss": 0.1182,
+      "step": 43237
+    },
+    {
+      "epoch": 0.37532660306768173,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0012558502001833892,
+      "loss": 0.0913,
+      "step": 43238
+    },
+    {
+      "epoch": 0.37533528354788587,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001255820887277895,
+      "loss": 0.0903,
+      "step": 43239
+    },
+    {
+      "epoch": 0.37534396402809006,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001255791574201997,
+      "loss": 0.0977,
+      "step": 43240
+    },
+    {
+      "epoch": 0.3753526445082942,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001255762260955728,
+      "loss": 0.1113,
+      "step": 43241
+    },
+    {
+      "epoch": 0.3753613249884984,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.00125573294753912,
+      "loss": 0.1099,
+      "step": 43242
+    },
+    {
+      "epoch": 0.37537000546870253,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0012557036339522043,
+      "loss": 0.1064,
+      "step": 43243
+    },
+    {
+      "epoch": 0.3753786859489067,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0012556743201950138,
+      "loss": 0.084,
+      "step": 43244
+    },
+    {
+      "epoch": 0.37538736642911086,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0012556450062675797,
+      "loss": 0.0996,
+      "step": 43245
+    },
+    {
+      "epoch": 0.37539604690931505,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001255615692169935,
+      "loss": 0.1152,
+      "step": 43246
+    },
+    {
+      "epoch": 0.3754047273895192,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001255586377902111,
+      "loss": 0.1484,
+      "step": 43247
+    },
+    {
+      "epoch": 0.3754134078697233,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0012555570634641404,
+      "loss": 0.1279,
+      "step": 43248
+    },
+    {
+      "epoch": 0.3754220883499275,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0012555277488560548,
+      "loss": 0.1074,
+      "step": 43249
+    },
+    {
+      "epoch": 0.37543076883013166,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0012554984340778866,
+      "loss": 0.0771,
+      "step": 43250
+    },
+    {
+      "epoch": 0.37543944931033585,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0012554691191296674,
+      "loss": 0.0781,
+      "step": 43251
+    },
+    {
+      "epoch": 0.37544812979054,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0012554398040114296,
+      "loss": 0.0928,
+      "step": 43252
+    },
+    {
+      "epoch": 0.3754568102707442,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001255410488723205,
+      "loss": 0.0957,
+      "step": 43253
+    },
+    {
+      "epoch": 0.3754654907509483,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0012553811732650259,
+      "loss": 0.0723,
+      "step": 43254
+    },
+    {
+      "epoch": 0.3754741712311525,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0012553518576369244,
+      "loss": 0.1221,
+      "step": 43255
+    },
+    {
+      "epoch": 0.37548285171135665,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0012553225418389326,
+      "loss": 0.0957,
+      "step": 43256
+    },
+    {
+      "epoch": 0.37549153219156084,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001255293225871082,
+      "loss": 0.0923,
+      "step": 43257
+    },
+    {
+      "epoch": 0.375500212671765,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001255263909733405,
+      "loss": 0.0962,
+      "step": 43258
+    },
+    {
+      "epoch": 0.37550889315196917,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.001255234593425934,
+      "loss": 0.1006,
+      "step": 43259
+    },
+    {
+      "epoch": 0.3755175736321733,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001255205276948701,
+      "loss": 0.0879,
+      "step": 43260
+    },
+    {
+      "epoch": 0.3755262541123775,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0012551759603017377,
+      "loss": 0.1299,
+      "step": 43261
+    },
+    {
+      "epoch": 0.37553493459258164,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001255146643485076,
+      "loss": 0.0806,
+      "step": 43262
+    },
+    {
+      "epoch": 0.37554361507278583,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0012551173264987486,
+      "loss": 0.0898,
+      "step": 43263
+    },
+    {
+      "epoch": 0.37555229555298997,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001255088009342787,
+      "loss": 0.085,
+      "step": 43264
+    },
+    {
+      "epoch": 0.37556097603319416,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0012550586920172233,
+      "loss": 0.0957,
+      "step": 43265
+    },
+    {
+      "epoch": 0.3755696565133983,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.00125502937452209,
+      "loss": 0.0835,
+      "step": 43266
+    },
+    {
+      "epoch": 0.3755783369936025,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0012550000568574192,
+      "loss": 0.0713,
+      "step": 43267
+    },
+    {
+      "epoch": 0.37558701747380663,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0012549707390232423,
+      "loss": 0.1152,
+      "step": 43268
+    },
+    {
+      "epoch": 0.3755956979540108,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0012549414210195919,
+      "loss": 0.0889,
+      "step": 43269
+    },
+    {
+      "epoch": 0.37560437843421496,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0012549121028464997,
+      "loss": 0.1035,
+      "step": 43270
+    },
+    {
+      "epoch": 0.37561305891441915,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0012548827845039981,
+      "loss": 0.0898,
+      "step": 43271
+    },
+    {
+      "epoch": 0.3756217393946233,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0012548534659921188,
+      "loss": 0.1484,
+      "step": 43272
+    },
+    {
+      "epoch": 0.3756304198748275,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0012548241473108943,
+      "loss": 0.0986,
+      "step": 43273
+    },
+    {
+      "epoch": 0.3756391003550316,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0012547948284603566,
+      "loss": 0.082,
+      "step": 43274
+    },
+    {
+      "epoch": 0.3756477808352358,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0012547655094405368,
+      "loss": 0.0986,
+      "step": 43275
+    },
+    {
+      "epoch": 0.37565646131543995,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0012547361902514684,
+      "loss": 0.1094,
+      "step": 43276
+    },
+    {
+      "epoch": 0.37566514179564414,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001254706870893183,
+      "loss": 0.0806,
+      "step": 43277
+    },
+    {
+      "epoch": 0.3756738222758483,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0012546775513657125,
+      "loss": 0.0859,
+      "step": 43278
+    },
+    {
+      "epoch": 0.37568250275605247,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0012546482316690883,
+      "loss": 0.0869,
+      "step": 43279
+    },
+    {
+      "epoch": 0.3756911832362566,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0012546189118033439,
+      "loss": 0.0674,
+      "step": 43280
+    },
+    {
+      "epoch": 0.3756998637164608,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0012545895917685102,
+      "loss": 0.0977,
+      "step": 43281
+    },
+    {
+      "epoch": 0.37570854419666494,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0012545602715646196,
+      "loss": 0.1152,
+      "step": 43282
+    },
+    {
+      "epoch": 0.37571722467686913,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0012545309511917042,
+      "loss": 0.126,
+      "step": 43283
+    },
+    {
+      "epoch": 0.37572590515707327,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0012545016306497964,
+      "loss": 0.1045,
+      "step": 43284
+    },
+    {
+      "epoch": 0.37573458563727746,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0012544723099389274,
+      "loss": 0.1084,
+      "step": 43285
+    },
+    {
+      "epoch": 0.3757432661174816,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.00125444298905913,
+      "loss": 0.083,
+      "step": 43286
+    },
+    {
+      "epoch": 0.3757519465976858,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001254413668010436,
+      "loss": 0.0869,
+      "step": 43287
+    },
+    {
+      "epoch": 0.37576062707788993,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001254384346792878,
+      "loss": 0.1221,
+      "step": 43288
+    },
+    {
+      "epoch": 0.3757693075580941,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001254355025406487,
+      "loss": 0.0928,
+      "step": 43289
+    },
+    {
+      "epoch": 0.37577798803829826,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001254325703851296,
+      "loss": 0.0996,
+      "step": 43290
+    },
+    {
+      "epoch": 0.37578666851850245,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0012542963821273367,
+      "loss": 0.0913,
+      "step": 43291
+    },
+    {
+      "epoch": 0.3757953489987066,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0012542670602346412,
+      "loss": 0.1035,
+      "step": 43292
+    },
+    {
+      "epoch": 0.3758040294789108,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0012542377381732416,
+      "loss": 0.1201,
+      "step": 43293
+    },
+    {
+      "epoch": 0.3758127099591149,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0012542084159431694,
+      "loss": 0.1143,
+      "step": 43294
+    },
+    {
+      "epoch": 0.3758213904393191,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001254179093544458,
+      "loss": 0.085,
+      "step": 43295
+    },
+    {
+      "epoch": 0.37583007091952325,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001254149770977138,
+      "loss": 0.1797,
+      "step": 43296
+    },
+    {
+      "epoch": 0.37583875139972744,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0012541204482412428,
+      "loss": 0.0918,
+      "step": 43297
+    },
+    {
+      "epoch": 0.3758474318799316,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0012540911253368031,
+      "loss": 0.1367,
+      "step": 43298
+    },
+    {
+      "epoch": 0.3758561123601358,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0012540618022638525,
+      "loss": 0.1006,
+      "step": 43299
+    },
+    {
+      "epoch": 0.3758647928403399,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0012540324790224213,
+      "loss": 0.0947,
+      "step": 43300
+    },
+    {
+      "epoch": 0.3758734733205441,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001254003155612543,
+      "loss": 0.1167,
+      "step": 43301
+    },
+    {
+      "epoch": 0.37588215380074824,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.001253973832034249,
+      "loss": 0.0898,
+      "step": 43302
+    },
+    {
+      "epoch": 0.37589083428095243,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0012539445082875716,
+      "loss": 0.1055,
+      "step": 43303
+    },
+    {
+      "epoch": 0.3758995147611566,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0012539151843725426,
+      "loss": 0.0752,
+      "step": 43304
+    },
+    {
+      "epoch": 0.37590819524136077,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0012538858602891946,
+      "loss": 0.0913,
+      "step": 43305
+    },
+    {
+      "epoch": 0.3759168757215649,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0012538565360375594,
+      "loss": 0.0913,
+      "step": 43306
+    },
+    {
+      "epoch": 0.3759255562017691,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0012538272116176688,
+      "loss": 0.0708,
+      "step": 43307
+    },
+    {
+      "epoch": 0.37593423668197323,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.001253797887029555,
+      "loss": 0.0864,
+      "step": 43308
+    },
+    {
+      "epoch": 0.3759429171621774,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0012537685622732503,
+      "loss": 0.1357,
+      "step": 43309
+    },
+    {
+      "epoch": 0.37595159764238156,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0012537392373487863,
+      "loss": 0.1045,
+      "step": 43310
+    },
+    {
+      "epoch": 0.37596027812258576,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0012537099122561957,
+      "loss": 0.1055,
+      "step": 43311
+    },
+    {
+      "epoch": 0.3759689586027899,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0012536805869955102,
+      "loss": 0.0898,
+      "step": 43312
+    },
+    {
+      "epoch": 0.3759776390829941,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.001253651261566762,
+      "loss": 0.0791,
+      "step": 43313
+    },
+    {
+      "epoch": 0.3759863195631982,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0012536219359699828,
+      "loss": 0.0854,
+      "step": 43314
+    },
+    {
+      "epoch": 0.3759950000434024,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0012535926102052051,
+      "loss": 0.0728,
+      "step": 43315
+    },
+    {
+      "epoch": 0.37600368052360655,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001253563284272461,
+      "loss": 0.0845,
+      "step": 43316
+    },
+    {
+      "epoch": 0.37601236100381075,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0012535339581717822,
+      "loss": 0.1143,
+      "step": 43317
+    },
+    {
+      "epoch": 0.3760210414840149,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0012535046319032011,
+      "loss": 0.0874,
+      "step": 43318
+    },
+    {
+      "epoch": 0.3760297219642191,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0012534753054667493,
+      "loss": 0.0908,
+      "step": 43319
+    },
+    {
+      "epoch": 0.3760384024444232,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0012534459788624598,
+      "loss": 0.0747,
+      "step": 43320
+    },
+    {
+      "epoch": 0.3760470829246274,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0012534166520903637,
+      "loss": 0.084,
+      "step": 43321
+    },
+    {
+      "epoch": 0.37605576340483154,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0012533873251504935,
+      "loss": 0.1748,
+      "step": 43322
+    },
+    {
+      "epoch": 0.37606444388503574,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.001253357998042881,
+      "loss": 0.1309,
+      "step": 43323
+    },
+    {
+      "epoch": 0.3760731243652399,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0012533286707675589,
+      "loss": 0.085,
+      "step": 43324
+    },
+    {
+      "epoch": 0.37608180484544407,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0012532993433245587,
+      "loss": 0.0918,
+      "step": 43325
+    },
+    {
+      "epoch": 0.3760904853256482,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0012532700157139127,
+      "loss": 0.127,
+      "step": 43326
+    },
+    {
+      "epoch": 0.3760991658058524,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0012532406879356528,
+      "loss": 0.0947,
+      "step": 43327
+    },
+    {
+      "epoch": 0.37610784628605654,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0012532113599898113,
+      "loss": 0.1582,
+      "step": 43328
+    },
+    {
+      "epoch": 0.37611652676626073,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.00125318203187642,
+      "loss": 0.082,
+      "step": 43329
+    },
+    {
+      "epoch": 0.37612520724646487,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0012531527035955115,
+      "loss": 0.126,
+      "step": 43330
+    },
+    {
+      "epoch": 0.37613388772666906,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.001253123375147117,
+      "loss": 0.1201,
+      "step": 43331
+    },
+    {
+      "epoch": 0.3761425682068732,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0012530940465312692,
+      "loss": 0.084,
+      "step": 43332
+    },
+    {
+      "epoch": 0.3761512486870774,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001253064717748,
+      "loss": 0.0786,
+      "step": 43333
+    },
+    {
+      "epoch": 0.3761599291672815,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001253035388797342,
+      "loss": 0.1182,
+      "step": 43334
+    },
+    {
+      "epoch": 0.3761686096474857,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0012530060596793262,
+      "loss": 0.1016,
+      "step": 43335
+    },
+    {
+      "epoch": 0.37617729012768986,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0012529767303939858,
+      "loss": 0.127,
+      "step": 43336
+    },
+    {
+      "epoch": 0.37618597060789405,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001252947400941352,
+      "loss": 0.0996,
+      "step": 43337
+    },
+    {
+      "epoch": 0.3761946510880982,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0012529180713214576,
+      "loss": 0.0796,
+      "step": 43338
+    },
+    {
+      "epoch": 0.3762033315683024,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001252888741534334,
+      "loss": 0.1084,
+      "step": 43339
+    },
+    {
+      "epoch": 0.3762120120485065,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0012528594115800136,
+      "loss": 0.1006,
+      "step": 43340
+    },
+    {
+      "epoch": 0.3762206925287107,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0012528300814585282,
+      "loss": 0.1108,
+      "step": 43341
+    },
+    {
+      "epoch": 0.37622937300891485,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0012528007511699104,
+      "loss": 0.0952,
+      "step": 43342
+    },
+    {
+      "epoch": 0.37623805348911904,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001252771420714192,
+      "loss": 0.0579,
+      "step": 43343
+    },
+    {
+      "epoch": 0.3762467339693232,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001252742090091405,
+      "loss": 0.0806,
+      "step": 43344
+    },
+    {
+      "epoch": 0.37625541444952737,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0012527127593015817,
+      "loss": 0.126,
+      "step": 43345
+    },
+    {
+      "epoch": 0.3762640949297315,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0012526834283447539,
+      "loss": 0.1465,
+      "step": 43346
+    },
+    {
+      "epoch": 0.3762727754099357,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0012526540972209537,
+      "loss": 0.0767,
+      "step": 43347
+    },
+    {
+      "epoch": 0.37628145589013984,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0012526247659302133,
+      "loss": 0.0938,
+      "step": 43348
+    },
+    {
+      "epoch": 0.37629013637034403,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0012525954344725647,
+      "loss": 0.1094,
+      "step": 43349
+    },
+    {
+      "epoch": 0.37629881685054817,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0012525661028480403,
+      "loss": 0.125,
+      "step": 43350
+    },
+    {
+      "epoch": 0.37630749733075236,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0012525367710566717,
+      "loss": 0.0991,
+      "step": 43351
+    },
+    {
+      "epoch": 0.3763161778109565,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0012525074390984912,
+      "loss": 0.0791,
+      "step": 43352
+    },
+    {
+      "epoch": 0.3763248582911607,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0012524781069735307,
+      "loss": 0.1348,
+      "step": 43353
+    },
+    {
+      "epoch": 0.37633353877136483,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0012524487746818228,
+      "loss": 0.0908,
+      "step": 43354
+    },
+    {
+      "epoch": 0.376342219251569,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001252419442223399,
+      "loss": 0.0972,
+      "step": 43355
+    },
+    {
+      "epoch": 0.37635089973177316,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0012523901095982917,
+      "loss": 0.0918,
+      "step": 43356
+    },
+    {
+      "epoch": 0.37635958021197735,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0012523607768065326,
+      "loss": 0.1426,
+      "step": 43357
+    },
+    {
+      "epoch": 0.3763682606921815,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0012523314438481544,
+      "loss": 0.1147,
+      "step": 43358
+    },
+    {
+      "epoch": 0.3763769411723857,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0012523021107231887,
+      "loss": 0.0986,
+      "step": 43359
+    },
+    {
+      "epoch": 0.3763856216525898,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0012522727774316672,
+      "loss": 0.0938,
+      "step": 43360
+    },
+    {
+      "epoch": 0.376394302132794,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0012522434439736232,
+      "loss": 0.1138,
+      "step": 43361
+    },
+    {
+      "epoch": 0.37640298261299815,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0012522141103490877,
+      "loss": 0.1377,
+      "step": 43362
+    },
+    {
+      "epoch": 0.37641166309320234,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0012521847765580932,
+      "loss": 0.0918,
+      "step": 43363
+    },
+    {
+      "epoch": 0.3764203435734065,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0012521554426006718,
+      "loss": 0.105,
+      "step": 43364
+    },
+    {
+      "epoch": 0.3764290240536107,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0012521261084768556,
+      "loss": 0.1143,
+      "step": 43365
+    },
+    {
+      "epoch": 0.3764377045338148,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0012520967741866762,
+      "loss": 0.0854,
+      "step": 43366
+    },
+    {
+      "epoch": 0.376446385014019,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0012520674397301663,
+      "loss": 0.0967,
+      "step": 43367
+    },
+    {
+      "epoch": 0.37645506549422314,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0012520381051073577,
+      "loss": 0.0913,
+      "step": 43368
+    },
+    {
+      "epoch": 0.37646374597442733,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0012520087703182825,
+      "loss": 0.1104,
+      "step": 43369
+    },
+    {
+      "epoch": 0.37647242645463147,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.0012519794353629727,
+      "loss": 0.082,
+      "step": 43370
+    },
+    {
+      "epoch": 0.3764811069348356,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0012519501002414605,
+      "loss": 0.0913,
+      "step": 43371
+    },
+    {
+      "epoch": 0.3764897874150398,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001251920764953778,
+      "loss": 0.1108,
+      "step": 43372
+    },
+    {
+      "epoch": 0.37649846789524394,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0012518914294999573,
+      "loss": 0.1006,
+      "step": 43373
+    },
+    {
+      "epoch": 0.37650714837544813,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0012518620938800303,
+      "loss": 0.1357,
+      "step": 43374
+    },
+    {
+      "epoch": 0.37651582885565227,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0012518327580940292,
+      "loss": 0.0859,
+      "step": 43375
+    },
+    {
+      "epoch": 0.37652450933585646,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0012518034221419862,
+      "loss": 0.0967,
+      "step": 43376
+    },
+    {
+      "epoch": 0.3765331898160606,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0012517740860239332,
+      "loss": 0.0908,
+      "step": 43377
+    },
+    {
+      "epoch": 0.3765418702962648,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0012517447497399023,
+      "loss": 0.1611,
+      "step": 43378
+    },
+    {
+      "epoch": 0.37655055077646893,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0012517154132899258,
+      "loss": 0.1226,
+      "step": 43379
+    },
+    {
+      "epoch": 0.3765592312566731,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0012516860766740353,
+      "loss": 0.1064,
+      "step": 43380
+    },
+    {
+      "epoch": 0.37656791173687726,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0012516567398922634,
+      "loss": 0.0732,
+      "step": 43381
+    },
+    {
+      "epoch": 0.37657659221708145,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0012516274029446418,
+      "loss": 0.1367,
+      "step": 43382
+    },
+    {
+      "epoch": 0.3765852726972856,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001251598065831203,
+      "loss": 0.104,
+      "step": 43383
+    },
+    {
+      "epoch": 0.3765939531774898,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0012515687285519785,
+      "loss": 0.1064,
+      "step": 43384
+    },
+    {
+      "epoch": 0.3766026336576939,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0012515393911070012,
+      "loss": 0.1143,
+      "step": 43385
+    },
+    {
+      "epoch": 0.3766113141378981,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0012515100534963024,
+      "loss": 0.1182,
+      "step": 43386
+    },
+    {
+      "epoch": 0.37661999461810225,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0012514807157199145,
+      "loss": 0.1172,
+      "step": 43387
+    },
+    {
+      "epoch": 0.37662867509830644,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0012514513777778693,
+      "loss": 0.1001,
+      "step": 43388
+    },
+    {
+      "epoch": 0.3766373555785106,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0012514220396701997,
+      "loss": 0.0981,
+      "step": 43389
+    },
+    {
+      "epoch": 0.3766460360587148,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0012513927013969369,
+      "loss": 0.0732,
+      "step": 43390
+    },
+    {
+      "epoch": 0.3766547165389189,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0012513633629581131,
+      "loss": 0.0894,
+      "step": 43391
+    },
+    {
+      "epoch": 0.3766633970191231,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0012513340243537608,
+      "loss": 0.1338,
+      "step": 43392
+    },
+    {
+      "epoch": 0.37667207749932724,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001251304685583912,
+      "loss": 0.0967,
+      "step": 43393
+    },
+    {
+      "epoch": 0.37668075797953143,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0012512753466485988,
+      "loss": 0.1504,
+      "step": 43394
+    },
+    {
+      "epoch": 0.37668943845973557,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0012512460075478529,
+      "loss": 0.1025,
+      "step": 43395
+    },
+    {
+      "epoch": 0.37669811893993976,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0012512166682817065,
+      "loss": 0.0664,
+      "step": 43396
+    },
+    {
+      "epoch": 0.3767067994201439,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001251187328850192,
+      "loss": 0.1011,
+      "step": 43397
+    },
+    {
+      "epoch": 0.3767154799003481,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0012511579892533415,
+      "loss": 0.0718,
+      "step": 43398
+    },
+    {
+      "epoch": 0.37672416038055223,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0012511286494911865,
+      "loss": 0.1367,
+      "step": 43399
+    },
+    {
+      "epoch": 0.3767328408607564,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0012510993095637597,
+      "loss": 0.0967,
+      "step": 43400
+    },
+    {
+      "epoch": 0.37674152134096056,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0012510699694710929,
+      "loss": 0.0654,
+      "step": 43401
+    },
+    {
+      "epoch": 0.37675020182116475,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0012510406292132183,
+      "loss": 0.1113,
+      "step": 43402
+    },
+    {
+      "epoch": 0.3767588823013689,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0012510112887901684,
+      "loss": 0.0938,
+      "step": 43403
+    },
+    {
+      "epoch": 0.3767675627815731,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001250981948201974,
+      "loss": 0.0752,
+      "step": 43404
+    },
+    {
+      "epoch": 0.3767762432617772,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0012509526074486685,
+      "loss": 0.0938,
+      "step": 43405
+    },
+    {
+      "epoch": 0.3767849237419814,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0012509232665302834,
+      "loss": 0.1172,
+      "step": 43406
+    },
+    {
+      "epoch": 0.37679360422218555,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001250893925446851,
+      "loss": 0.106,
+      "step": 43407
+    },
+    {
+      "epoch": 0.37680228470238974,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.001250864584198403,
+      "loss": 0.0981,
+      "step": 43408
+    },
+    {
+      "epoch": 0.3768109651825939,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0012508352427849717,
+      "loss": 0.1348,
+      "step": 43409
+    },
+    {
+      "epoch": 0.3768196456627981,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0012508059012065896,
+      "loss": 0.1035,
+      "step": 43410
+    },
+    {
+      "epoch": 0.3768283261430022,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0012507765594632883,
+      "loss": 0.0618,
+      "step": 43411
+    },
+    {
+      "epoch": 0.3768370066232064,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0012507472175551,
+      "loss": 0.1167,
+      "step": 43412
+    },
+    {
+      "epoch": 0.37684568710341054,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0012507178754820568,
+      "loss": 0.1025,
+      "step": 43413
+    },
+    {
+      "epoch": 0.37685436758361474,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0012506885332441908,
+      "loss": 0.0791,
+      "step": 43414
+    },
+    {
+      "epoch": 0.3768630480638189,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001250659190841534,
+      "loss": 0.084,
+      "step": 43415
+    },
+    {
+      "epoch": 0.37687172854402307,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0012506298482741188,
+      "loss": 0.0791,
+      "step": 43416
+    },
+    {
+      "epoch": 0.3768804090242272,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001250600505541977,
+      "loss": 0.1074,
+      "step": 43417
+    },
+    {
+      "epoch": 0.3768890895044314,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0012505711626451406,
+      "loss": 0.1118,
+      "step": 43418
+    },
+    {
+      "epoch": 0.37689776998463553,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0012505418195836418,
+      "loss": 0.0623,
+      "step": 43419
+    },
+    {
+      "epoch": 0.3769064504648397,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001250512476357513,
+      "loss": 0.0942,
+      "step": 43420
+    },
+    {
+      "epoch": 0.37691513094504386,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0012504831329667855,
+      "loss": 0.1016,
+      "step": 43421
+    },
+    {
+      "epoch": 0.37692381142524806,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0012504537894114922,
+      "loss": 0.1328,
+      "step": 43422
+    },
+    {
+      "epoch": 0.3769324919054522,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0012504244456916652,
+      "loss": 0.084,
+      "step": 43423
+    },
+    {
+      "epoch": 0.3769411723856564,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001250395101807336,
+      "loss": 0.1445,
+      "step": 43424
+    },
+    {
+      "epoch": 0.3769498528658605,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0012503657577585372,
+      "loss": 0.0869,
+      "step": 43425
+    },
+    {
+      "epoch": 0.3769585333460647,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0012503364135453003,
+      "loss": 0.1104,
+      "step": 43426
+    },
+    {
+      "epoch": 0.37696721382626885,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001250307069167658,
+      "loss": 0.0908,
+      "step": 43427
+    },
+    {
+      "epoch": 0.37697589430647305,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001250277724625642,
+      "loss": 0.1152,
+      "step": 43428
+    },
+    {
+      "epoch": 0.3769845747866772,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0012502483799192844,
+      "loss": 0.1211,
+      "step": 43429
+    },
+    {
+      "epoch": 0.3769932552668814,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0012502190350486178,
+      "loss": 0.0981,
+      "step": 43430
+    },
+    {
+      "epoch": 0.3770019357470855,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0012501896900136739,
+      "loss": 0.1182,
+      "step": 43431
+    },
+    {
+      "epoch": 0.3770106162272897,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0012501603448144846,
+      "loss": 0.1143,
+      "step": 43432
+    },
+    {
+      "epoch": 0.37701929670749385,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0012501309994510823,
+      "loss": 0.0928,
+      "step": 43433
+    },
+    {
+      "epoch": 0.37702797718769804,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001250101653923499,
+      "loss": 0.1172,
+      "step": 43434
+    },
+    {
+      "epoch": 0.3770366576679022,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0012500723082317668,
+      "loss": 0.0908,
+      "step": 43435
+    },
+    {
+      "epoch": 0.37704533814810637,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0012500429623759177,
+      "loss": 0.1309,
+      "step": 43436
+    },
+    {
+      "epoch": 0.3770540186283105,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001250013616355984,
+      "loss": 0.0933,
+      "step": 43437
+    },
+    {
+      "epoch": 0.3770626991085147,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0012499842701719974,
+      "loss": 0.1094,
+      "step": 43438
+    },
+    {
+      "epoch": 0.37707137958871884,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0012499549238239903,
+      "loss": 0.1299,
+      "step": 43439
+    },
+    {
+      "epoch": 0.37708006006892303,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0012499255773119951,
+      "loss": 0.1797,
+      "step": 43440
+    },
+    {
+      "epoch": 0.37708874054912717,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0012498962306360434,
+      "loss": 0.0703,
+      "step": 43441
+    },
+    {
+      "epoch": 0.37709742102933136,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0012498668837961676,
+      "loss": 0.1245,
+      "step": 43442
+    },
+    {
+      "epoch": 0.3771061015095355,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001249837536792399,
+      "loss": 0.0903,
+      "step": 43443
+    },
+    {
+      "epoch": 0.3771147819897397,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0012498081896247708,
+      "loss": 0.126,
+      "step": 43444
+    },
+    {
+      "epoch": 0.3771234624699438,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0012497788422933145,
+      "loss": 0.0693,
+      "step": 43445
+    },
+    {
+      "epoch": 0.377132142950148,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0012497494947980626,
+      "loss": 0.0996,
+      "step": 43446
+    },
+    {
+      "epoch": 0.37714082343035216,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0012497201471390466,
+      "loss": 0.0981,
+      "step": 43447
+    },
+    {
+      "epoch": 0.37714950391055635,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0012496907993162986,
+      "loss": 0.1426,
+      "step": 43448
+    },
+    {
+      "epoch": 0.3771581843907605,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0012496614513298515,
+      "loss": 0.0806,
+      "step": 43449
+    },
+    {
+      "epoch": 0.3771668648709647,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0012496321031797365,
+      "loss": 0.0913,
+      "step": 43450
+    },
+    {
+      "epoch": 0.3771755453511688,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0012496027548659866,
+      "loss": 0.126,
+      "step": 43451
+    },
+    {
+      "epoch": 0.377184225831373,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0012495734063886327,
+      "loss": 0.1094,
+      "step": 43452
+    },
+    {
+      "epoch": 0.37719290631157715,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001249544057747708,
+      "loss": 0.0991,
+      "step": 43453
+    },
+    {
+      "epoch": 0.37720158679178134,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0012495147089432444,
+      "loss": 0.0776,
+      "step": 43454
+    },
+    {
+      "epoch": 0.3772102672719855,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0012494853599752736,
+      "loss": 0.1035,
+      "step": 43455
+    },
+    {
+      "epoch": 0.37721894775218967,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0012494560108438273,
+      "loss": 0.1367,
+      "step": 43456
+    },
+    {
+      "epoch": 0.3772276282323938,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0012494266615489386,
+      "loss": 0.0859,
+      "step": 43457
+    },
+    {
+      "epoch": 0.377236308712598,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001249397312090639,
+      "loss": 0.1309,
+      "step": 43458
+    },
+    {
+      "epoch": 0.37724498919280214,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001249367962468961,
+      "loss": 0.084,
+      "step": 43459
+    },
+    {
+      "epoch": 0.37725366967300633,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0012493386126839364,
+      "loss": 0.1064,
+      "step": 43460
+    },
+    {
+      "epoch": 0.37726235015321047,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001249309262735597,
+      "loss": 0.0859,
+      "step": 43461
+    },
+    {
+      "epoch": 0.37727103063341466,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0012492799126239756,
+      "loss": 0.0972,
+      "step": 43462
+    },
+    {
+      "epoch": 0.3772797111136188,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0012492505623491038,
+      "loss": 0.1406,
+      "step": 43463
+    },
+    {
+      "epoch": 0.377288391593823,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001249221211911014,
+      "loss": 0.1055,
+      "step": 43464
+    },
+    {
+      "epoch": 0.37729707207402713,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.001249191861309738,
+      "loss": 0.0737,
+      "step": 43465
+    },
+    {
+      "epoch": 0.3773057525542313,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0012491625105453077,
+      "loss": 0.1172,
+      "step": 43466
+    },
+    {
+      "epoch": 0.37731443303443546,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0012491331596177558,
+      "loss": 0.0928,
+      "step": 43467
+    },
+    {
+      "epoch": 0.37732311351463965,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0012491038085271143,
+      "loss": 0.0859,
+      "step": 43468
+    },
+    {
+      "epoch": 0.3773317939948438,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0012490744572734146,
+      "loss": 0.1104,
+      "step": 43469
+    },
+    {
+      "epoch": 0.377340474475048,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0012490451058566898,
+      "loss": 0.1006,
+      "step": 43470
+    },
+    {
+      "epoch": 0.3773491549552521,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0012490157542769716,
+      "loss": 0.0884,
+      "step": 43471
+    },
+    {
+      "epoch": 0.3773578354354563,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0012489864025342916,
+      "loss": 0.0898,
+      "step": 43472
+    },
+    {
+      "epoch": 0.37736651591566045,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0012489570506286826,
+      "loss": 0.0972,
+      "step": 43473
+    },
+    {
+      "epoch": 0.37737519639586464,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001248927698560176,
+      "loss": 0.0986,
+      "step": 43474
+    },
+    {
+      "epoch": 0.3773838768760688,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0012488983463288046,
+      "loss": 0.1045,
+      "step": 43475
+    },
+    {
+      "epoch": 0.377392557356273,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0012488689939346002,
+      "loss": 0.0933,
+      "step": 43476
+    },
+    {
+      "epoch": 0.3774012378364771,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0012488396413775948,
+      "loss": 0.0664,
+      "step": 43477
+    },
+    {
+      "epoch": 0.3774099183166813,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0012488102886578207,
+      "loss": 0.1035,
+      "step": 43478
+    },
+    {
+      "epoch": 0.37741859879688544,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0012487809357753102,
+      "loss": 0.1396,
+      "step": 43479
+    },
+    {
+      "epoch": 0.37742727927708963,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0012487515827300946,
+      "loss": 0.1084,
+      "step": 43480
+    },
+    {
+      "epoch": 0.37743595975729377,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0012487222295222069,
+      "loss": 0.1367,
+      "step": 43481
+    },
+    {
+      "epoch": 0.37744464023749796,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0012486928761516786,
+      "loss": 0.0977,
+      "step": 43482
+    },
+    {
+      "epoch": 0.3774533207177021,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0012486635226185419,
+      "loss": 0.1455,
+      "step": 43483
+    },
+    {
+      "epoch": 0.3774620011979063,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0012486341689228293,
+      "loss": 0.0938,
+      "step": 43484
+    },
+    {
+      "epoch": 0.37747068167811043,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0012486048150645722,
+      "loss": 0.0952,
+      "step": 43485
+    },
+    {
+      "epoch": 0.3774793621583146,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0012485754610438034,
+      "loss": 0.0801,
+      "step": 43486
+    },
+    {
+      "epoch": 0.37748804263851876,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0012485461068605546,
+      "loss": 0.1221,
+      "step": 43487
+    },
+    {
+      "epoch": 0.37749672311872295,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0012485167525148582,
+      "loss": 0.1416,
+      "step": 43488
+    },
+    {
+      "epoch": 0.3775054035989271,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001248487398006746,
+      "loss": 0.0981,
+      "step": 43489
+    },
+    {
+      "epoch": 0.3775140840791313,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0012484580433362502,
+      "loss": 0.0996,
+      "step": 43490
+    },
+    {
+      "epoch": 0.3775227645593354,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0012484286885034027,
+      "loss": 0.1216,
+      "step": 43491
+    },
+    {
+      "epoch": 0.3775314450395396,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0012483993335082363,
+      "loss": 0.083,
+      "step": 43492
+    },
+    {
+      "epoch": 0.37754012551974375,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0012483699783507822,
+      "loss": 0.0618,
+      "step": 43493
+    },
+    {
+      "epoch": 0.3775488059999479,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0012483406230310733,
+      "loss": 0.1279,
+      "step": 43494
+    },
+    {
+      "epoch": 0.3775574864801521,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0012483112675491408,
+      "loss": 0.1084,
+      "step": 43495
+    },
+    {
+      "epoch": 0.3775661669603562,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0012482819119050176,
+      "loss": 0.1396,
+      "step": 43496
+    },
+    {
+      "epoch": 0.3775748474405604,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0012482525560987355,
+      "loss": 0.1543,
+      "step": 43497
+    },
+    {
+      "epoch": 0.37758352792076455,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0012482232001303267,
+      "loss": 0.1084,
+      "step": 43498
+    },
+    {
+      "epoch": 0.37759220840096874,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0012481938439998232,
+      "loss": 0.0693,
+      "step": 43499
+    },
+    {
+      "epoch": 0.3776008888811729,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0012481644877072571,
+      "loss": 0.0928,
+      "step": 43500
+    },
+    {
+      "epoch": 0.3776095693613771,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0012481351312526605,
+      "loss": 0.0967,
+      "step": 43501
+    },
+    {
+      "epoch": 0.3776182498415812,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0012481057746360658,
+      "loss": 0.1494,
+      "step": 43502
+    },
+    {
+      "epoch": 0.3776269303217854,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0012480764178575045,
+      "loss": 0.0947,
+      "step": 43503
+    },
+    {
+      "epoch": 0.37763561080198954,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0012480470609170095,
+      "loss": 0.0908,
+      "step": 43504
+    },
+    {
+      "epoch": 0.37764429128219373,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001248017703814612,
+      "loss": 0.1182,
+      "step": 43505
+    },
+    {
+      "epoch": 0.37765297176239787,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0012479883465503448,
+      "loss": 0.1152,
+      "step": 43506
+    },
+    {
+      "epoch": 0.37766165224260206,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0012479589891242395,
+      "loss": 0.0781,
+      "step": 43507
+    },
+    {
+      "epoch": 0.3776703327228062,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0012479296315363287,
+      "loss": 0.1416,
+      "step": 43508
+    },
+    {
+      "epoch": 0.3776790132030104,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0012479002737866443,
+      "loss": 0.1309,
+      "step": 43509
+    },
+    {
+      "epoch": 0.37768769368321453,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0012478709158752184,
+      "loss": 0.085,
+      "step": 43510
+    },
+    {
+      "epoch": 0.3776963741634187,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0012478415578020831,
+      "loss": 0.1064,
+      "step": 43511
+    },
+    {
+      "epoch": 0.37770505464362286,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0012478121995672707,
+      "loss": 0.0613,
+      "step": 43512
+    },
+    {
+      "epoch": 0.37771373512382705,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0012477828411708125,
+      "loss": 0.0776,
+      "step": 43513
+    },
+    {
+      "epoch": 0.3777224156040312,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0012477534826127415,
+      "loss": 0.1104,
+      "step": 43514
+    },
+    {
+      "epoch": 0.3777310960842354,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0012477241238930895,
+      "loss": 0.0781,
+      "step": 43515
+    },
+    {
+      "epoch": 0.3777397765644395,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0012476947650118888,
+      "loss": 0.1099,
+      "step": 43516
+    },
+    {
+      "epoch": 0.3777484570446437,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001247665405969171,
+      "loss": 0.0723,
+      "step": 43517
+    },
+    {
+      "epoch": 0.37775713752484785,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0012476360467649687,
+      "loss": 0.1074,
+      "step": 43518
+    },
+    {
+      "epoch": 0.37776581800505205,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0012476066873993142,
+      "loss": 0.0698,
+      "step": 43519
+    },
+    {
+      "epoch": 0.3777744984852562,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0012475773278722388,
+      "loss": 0.0815,
+      "step": 43520
+    },
+    {
+      "epoch": 0.3777831789654604,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0012475479681837752,
+      "loss": 0.0796,
+      "step": 43521
+    },
+    {
+      "epoch": 0.3777918594456645,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0012475186083339552,
+      "loss": 0.123,
+      "step": 43522
+    },
+    {
+      "epoch": 0.3778005399258687,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0012474892483228114,
+      "loss": 0.0752,
+      "step": 43523
+    },
+    {
+      "epoch": 0.37780922040607284,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0012474598881503753,
+      "loss": 0.0928,
+      "step": 43524
+    },
+    {
+      "epoch": 0.37781790088627704,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0012474305278166791,
+      "loss": 0.0879,
+      "step": 43525
+    },
+    {
+      "epoch": 0.3778265813664812,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0012474011673217554,
+      "loss": 0.0996,
+      "step": 43526
+    },
+    {
+      "epoch": 0.37783526184668537,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001247371806665636,
+      "loss": 0.0903,
+      "step": 43527
+    },
+    {
+      "epoch": 0.3778439423268895,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0012473424458483528,
+      "loss": 0.0947,
+      "step": 43528
+    },
+    {
+      "epoch": 0.3778526228070937,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0012473130848699384,
+      "loss": 0.1045,
+      "step": 43529
+    },
+    {
+      "epoch": 0.37786130328729783,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0012472837237304244,
+      "loss": 0.0679,
+      "step": 43530
+    },
+    {
+      "epoch": 0.377869983767502,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0012472543624298432,
+      "loss": 0.0977,
+      "step": 43531
+    },
+    {
+      "epoch": 0.37787866424770616,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0012472250009682268,
+      "loss": 0.1475,
+      "step": 43532
+    },
+    {
+      "epoch": 0.37788734472791036,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0012471956393456075,
+      "loss": 0.0864,
+      "step": 43533
+    },
+    {
+      "epoch": 0.3778960252081145,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0012471662775620168,
+      "loss": 0.0713,
+      "step": 43534
+    },
+    {
+      "epoch": 0.3779047056883187,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0012471369156174877,
+      "loss": 0.1299,
+      "step": 43535
+    },
+    {
+      "epoch": 0.3779133861685228,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001247107553512052,
+      "loss": 0.0918,
+      "step": 43536
+    },
+    {
+      "epoch": 0.377922066648727,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0012470781912457417,
+      "loss": 0.0903,
+      "step": 43537
+    },
+    {
+      "epoch": 0.37793074712893115,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0012470488288185884,
+      "loss": 0.1123,
+      "step": 43538
+    },
+    {
+      "epoch": 0.37793942760913535,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001247019466230625,
+      "loss": 0.082,
+      "step": 43539
+    },
+    {
+      "epoch": 0.3779481080893395,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0012469901034818836,
+      "loss": 0.1138,
+      "step": 43540
+    },
+    {
+      "epoch": 0.3779567885695437,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0012469607405723958,
+      "loss": 0.1162,
+      "step": 43541
+    },
+    {
+      "epoch": 0.3779654690497478,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0012469313775021937,
+      "loss": 0.1133,
+      "step": 43542
+    },
+    {
+      "epoch": 0.377974149529952,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0012469020142713099,
+      "loss": 0.0957,
+      "step": 43543
+    },
+    {
+      "epoch": 0.37798283001015615,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001246872650879776,
+      "loss": 0.1021,
+      "step": 43544
+    },
+    {
+      "epoch": 0.37799151049036034,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0012468432873276247,
+      "loss": 0.1055,
+      "step": 43545
+    },
+    {
+      "epoch": 0.3780001909705645,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0012468139236148877,
+      "loss": 0.1123,
+      "step": 43546
+    },
+    {
+      "epoch": 0.37800887145076867,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.001246784559741597,
+      "loss": 0.123,
+      "step": 43547
+    },
+    {
+      "epoch": 0.3780175519309728,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0012467551957077855,
+      "loss": 0.1123,
+      "step": 43548
+    },
+    {
+      "epoch": 0.378026232411177,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0012467258315134841,
+      "loss": 0.0869,
+      "step": 43549
+    },
+    {
+      "epoch": 0.37803491289138114,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001246696467158726,
+      "loss": 0.0957,
+      "step": 43550
+    },
+    {
+      "epoch": 0.37804359337158533,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0012466671026435422,
+      "loss": 0.0767,
+      "step": 43551
+    },
+    {
+      "epoch": 0.37805227385178947,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0012466377379679661,
+      "loss": 0.1953,
+      "step": 43552
+    },
+    {
+      "epoch": 0.37806095433199366,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0012466083731320287,
+      "loss": 0.1074,
+      "step": 43553
+    },
+    {
+      "epoch": 0.3780696348121978,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.001246579008135763,
+      "loss": 0.104,
+      "step": 43554
+    },
+    {
+      "epoch": 0.378078315292402,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0012465496429792005,
+      "loss": 0.0928,
+      "step": 43555
+    },
+    {
+      "epoch": 0.3780869957726061,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0012465202776623734,
+      "loss": 0.1035,
+      "step": 43556
+    },
+    {
+      "epoch": 0.3780956762528103,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001246490912185314,
+      "loss": 0.1016,
+      "step": 43557
+    },
+    {
+      "epoch": 0.37810435673301446,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0012464615465480546,
+      "loss": 0.1055,
+      "step": 43558
+    },
+    {
+      "epoch": 0.37811303721321865,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001246432180750627,
+      "loss": 0.0776,
+      "step": 43559
+    },
+    {
+      "epoch": 0.3781217176934228,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0012464028147930629,
+      "loss": 0.0957,
+      "step": 43560
+    },
+    {
+      "epoch": 0.378130398173627,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0012463734486753953,
+      "loss": 0.1221,
+      "step": 43561
+    },
+    {
+      "epoch": 0.3781390786538311,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0012463440823976557,
+      "loss": 0.1309,
+      "step": 43562
+    },
+    {
+      "epoch": 0.3781477591340353,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0012463147159598763,
+      "loss": 0.1123,
+      "step": 43563
+    },
+    {
+      "epoch": 0.37815643961423945,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0012462853493620896,
+      "loss": 0.106,
+      "step": 43564
+    },
+    {
+      "epoch": 0.37816512009444364,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0012462559826043275,
+      "loss": 0.0996,
+      "step": 43565
+    },
+    {
+      "epoch": 0.3781738005746478,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001246226615686622,
+      "loss": 0.084,
+      "step": 43566
+    },
+    {
+      "epoch": 0.37818248105485197,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0012461972486090052,
+      "loss": 0.0986,
+      "step": 43567
+    },
+    {
+      "epoch": 0.3781911615350561,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0012461678813715095,
+      "loss": 0.1055,
+      "step": 43568
+    },
+    {
+      "epoch": 0.3781998420152603,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0012461385139741663,
+      "loss": 0.1113,
+      "step": 43569
+    },
+    {
+      "epoch": 0.37820852249546444,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0012461091464170085,
+      "loss": 0.1035,
+      "step": 43570
+    },
+    {
+      "epoch": 0.37821720297566863,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001246079778700068,
+      "loss": 0.0708,
+      "step": 43571
+    },
+    {
+      "epoch": 0.37822588345587277,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0012460504108233767,
+      "loss": 0.0957,
+      "step": 43572
+    },
+    {
+      "epoch": 0.37823456393607696,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0012460210427869666,
+      "loss": 0.1348,
+      "step": 43573
+    },
+    {
+      "epoch": 0.3782432444162811,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.001245991674590871,
+      "loss": 0.0791,
+      "step": 43574
+    },
+    {
+      "epoch": 0.3782519248964853,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0012459623062351201,
+      "loss": 0.1128,
+      "step": 43575
+    },
+    {
+      "epoch": 0.37826060537668943,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0012459329377197477,
+      "loss": 0.0537,
+      "step": 43576
+    },
+    {
+      "epoch": 0.3782692858568936,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0012459035690447848,
+      "loss": 0.123,
+      "step": 43577
+    },
+    {
+      "epoch": 0.37827796633709776,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001245874200210264,
+      "loss": 0.084,
+      "step": 43578
+    },
+    {
+      "epoch": 0.37828664681730195,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0012458448312162177,
+      "loss": 0.1143,
+      "step": 43579
+    },
+    {
+      "epoch": 0.3782953272975061,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0012458154620626776,
+      "loss": 0.0728,
+      "step": 43580
+    },
+    {
+      "epoch": 0.3783040077777103,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0012457860927496756,
+      "loss": 0.1084,
+      "step": 43581
+    },
+    {
+      "epoch": 0.3783126882579144,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0012457567232772443,
+      "loss": 0.0864,
+      "step": 43582
+    },
+    {
+      "epoch": 0.3783213687381186,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0012457273536454158,
+      "loss": 0.1016,
+      "step": 43583
+    },
+    {
+      "epoch": 0.37833004921832275,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0012456979838542217,
+      "loss": 0.0913,
+      "step": 43584
+    },
+    {
+      "epoch": 0.37833872969852694,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0012456686139036947,
+      "loss": 0.0981,
+      "step": 43585
+    },
+    {
+      "epoch": 0.3783474101787311,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0012456392437938666,
+      "loss": 0.127,
+      "step": 43586
+    },
+    {
+      "epoch": 0.3783560906589353,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0012456098735247696,
+      "loss": 0.1025,
+      "step": 43587
+    },
+    {
+      "epoch": 0.3783647711391394,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001245580503096436,
+      "loss": 0.1289,
+      "step": 43588
+    },
+    {
+      "epoch": 0.3783734516193436,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0012455511325088976,
+      "loss": 0.0762,
+      "step": 43589
+    },
+    {
+      "epoch": 0.37838213209954774,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0012455217617621866,
+      "loss": 0.1162,
+      "step": 43590
+    },
+    {
+      "epoch": 0.37839081257975193,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0012454923908563353,
+      "loss": 0.0952,
+      "step": 43591
+    },
+    {
+      "epoch": 0.37839949305995607,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0012454630197913756,
+      "loss": 0.0938,
+      "step": 43592
+    },
+    {
+      "epoch": 0.37840817354016026,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0012454336485673398,
+      "loss": 0.1152,
+      "step": 43593
+    },
+    {
+      "epoch": 0.3784168540203644,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0012454042771842602,
+      "loss": 0.0981,
+      "step": 43594
+    },
+    {
+      "epoch": 0.3784255345005686,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0012453749056421683,
+      "loss": 0.0903,
+      "step": 43595
+    },
+    {
+      "epoch": 0.37843421498077273,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0012453455339410968,
+      "loss": 0.1079,
+      "step": 43596
+    },
+    {
+      "epoch": 0.3784428954609769,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0012453161620810777,
+      "loss": 0.0859,
+      "step": 43597
+    },
+    {
+      "epoch": 0.37845157594118106,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001245286790062143,
+      "loss": 0.1138,
+      "step": 43598
+    },
+    {
+      "epoch": 0.37846025642138525,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0012452574178843246,
+      "loss": 0.1069,
+      "step": 43599
+    },
+    {
+      "epoch": 0.3784689369015894,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0012452280455476548,
+      "loss": 0.0957,
+      "step": 43600
+    },
+    {
+      "epoch": 0.3784776173817936,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001245198673052166,
+      "loss": 0.0977,
+      "step": 43601
+    },
+    {
+      "epoch": 0.3784862978619977,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0012451693003978901,
+      "loss": 0.126,
+      "step": 43602
+    },
+    {
+      "epoch": 0.3784949783422019,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001245139927584859,
+      "loss": 0.1064,
+      "step": 43603
+    },
+    {
+      "epoch": 0.37850365882240605,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0012451105546131055,
+      "loss": 0.0869,
+      "step": 43604
+    },
+    {
+      "epoch": 0.37851233930261025,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001245081181482661,
+      "loss": 0.0918,
+      "step": 43605
+    },
+    {
+      "epoch": 0.3785210197828144,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001245051808193558,
+      "loss": 0.085,
+      "step": 43606
+    },
+    {
+      "epoch": 0.3785297002630186,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0012450224347458285,
+      "loss": 0.0762,
+      "step": 43607
+    },
+    {
+      "epoch": 0.3785383807432227,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0012449930611395047,
+      "loss": 0.0801,
+      "step": 43608
+    },
+    {
+      "epoch": 0.3785470612234269,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0012449636873746184,
+      "loss": 0.1436,
+      "step": 43609
+    },
+    {
+      "epoch": 0.37855574170363104,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0012449343134512023,
+      "loss": 0.1157,
+      "step": 43610
+    },
+    {
+      "epoch": 0.37856442218383524,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0012449049393692879,
+      "loss": 0.0903,
+      "step": 43611
+    },
+    {
+      "epoch": 0.3785731026640394,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.001244875565128908,
+      "loss": 0.1055,
+      "step": 43612
+    },
+    {
+      "epoch": 0.37858178314424357,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0012448461907300942,
+      "loss": 0.1108,
+      "step": 43613
+    },
+    {
+      "epoch": 0.3785904636244477,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0012448168161728788,
+      "loss": 0.1309,
+      "step": 43614
+    },
+    {
+      "epoch": 0.3785991441046519,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001244787441457294,
+      "loss": 0.0986,
+      "step": 43615
+    },
+    {
+      "epoch": 0.37860782458485603,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0012447580665833715,
+      "loss": 0.1104,
+      "step": 43616
+    },
+    {
+      "epoch": 0.37861650506506017,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0012447286915511439,
+      "loss": 0.0781,
+      "step": 43617
+    },
+    {
+      "epoch": 0.37862518554526436,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0012446993163606436,
+      "loss": 0.1099,
+      "step": 43618
+    },
+    {
+      "epoch": 0.3786338660254685,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0012446699410119015,
+      "loss": 0.1216,
+      "step": 43619
+    },
+    {
+      "epoch": 0.3786425465056727,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0012446405655049512,
+      "loss": 0.1016,
+      "step": 43620
+    },
+    {
+      "epoch": 0.37865122698587683,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0012446111898398236,
+      "loss": 0.1074,
+      "step": 43621
+    },
+    {
+      "epoch": 0.378659907466081,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001244581814016552,
+      "loss": 0.0913,
+      "step": 43622
+    },
+    {
+      "epoch": 0.37866858794628516,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0012445524380351675,
+      "loss": 0.126,
+      "step": 43623
+    },
+    {
+      "epoch": 0.37867726842648936,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0012445230618957027,
+      "loss": 0.1094,
+      "step": 43624
+    },
+    {
+      "epoch": 0.3786859489066935,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0012444936855981897,
+      "loss": 0.0854,
+      "step": 43625
+    },
+    {
+      "epoch": 0.3786946293868977,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0012444643091426607,
+      "loss": 0.0771,
+      "step": 43626
+    },
+    {
+      "epoch": 0.3787033098671018,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0012444349325291474,
+      "loss": 0.1035,
+      "step": 43627
+    },
+    {
+      "epoch": 0.378711990347306,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0012444055557576826,
+      "loss": 0.1001,
+      "step": 43628
+    },
+    {
+      "epoch": 0.37872067082751015,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0012443761788282976,
+      "loss": 0.0869,
+      "step": 43629
+    },
+    {
+      "epoch": 0.37872935130771435,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0012443468017410248,
+      "loss": 0.0771,
+      "step": 43630
+    },
+    {
+      "epoch": 0.3787380317879185,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0012443174244958972,
+      "loss": 0.1455,
+      "step": 43631
+    },
+    {
+      "epoch": 0.3787467122681227,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0012442880470929462,
+      "loss": 0.1167,
+      "step": 43632
+    },
+    {
+      "epoch": 0.3787553927483268,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0012442586695322036,
+      "loss": 0.0791,
+      "step": 43633
+    },
+    {
+      "epoch": 0.378764073228531,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.001244229291813702,
+      "loss": 0.1201,
+      "step": 43634
+    },
+    {
+      "epoch": 0.37877275370873514,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0012441999139374733,
+      "loss": 0.0732,
+      "step": 43635
+    },
+    {
+      "epoch": 0.37878143418893934,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.00124417053590355,
+      "loss": 0.0767,
+      "step": 43636
+    },
+    {
+      "epoch": 0.3787901146691435,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0012441411577119638,
+      "loss": 0.0967,
+      "step": 43637
+    },
+    {
+      "epoch": 0.37879879514934767,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0012441117793627469,
+      "loss": 0.1064,
+      "step": 43638
+    },
+    {
+      "epoch": 0.3788074756295518,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0012440824008559314,
+      "loss": 0.0684,
+      "step": 43639
+    },
+    {
+      "epoch": 0.378816156109756,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0012440530221915498,
+      "loss": 0.0874,
+      "step": 43640
+    },
+    {
+      "epoch": 0.37882483658996013,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0012440236433696344,
+      "loss": 0.1621,
+      "step": 43641
+    },
+    {
+      "epoch": 0.3788335170701643,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0012439942643902163,
+      "loss": 0.0967,
+      "step": 43642
+    },
+    {
+      "epoch": 0.37884219755036846,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0012439648852533283,
+      "loss": 0.0679,
+      "step": 43643
+    },
+    {
+      "epoch": 0.37885087803057266,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0012439355059590026,
+      "loss": 0.1709,
+      "step": 43644
+    },
+    {
+      "epoch": 0.3788595585107768,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0012439061265072712,
+      "loss": 0.1143,
+      "step": 43645
+    },
+    {
+      "epoch": 0.378868238990981,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001243876746898166,
+      "loss": 0.1104,
+      "step": 43646
+    },
+    {
+      "epoch": 0.3788769194711851,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00124384736713172,
+      "loss": 0.1406,
+      "step": 43647
+    },
+    {
+      "epoch": 0.3788855999513893,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001243817987207964,
+      "loss": 0.127,
+      "step": 43648
+    },
+    {
+      "epoch": 0.37889428043159346,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0012437886071269308,
+      "loss": 0.1045,
+      "step": 43649
+    },
+    {
+      "epoch": 0.37890296091179765,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0012437592268886531,
+      "loss": 0.1069,
+      "step": 43650
+    },
+    {
+      "epoch": 0.3789116413920018,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0012437298464931619,
+      "loss": 0.1201,
+      "step": 43651
+    },
+    {
+      "epoch": 0.378920321872206,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0012437004659404903,
+      "loss": 0.0601,
+      "step": 43652
+    },
+    {
+      "epoch": 0.3789290023524101,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0012436710852306695,
+      "loss": 0.0781,
+      "step": 43653
+    },
+    {
+      "epoch": 0.3789376828326143,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001243641704363733,
+      "loss": 0.0762,
+      "step": 43654
+    },
+    {
+      "epoch": 0.37894636331281845,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0012436123233397114,
+      "loss": 0.1016,
+      "step": 43655
+    },
+    {
+      "epoch": 0.37895504379302264,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0012435829421586378,
+      "loss": 0.125,
+      "step": 43656
+    },
+    {
+      "epoch": 0.3789637242732268,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001243553560820544,
+      "loss": 0.1055,
+      "step": 43657
+    },
+    {
+      "epoch": 0.37897240475343097,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0012435241793254622,
+      "loss": 0.0811,
+      "step": 43658
+    },
+    {
+      "epoch": 0.3789810852336351,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0012434947976734242,
+      "loss": 0.0967,
+      "step": 43659
+    },
+    {
+      "epoch": 0.3789897657138393,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0012434654158644627,
+      "loss": 0.1445,
+      "step": 43660
+    },
+    {
+      "epoch": 0.37899844619404344,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0012434360338986097,
+      "loss": 0.1338,
+      "step": 43661
+    },
+    {
+      "epoch": 0.37900712667424763,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0012434066517758972,
+      "loss": 0.0713,
+      "step": 43662
+    },
+    {
+      "epoch": 0.37901580715445177,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0012433772694963572,
+      "loss": 0.0791,
+      "step": 43663
+    },
+    {
+      "epoch": 0.37902448763465596,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0012433478870600221,
+      "loss": 0.0889,
+      "step": 43664
+    },
+    {
+      "epoch": 0.3790331681148601,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001243318504466924,
+      "loss": 0.1177,
+      "step": 43665
+    },
+    {
+      "epoch": 0.3790418485950643,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0012432891217170948,
+      "loss": 0.1016,
+      "step": 43666
+    },
+    {
+      "epoch": 0.3790505290752684,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0012432597388105666,
+      "loss": 0.1133,
+      "step": 43667
+    },
+    {
+      "epoch": 0.3790592095554726,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0012432303557473718,
+      "loss": 0.0874,
+      "step": 43668
+    },
+    {
+      "epoch": 0.37906789003567676,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0012432009725275423,
+      "loss": 0.1094,
+      "step": 43669
+    },
+    {
+      "epoch": 0.37907657051588095,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0012431715891511107,
+      "loss": 0.1133,
+      "step": 43670
+    },
+    {
+      "epoch": 0.3790852509960851,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0012431422056181087,
+      "loss": 0.0889,
+      "step": 43671
+    },
+    {
+      "epoch": 0.3790939314762893,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0012431128219285684,
+      "loss": 0.0977,
+      "step": 43672
+    },
+    {
+      "epoch": 0.3791026119564934,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0012430834380825222,
+      "loss": 0.0564,
+      "step": 43673
+    },
+    {
+      "epoch": 0.3791112924366976,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001243054054080002,
+      "loss": 0.1055,
+      "step": 43674
+    },
+    {
+      "epoch": 0.37911997291690175,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0012430246699210402,
+      "loss": 0.0859,
+      "step": 43675
+    },
+    {
+      "epoch": 0.37912865339710594,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0012429952856056687,
+      "loss": 0.123,
+      "step": 43676
+    },
+    {
+      "epoch": 0.3791373338773101,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0012429659011339196,
+      "loss": 0.0811,
+      "step": 43677
+    },
+    {
+      "epoch": 0.37914601435751427,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0012429365165058251,
+      "loss": 0.1309,
+      "step": 43678
+    },
+    {
+      "epoch": 0.3791546948377184,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0012429071317214175,
+      "loss": 0.104,
+      "step": 43679
+    },
+    {
+      "epoch": 0.3791633753179226,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001242877746780729,
+      "loss": 0.0918,
+      "step": 43680
+    },
+    {
+      "epoch": 0.37917205579812674,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0012428483616837912,
+      "loss": 0.0972,
+      "step": 43681
+    },
+    {
+      "epoch": 0.37918073627833093,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0012428189764306366,
+      "loss": 0.1055,
+      "step": 43682
+    },
+    {
+      "epoch": 0.37918941675853507,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0012427895910212974,
+      "loss": 0.0801,
+      "step": 43683
+    },
+    {
+      "epoch": 0.37919809723873926,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0012427602054558059,
+      "loss": 0.1025,
+      "step": 43684
+    },
+    {
+      "epoch": 0.3792067777189434,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0012427308197341936,
+      "loss": 0.0947,
+      "step": 43685
+    },
+    {
+      "epoch": 0.3792154581991476,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0012427014338564932,
+      "loss": 0.1348,
+      "step": 43686
+    },
+    {
+      "epoch": 0.37922413867935173,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0012426720478227364,
+      "loss": 0.0923,
+      "step": 43687
+    },
+    {
+      "epoch": 0.3792328191595559,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0012426426616329559,
+      "loss": 0.0859,
+      "step": 43688
+    },
+    {
+      "epoch": 0.37924149963976006,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0012426132752871835,
+      "loss": 0.0962,
+      "step": 43689
+    },
+    {
+      "epoch": 0.37925018011996425,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0012425838887854513,
+      "loss": 0.1318,
+      "step": 43690
+    },
+    {
+      "epoch": 0.3792588606001684,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0012425545021277913,
+      "loss": 0.1152,
+      "step": 43691
+    },
+    {
+      "epoch": 0.3792675410803726,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0012425251153142362,
+      "loss": 0.0889,
+      "step": 43692
+    },
+    {
+      "epoch": 0.3792762215605767,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0012424957283448178,
+      "loss": 0.0874,
+      "step": 43693
+    },
+    {
+      "epoch": 0.3792849020407809,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0012424663412195678,
+      "loss": 0.0854,
+      "step": 43694
+    },
+    {
+      "epoch": 0.37929358252098505,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0012424369539385189,
+      "loss": 0.1104,
+      "step": 43695
+    },
+    {
+      "epoch": 0.37930226300118924,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0012424075665017035,
+      "loss": 0.0977,
+      "step": 43696
+    },
+    {
+      "epoch": 0.3793109434813934,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0012423781789091528,
+      "loss": 0.1445,
+      "step": 43697
+    },
+    {
+      "epoch": 0.3793196239615976,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0012423487911608995,
+      "loss": 0.1016,
+      "step": 43698
+    },
+    {
+      "epoch": 0.3793283044418017,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001242319403256976,
+      "loss": 0.1143,
+      "step": 43699
+    },
+    {
+      "epoch": 0.3793369849220059,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0012422900151974141,
+      "loss": 0.2656,
+      "step": 43700
+    },
+    {
+      "epoch": 0.37934566540221004,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0012422606269822459,
+      "loss": 0.0771,
+      "step": 43701
+    },
+    {
+      "epoch": 0.37935434588241423,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0012422312386115037,
+      "loss": 0.1055,
+      "step": 43702
+    },
+    {
+      "epoch": 0.37936302636261837,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0012422018500852195,
+      "loss": 0.1426,
+      "step": 43703
+    },
+    {
+      "epoch": 0.37937170684282256,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0012421724614034254,
+      "loss": 0.0923,
+      "step": 43704
+    },
+    {
+      "epoch": 0.3793803873230267,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0012421430725661536,
+      "loss": 0.1279,
+      "step": 43705
+    },
+    {
+      "epoch": 0.3793890678032309,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0012421136835734361,
+      "loss": 0.0894,
+      "step": 43706
+    },
+    {
+      "epoch": 0.37939774828343503,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0012420842944253054,
+      "loss": 0.0913,
+      "step": 43707
+    },
+    {
+      "epoch": 0.3794064287636392,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0012420549051217937,
+      "loss": 0.1147,
+      "step": 43708
+    },
+    {
+      "epoch": 0.37941510924384336,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0012420255156629328,
+      "loss": 0.0859,
+      "step": 43709
+    },
+    {
+      "epoch": 0.37942378972404756,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001241996126048755,
+      "loss": 0.0908,
+      "step": 43710
+    },
+    {
+      "epoch": 0.3794324702042517,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0012419667362792918,
+      "loss": 0.0889,
+      "step": 43711
+    },
+    {
+      "epoch": 0.3794411506844559,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0012419373463545763,
+      "loss": 0.0957,
+      "step": 43712
+    },
+    {
+      "epoch": 0.37944983116466,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0012419079562746404,
+      "loss": 0.1484,
+      "step": 43713
+    },
+    {
+      "epoch": 0.3794585116448642,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001241878566039516,
+      "loss": 0.0654,
+      "step": 43714
+    },
+    {
+      "epoch": 0.37946719212506835,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0012418491756492351,
+      "loss": 0.0781,
+      "step": 43715
+    },
+    {
+      "epoch": 0.37947587260527255,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0012418197851038304,
+      "loss": 0.1104,
+      "step": 43716
+    },
+    {
+      "epoch": 0.3794845530854767,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0012417903944033336,
+      "loss": 0.1328,
+      "step": 43717
+    },
+    {
+      "epoch": 0.3794932335656809,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001241761003547777,
+      "loss": 0.1045,
+      "step": 43718
+    },
+    {
+      "epoch": 0.379501914045885,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0012417316125371927,
+      "loss": 0.0947,
+      "step": 43719
+    },
+    {
+      "epoch": 0.3795105945260892,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0012417022213716126,
+      "loss": 0.0962,
+      "step": 43720
+    },
+    {
+      "epoch": 0.37951927500629334,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0012416728300510692,
+      "loss": 0.0874,
+      "step": 43721
+    },
+    {
+      "epoch": 0.37952795548649754,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0012416434385755946,
+      "loss": 0.0645,
+      "step": 43722
+    },
+    {
+      "epoch": 0.3795366359667017,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001241614046945221,
+      "loss": 0.0913,
+      "step": 43723
+    },
+    {
+      "epoch": 0.37954531644690587,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0012415846551599799,
+      "loss": 0.084,
+      "step": 43724
+    },
+    {
+      "epoch": 0.37955399692711,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0012415552632199043,
+      "loss": 0.0928,
+      "step": 43725
+    },
+    {
+      "epoch": 0.3795626774073142,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0012415258711250261,
+      "loss": 0.1035,
+      "step": 43726
+    },
+    {
+      "epoch": 0.37957135788751833,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001241496478875377,
+      "loss": 0.1143,
+      "step": 43727
+    },
+    {
+      "epoch": 0.3795800383677225,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0012414670864709899,
+      "loss": 0.1182,
+      "step": 43728
+    },
+    {
+      "epoch": 0.37958871884792666,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.001241437693911896,
+      "loss": 0.1074,
+      "step": 43729
+    },
+    {
+      "epoch": 0.37959739932813086,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0012414083011981283,
+      "loss": 0.0981,
+      "step": 43730
+    },
+    {
+      "epoch": 0.379606079808335,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001241378908329719,
+      "loss": 0.0825,
+      "step": 43731
+    },
+    {
+      "epoch": 0.3796147602885392,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0012413495153066993,
+      "loss": 0.082,
+      "step": 43732
+    },
+    {
+      "epoch": 0.3796234407687433,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0012413201221291017,
+      "loss": 0.1162,
+      "step": 43733
+    },
+    {
+      "epoch": 0.3796321212489475,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0012412907287969593,
+      "loss": 0.0967,
+      "step": 43734
+    },
+    {
+      "epoch": 0.37964080172915166,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0012412613353103025,
+      "loss": 0.0981,
+      "step": 43735
+    },
+    {
+      "epoch": 0.37964948220935585,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0012412319416691654,
+      "loss": 0.0811,
+      "step": 43736
+    },
+    {
+      "epoch": 0.37965816268956,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0012412025478735784,
+      "loss": 0.1387,
+      "step": 43737
+    },
+    {
+      "epoch": 0.3796668431697642,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001241173153923575,
+      "loss": 0.0977,
+      "step": 43738
+    },
+    {
+      "epoch": 0.3796755236499683,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0012411437598191864,
+      "loss": 0.0889,
+      "step": 43739
+    },
+    {
+      "epoch": 0.37968420413017245,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0012411143655604454,
+      "loss": 0.1094,
+      "step": 43740
+    },
+    {
+      "epoch": 0.37969288461037665,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0012410849711473836,
+      "loss": 0.0889,
+      "step": 43741
+    },
+    {
+      "epoch": 0.3797015650905808,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0012410555765800333,
+      "loss": 0.0752,
+      "step": 43742
+    },
+    {
+      "epoch": 0.379710245570785,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001241026181858427,
+      "loss": 0.1445,
+      "step": 43743
+    },
+    {
+      "epoch": 0.3797189260509891,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0012409967869825963,
+      "loss": 0.1035,
+      "step": 43744
+    },
+    {
+      "epoch": 0.3797276065311933,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0012409673919525739,
+      "loss": 0.0752,
+      "step": 43745
+    },
+    {
+      "epoch": 0.37973628701139744,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0012409379967683917,
+      "loss": 0.0942,
+      "step": 43746
+    },
+    {
+      "epoch": 0.37974496749160164,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0012409086014300817,
+      "loss": 0.123,
+      "step": 43747
+    },
+    {
+      "epoch": 0.3797536479718058,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001240879205937676,
+      "loss": 0.1133,
+      "step": 43748
+    },
+    {
+      "epoch": 0.37976232845200997,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0012408498102912072,
+      "loss": 0.1006,
+      "step": 43749
+    },
+    {
+      "epoch": 0.3797710089322141,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001240820414490707,
+      "loss": 0.1177,
+      "step": 43750
+    },
+    {
+      "epoch": 0.3797796894124183,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001240791018536208,
+      "loss": 0.0898,
+      "step": 43751
+    },
+    {
+      "epoch": 0.37978836989262243,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0012407616224277419,
+      "loss": 0.1055,
+      "step": 43752
+    },
+    {
+      "epoch": 0.37979705037282663,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0012407322261653408,
+      "loss": 0.0752,
+      "step": 43753
+    },
+    {
+      "epoch": 0.37980573085303077,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0012407028297490373,
+      "loss": 0.1475,
+      "step": 43754
+    },
+    {
+      "epoch": 0.37981441133323496,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0012406734331788629,
+      "loss": 0.0928,
+      "step": 43755
+    },
+    {
+      "epoch": 0.3798230918134391,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0012406440364548506,
+      "loss": 0.0747,
+      "step": 43756
+    },
+    {
+      "epoch": 0.3798317722936433,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0012406146395770319,
+      "loss": 0.0894,
+      "step": 43757
+    },
+    {
+      "epoch": 0.3798404527738474,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0012405852425454395,
+      "loss": 0.1309,
+      "step": 43758
+    },
+    {
+      "epoch": 0.3798491332540516,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0012405558453601044,
+      "loss": 0.1543,
+      "step": 43759
+    },
+    {
+      "epoch": 0.37985781373425576,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0012405264480210602,
+      "loss": 0.083,
+      "step": 43760
+    },
+    {
+      "epoch": 0.37986649421445995,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0012404970505283384,
+      "loss": 0.1357,
+      "step": 43761
+    },
+    {
+      "epoch": 0.3798751746946641,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0012404676528819708,
+      "loss": 0.1143,
+      "step": 43762
+    },
+    {
+      "epoch": 0.3798838551748683,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.00124043825508199,
+      "loss": 0.1001,
+      "step": 43763
+    },
+    {
+      "epoch": 0.3798925356550724,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0012404088571284278,
+      "loss": 0.0879,
+      "step": 43764
+    },
+    {
+      "epoch": 0.3799012161352766,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0012403794590213168,
+      "loss": 0.0967,
+      "step": 43765
+    },
+    {
+      "epoch": 0.37990989661548075,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001240350060760689,
+      "loss": 0.0698,
+      "step": 43766
+    },
+    {
+      "epoch": 0.37991857709568494,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0012403206623465763,
+      "loss": 0.0952,
+      "step": 43767
+    },
+    {
+      "epoch": 0.3799272575758891,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0012402912637790114,
+      "loss": 0.0771,
+      "step": 43768
+    },
+    {
+      "epoch": 0.37993593805609327,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0012402618650580257,
+      "loss": 0.1191,
+      "step": 43769
+    },
+    {
+      "epoch": 0.3799446185362974,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0012402324661836522,
+      "loss": 0.0718,
+      "step": 43770
+    },
+    {
+      "epoch": 0.3799532990165016,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0012402030671559224,
+      "loss": 0.1089,
+      "step": 43771
+    },
+    {
+      "epoch": 0.37996197949670574,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0012401736679748684,
+      "loss": 0.1157,
+      "step": 43772
+    },
+    {
+      "epoch": 0.37997065997690993,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0012401442686405227,
+      "loss": 0.0889,
+      "step": 43773
+    },
+    {
+      "epoch": 0.37997934045711407,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0012401148691529172,
+      "loss": 0.1064,
+      "step": 43774
+    },
+    {
+      "epoch": 0.37998802093731826,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0012400854695120843,
+      "loss": 0.1016,
+      "step": 43775
+    },
+    {
+      "epoch": 0.3799967014175224,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.001240056069718056,
+      "loss": 0.0957,
+      "step": 43776
+    },
+    {
+      "epoch": 0.3800053818977266,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0012400266697708644,
+      "loss": 0.1133,
+      "step": 43777
+    },
+    {
+      "epoch": 0.38001406237793073,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001239997269670542,
+      "loss": 0.0859,
+      "step": 43778
+    },
+    {
+      "epoch": 0.3800227428581349,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0012399678694171206,
+      "loss": 0.1055,
+      "step": 43779
+    },
+    {
+      "epoch": 0.38003142333833906,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0012399384690106323,
+      "loss": 0.0645,
+      "step": 43780
+    },
+    {
+      "epoch": 0.38004010381854325,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0012399090684511096,
+      "loss": 0.1221,
+      "step": 43781
+    },
+    {
+      "epoch": 0.3800487842987474,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0012398796677385844,
+      "loss": 0.1348,
+      "step": 43782
+    },
+    {
+      "epoch": 0.3800574647789516,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0012398502668730888,
+      "loss": 0.1035,
+      "step": 43783
+    },
+    {
+      "epoch": 0.3800661452591557,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001239820865854655,
+      "loss": 0.0913,
+      "step": 43784
+    },
+    {
+      "epoch": 0.3800748257393599,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0012397914646833153,
+      "loss": 0.1152,
+      "step": 43785
+    },
+    {
+      "epoch": 0.38008350621956405,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0012397620633591018,
+      "loss": 0.1758,
+      "step": 43786
+    },
+    {
+      "epoch": 0.38009218669976824,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0012397326618820467,
+      "loss": 0.1035,
+      "step": 43787
+    },
+    {
+      "epoch": 0.3801008671799724,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0012397032602521818,
+      "loss": 0.084,
+      "step": 43788
+    },
+    {
+      "epoch": 0.38010954766017657,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0012396738584695395,
+      "loss": 0.082,
+      "step": 43789
+    },
+    {
+      "epoch": 0.3801182281403807,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001239644456534152,
+      "loss": 0.1426,
+      "step": 43790
+    },
+    {
+      "epoch": 0.3801269086205849,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0012396150544460515,
+      "loss": 0.1245,
+      "step": 43791
+    },
+    {
+      "epoch": 0.38013558910078904,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0012395856522052701,
+      "loss": 0.1328,
+      "step": 43792
+    },
+    {
+      "epoch": 0.38014426958099323,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.00123955624981184,
+      "loss": 0.0786,
+      "step": 43793
+    },
+    {
+      "epoch": 0.38015295006119737,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0012395268472657932,
+      "loss": 0.085,
+      "step": 43794
+    },
+    {
+      "epoch": 0.38016163054140156,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001239497444567162,
+      "loss": 0.0889,
+      "step": 43795
+    },
+    {
+      "epoch": 0.3801703110216057,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0012394680417159782,
+      "loss": 0.167,
+      "step": 43796
+    },
+    {
+      "epoch": 0.3801789915018099,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0012394386387122747,
+      "loss": 0.0874,
+      "step": 43797
+    },
+    {
+      "epoch": 0.38018767198201403,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0012394092355560828,
+      "loss": 0.1143,
+      "step": 43798
+    },
+    {
+      "epoch": 0.3801963524622182,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0012393798322474351,
+      "loss": 0.0996,
+      "step": 43799
+    },
+    {
+      "epoch": 0.38020503294242236,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0012393504287863642,
+      "loss": 0.0996,
+      "step": 43800
+    },
+    {
+      "epoch": 0.38021371342262655,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0012393210251729016,
+      "loss": 0.0791,
+      "step": 43801
+    },
+    {
+      "epoch": 0.3802223939028307,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001239291621407079,
+      "loss": 0.123,
+      "step": 43802
+    },
+    {
+      "epoch": 0.3802310743830349,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0012392622174889297,
+      "loss": 0.0762,
+      "step": 43803
+    },
+    {
+      "epoch": 0.380239754863239,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0012392328134184852,
+      "loss": 0.1221,
+      "step": 43804
+    },
+    {
+      "epoch": 0.3802484353434432,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0012392034091957779,
+      "loss": 0.0938,
+      "step": 43805
+    },
+    {
+      "epoch": 0.38025711582364735,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0012391740048208395,
+      "loss": 0.125,
+      "step": 43806
+    },
+    {
+      "epoch": 0.38026579630385154,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0012391446002937027,
+      "loss": 0.1113,
+      "step": 43807
+    },
+    {
+      "epoch": 0.3802744767840557,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0012391151956143995,
+      "loss": 0.1279,
+      "step": 43808
+    },
+    {
+      "epoch": 0.3802831572642599,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0012390857907829623,
+      "loss": 0.1104,
+      "step": 43809
+    },
+    {
+      "epoch": 0.380291837744464,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0012390563857994229,
+      "loss": 0.0859,
+      "step": 43810
+    },
+    {
+      "epoch": 0.3803005182246682,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001239026980663813,
+      "loss": 0.1045,
+      "step": 43811
+    },
+    {
+      "epoch": 0.38030919870487234,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0012389975753761655,
+      "loss": 0.1016,
+      "step": 43812
+    },
+    {
+      "epoch": 0.38031787918507654,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0012389681699365126,
+      "loss": 0.1104,
+      "step": 43813
+    },
+    {
+      "epoch": 0.3803265596652807,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001238938764344886,
+      "loss": 0.1152,
+      "step": 43814
+    },
+    {
+      "epoch": 0.38033524014548487,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001238909358601318,
+      "loss": 0.1582,
+      "step": 43815
+    },
+    {
+      "epoch": 0.380343920625689,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0012388799527058407,
+      "loss": 0.0923,
+      "step": 43816
+    },
+    {
+      "epoch": 0.3803526011058932,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001238850546658487,
+      "loss": 0.1001,
+      "step": 43817
+    },
+    {
+      "epoch": 0.38036128158609733,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.001238821140459288,
+      "loss": 0.0767,
+      "step": 43818
+    },
+    {
+      "epoch": 0.3803699620663015,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0012387917341082758,
+      "loss": 0.1177,
+      "step": 43819
+    },
+    {
+      "epoch": 0.38037864254650566,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0012387623276054838,
+      "loss": 0.0859,
+      "step": 43820
+    },
+    {
+      "epoch": 0.38038732302670986,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0012387329209509433,
+      "loss": 0.0854,
+      "step": 43821
+    },
+    {
+      "epoch": 0.380396003506914,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0012387035141446863,
+      "loss": 0.0864,
+      "step": 43822
+    },
+    {
+      "epoch": 0.3804046839871182,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0012386741071867453,
+      "loss": 0.061,
+      "step": 43823
+    },
+    {
+      "epoch": 0.3804133644673223,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0012386447000771524,
+      "loss": 0.0825,
+      "step": 43824
+    },
+    {
+      "epoch": 0.3804220449475265,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0012386152928159397,
+      "loss": 0.1045,
+      "step": 43825
+    },
+    {
+      "epoch": 0.38043072542773065,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0012385858854031396,
+      "loss": 0.084,
+      "step": 43826
+    },
+    {
+      "epoch": 0.38043940590793485,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001238556477838784,
+      "loss": 0.1138,
+      "step": 43827
+    },
+    {
+      "epoch": 0.380448086388139,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0012385270701229048,
+      "loss": 0.0791,
+      "step": 43828
+    },
+    {
+      "epoch": 0.3804567668683432,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0012384976622555346,
+      "loss": 0.0718,
+      "step": 43829
+    },
+    {
+      "epoch": 0.3804654473485473,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0012384682542367056,
+      "loss": 0.1396,
+      "step": 43830
+    },
+    {
+      "epoch": 0.3804741278287515,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0012384388460664494,
+      "loss": 0.0933,
+      "step": 43831
+    },
+    {
+      "epoch": 0.38048280830895564,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001238409437744799,
+      "loss": 0.1016,
+      "step": 43832
+    },
+    {
+      "epoch": 0.38049148878915984,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0012383800292717862,
+      "loss": 0.1025,
+      "step": 43833
+    },
+    {
+      "epoch": 0.380500169269364,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0012383506206474426,
+      "loss": 0.0713,
+      "step": 43834
+    },
+    {
+      "epoch": 0.38050884974956817,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0012383212118718015,
+      "loss": 0.124,
+      "step": 43835
+    },
+    {
+      "epoch": 0.3805175302297723,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001238291802944894,
+      "loss": 0.1147,
+      "step": 43836
+    },
+    {
+      "epoch": 0.3805262107099765,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0012382623938667525,
+      "loss": 0.0908,
+      "step": 43837
+    },
+    {
+      "epoch": 0.38053489119018064,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0012382329846374097,
+      "loss": 0.0884,
+      "step": 43838
+    },
+    {
+      "epoch": 0.38054357167038483,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001238203575256897,
+      "loss": 0.1045,
+      "step": 43839
+    },
+    {
+      "epoch": 0.38055225215058897,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.001238174165725247,
+      "loss": 0.125,
+      "step": 43840
+    },
+    {
+      "epoch": 0.38056093263079316,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0012381447560424921,
+      "loss": 0.1299,
+      "step": 43841
+    },
+    {
+      "epoch": 0.3805696131109973,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0012381153462086639,
+      "loss": 0.0972,
+      "step": 43842
+    },
+    {
+      "epoch": 0.3805782935912015,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001238085936223795,
+      "loss": 0.1001,
+      "step": 43843
+    },
+    {
+      "epoch": 0.3805869740714056,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0012380565260879173,
+      "loss": 0.0928,
+      "step": 43844
+    },
+    {
+      "epoch": 0.3805956545516098,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0012380271158010632,
+      "loss": 0.1128,
+      "step": 43845
+    },
+    {
+      "epoch": 0.38060433503181396,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0012379977053632646,
+      "loss": 0.1094,
+      "step": 43846
+    },
+    {
+      "epoch": 0.38061301551201815,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0012379682947745538,
+      "loss": 0.1084,
+      "step": 43847
+    },
+    {
+      "epoch": 0.3806216959922223,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.001237938884034963,
+      "loss": 0.082,
+      "step": 43848
+    },
+    {
+      "epoch": 0.3806303764724265,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0012379094731445241,
+      "loss": 0.1025,
+      "step": 43849
+    },
+    {
+      "epoch": 0.3806390569526306,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0012378800621032698,
+      "loss": 0.082,
+      "step": 43850
+    },
+    {
+      "epoch": 0.3806477374328348,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0012378506509112318,
+      "loss": 0.1172,
+      "step": 43851
+    },
+    {
+      "epoch": 0.38065641791303895,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0012378212395684422,
+      "loss": 0.1152,
+      "step": 43852
+    },
+    {
+      "epoch": 0.38066509839324314,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0012377918280749337,
+      "loss": 0.1084,
+      "step": 43853
+    },
+    {
+      "epoch": 0.3806737788734473,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0012377624164307377,
+      "loss": 0.1113,
+      "step": 43854
+    },
+    {
+      "epoch": 0.38068245935365147,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0012377330046358871,
+      "loss": 0.1094,
+      "step": 43855
+    },
+    {
+      "epoch": 0.3806911398338556,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0012377035926904138,
+      "loss": 0.1328,
+      "step": 43856
+    },
+    {
+      "epoch": 0.3806998203140598,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0012376741805943498,
+      "loss": 0.084,
+      "step": 43857
+    },
+    {
+      "epoch": 0.38070850079426394,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0012376447683477274,
+      "loss": 0.1016,
+      "step": 43858
+    },
+    {
+      "epoch": 0.38071718127446813,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001237615355950579,
+      "loss": 0.0894,
+      "step": 43859
+    },
+    {
+      "epoch": 0.38072586175467227,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0012375859434029359,
+      "loss": 0.1216,
+      "step": 43860
+    },
+    {
+      "epoch": 0.38073454223487646,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0012375565307048313,
+      "loss": 0.0986,
+      "step": 43861
+    },
+    {
+      "epoch": 0.3807432227150806,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001237527117856297,
+      "loss": 0.1094,
+      "step": 43862
+    },
+    {
+      "epoch": 0.38075190319528474,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001237497704857365,
+      "loss": 0.1084,
+      "step": 43863
+    },
+    {
+      "epoch": 0.38076058367548893,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0012374682917080675,
+      "loss": 0.1143,
+      "step": 43864
+    },
+    {
+      "epoch": 0.38076926415569307,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001237438878408437,
+      "loss": 0.0835,
+      "step": 43865
+    },
+    {
+      "epoch": 0.38077794463589726,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0012374094649585052,
+      "loss": 0.1196,
+      "step": 43866
+    },
+    {
+      "epoch": 0.3807866251161014,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0012373800513583045,
+      "loss": 0.1064,
+      "step": 43867
+    },
+    {
+      "epoch": 0.3807953055963056,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0012373506376078672,
+      "loss": 0.0898,
+      "step": 43868
+    },
+    {
+      "epoch": 0.3808039860765097,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0012373212237072248,
+      "loss": 0.0898,
+      "step": 43869
+    },
+    {
+      "epoch": 0.3808126665567139,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0012372918096564104,
+      "loss": 0.1016,
+      "step": 43870
+    },
+    {
+      "epoch": 0.38082134703691806,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0012372623954554555,
+      "loss": 0.085,
+      "step": 43871
+    },
+    {
+      "epoch": 0.38083002751712225,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.001237232981104393,
+      "loss": 0.1777,
+      "step": 43872
+    },
+    {
+      "epoch": 0.3808387079973264,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0012372035666032542,
+      "loss": 0.1455,
+      "step": 43873
+    },
+    {
+      "epoch": 0.3808473884775306,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0012371741519520715,
+      "loss": 0.085,
+      "step": 43874
+    },
+    {
+      "epoch": 0.3808560689577347,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0012371447371508778,
+      "loss": 0.1143,
+      "step": 43875
+    },
+    {
+      "epoch": 0.3808647494379389,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001237115322199704,
+      "loss": 0.0593,
+      "step": 43876
+    },
+    {
+      "epoch": 0.38087342991814305,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.001237085907098583,
+      "loss": 0.1523,
+      "step": 43877
+    },
+    {
+      "epoch": 0.38088211039834724,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0012370564918475475,
+      "loss": 0.0962,
+      "step": 43878
+    },
+    {
+      "epoch": 0.3808907908785514,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0012370270764466285,
+      "loss": 0.0791,
+      "step": 43879
+    },
+    {
+      "epoch": 0.38089947135875557,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0012369976608958588,
+      "loss": 0.1104,
+      "step": 43880
+    },
+    {
+      "epoch": 0.3809081518389597,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0012369682451952705,
+      "loss": 0.0977,
+      "step": 43881
+    },
+    {
+      "epoch": 0.3809168323191639,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0012369388293448965,
+      "loss": 0.1177,
+      "step": 43882
+    },
+    {
+      "epoch": 0.38092551279936804,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0012369094133447675,
+      "loss": 0.1445,
+      "step": 43883
+    },
+    {
+      "epoch": 0.38093419327957223,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0012368799971949165,
+      "loss": 0.0967,
+      "step": 43884
+    },
+    {
+      "epoch": 0.38094287375977637,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001236850580895376,
+      "loss": 0.0938,
+      "step": 43885
+    },
+    {
+      "epoch": 0.38095155423998056,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0012368211644461773,
+      "loss": 0.0747,
+      "step": 43886
+    },
+    {
+      "epoch": 0.3809602347201847,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0012367917478473534,
+      "loss": 0.0942,
+      "step": 43887
+    },
+    {
+      "epoch": 0.3809689152003889,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0012367623310989358,
+      "loss": 0.1064,
+      "step": 43888
+    },
+    {
+      "epoch": 0.38097759568059303,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.001236732914200957,
+      "loss": 0.1084,
+      "step": 43889
+    },
+    {
+      "epoch": 0.3809862761607972,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0012367034971534493,
+      "loss": 0.0913,
+      "step": 43890
+    },
+    {
+      "epoch": 0.38099495664100136,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0012366740799564446,
+      "loss": 0.1162,
+      "step": 43891
+    },
+    {
+      "epoch": 0.38100363712120555,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0012366446626099752,
+      "loss": 0.0918,
+      "step": 43892
+    },
+    {
+      "epoch": 0.3810123176014097,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001236615245114073,
+      "loss": 0.1157,
+      "step": 43893
+    },
+    {
+      "epoch": 0.3810209980816139,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0012365858274687711,
+      "loss": 0.0742,
+      "step": 43894
+    },
+    {
+      "epoch": 0.381029678561818,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0012365564096741006,
+      "loss": 0.1011,
+      "step": 43895
+    },
+    {
+      "epoch": 0.3810383590420222,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.001236526991730094,
+      "loss": 0.1055,
+      "step": 43896
+    },
+    {
+      "epoch": 0.38104703952222635,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0012364975736367833,
+      "loss": 0.1309,
+      "step": 43897
+    },
+    {
+      "epoch": 0.38105572000243054,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0012364681553942015,
+      "loss": 0.0918,
+      "step": 43898
+    },
+    {
+      "epoch": 0.3810644004826347,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0012364387370023798,
+      "loss": 0.0913,
+      "step": 43899
+    },
+    {
+      "epoch": 0.3810730809628389,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0012364093184613508,
+      "loss": 0.125,
+      "step": 43900
+    },
+    {
+      "epoch": 0.381081761443043,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0012363798997711464,
+      "loss": 0.1016,
+      "step": 43901
+    },
+    {
+      "epoch": 0.3810904419232472,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0012363504809317992,
+      "loss": 0.1201,
+      "step": 43902
+    },
+    {
+      "epoch": 0.38109912240345134,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0012363210619433412,
+      "loss": 0.1025,
+      "step": 43903
+    },
+    {
+      "epoch": 0.38110780288365553,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0012362916428058047,
+      "loss": 0.0806,
+      "step": 43904
+    },
+    {
+      "epoch": 0.38111648336385967,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0012362622235192215,
+      "loss": 0.1025,
+      "step": 43905
+    },
+    {
+      "epoch": 0.38112516384406386,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001236232804083624,
+      "loss": 0.0898,
+      "step": 43906
+    },
+    {
+      "epoch": 0.381133844324268,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001236203384499044,
+      "loss": 0.1001,
+      "step": 43907
+    },
+    {
+      "epoch": 0.3811425248044722,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0012361739647655146,
+      "loss": 0.0884,
+      "step": 43908
+    },
+    {
+      "epoch": 0.38115120528467633,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001236144544883067,
+      "loss": 0.103,
+      "step": 43909
+    },
+    {
+      "epoch": 0.3811598857648805,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0012361151248517339,
+      "loss": 0.0796,
+      "step": 43910
+    },
+    {
+      "epoch": 0.38116856624508466,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0012360857046715475,
+      "loss": 0.0918,
+      "step": 43911
+    },
+    {
+      "epoch": 0.38117724672528885,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0012360562843425399,
+      "loss": 0.1377,
+      "step": 43912
+    },
+    {
+      "epoch": 0.381185927205493,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0012360268638647426,
+      "loss": 0.0767,
+      "step": 43913
+    },
+    {
+      "epoch": 0.3811946076856972,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0012359974432381892,
+      "loss": 0.1143,
+      "step": 43914
+    },
+    {
+      "epoch": 0.3812032881659013,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0012359680224629105,
+      "loss": 0.1133,
+      "step": 43915
+    },
+    {
+      "epoch": 0.3812119686461055,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0012359386015389391,
+      "loss": 0.0737,
+      "step": 43916
+    },
+    {
+      "epoch": 0.38122064912630965,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0012359091804663076,
+      "loss": 0.1079,
+      "step": 43917
+    },
+    {
+      "epoch": 0.38122932960651384,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0012358797592450477,
+      "loss": 0.0996,
+      "step": 43918
+    },
+    {
+      "epoch": 0.381238010086718,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0012358503378751917,
+      "loss": 0.0776,
+      "step": 43919
+    },
+    {
+      "epoch": 0.3812466905669222,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001235820916356772,
+      "loss": 0.1177,
+      "step": 43920
+    },
+    {
+      "epoch": 0.3812553710471263,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0012357914946898205,
+      "loss": 0.0615,
+      "step": 43921
+    },
+    {
+      "epoch": 0.3812640515273305,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0012357620728743696,
+      "loss": 0.0669,
+      "step": 43922
+    },
+    {
+      "epoch": 0.38127273200753464,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0012357326509104512,
+      "loss": 0.1309,
+      "step": 43923
+    },
+    {
+      "epoch": 0.38128141248773884,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0012357032287980973,
+      "loss": 0.0762,
+      "step": 43924
+    },
+    {
+      "epoch": 0.381290092967943,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0012356738065373409,
+      "loss": 0.1172,
+      "step": 43925
+    },
+    {
+      "epoch": 0.38129877344814717,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0012356443841282138,
+      "loss": 0.0811,
+      "step": 43926
+    },
+    {
+      "epoch": 0.3813074539283513,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0012356149615707474,
+      "loss": 0.0723,
+      "step": 43927
+    },
+    {
+      "epoch": 0.3813161344085555,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0012355855388649745,
+      "loss": 0.0908,
+      "step": 43928
+    },
+    {
+      "epoch": 0.38132481488875963,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001235556116010928,
+      "loss": 0.0908,
+      "step": 43929
+    },
+    {
+      "epoch": 0.3813334953689638,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0012355266930086388,
+      "loss": 0.0796,
+      "step": 43930
+    },
+    {
+      "epoch": 0.38134217584916796,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0012354972698581399,
+      "loss": 0.1348,
+      "step": 43931
+    },
+    {
+      "epoch": 0.38135085632937216,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001235467846559463,
+      "loss": 0.1201,
+      "step": 43932
+    },
+    {
+      "epoch": 0.3813595368095763,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0012354384231126408,
+      "loss": 0.0801,
+      "step": 43933
+    },
+    {
+      "epoch": 0.3813682172897805,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0012354089995177048,
+      "loss": 0.1436,
+      "step": 43934
+    },
+    {
+      "epoch": 0.3813768977699846,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.001235379575774688,
+      "loss": 0.1064,
+      "step": 43935
+    },
+    {
+      "epoch": 0.3813855782501888,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0012353501518836216,
+      "loss": 0.0889,
+      "step": 43936
+    },
+    {
+      "epoch": 0.38139425873039295,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0012353207278445386,
+      "loss": 0.0781,
+      "step": 43937
+    },
+    {
+      "epoch": 0.38140293921059715,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001235291303657471,
+      "loss": 0.0889,
+      "step": 43938
+    },
+    {
+      "epoch": 0.3814116196908013,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0012352618793224509,
+      "loss": 0.0811,
+      "step": 43939
+    },
+    {
+      "epoch": 0.3814203001710055,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0012352324548395102,
+      "loss": 0.123,
+      "step": 43940
+    },
+    {
+      "epoch": 0.3814289806512096,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0012352030302086815,
+      "loss": 0.0806,
+      "step": 43941
+    },
+    {
+      "epoch": 0.3814376611314138,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0012351736054299967,
+      "loss": 0.0942,
+      "step": 43942
+    },
+    {
+      "epoch": 0.38144634161161795,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001235144180503488,
+      "loss": 0.0581,
+      "step": 43943
+    },
+    {
+      "epoch": 0.38145502209182214,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001235114755429188,
+      "loss": 0.1025,
+      "step": 43944
+    },
+    {
+      "epoch": 0.3814637025720263,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.001235085330207128,
+      "loss": 0.0996,
+      "step": 43945
+    },
+    {
+      "epoch": 0.38147238305223047,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.001235055904837341,
+      "loss": 0.0928,
+      "step": 43946
+    },
+    {
+      "epoch": 0.3814810635324346,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001235026479319859,
+      "loss": 0.0879,
+      "step": 43947
+    },
+    {
+      "epoch": 0.3814897440126388,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0012349970536547137,
+      "loss": 0.0908,
+      "step": 43948
+    },
+    {
+      "epoch": 0.38149842449284294,
+      "grad_norm": 1.8203125,
+      "learning_rate": 0.001234967627841938,
+      "loss": 0.332,
+      "step": 43949
+    },
+    {
+      "epoch": 0.38150710497304713,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0012349382018815636,
+      "loss": 0.1123,
+      "step": 43950
+    },
+    {
+      "epoch": 0.38151578545325127,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001234908775773623,
+      "loss": 0.0898,
+      "step": 43951
+    },
+    {
+      "epoch": 0.38152446593345546,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.001234879349518148,
+      "loss": 0.0889,
+      "step": 43952
+    },
+    {
+      "epoch": 0.3815331464136596,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0012348499231151708,
+      "loss": 0.0791,
+      "step": 43953
+    },
+    {
+      "epoch": 0.3815418268938638,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001234820496564724,
+      "loss": 0.0498,
+      "step": 43954
+    },
+    {
+      "epoch": 0.3815505073740679,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0012347910698668396,
+      "loss": 0.1055,
+      "step": 43955
+    },
+    {
+      "epoch": 0.3815591878542721,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0012347616430215497,
+      "loss": 0.0718,
+      "step": 43956
+    },
+    {
+      "epoch": 0.38156786833447626,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001234732216028886,
+      "loss": 0.085,
+      "step": 43957
+    },
+    {
+      "epoch": 0.38157654881468045,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0012347027888888816,
+      "loss": 0.1309,
+      "step": 43958
+    },
+    {
+      "epoch": 0.3815852292948846,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0012346733616015683,
+      "loss": 0.0747,
+      "step": 43959
+    },
+    {
+      "epoch": 0.3815939097750888,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0012346439341669783,
+      "loss": 0.1084,
+      "step": 43960
+    },
+    {
+      "epoch": 0.3816025902552929,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0012346145065851438,
+      "loss": 0.1016,
+      "step": 43961
+    },
+    {
+      "epoch": 0.3816112707354971,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0012345850788560962,
+      "loss": 0.1035,
+      "step": 43962
+    },
+    {
+      "epoch": 0.38161995121570125,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001234555650979869,
+      "loss": 0.0898,
+      "step": 43963
+    },
+    {
+      "epoch": 0.38162863169590544,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0012345262229564937,
+      "loss": 0.104,
+      "step": 43964
+    },
+    {
+      "epoch": 0.3816373121761096,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0012344967947860023,
+      "loss": 0.123,
+      "step": 43965
+    },
+    {
+      "epoch": 0.38164599265631377,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0012344673664684272,
+      "loss": 0.083,
+      "step": 43966
+    },
+    {
+      "epoch": 0.3816546731365179,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0012344379380038008,
+      "loss": 0.1182,
+      "step": 43967
+    },
+    {
+      "epoch": 0.3816633536167221,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0012344085093921554,
+      "loss": 0.105,
+      "step": 43968
+    },
+    {
+      "epoch": 0.38167203409692624,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0012343790806335223,
+      "loss": 0.1074,
+      "step": 43969
+    },
+    {
+      "epoch": 0.38168071457713043,
+      "grad_norm": 5.03125,
+      "learning_rate": 0.0012343496517279347,
+      "loss": 0.3477,
+      "step": 43970
+    },
+    {
+      "epoch": 0.38168939505733457,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001234320222675424,
+      "loss": 0.0723,
+      "step": 43971
+    },
+    {
+      "epoch": 0.38169807553753876,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001234290793476023,
+      "loss": 0.1006,
+      "step": 43972
+    },
+    {
+      "epoch": 0.3817067560177429,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0012342613641297636,
+      "loss": 0.0928,
+      "step": 43973
+    },
+    {
+      "epoch": 0.3817154364979471,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0012342319346366774,
+      "loss": 0.1104,
+      "step": 43974
+    },
+    {
+      "epoch": 0.38172411697815123,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.001234202504996798,
+      "loss": 0.0923,
+      "step": 43975
+    },
+    {
+      "epoch": 0.3817327974583554,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0012341730752101563,
+      "loss": 0.0986,
+      "step": 43976
+    },
+    {
+      "epoch": 0.38174147793855956,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.001234143645276785,
+      "loss": 0.0986,
+      "step": 43977
+    },
+    {
+      "epoch": 0.38175015841876375,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0012341142151967163,
+      "loss": 0.1084,
+      "step": 43978
+    },
+    {
+      "epoch": 0.3817588388989679,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0012340847849699821,
+      "loss": 0.1045,
+      "step": 43979
+    },
+    {
+      "epoch": 0.3817675193791721,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001234055354596615,
+      "loss": 0.1055,
+      "step": 43980
+    },
+    {
+      "epoch": 0.3817761998593762,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001234025924076647,
+      "loss": 0.1309,
+      "step": 43981
+    },
+    {
+      "epoch": 0.3817848803395804,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0012339964934101103,
+      "loss": 0.0845,
+      "step": 43982
+    },
+    {
+      "epoch": 0.38179356081978455,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001233967062597037,
+      "loss": 0.0713,
+      "step": 43983
+    },
+    {
+      "epoch": 0.38180224129998874,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0012339376316374592,
+      "loss": 0.0879,
+      "step": 43984
+    },
+    {
+      "epoch": 0.3818109217801929,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0012339082005314091,
+      "loss": 0.1396,
+      "step": 43985
+    },
+    {
+      "epoch": 0.381819602260397,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001233878769278919,
+      "loss": 0.1113,
+      "step": 43986
+    },
+    {
+      "epoch": 0.3818282827406012,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0012338493378800215,
+      "loss": 0.0488,
+      "step": 43987
+    },
+    {
+      "epoch": 0.38183696322080535,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.001233819906334748,
+      "loss": 0.1055,
+      "step": 43988
+    },
+    {
+      "epoch": 0.38184564370100954,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0012337904746431315,
+      "loss": 0.082,
+      "step": 43989
+    },
+    {
+      "epoch": 0.3818543241812137,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0012337610428052033,
+      "loss": 0.1133,
+      "step": 43990
+    },
+    {
+      "epoch": 0.38186300466141787,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0012337316108209965,
+      "loss": 0.1016,
+      "step": 43991
+    },
+    {
+      "epoch": 0.381871685141622,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0012337021786905425,
+      "loss": 0.1177,
+      "step": 43992
+    },
+    {
+      "epoch": 0.3818803656218262,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0012336727464138737,
+      "loss": 0.0967,
+      "step": 43993
+    },
+    {
+      "epoch": 0.38188904610203034,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0012336433139910224,
+      "loss": 0.167,
+      "step": 43994
+    },
+    {
+      "epoch": 0.38189772658223453,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.001233613881422021,
+      "loss": 0.0781,
+      "step": 43995
+    },
+    {
+      "epoch": 0.38190640706243867,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0012335844487069015,
+      "loss": 0.123,
+      "step": 43996
+    },
+    {
+      "epoch": 0.38191508754264286,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0012335550158456957,
+      "loss": 0.0947,
+      "step": 43997
+    },
+    {
+      "epoch": 0.381923768022847,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0012335255828384367,
+      "loss": 0.1416,
+      "step": 43998
+    },
+    {
+      "epoch": 0.3819324485030512,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0012334961496851555,
+      "loss": 0.1162,
+      "step": 43999
+    },
+    {
+      "epoch": 0.38194112898325533,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0012334667163858854,
+      "loss": 0.0962,
+      "step": 44000
+    },
+    {
+      "epoch": 0.3819498094634595,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0012334372829406577,
+      "loss": 0.0952,
+      "step": 44001
+    },
+    {
+      "epoch": 0.38195848994366366,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0012334078493495054,
+      "loss": 0.1025,
+      "step": 44002
+    },
+    {
+      "epoch": 0.38196717042386785,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.00123337841561246,
+      "loss": 0.1055,
+      "step": 44003
+    },
+    {
+      "epoch": 0.381975850904072,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.001233348981729554,
+      "loss": 0.0645,
+      "step": 44004
+    },
+    {
+      "epoch": 0.3819845313842762,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0012333195477008197,
+      "loss": 0.0898,
+      "step": 44005
+    },
+    {
+      "epoch": 0.3819932118644803,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0012332901135262888,
+      "loss": 0.1074,
+      "step": 44006
+    },
+    {
+      "epoch": 0.3820018923446845,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0012332606792059943,
+      "loss": 0.0894,
+      "step": 44007
+    },
+    {
+      "epoch": 0.38201057282488865,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0012332312447399679,
+      "loss": 0.1016,
+      "step": 44008
+    },
+    {
+      "epoch": 0.38201925330509284,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0012332018101282414,
+      "loss": 0.123,
+      "step": 44009
+    },
+    {
+      "epoch": 0.382027933785297,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0012331723753708477,
+      "loss": 0.0879,
+      "step": 44010
+    },
+    {
+      "epoch": 0.3820366142655012,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0012331429404678186,
+      "loss": 0.0625,
+      "step": 44011
+    },
+    {
+      "epoch": 0.3820452947457053,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0012331135054191866,
+      "loss": 0.0962,
+      "step": 44012
+    },
+    {
+      "epoch": 0.3820539752259095,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0012330840702249836,
+      "loss": 0.0898,
+      "step": 44013
+    },
+    {
+      "epoch": 0.38206265570611364,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0012330546348852413,
+      "loss": 0.0869,
+      "step": 44014
+    },
+    {
+      "epoch": 0.38207133618631783,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0012330251993999928,
+      "loss": 0.0811,
+      "step": 44015
+    },
+    {
+      "epoch": 0.38208001666652197,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0012329957637692702,
+      "loss": 0.1338,
+      "step": 44016
+    },
+    {
+      "epoch": 0.38208869714672616,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0012329663279931054,
+      "loss": 0.1455,
+      "step": 44017
+    },
+    {
+      "epoch": 0.3820973776269303,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0012329368920715302,
+      "loss": 0.1001,
+      "step": 44018
+    },
+    {
+      "epoch": 0.3821060581071345,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0012329074560045777,
+      "loss": 0.0859,
+      "step": 44019
+    },
+    {
+      "epoch": 0.38211473858733863,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0012328780197922795,
+      "loss": 0.1221,
+      "step": 44020
+    },
+    {
+      "epoch": 0.3821234190675428,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001232848583434668,
+      "loss": 0.0933,
+      "step": 44021
+    },
+    {
+      "epoch": 0.38213209954774696,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001232819146931775,
+      "loss": 0.0894,
+      "step": 44022
+    },
+    {
+      "epoch": 0.38214078002795115,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0012327897102836329,
+      "loss": 0.1113,
+      "step": 44023
+    },
+    {
+      "epoch": 0.3821494605081553,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0012327602734902741,
+      "loss": 0.0747,
+      "step": 44024
+    },
+    {
+      "epoch": 0.3821581409883595,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0012327308365517305,
+      "loss": 0.1299,
+      "step": 44025
+    },
+    {
+      "epoch": 0.3821668214685636,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0012327013994680347,
+      "loss": 0.1074,
+      "step": 44026
+    },
+    {
+      "epoch": 0.3821755019487678,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0012326719622392187,
+      "loss": 0.085,
+      "step": 44027
+    },
+    {
+      "epoch": 0.38218418242897195,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0012326425248653146,
+      "loss": 0.1328,
+      "step": 44028
+    },
+    {
+      "epoch": 0.38219286290917615,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0012326130873463543,
+      "loss": 0.0752,
+      "step": 44029
+    },
+    {
+      "epoch": 0.3822015433893803,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0012325836496823708,
+      "loss": 0.1064,
+      "step": 44030
+    },
+    {
+      "epoch": 0.3822102238695845,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0012325542118733952,
+      "loss": 0.1279,
+      "step": 44031
+    },
+    {
+      "epoch": 0.3822189043497886,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0012325247739194609,
+      "loss": 0.1099,
+      "step": 44032
+    },
+    {
+      "epoch": 0.3822275848299928,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0012324953358205993,
+      "loss": 0.106,
+      "step": 44033
+    },
+    {
+      "epoch": 0.38223626531019694,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0012324658975768428,
+      "loss": 0.0967,
+      "step": 44034
+    },
+    {
+      "epoch": 0.38224494579040114,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0012324364591882234,
+      "loss": 0.1152,
+      "step": 44035
+    },
+    {
+      "epoch": 0.3822536262706053,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0012324070206547737,
+      "loss": 0.0972,
+      "step": 44036
+    },
+    {
+      "epoch": 0.38226230675080947,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0012323775819765258,
+      "loss": 0.0762,
+      "step": 44037
+    },
+    {
+      "epoch": 0.3822709872310136,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0012323481431535113,
+      "loss": 0.123,
+      "step": 44038
+    },
+    {
+      "epoch": 0.3822796677112178,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0012323187041857633,
+      "loss": 0.0923,
+      "step": 44039
+    },
+    {
+      "epoch": 0.38228834819142193,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0012322892650733133,
+      "loss": 0.1108,
+      "step": 44040
+    },
+    {
+      "epoch": 0.3822970286716261,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001232259825816194,
+      "loss": 0.0615,
+      "step": 44041
+    },
+    {
+      "epoch": 0.38230570915183026,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0012322303864144369,
+      "loss": 0.1328,
+      "step": 44042
+    },
+    {
+      "epoch": 0.38231438963203446,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0012322009468680748,
+      "loss": 0.127,
+      "step": 44043
+    },
+    {
+      "epoch": 0.3823230701122386,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0012321715071771397,
+      "loss": 0.1123,
+      "step": 44044
+    },
+    {
+      "epoch": 0.3823317505924428,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0012321420673416642,
+      "loss": 0.1426,
+      "step": 44045
+    },
+    {
+      "epoch": 0.3823404310726469,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.00123211262736168,
+      "loss": 0.1128,
+      "step": 44046
+    },
+    {
+      "epoch": 0.3823491115528511,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0012320831872372194,
+      "loss": 0.1182,
+      "step": 44047
+    },
+    {
+      "epoch": 0.38235779203305525,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0012320537469683141,
+      "loss": 0.0977,
+      "step": 44048
+    },
+    {
+      "epoch": 0.38236647251325945,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0012320243065549975,
+      "loss": 0.1006,
+      "step": 44049
+    },
+    {
+      "epoch": 0.3823751529934636,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001231994865997301,
+      "loss": 0.1074,
+      "step": 44050
+    },
+    {
+      "epoch": 0.3823838334736678,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0012319654252952565,
+      "loss": 0.1172,
+      "step": 44051
+    },
+    {
+      "epoch": 0.3823925139538719,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001231935984448897,
+      "loss": 0.0581,
+      "step": 44052
+    },
+    {
+      "epoch": 0.3824011944340761,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0012319065434582537,
+      "loss": 0.125,
+      "step": 44053
+    },
+    {
+      "epoch": 0.38240987491428025,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.00123187710232336,
+      "loss": 0.0669,
+      "step": 44054
+    },
+    {
+      "epoch": 0.38241855539448444,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0012318476610442472,
+      "loss": 0.1011,
+      "step": 44055
+    },
+    {
+      "epoch": 0.3824272358746886,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001231818219620948,
+      "loss": 0.1289,
+      "step": 44056
+    },
+    {
+      "epoch": 0.38243591635489277,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0012317887780534942,
+      "loss": 0.0928,
+      "step": 44057
+    },
+    {
+      "epoch": 0.3824445968350969,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0012317593363419179,
+      "loss": 0.0957,
+      "step": 44058
+    },
+    {
+      "epoch": 0.3824532773153011,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001231729894486252,
+      "loss": 0.1143,
+      "step": 44059
+    },
+    {
+      "epoch": 0.38246195779550524,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0012317004524865284,
+      "loss": 0.1055,
+      "step": 44060
+    },
+    {
+      "epoch": 0.38247063827570943,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0012316710103427789,
+      "loss": 0.1035,
+      "step": 44061
+    },
+    {
+      "epoch": 0.38247931875591357,
+      "grad_norm": 0.057861328125,
+      "learning_rate": 0.0012316415680550354,
+      "loss": 0.0674,
+      "step": 44062
+    },
+    {
+      "epoch": 0.38248799923611776,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0012316121256233314,
+      "loss": 0.1035,
+      "step": 44063
+    },
+    {
+      "epoch": 0.3824966797163219,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0012315826830476984,
+      "loss": 0.0972,
+      "step": 44064
+    },
+    {
+      "epoch": 0.3825053601965261,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0012315532403281684,
+      "loss": 0.127,
+      "step": 44065
+    },
+    {
+      "epoch": 0.3825140406767302,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0012315237974647732,
+      "loss": 0.0811,
+      "step": 44066
+    },
+    {
+      "epoch": 0.3825227211569344,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0012314943544575463,
+      "loss": 0.1211,
+      "step": 44067
+    },
+    {
+      "epoch": 0.38253140163713856,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001231464911306519,
+      "loss": 0.0894,
+      "step": 44068
+    },
+    {
+      "epoch": 0.38254008211734275,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0012314354680117235,
+      "loss": 0.0645,
+      "step": 44069
+    },
+    {
+      "epoch": 0.3825487625975469,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001231406024573192,
+      "loss": 0.1221,
+      "step": 44070
+    },
+    {
+      "epoch": 0.3825574430777511,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001231376580990957,
+      "loss": 0.1035,
+      "step": 44071
+    },
+    {
+      "epoch": 0.3825661235579552,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0012313471372650505,
+      "loss": 0.123,
+      "step": 44072
+    },
+    {
+      "epoch": 0.3825748040381594,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001231317693395505,
+      "loss": 0.0996,
+      "step": 44073
+    },
+    {
+      "epoch": 0.38258348451836355,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001231288249382352,
+      "loss": 0.0659,
+      "step": 44074
+    },
+    {
+      "epoch": 0.38259216499856774,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0012312588052256245,
+      "loss": 0.126,
+      "step": 44075
+    },
+    {
+      "epoch": 0.3826008454787719,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0012312293609253544,
+      "loss": 0.0889,
+      "step": 44076
+    },
+    {
+      "epoch": 0.38260952595897607,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0012311999164815737,
+      "loss": 0.1157,
+      "step": 44077
+    },
+    {
+      "epoch": 0.3826182064391802,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0012311704718943148,
+      "loss": 0.2324,
+      "step": 44078
+    },
+    {
+      "epoch": 0.3826268869193844,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0012311410271636097,
+      "loss": 0.0977,
+      "step": 44079
+    },
+    {
+      "epoch": 0.38263556739958854,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001231111582289491,
+      "loss": 0.1016,
+      "step": 44080
+    },
+    {
+      "epoch": 0.38264424787979273,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00123108213727199,
+      "loss": 0.0845,
+      "step": 44081
+    },
+    {
+      "epoch": 0.38265292835999687,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0012310526921111402,
+      "loss": 0.1416,
+      "step": 44082
+    },
+    {
+      "epoch": 0.38266160884020106,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0012310232468069726,
+      "loss": 0.1533,
+      "step": 44083
+    },
+    {
+      "epoch": 0.3826702893204052,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0012309938013595205,
+      "loss": 0.0869,
+      "step": 44084
+    },
+    {
+      "epoch": 0.3826789698006094,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0012309643557688154,
+      "loss": 0.0898,
+      "step": 44085
+    },
+    {
+      "epoch": 0.38268765028081353,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0012309349100348896,
+      "loss": 0.0767,
+      "step": 44086
+    },
+    {
+      "epoch": 0.3826963307610177,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0012309054641577752,
+      "loss": 0.0713,
+      "step": 44087
+    },
+    {
+      "epoch": 0.38270501124122186,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0012308760181375046,
+      "loss": 0.1104,
+      "step": 44088
+    },
+    {
+      "epoch": 0.38271369172142605,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0012308465719741105,
+      "loss": 0.084,
+      "step": 44089
+    },
+    {
+      "epoch": 0.3827223722016302,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0012308171256676238,
+      "loss": 0.0962,
+      "step": 44090
+    },
+    {
+      "epoch": 0.3827310526818344,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001230787679218078,
+      "loss": 0.1279,
+      "step": 44091
+    },
+    {
+      "epoch": 0.3827397331620385,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0012307582326255043,
+      "loss": 0.0781,
+      "step": 44092
+    },
+    {
+      "epoch": 0.3827484136422427,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0012307287858899353,
+      "loss": 0.1138,
+      "step": 44093
+    },
+    {
+      "epoch": 0.38275709412244685,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0012306993390114038,
+      "loss": 0.0747,
+      "step": 44094
+    },
+    {
+      "epoch": 0.38276577460265104,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0012306698919899413,
+      "loss": 0.0864,
+      "step": 44095
+    },
+    {
+      "epoch": 0.3827744550828552,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0012306404448255799,
+      "loss": 0.1035,
+      "step": 44096
+    },
+    {
+      "epoch": 0.3827831355630594,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0012306109975183525,
+      "loss": 0.0933,
+      "step": 44097
+    },
+    {
+      "epoch": 0.3827918160432635,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0012305815500682905,
+      "loss": 0.0835,
+      "step": 44098
+    },
+    {
+      "epoch": 0.3828004965234677,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0012305521024754266,
+      "loss": 0.1084,
+      "step": 44099
+    },
+    {
+      "epoch": 0.38280917700367184,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0012305226547397927,
+      "loss": 0.1123,
+      "step": 44100
+    },
+    {
+      "epoch": 0.38281785748387603,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0012304932068614215,
+      "loss": 0.1338,
+      "step": 44101
+    },
+    {
+      "epoch": 0.38282653796408017,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0012304637588403447,
+      "loss": 0.0859,
+      "step": 44102
+    },
+    {
+      "epoch": 0.38283521844428436,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.001230434310676595,
+      "loss": 0.1094,
+      "step": 44103
+    },
+    {
+      "epoch": 0.3828438989244885,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0012304048623702037,
+      "loss": 0.1021,
+      "step": 44104
+    },
+    {
+      "epoch": 0.3828525794046927,
+      "grad_norm": 2.234375,
+      "learning_rate": 0.0012303754139212043,
+      "loss": 0.3027,
+      "step": 44105
+    },
+    {
+      "epoch": 0.38286125988489683,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0012303459653296275,
+      "loss": 0.125,
+      "step": 44106
+    },
+    {
+      "epoch": 0.382869940365101,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001230316516595507,
+      "loss": 0.1162,
+      "step": 44107
+    },
+    {
+      "epoch": 0.38287862084530516,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0012302870677188742,
+      "loss": 0.0752,
+      "step": 44108
+    },
+    {
+      "epoch": 0.38288730132550935,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0012302576186997611,
+      "loss": 0.1182,
+      "step": 44109
+    },
+    {
+      "epoch": 0.3828959818057135,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0012302281695382002,
+      "loss": 0.085,
+      "step": 44110
+    },
+    {
+      "epoch": 0.38290466228591763,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001230198720234224,
+      "loss": 0.0781,
+      "step": 44111
+    },
+    {
+      "epoch": 0.3829133427661218,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0012301692707878647,
+      "loss": 0.1123,
+      "step": 44112
+    },
+    {
+      "epoch": 0.38292202324632596,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0012301398211991537,
+      "loss": 0.0889,
+      "step": 44113
+    },
+    {
+      "epoch": 0.38293070372653015,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.001230110371468124,
+      "loss": 0.0781,
+      "step": 44114
+    },
+    {
+      "epoch": 0.3829393842067343,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0012300809215948076,
+      "loss": 0.1328,
+      "step": 44115
+    },
+    {
+      "epoch": 0.3829480646869385,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0012300514715792368,
+      "loss": 0.0913,
+      "step": 44116
+    },
+    {
+      "epoch": 0.3829567451671426,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0012300220214214435,
+      "loss": 0.0947,
+      "step": 44117
+    },
+    {
+      "epoch": 0.3829654256473468,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0012299925711214599,
+      "loss": 0.1064,
+      "step": 44118
+    },
+    {
+      "epoch": 0.38297410612755095,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0012299631206793183,
+      "loss": 0.1006,
+      "step": 44119
+    },
+    {
+      "epoch": 0.38298278660775514,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0012299336700950514,
+      "loss": 0.0981,
+      "step": 44120
+    },
+    {
+      "epoch": 0.3829914670879593,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0012299042193686907,
+      "loss": 0.1069,
+      "step": 44121
+    },
+    {
+      "epoch": 0.3830001475681635,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0012298747685002686,
+      "loss": 0.064,
+      "step": 44122
+    },
+    {
+      "epoch": 0.3830088280483676,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0012298453174898179,
+      "loss": 0.1221,
+      "step": 44123
+    },
+    {
+      "epoch": 0.3830175085285718,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0012298158663373697,
+      "loss": 0.0879,
+      "step": 44124
+    },
+    {
+      "epoch": 0.38302618900877594,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0012297864150429573,
+      "loss": 0.1123,
+      "step": 44125
+    },
+    {
+      "epoch": 0.38303486948898013,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001229756963606612,
+      "loss": 0.0938,
+      "step": 44126
+    },
+    {
+      "epoch": 0.38304354996918427,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0012297275120283669,
+      "loss": 0.1182,
+      "step": 44127
+    },
+    {
+      "epoch": 0.38305223044938846,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0012296980603082534,
+      "loss": 0.1055,
+      "step": 44128
+    },
+    {
+      "epoch": 0.3830609109295926,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.001229668608446304,
+      "loss": 0.1157,
+      "step": 44129
+    },
+    {
+      "epoch": 0.3830695914097968,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001229639156442551,
+      "loss": 0.1074,
+      "step": 44130
+    },
+    {
+      "epoch": 0.38307827189000093,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001229609704297027,
+      "loss": 0.0928,
+      "step": 44131
+    },
+    {
+      "epoch": 0.3830869523702051,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0012295802520097634,
+      "loss": 0.1025,
+      "step": 44132
+    },
+    {
+      "epoch": 0.38309563285040926,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0012295507995807931,
+      "loss": 0.0835,
+      "step": 44133
+    },
+    {
+      "epoch": 0.38310431333061346,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0012295213470101475,
+      "loss": 0.0796,
+      "step": 44134
+    },
+    {
+      "epoch": 0.3831129938108176,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0012294918942978596,
+      "loss": 0.1113,
+      "step": 44135
+    },
+    {
+      "epoch": 0.3831216742910218,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0012294624414439612,
+      "loss": 0.0874,
+      "step": 44136
+    },
+    {
+      "epoch": 0.3831303547712259,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0012294329884484849,
+      "loss": 0.1367,
+      "step": 44137
+    },
+    {
+      "epoch": 0.3831390352514301,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001229403535311462,
+      "loss": 0.0688,
+      "step": 44138
+    },
+    {
+      "epoch": 0.38314771573163425,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0012293740820329257,
+      "loss": 0.1201,
+      "step": 44139
+    },
+    {
+      "epoch": 0.38315639621183845,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0012293446286129079,
+      "loss": 0.0674,
+      "step": 44140
+    },
+    {
+      "epoch": 0.3831650766920426,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0012293151750514411,
+      "loss": 0.1328,
+      "step": 44141
+    },
+    {
+      "epoch": 0.3831737571722468,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0012292857213485567,
+      "loss": 0.1113,
+      "step": 44142
+    },
+    {
+      "epoch": 0.3831824376524509,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0012292562675042874,
+      "loss": 0.1074,
+      "step": 44143
+    },
+    {
+      "epoch": 0.3831911181326551,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0012292268135186656,
+      "loss": 0.0713,
+      "step": 44144
+    },
+    {
+      "epoch": 0.38319979861285924,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0012291973593917232,
+      "loss": 0.0972,
+      "step": 44145
+    },
+    {
+      "epoch": 0.38320847909306344,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0012291679051234926,
+      "loss": 0.1113,
+      "step": 44146
+    },
+    {
+      "epoch": 0.3832171595732676,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0012291384507140055,
+      "loss": 0.084,
+      "step": 44147
+    },
+    {
+      "epoch": 0.38322584005347177,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0012291089961632949,
+      "loss": 0.1064,
+      "step": 44148
+    },
+    {
+      "epoch": 0.3832345205336759,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0012290795414713924,
+      "loss": 0.0889,
+      "step": 44149
+    },
+    {
+      "epoch": 0.3832432010138801,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0012290500866383308,
+      "loss": 0.1021,
+      "step": 44150
+    },
+    {
+      "epoch": 0.38325188149408423,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0012290206316641417,
+      "loss": 0.1855,
+      "step": 44151
+    },
+    {
+      "epoch": 0.3832605619742884,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0012289911765488578,
+      "loss": 0.0952,
+      "step": 44152
+    },
+    {
+      "epoch": 0.38326924245449256,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0012289617212925107,
+      "loss": 0.1177,
+      "step": 44153
+    },
+    {
+      "epoch": 0.38327792293469676,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0012289322658951335,
+      "loss": 0.1123,
+      "step": 44154
+    },
+    {
+      "epoch": 0.3832866034149009,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0012289028103567576,
+      "loss": 0.1196,
+      "step": 44155
+    },
+    {
+      "epoch": 0.3832952838951051,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0012288733546774156,
+      "loss": 0.1138,
+      "step": 44156
+    },
+    {
+      "epoch": 0.3833039643753092,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0012288438988571398,
+      "loss": 0.0898,
+      "step": 44157
+    },
+    {
+      "epoch": 0.3833126448555134,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0012288144428959618,
+      "loss": 0.0918,
+      "step": 44158
+    },
+    {
+      "epoch": 0.38332132533571756,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0012287849867939145,
+      "loss": 0.1235,
+      "step": 44159
+    },
+    {
+      "epoch": 0.38333000581592175,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.00122875553055103,
+      "loss": 0.1348,
+      "step": 44160
+    },
+    {
+      "epoch": 0.3833386862961259,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0012287260741673403,
+      "loss": 0.0938,
+      "step": 44161
+    },
+    {
+      "epoch": 0.3833473667763301,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0012286966176428776,
+      "loss": 0.082,
+      "step": 44162
+    },
+    {
+      "epoch": 0.3833560472565342,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0012286671609776745,
+      "loss": 0.1123,
+      "step": 44163
+    },
+    {
+      "epoch": 0.3833647277367384,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0012286377041717627,
+      "loss": 0.1104,
+      "step": 44164
+    },
+    {
+      "epoch": 0.38337340821694255,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0012286082472251745,
+      "loss": 0.1133,
+      "step": 44165
+    },
+    {
+      "epoch": 0.38338208869714674,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0012285787901379426,
+      "loss": 0.1084,
+      "step": 44166
+    },
+    {
+      "epoch": 0.3833907691773509,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0012285493329100986,
+      "loss": 0.127,
+      "step": 44167
+    },
+    {
+      "epoch": 0.38339944965755507,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0012285198755416751,
+      "loss": 0.0718,
+      "step": 44168
+    },
+    {
+      "epoch": 0.3834081301377592,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001228490418032704,
+      "loss": 0.0684,
+      "step": 44169
+    },
+    {
+      "epoch": 0.3834168106179634,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001228460960383218,
+      "loss": 0.0933,
+      "step": 44170
+    },
+    {
+      "epoch": 0.38342549109816754,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0012284315025932488,
+      "loss": 0.0713,
+      "step": 44171
+    },
+    {
+      "epoch": 0.38343417157837173,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001228402044662829,
+      "loss": 0.0864,
+      "step": 44172
+    },
+    {
+      "epoch": 0.38344285205857587,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0012283725865919906,
+      "loss": 0.1011,
+      "step": 44173
+    },
+    {
+      "epoch": 0.38345153253878006,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001228343128380766,
+      "loss": 0.0732,
+      "step": 44174
+    },
+    {
+      "epoch": 0.3834602130189842,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001228313670029187,
+      "loss": 0.083,
+      "step": 44175
+    },
+    {
+      "epoch": 0.3834688934991884,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001228284211537286,
+      "loss": 0.0962,
+      "step": 44176
+    },
+    {
+      "epoch": 0.3834775739793925,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0012282547529050957,
+      "loss": 0.0913,
+      "step": 44177
+    },
+    {
+      "epoch": 0.3834862544595967,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0012282252941326478,
+      "loss": 0.1279,
+      "step": 44178
+    },
+    {
+      "epoch": 0.38349493493980086,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0012281958352199744,
+      "loss": 0.1196,
+      "step": 44179
+    },
+    {
+      "epoch": 0.38350361542000505,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0012281663761671085,
+      "loss": 0.1396,
+      "step": 44180
+    },
+    {
+      "epoch": 0.3835122959002092,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0012281369169740814,
+      "loss": 0.085,
+      "step": 44181
+    },
+    {
+      "epoch": 0.3835209763804134,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0012281074576409258,
+      "loss": 0.0732,
+      "step": 44182
+    },
+    {
+      "epoch": 0.3835296568606175,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0012280779981676737,
+      "loss": 0.1138,
+      "step": 44183
+    },
+    {
+      "epoch": 0.3835383373408217,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0012280485385543576,
+      "loss": 0.207,
+      "step": 44184
+    },
+    {
+      "epoch": 0.38354701782102585,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0012280190788010095,
+      "loss": 0.0918,
+      "step": 44185
+    },
+    {
+      "epoch": 0.38355569830123004,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0012279896189076615,
+      "loss": 0.0947,
+      "step": 44186
+    },
+    {
+      "epoch": 0.3835643787814342,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001227960158874346,
+      "loss": 0.127,
+      "step": 44187
+    },
+    {
+      "epoch": 0.38357305926163837,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0012279306987010952,
+      "loss": 0.1631,
+      "step": 44188
+    },
+    {
+      "epoch": 0.3835817397418425,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0012279012383879413,
+      "loss": 0.0737,
+      "step": 44189
+    },
+    {
+      "epoch": 0.3835904202220467,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001227871777934917,
+      "loss": 0.1279,
+      "step": 44190
+    },
+    {
+      "epoch": 0.38359910070225084,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0012278423173420533,
+      "loss": 0.0869,
+      "step": 44191
+    },
+    {
+      "epoch": 0.38360778118245503,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0012278128566093835,
+      "loss": 0.1143,
+      "step": 44192
+    },
+    {
+      "epoch": 0.38361646166265917,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0012277833957369396,
+      "loss": 0.0776,
+      "step": 44193
+    },
+    {
+      "epoch": 0.38362514214286336,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0012277539347247538,
+      "loss": 0.0879,
+      "step": 44194
+    },
+    {
+      "epoch": 0.3836338226230675,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0012277244735728577,
+      "loss": 0.0918,
+      "step": 44195
+    },
+    {
+      "epoch": 0.3836425031032717,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0012276950122812843,
+      "loss": 0.1221,
+      "step": 44196
+    },
+    {
+      "epoch": 0.38365118358347583,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0012276655508500656,
+      "loss": 0.1162,
+      "step": 44197
+    },
+    {
+      "epoch": 0.38365986406368,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0012276360892792338,
+      "loss": 0.1602,
+      "step": 44198
+    },
+    {
+      "epoch": 0.38366854454388416,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0012276066275688211,
+      "loss": 0.0728,
+      "step": 44199
+    },
+    {
+      "epoch": 0.38367722502408835,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0012275771657188595,
+      "loss": 0.1211,
+      "step": 44200
+    },
+    {
+      "epoch": 0.3836859055042925,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001227547703729382,
+      "loss": 0.0996,
+      "step": 44201
+    },
+    {
+      "epoch": 0.3836945859844967,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0012275182416004197,
+      "loss": 0.0996,
+      "step": 44202
+    },
+    {
+      "epoch": 0.3837032664647008,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0012274887793320055,
+      "loss": 0.1113,
+      "step": 44203
+    },
+    {
+      "epoch": 0.383711946944905,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0012274593169241716,
+      "loss": 0.083,
+      "step": 44204
+    },
+    {
+      "epoch": 0.38372062742510915,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.00122742985437695,
+      "loss": 0.0933,
+      "step": 44205
+    },
+    {
+      "epoch": 0.38372930790531334,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0012274003916903726,
+      "loss": 0.0835,
+      "step": 44206
+    },
+    {
+      "epoch": 0.3837379883855175,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0012273709288644725,
+      "loss": 0.1748,
+      "step": 44207
+    },
+    {
+      "epoch": 0.3837466688657217,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0012273414658992815,
+      "loss": 0.0996,
+      "step": 44208
+    },
+    {
+      "epoch": 0.3837553493459258,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0012273120027948318,
+      "loss": 0.1074,
+      "step": 44209
+    },
+    {
+      "epoch": 0.38376402982613,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0012272825395511556,
+      "loss": 0.0728,
+      "step": 44210
+    },
+    {
+      "epoch": 0.38377271030633414,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001227253076168285,
+      "loss": 0.0693,
+      "step": 44211
+    },
+    {
+      "epoch": 0.38378139078653833,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0012272236126462525,
+      "loss": 0.1104,
+      "step": 44212
+    },
+    {
+      "epoch": 0.38379007126674247,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.00122719414898509,
+      "loss": 0.0737,
+      "step": 44213
+    },
+    {
+      "epoch": 0.38379875174694666,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0012271646851848299,
+      "loss": 0.0825,
+      "step": 44214
+    },
+    {
+      "epoch": 0.3838074322271508,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0012271352212455044,
+      "loss": 0.0884,
+      "step": 44215
+    },
+    {
+      "epoch": 0.383816112707355,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001227105757167146,
+      "loss": 0.0869,
+      "step": 44216
+    },
+    {
+      "epoch": 0.38382479318755913,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0012270762929497863,
+      "loss": 0.1543,
+      "step": 44217
+    },
+    {
+      "epoch": 0.3838334736677633,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001227046828593458,
+      "loss": 0.1074,
+      "step": 44218
+    },
+    {
+      "epoch": 0.38384215414796746,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0012270173640981932,
+      "loss": 0.1357,
+      "step": 44219
+    },
+    {
+      "epoch": 0.38385083462817166,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0012269878994640242,
+      "loss": 0.106,
+      "step": 44220
+    },
+    {
+      "epoch": 0.3838595151083758,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001226958434690983,
+      "loss": 0.1182,
+      "step": 44221
+    },
+    {
+      "epoch": 0.38386819558858,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0012269289697791023,
+      "loss": 0.1143,
+      "step": 44222
+    },
+    {
+      "epoch": 0.3838768760687841,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0012268995047284136,
+      "loss": 0.0835,
+      "step": 44223
+    },
+    {
+      "epoch": 0.3838855565489883,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0012268700395389495,
+      "loss": 0.0991,
+      "step": 44224
+    },
+    {
+      "epoch": 0.38389423702919245,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0012268405742107426,
+      "loss": 0.1021,
+      "step": 44225
+    },
+    {
+      "epoch": 0.38390291750939665,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0012268111087438242,
+      "loss": 0.127,
+      "step": 44226
+    },
+    {
+      "epoch": 0.3839115979896008,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0012267816431382275,
+      "loss": 0.1113,
+      "step": 44227
+    },
+    {
+      "epoch": 0.383920278469805,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0012267521773939843,
+      "loss": 0.1108,
+      "step": 44228
+    },
+    {
+      "epoch": 0.3839289589500091,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001226722711511127,
+      "loss": 0.0928,
+      "step": 44229
+    },
+    {
+      "epoch": 0.3839376394302133,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001226693245489687,
+      "loss": 0.084,
+      "step": 44230
+    },
+    {
+      "epoch": 0.38394631991041744,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0012266637793296978,
+      "loss": 0.1025,
+      "step": 44231
+    },
+    {
+      "epoch": 0.38395500039062164,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0012266343130311907,
+      "loss": 0.0742,
+      "step": 44232
+    },
+    {
+      "epoch": 0.3839636808708258,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0012266048465941985,
+      "loss": 0.1348,
+      "step": 44233
+    },
+    {
+      "epoch": 0.3839723613510299,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0012265753800187525,
+      "loss": 0.1182,
+      "step": 44234
+    },
+    {
+      "epoch": 0.3839810418312341,
+      "grad_norm": 1.8359375,
+      "learning_rate": 0.001226545913304886,
+      "loss": 0.1143,
+      "step": 44235
+    },
+    {
+      "epoch": 0.38398972231143824,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001226516446452631,
+      "loss": 0.0894,
+      "step": 44236
+    },
+    {
+      "epoch": 0.38399840279164243,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0012264869794620192,
+      "loss": 0.1045,
+      "step": 44237
+    },
+    {
+      "epoch": 0.38400708327184657,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0012264575123330833,
+      "loss": 0.0952,
+      "step": 44238
+    },
+    {
+      "epoch": 0.38401576375205076,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0012264280450658552,
+      "loss": 0.0996,
+      "step": 44239
+    },
+    {
+      "epoch": 0.3840244442322549,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0012263985776603673,
+      "loss": 0.0864,
+      "step": 44240
+    },
+    {
+      "epoch": 0.3840331247124591,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0012263691101166518,
+      "loss": 0.0908,
+      "step": 44241
+    },
+    {
+      "epoch": 0.38404180519266323,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0012263396424347412,
+      "loss": 0.0889,
+      "step": 44242
+    },
+    {
+      "epoch": 0.3840504856728674,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0012263101746146671,
+      "loss": 0.1387,
+      "step": 44243
+    },
+    {
+      "epoch": 0.38405916615307156,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0012262807066564622,
+      "loss": 0.0815,
+      "step": 44244
+    },
+    {
+      "epoch": 0.38406784663327576,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0012262512385601586,
+      "loss": 0.0864,
+      "step": 44245
+    },
+    {
+      "epoch": 0.3840765271134799,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.001226221770325789,
+      "loss": 0.0664,
+      "step": 44246
+    },
+    {
+      "epoch": 0.3840852075936841,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0012261923019533845,
+      "loss": 0.123,
+      "step": 44247
+    },
+    {
+      "epoch": 0.3840938880738882,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0012261628334429783,
+      "loss": 0.1113,
+      "step": 44248
+    },
+    {
+      "epoch": 0.3841025685540924,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0012261333647946024,
+      "loss": 0.0913,
+      "step": 44249
+    },
+    {
+      "epoch": 0.38411124903429655,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0012261038960082891,
+      "loss": 0.0967,
+      "step": 44250
+    },
+    {
+      "epoch": 0.38411992951450075,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.00122607442708407,
+      "loss": 0.124,
+      "step": 44251
+    },
+    {
+      "epoch": 0.3841286099947049,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001226044958021978,
+      "loss": 0.0864,
+      "step": 44252
+    },
+    {
+      "epoch": 0.3841372904749091,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0012260154888220453,
+      "loss": 0.0737,
+      "step": 44253
+    },
+    {
+      "epoch": 0.3841459709551132,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0012259860194843038,
+      "loss": 0.0942,
+      "step": 44254
+    },
+    {
+      "epoch": 0.3841546514353174,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001225956550008786,
+      "loss": 0.0928,
+      "step": 44255
+    },
+    {
+      "epoch": 0.38416333191552154,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0012259270803955235,
+      "loss": 0.0791,
+      "step": 44256
+    },
+    {
+      "epoch": 0.38417201239572574,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0012258976106445498,
+      "loss": 0.1074,
+      "step": 44257
+    },
+    {
+      "epoch": 0.3841806928759299,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.001225868140755896,
+      "loss": 0.124,
+      "step": 44258
+    },
+    {
+      "epoch": 0.38418937335613407,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001225838670729595,
+      "loss": 0.0957,
+      "step": 44259
+    },
+    {
+      "epoch": 0.3841980538363382,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0012258092005656782,
+      "loss": 0.0776,
+      "step": 44260
+    },
+    {
+      "epoch": 0.3842067343165424,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0012257797302641787,
+      "loss": 0.0732,
+      "step": 44261
+    },
+    {
+      "epoch": 0.38421541479674653,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0012257502598251283,
+      "loss": 0.0986,
+      "step": 44262
+    },
+    {
+      "epoch": 0.38422409527695073,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0012257207892485589,
+      "loss": 0.1367,
+      "step": 44263
+    },
+    {
+      "epoch": 0.38423277575715487,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0012256913185345036,
+      "loss": 0.1562,
+      "step": 44264
+    },
+    {
+      "epoch": 0.38424145623735906,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0012256618476829942,
+      "loss": 0.0918,
+      "step": 44265
+    },
+    {
+      "epoch": 0.3842501367175632,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0012256323766940626,
+      "loss": 0.1221,
+      "step": 44266
+    },
+    {
+      "epoch": 0.3842588171977674,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0012256029055677416,
+      "loss": 0.0938,
+      "step": 44267
+    },
+    {
+      "epoch": 0.3842674976779715,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001225573434304063,
+      "loss": 0.0977,
+      "step": 44268
+    },
+    {
+      "epoch": 0.3842761781581757,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001225543962903059,
+      "loss": 0.082,
+      "step": 44269
+    },
+    {
+      "epoch": 0.38428485863837986,
+      "grad_norm": 10.4375,
+      "learning_rate": 0.0012255144913647625,
+      "loss": 0.3105,
+      "step": 44270
+    },
+    {
+      "epoch": 0.38429353911858405,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.001225485019689205,
+      "loss": 0.1006,
+      "step": 44271
+    },
+    {
+      "epoch": 0.3843022195987882,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0012254555478764188,
+      "loss": 0.1074,
+      "step": 44272
+    },
+    {
+      "epoch": 0.3843109000789924,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0012254260759264363,
+      "loss": 0.1055,
+      "step": 44273
+    },
+    {
+      "epoch": 0.3843195805591965,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0012253966038392897,
+      "loss": 0.1006,
+      "step": 44274
+    },
+    {
+      "epoch": 0.3843282610394007,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0012253671316150114,
+      "loss": 0.1211,
+      "step": 44275
+    },
+    {
+      "epoch": 0.38433694151960485,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0012253376592536336,
+      "loss": 0.0913,
+      "step": 44276
+    },
+    {
+      "epoch": 0.38434562199980904,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0012253081867551884,
+      "loss": 0.0708,
+      "step": 44277
+    },
+    {
+      "epoch": 0.3843543024800132,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001225278714119708,
+      "loss": 0.1104,
+      "step": 44278
+    },
+    {
+      "epoch": 0.38436298296021737,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0012252492413472246,
+      "loss": 0.0918,
+      "step": 44279
+    },
+    {
+      "epoch": 0.3843716634404215,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0012252197684377707,
+      "loss": 0.1133,
+      "step": 44280
+    },
+    {
+      "epoch": 0.3843803439206257,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0012251902953913782,
+      "loss": 0.0728,
+      "step": 44281
+    },
+    {
+      "epoch": 0.38438902440082984,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0012251608222080793,
+      "loss": 0.1143,
+      "step": 44282
+    },
+    {
+      "epoch": 0.38439770488103403,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0012251313488879064,
+      "loss": 0.0806,
+      "step": 44283
+    },
+    {
+      "epoch": 0.38440638536123817,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001225101875430892,
+      "loss": 0.0908,
+      "step": 44284
+    },
+    {
+      "epoch": 0.38441506584144236,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001225072401837068,
+      "loss": 0.1055,
+      "step": 44285
+    },
+    {
+      "epoch": 0.3844237463216465,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0012250429281064667,
+      "loss": 0.1289,
+      "step": 44286
+    },
+    {
+      "epoch": 0.3844324268018507,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0012250134542391203,
+      "loss": 0.0796,
+      "step": 44287
+    },
+    {
+      "epoch": 0.38444110728205483,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001224983980235061,
+      "loss": 0.1152,
+      "step": 44288
+    },
+    {
+      "epoch": 0.384449787762259,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0012249545060943213,
+      "loss": 0.1992,
+      "step": 44289
+    },
+    {
+      "epoch": 0.38445846824246316,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001224925031816933,
+      "loss": 0.0771,
+      "step": 44290
+    },
+    {
+      "epoch": 0.38446714872266735,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0012248955574029289,
+      "loss": 0.0923,
+      "step": 44291
+    },
+    {
+      "epoch": 0.3844758292028715,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0012248660828523405,
+      "loss": 0.1123,
+      "step": 44292
+    },
+    {
+      "epoch": 0.3844845096830757,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0012248366081652005,
+      "loss": 0.1006,
+      "step": 44293
+    },
+    {
+      "epoch": 0.3844931901632798,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0012248071333415412,
+      "loss": 0.0884,
+      "step": 44294
+    },
+    {
+      "epoch": 0.384501870643484,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0012247776583813946,
+      "loss": 0.124,
+      "step": 44295
+    },
+    {
+      "epoch": 0.38451055112368815,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001224748183284793,
+      "loss": 0.0933,
+      "step": 44296
+    },
+    {
+      "epoch": 0.38451923160389234,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001224718708051769,
+      "loss": 0.1177,
+      "step": 44297
+    },
+    {
+      "epoch": 0.3845279120840965,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0012246892326823542,
+      "loss": 0.0752,
+      "step": 44298
+    },
+    {
+      "epoch": 0.38453659256430067,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0012246597571765809,
+      "loss": 0.064,
+      "step": 44299
+    },
+    {
+      "epoch": 0.3845452730445048,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001224630281534482,
+      "loss": 0.1123,
+      "step": 44300
+    },
+    {
+      "epoch": 0.384553953524709,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001224600805756089,
+      "loss": 0.1094,
+      "step": 44301
+    },
+    {
+      "epoch": 0.38456263400491314,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0012245713298414346,
+      "loss": 0.1211,
+      "step": 44302
+    },
+    {
+      "epoch": 0.38457131448511733,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0012245418537905506,
+      "loss": 0.1084,
+      "step": 44303
+    },
+    {
+      "epoch": 0.38457999496532147,
+      "grad_norm": 0.05322265625,
+      "learning_rate": 0.0012245123776034696,
+      "loss": 0.0679,
+      "step": 44304
+    },
+    {
+      "epoch": 0.38458867544552566,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.001224482901280224,
+      "loss": 0.1299,
+      "step": 44305
+    },
+    {
+      "epoch": 0.3845973559257298,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0012244534248208457,
+      "loss": 0.1191,
+      "step": 44306
+    },
+    {
+      "epoch": 0.384606036405934,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001224423948225367,
+      "loss": 0.1133,
+      "step": 44307
+    },
+    {
+      "epoch": 0.38461471688613813,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.00122439447149382,
+      "loss": 0.0781,
+      "step": 44308
+    },
+    {
+      "epoch": 0.3846233973663423,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0012243649946262372,
+      "loss": 0.0713,
+      "step": 44309
+    },
+    {
+      "epoch": 0.38463207784654646,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0012243355176226505,
+      "loss": 0.0981,
+      "step": 44310
+    },
+    {
+      "epoch": 0.38464075832675065,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0012243060404830927,
+      "loss": 0.0938,
+      "step": 44311
+    },
+    {
+      "epoch": 0.3846494388069548,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0012242765632075952,
+      "loss": 0.1001,
+      "step": 44312
+    },
+    {
+      "epoch": 0.384658119287159,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001224247085796191,
+      "loss": 0.1113,
+      "step": 44313
+    },
+    {
+      "epoch": 0.3846667997673631,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001224217608248912,
+      "loss": 0.123,
+      "step": 44314
+    },
+    {
+      "epoch": 0.3846754802475673,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0012241881305657908,
+      "loss": 0.1064,
+      "step": 44315
+    },
+    {
+      "epoch": 0.38468416072777145,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0012241586527468589,
+      "loss": 0.1035,
+      "step": 44316
+    },
+    {
+      "epoch": 0.38469284120797564,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001224129174792149,
+      "loss": 0.105,
+      "step": 44317
+    },
+    {
+      "epoch": 0.3847015216881798,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0012240996967016935,
+      "loss": 0.1006,
+      "step": 44318
+    },
+    {
+      "epoch": 0.384710202168384,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0012240702184755246,
+      "loss": 0.0859,
+      "step": 44319
+    },
+    {
+      "epoch": 0.3847188826485881,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0012240407401136737,
+      "loss": 0.1152,
+      "step": 44320
+    },
+    {
+      "epoch": 0.3847275631287923,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0012240112616161744,
+      "loss": 0.1133,
+      "step": 44321
+    },
+    {
+      "epoch": 0.38473624360899644,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.001223981782983058,
+      "loss": 0.084,
+      "step": 44322
+    },
+    {
+      "epoch": 0.38474492408920064,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0012239523042143568,
+      "loss": 0.0928,
+      "step": 44323
+    },
+    {
+      "epoch": 0.3847536045694048,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0012239228253101036,
+      "loss": 0.0723,
+      "step": 44324
+    },
+    {
+      "epoch": 0.38476228504960897,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0012238933462703299,
+      "loss": 0.1045,
+      "step": 44325
+    },
+    {
+      "epoch": 0.3847709655298131,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0012238638670950684,
+      "loss": 0.0854,
+      "step": 44326
+    },
+    {
+      "epoch": 0.3847796460100173,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0012238343877843513,
+      "loss": 0.0986,
+      "step": 44327
+    },
+    {
+      "epoch": 0.38478832649022143,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001223804908338211,
+      "loss": 0.1064,
+      "step": 44328
+    },
+    {
+      "epoch": 0.3847970069704256,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0012237754287566791,
+      "loss": 0.0752,
+      "step": 44329
+    },
+    {
+      "epoch": 0.38480568745062976,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0012237459490397884,
+      "loss": 0.1289,
+      "step": 44330
+    },
+    {
+      "epoch": 0.38481436793083396,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0012237164691875709,
+      "loss": 0.105,
+      "step": 44331
+    },
+    {
+      "epoch": 0.3848230484110381,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0012236869892000592,
+      "loss": 0.1621,
+      "step": 44332
+    },
+    {
+      "epoch": 0.3848317288912423,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.001223657509077285,
+      "loss": 0.1182,
+      "step": 44333
+    },
+    {
+      "epoch": 0.3848404093714464,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001223628028819281,
+      "loss": 0.0986,
+      "step": 44334
+    },
+    {
+      "epoch": 0.3848490898516506,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0012235985484260793,
+      "loss": 0.106,
+      "step": 44335
+    },
+    {
+      "epoch": 0.38485777033185475,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0012235690678977118,
+      "loss": 0.1484,
+      "step": 44336
+    },
+    {
+      "epoch": 0.38486645081205895,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0012235395872342112,
+      "loss": 0.0815,
+      "step": 44337
+    },
+    {
+      "epoch": 0.3848751312922631,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0012235101064356096,
+      "loss": 0.0889,
+      "step": 44338
+    },
+    {
+      "epoch": 0.3848838117724673,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001223480625501939,
+      "loss": 0.0884,
+      "step": 44339
+    },
+    {
+      "epoch": 0.3848924922526714,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0012234511444332318,
+      "loss": 0.1025,
+      "step": 44340
+    },
+    {
+      "epoch": 0.3849011727328756,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0012234216632295207,
+      "loss": 0.0864,
+      "step": 44341
+    },
+    {
+      "epoch": 0.38490985321307974,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001223392181890837,
+      "loss": 0.1143,
+      "step": 44342
+    },
+    {
+      "epoch": 0.38491853369328394,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0012233627004172138,
+      "loss": 0.1475,
+      "step": 44343
+    },
+    {
+      "epoch": 0.3849272141734881,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0012233332188086833,
+      "loss": 0.1045,
+      "step": 44344
+    },
+    {
+      "epoch": 0.38493589465369227,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001223303737065277,
+      "loss": 0.0859,
+      "step": 44345
+    },
+    {
+      "epoch": 0.3849445751338964,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0012232742551870276,
+      "loss": 0.0776,
+      "step": 44346
+    },
+    {
+      "epoch": 0.3849532556141006,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0012232447731739676,
+      "loss": 0.1035,
+      "step": 44347
+    },
+    {
+      "epoch": 0.38496193609430474,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0012232152910261287,
+      "loss": 0.0859,
+      "step": 44348
+    },
+    {
+      "epoch": 0.38497061657450893,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0012231858087435434,
+      "loss": 0.0859,
+      "step": 44349
+    },
+    {
+      "epoch": 0.38497929705471307,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0012231563263262442,
+      "loss": 0.082,
+      "step": 44350
+    },
+    {
+      "epoch": 0.38498797753491726,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0012231268437742626,
+      "loss": 0.1157,
+      "step": 44351
+    },
+    {
+      "epoch": 0.3849966580151214,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0012230973610876316,
+      "loss": 0.1289,
+      "step": 44352
+    },
+    {
+      "epoch": 0.3850053384953256,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0012230678782663837,
+      "loss": 0.1113,
+      "step": 44353
+    },
+    {
+      "epoch": 0.3850140189755297,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.00122303839531055,
+      "loss": 0.1006,
+      "step": 44354
+    },
+    {
+      "epoch": 0.3850226994557339,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0012230089122201634,
+      "loss": 0.1157,
+      "step": 44355
+    },
+    {
+      "epoch": 0.38503137993593806,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0012229794289952563,
+      "loss": 0.0903,
+      "step": 44356
+    },
+    {
+      "epoch": 0.3850400604161422,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0012229499456358607,
+      "loss": 0.1348,
+      "step": 44357
+    },
+    {
+      "epoch": 0.3850487408963464,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0012229204621420089,
+      "loss": 0.1123,
+      "step": 44358
+    },
+    {
+      "epoch": 0.3850574213765505,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.001222890978513733,
+      "loss": 0.0845,
+      "step": 44359
+    },
+    {
+      "epoch": 0.3850661018567547,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0012228614947510652,
+      "loss": 0.1074,
+      "step": 44360
+    },
+    {
+      "epoch": 0.38507478233695885,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0012228320108540382,
+      "loss": 0.1318,
+      "step": 44361
+    },
+    {
+      "epoch": 0.38508346281716305,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001222802526822684,
+      "loss": 0.1211,
+      "step": 44362
+    },
+    {
+      "epoch": 0.3850921432973672,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001222773042657035,
+      "loss": 0.0718,
+      "step": 44363
+    },
+    {
+      "epoch": 0.3851008237775714,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0012227435583571228,
+      "loss": 0.0859,
+      "step": 44364
+    },
+    {
+      "epoch": 0.3851095042577755,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0012227140739229803,
+      "loss": 0.0767,
+      "step": 44365
+    },
+    {
+      "epoch": 0.3851181847379797,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0012226845893546395,
+      "loss": 0.0786,
+      "step": 44366
+    },
+    {
+      "epoch": 0.38512686521818384,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0012226551046521329,
+      "loss": 0.0781,
+      "step": 44367
+    },
+    {
+      "epoch": 0.38513554569838804,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0012226256198154919,
+      "loss": 0.0767,
+      "step": 44368
+    },
+    {
+      "epoch": 0.3851442261785922,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0012225961348447498,
+      "loss": 0.0928,
+      "step": 44369
+    },
+    {
+      "epoch": 0.38515290665879637,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0012225666497399383,
+      "loss": 0.1113,
+      "step": 44370
+    },
+    {
+      "epoch": 0.3851615871390005,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.00122253716450109,
+      "loss": 0.0981,
+      "step": 44371
+    },
+    {
+      "epoch": 0.3851702676192047,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0012225076791282366,
+      "loss": 0.1221,
+      "step": 44372
+    },
+    {
+      "epoch": 0.38517894809940884,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0012224781936214107,
+      "loss": 0.0884,
+      "step": 44373
+    },
+    {
+      "epoch": 0.38518762857961303,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0012224487079806445,
+      "loss": 0.0815,
+      "step": 44374
+    },
+    {
+      "epoch": 0.38519630905981717,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0012224192222059704,
+      "loss": 0.0962,
+      "step": 44375
+    },
+    {
+      "epoch": 0.38520498954002136,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0012223897362974205,
+      "loss": 0.084,
+      "step": 44376
+    },
+    {
+      "epoch": 0.3852136700202255,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0012223602502550268,
+      "loss": 0.0986,
+      "step": 44377
+    },
+    {
+      "epoch": 0.3852223505004297,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.001222330764078822,
+      "loss": 0.1055,
+      "step": 44378
+    },
+    {
+      "epoch": 0.3852310309806338,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0012223012777688377,
+      "loss": 0.1426,
+      "step": 44379
+    },
+    {
+      "epoch": 0.385239711460838,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0012222717913251068,
+      "loss": 0.0747,
+      "step": 44380
+    },
+    {
+      "epoch": 0.38524839194104216,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0012222423047476615,
+      "loss": 0.1348,
+      "step": 44381
+    },
+    {
+      "epoch": 0.38525707242124635,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0012222128180365336,
+      "loss": 0.0996,
+      "step": 44382
+    },
+    {
+      "epoch": 0.3852657529014505,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0012221833311917556,
+      "loss": 0.1377,
+      "step": 44383
+    },
+    {
+      "epoch": 0.3852744333816547,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00122215384421336,
+      "loss": 0.1113,
+      "step": 44384
+    },
+    {
+      "epoch": 0.3852831138618588,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0012221243571013784,
+      "loss": 0.1201,
+      "step": 44385
+    },
+    {
+      "epoch": 0.385291794342063,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0012220948698558438,
+      "loss": 0.085,
+      "step": 44386
+    },
+    {
+      "epoch": 0.38530047482226715,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001222065382476788,
+      "loss": 0.1055,
+      "step": 44387
+    },
+    {
+      "epoch": 0.38530915530247134,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.001222035894964243,
+      "loss": 0.1133,
+      "step": 44388
+    },
+    {
+      "epoch": 0.3853178357826755,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001222006407318242,
+      "loss": 0.0771,
+      "step": 44389
+    },
+    {
+      "epoch": 0.38532651626287967,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.001221976919538816,
+      "loss": 0.0522,
+      "step": 44390
+    },
+    {
+      "epoch": 0.3853351967430838,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001221947431625998,
+      "loss": 0.0869,
+      "step": 44391
+    },
+    {
+      "epoch": 0.385343877223288,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0012219179435798205,
+      "loss": 0.0791,
+      "step": 44392
+    },
+    {
+      "epoch": 0.38535255770349214,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001221888455400315,
+      "loss": 0.0703,
+      "step": 44393
+    },
+    {
+      "epoch": 0.38536123818369633,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0012218589670875142,
+      "loss": 0.1084,
+      "step": 44394
+    },
+    {
+      "epoch": 0.38536991866390047,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0012218294786414505,
+      "loss": 0.0874,
+      "step": 44395
+    },
+    {
+      "epoch": 0.38537859914410466,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.001221799990062156,
+      "loss": 0.1143,
+      "step": 44396
+    },
+    {
+      "epoch": 0.3853872796243088,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0012217705013496624,
+      "loss": 0.0859,
+      "step": 44397
+    },
+    {
+      "epoch": 0.385395960104513,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0012217410125040028,
+      "loss": 0.0879,
+      "step": 44398
+    },
+    {
+      "epoch": 0.38540464058471713,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0012217115235252083,
+      "loss": 0.1006,
+      "step": 44399
+    },
+    {
+      "epoch": 0.3854133210649213,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0012216820344133129,
+      "loss": 0.1328,
+      "step": 44400
+    },
+    {
+      "epoch": 0.38542200154512546,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0012216525451683473,
+      "loss": 0.0845,
+      "step": 44401
+    },
+    {
+      "epoch": 0.38543068202532965,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0012216230557903446,
+      "loss": 0.1172,
+      "step": 44402
+    },
+    {
+      "epoch": 0.3854393625055338,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0012215935662793363,
+      "loss": 0.1006,
+      "step": 44403
+    },
+    {
+      "epoch": 0.385448042985738,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0012215640766353555,
+      "loss": 0.0889,
+      "step": 44404
+    },
+    {
+      "epoch": 0.3854567234659421,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.001221534586858434,
+      "loss": 0.1465,
+      "step": 44405
+    },
+    {
+      "epoch": 0.3854654039461463,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001221505096948604,
+      "loss": 0.1123,
+      "step": 44406
+    },
+    {
+      "epoch": 0.38547408442635045,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0012214756069058978,
+      "loss": 0.0767,
+      "step": 44407
+    },
+    {
+      "epoch": 0.38548276490655464,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0012214461167303478,
+      "loss": 0.1025,
+      "step": 44408
+    },
+    {
+      "epoch": 0.3854914453867588,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0012214166264219862,
+      "loss": 0.1074,
+      "step": 44409
+    },
+    {
+      "epoch": 0.385500125866963,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001221387135980845,
+      "loss": 0.0781,
+      "step": 44410
+    },
+    {
+      "epoch": 0.3855088063471671,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0012213576454069568,
+      "loss": 0.1025,
+      "step": 44411
+    },
+    {
+      "epoch": 0.3855174868273713,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.001221328154700354,
+      "loss": 0.0869,
+      "step": 44412
+    },
+    {
+      "epoch": 0.38552616730757544,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.001221298663861068,
+      "loss": 0.1016,
+      "step": 44413
+    },
+    {
+      "epoch": 0.38553484778777963,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001221269172889132,
+      "loss": 0.0869,
+      "step": 44414
+    },
+    {
+      "epoch": 0.38554352826798377,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0012212396817845776,
+      "loss": 0.0825,
+      "step": 44415
+    },
+    {
+      "epoch": 0.38555220874818796,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0012212101905474373,
+      "loss": 0.0869,
+      "step": 44416
+    },
+    {
+      "epoch": 0.3855608892283921,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0012211806991777434,
+      "loss": 0.0859,
+      "step": 44417
+    },
+    {
+      "epoch": 0.3855695697085963,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0012211512076755282,
+      "loss": 0.0791,
+      "step": 44418
+    },
+    {
+      "epoch": 0.38557825018880043,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.001221121716040824,
+      "loss": 0.1006,
+      "step": 44419
+    },
+    {
+      "epoch": 0.3855869306690046,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0012210922242736626,
+      "loss": 0.0962,
+      "step": 44420
+    },
+    {
+      "epoch": 0.38559561114920876,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001221062732374077,
+      "loss": 0.1074,
+      "step": 44421
+    },
+    {
+      "epoch": 0.38560429162941295,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0012210332403420985,
+      "loss": 0.0908,
+      "step": 44422
+    },
+    {
+      "epoch": 0.3856129721096171,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0012210037481777602,
+      "loss": 0.1631,
+      "step": 44423
+    },
+    {
+      "epoch": 0.3856216525898213,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001220974255881094,
+      "loss": 0.1182,
+      "step": 44424
+    },
+    {
+      "epoch": 0.3856303330700254,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001220944763452132,
+      "loss": 0.082,
+      "step": 44425
+    },
+    {
+      "epoch": 0.3856390135502296,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0012209152708909066,
+      "loss": 0.126,
+      "step": 44426
+    },
+    {
+      "epoch": 0.38564769403043375,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0012208857781974504,
+      "loss": 0.0723,
+      "step": 44427
+    },
+    {
+      "epoch": 0.38565637451063794,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.001220856285371795,
+      "loss": 0.1074,
+      "step": 44428
+    },
+    {
+      "epoch": 0.3856650549908421,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0012208267924139731,
+      "loss": 0.0679,
+      "step": 44429
+    },
+    {
+      "epoch": 0.3856737354710463,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0012207972993240168,
+      "loss": 0.0928,
+      "step": 44430
+    },
+    {
+      "epoch": 0.3856824159512504,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0012207678061019584,
+      "loss": 0.1328,
+      "step": 44431
+    },
+    {
+      "epoch": 0.3856910964314546,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0012207383127478303,
+      "loss": 0.1001,
+      "step": 44432
+    },
+    {
+      "epoch": 0.38569977691165874,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0012207088192616644,
+      "loss": 0.1104,
+      "step": 44433
+    },
+    {
+      "epoch": 0.38570845739186294,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0012206793256434932,
+      "loss": 0.0859,
+      "step": 44434
+    },
+    {
+      "epoch": 0.3857171378720671,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0012206498318933488,
+      "loss": 0.104,
+      "step": 44435
+    },
+    {
+      "epoch": 0.38572581835227127,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0012206203380112637,
+      "loss": 0.1133,
+      "step": 44436
+    },
+    {
+      "epoch": 0.3857344988324754,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.00122059084399727,
+      "loss": 0.1572,
+      "step": 44437
+    },
+    {
+      "epoch": 0.3857431793126796,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0012205613498513998,
+      "loss": 0.125,
+      "step": 44438
+    },
+    {
+      "epoch": 0.38575185979288373,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0012205318555736856,
+      "loss": 0.0908,
+      "step": 44439
+    },
+    {
+      "epoch": 0.3857605402730879,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0012205023611641598,
+      "loss": 0.1318,
+      "step": 44440
+    },
+    {
+      "epoch": 0.38576922075329206,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0012204728666228544,
+      "loss": 0.1235,
+      "step": 44441
+    },
+    {
+      "epoch": 0.38577790123349626,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0012204433719498013,
+      "loss": 0.0977,
+      "step": 44442
+    },
+    {
+      "epoch": 0.3857865817137004,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0012204138771450335,
+      "loss": 0.0864,
+      "step": 44443
+    },
+    {
+      "epoch": 0.3857952621939046,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0012203843822085828,
+      "loss": 0.1201,
+      "step": 44444
+    },
+    {
+      "epoch": 0.3858039426741087,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0012203548871404814,
+      "loss": 0.0977,
+      "step": 44445
+    },
+    {
+      "epoch": 0.3858126231543129,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0012203253919407617,
+      "loss": 0.0942,
+      "step": 44446
+    },
+    {
+      "epoch": 0.38582130363451705,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001220295896609456,
+      "loss": 0.1045,
+      "step": 44447
+    },
+    {
+      "epoch": 0.38582998411472125,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0012202664011465969,
+      "loss": 0.1069,
+      "step": 44448
+    },
+    {
+      "epoch": 0.3858386645949254,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.001220236905552216,
+      "loss": 0.0938,
+      "step": 44449
+    },
+    {
+      "epoch": 0.3858473450751296,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0012202074098263459,
+      "loss": 0.124,
+      "step": 44450
+    },
+    {
+      "epoch": 0.3858560255553337,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0012201779139690185,
+      "loss": 0.1172,
+      "step": 44451
+    },
+    {
+      "epoch": 0.3858647060355379,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0012201484179802666,
+      "loss": 0.0747,
+      "step": 44452
+    },
+    {
+      "epoch": 0.38587338651574205,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0012201189218601224,
+      "loss": 0.1582,
+      "step": 44453
+    },
+    {
+      "epoch": 0.38588206699594624,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0012200894256086173,
+      "loss": 0.0869,
+      "step": 44454
+    },
+    {
+      "epoch": 0.3858907474761504,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0012200599292257848,
+      "loss": 0.0918,
+      "step": 44455
+    },
+    {
+      "epoch": 0.38589942795635457,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001220030432711656,
+      "loss": 0.1006,
+      "step": 44456
+    },
+    {
+      "epoch": 0.3859081084365587,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0012200009360662644,
+      "loss": 0.1006,
+      "step": 44457
+    },
+    {
+      "epoch": 0.3859167889167629,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0012199714392896414,
+      "loss": 0.1108,
+      "step": 44458
+    },
+    {
+      "epoch": 0.38592546939696704,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001219941942381819,
+      "loss": 0.1953,
+      "step": 44459
+    },
+    {
+      "epoch": 0.38593414987717123,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0012199124453428304,
+      "loss": 0.1045,
+      "step": 44460
+    },
+    {
+      "epoch": 0.38594283035737537,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.001219882948172707,
+      "loss": 0.0957,
+      "step": 44461
+    },
+    {
+      "epoch": 0.38595151083757956,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0012198534508714816,
+      "loss": 0.1182,
+      "step": 44462
+    },
+    {
+      "epoch": 0.3859601913177837,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001219823953439186,
+      "loss": 0.1279,
+      "step": 44463
+    },
+    {
+      "epoch": 0.3859688717979879,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0012197944558758529,
+      "loss": 0.1235,
+      "step": 44464
+    },
+    {
+      "epoch": 0.385977552278192,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0012197649581815145,
+      "loss": 0.0767,
+      "step": 44465
+    },
+    {
+      "epoch": 0.3859862327583962,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0012197354603562028,
+      "loss": 0.1055,
+      "step": 44466
+    },
+    {
+      "epoch": 0.38599491323860036,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0012197059623999505,
+      "loss": 0.0947,
+      "step": 44467
+    },
+    {
+      "epoch": 0.38600359371880455,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0012196764643127892,
+      "loss": 0.1445,
+      "step": 44468
+    },
+    {
+      "epoch": 0.3860122741990087,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0012196469660947515,
+      "loss": 0.0879,
+      "step": 44469
+    },
+    {
+      "epoch": 0.3860209546792129,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.00121961746774587,
+      "loss": 0.0977,
+      "step": 44470
+    },
+    {
+      "epoch": 0.386029635159417,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0012195879692661765,
+      "loss": 0.1221,
+      "step": 44471
+    },
+    {
+      "epoch": 0.3860383156396212,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001219558470655703,
+      "loss": 0.104,
+      "step": 44472
+    },
+    {
+      "epoch": 0.38604699611982535,
+      "grad_norm": 4.875,
+      "learning_rate": 0.0012195289719144826,
+      "loss": 0.7227,
+      "step": 44473
+    },
+    {
+      "epoch": 0.38605567660002954,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0012194994730425467,
+      "loss": 0.1016,
+      "step": 44474
+    },
+    {
+      "epoch": 0.3860643570802337,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0012194699740399285,
+      "loss": 0.0859,
+      "step": 44475
+    },
+    {
+      "epoch": 0.38607303756043787,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0012194404749066592,
+      "loss": 0.0986,
+      "step": 44476
+    },
+    {
+      "epoch": 0.386081718040642,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001219410975642772,
+      "loss": 0.1099,
+      "step": 44477
+    },
+    {
+      "epoch": 0.3860903985208462,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0012193814762482987,
+      "loss": 0.1152,
+      "step": 44478
+    },
+    {
+      "epoch": 0.38609907900105034,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0012193519767232717,
+      "loss": 0.1152,
+      "step": 44479
+    },
+    {
+      "epoch": 0.3861077594812545,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0012193224770677231,
+      "loss": 0.0996,
+      "step": 44480
+    },
+    {
+      "epoch": 0.38611643996145867,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0012192929772816851,
+      "loss": 0.0898,
+      "step": 44481
+    },
+    {
+      "epoch": 0.3861251204416628,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.00121926347736519,
+      "loss": 0.0752,
+      "step": 44482
+    },
+    {
+      "epoch": 0.386133800921867,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0012192339773182705,
+      "loss": 0.166,
+      "step": 44483
+    },
+    {
+      "epoch": 0.38614248140207114,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0012192044771409582,
+      "loss": 0.1201,
+      "step": 44484
+    },
+    {
+      "epoch": 0.38615116188227533,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0012191749768332857,
+      "loss": 0.0776,
+      "step": 44485
+    },
+    {
+      "epoch": 0.38615984236247947,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0012191454763952855,
+      "loss": 0.1025,
+      "step": 44486
+    },
+    {
+      "epoch": 0.38616852284268366,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0012191159758269893,
+      "loss": 0.0918,
+      "step": 44487
+    },
+    {
+      "epoch": 0.3861772033228878,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.00121908647512843,
+      "loss": 0.084,
+      "step": 44488
+    },
+    {
+      "epoch": 0.386185883803092,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001219056974299639,
+      "loss": 0.127,
+      "step": 44489
+    },
+    {
+      "epoch": 0.3861945642832961,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0012190274733406494,
+      "loss": 0.0938,
+      "step": 44490
+    },
+    {
+      "epoch": 0.3862032447635003,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0012189979722514932,
+      "loss": 0.0879,
+      "step": 44491
+    },
+    {
+      "epoch": 0.38621192524370446,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0012189684710322026,
+      "loss": 0.083,
+      "step": 44492
+    },
+    {
+      "epoch": 0.38622060572390865,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0012189389696828099,
+      "loss": 0.0879,
+      "step": 44493
+    },
+    {
+      "epoch": 0.3862292862041128,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001218909468203347,
+      "loss": 0.1191,
+      "step": 44494
+    },
+    {
+      "epoch": 0.386237966684317,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0012188799665938468,
+      "loss": 0.1201,
+      "step": 44495
+    },
+    {
+      "epoch": 0.3862466471645211,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0012188504648543413,
+      "loss": 0.0947,
+      "step": 44496
+    },
+    {
+      "epoch": 0.3862553276447253,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0012188209629848627,
+      "loss": 0.0757,
+      "step": 44497
+    },
+    {
+      "epoch": 0.38626400812492945,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0012187914609854428,
+      "loss": 0.0737,
+      "step": 44498
+    },
+    {
+      "epoch": 0.38627268860513364,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0012187619588561148,
+      "loss": 0.0986,
+      "step": 44499
+    },
+    {
+      "epoch": 0.3862813690853378,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0012187324565969104,
+      "loss": 0.1084,
+      "step": 44500
+    },
+    {
+      "epoch": 0.38629004956554197,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0012187029542078624,
+      "loss": 0.1211,
+      "step": 44501
+    },
+    {
+      "epoch": 0.3862987300457461,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.001218673451689002,
+      "loss": 0.0952,
+      "step": 44502
+    },
+    {
+      "epoch": 0.3863074105259503,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0012186439490403623,
+      "loss": 0.1289,
+      "step": 44503
+    },
+    {
+      "epoch": 0.38631609100615444,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0012186144462619753,
+      "loss": 0.104,
+      "step": 44504
+    },
+    {
+      "epoch": 0.38632477148635863,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0012185849433538733,
+      "loss": 0.0796,
+      "step": 44505
+    },
+    {
+      "epoch": 0.38633345196656277,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0012185554403160888,
+      "loss": 0.0762,
+      "step": 44506
+    },
+    {
+      "epoch": 0.38634213244676696,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0012185259371486536,
+      "loss": 0.0708,
+      "step": 44507
+    },
+    {
+      "epoch": 0.3863508129269711,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0012184964338516004,
+      "loss": 0.0918,
+      "step": 44508
+    },
+    {
+      "epoch": 0.3863594934071753,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0012184669304249613,
+      "loss": 0.1025,
+      "step": 44509
+    },
+    {
+      "epoch": 0.38636817388737943,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0012184374268687686,
+      "loss": 0.0732,
+      "step": 44510
+    },
+    {
+      "epoch": 0.3863768543675836,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0012184079231830542,
+      "loss": 0.0977,
+      "step": 44511
+    },
+    {
+      "epoch": 0.38638553484778776,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0012183784193678512,
+      "loss": 0.127,
+      "step": 44512
+    },
+    {
+      "epoch": 0.38639421532799195,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0012183489154231908,
+      "loss": 0.1074,
+      "step": 44513
+    },
+    {
+      "epoch": 0.3864028958081961,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0012183194113491059,
+      "loss": 0.0713,
+      "step": 44514
+    },
+    {
+      "epoch": 0.3864115762884003,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0012182899071456288,
+      "loss": 0.0879,
+      "step": 44515
+    },
+    {
+      "epoch": 0.3864202567686044,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0012182604028127917,
+      "loss": 0.1128,
+      "step": 44516
+    },
+    {
+      "epoch": 0.3864289372488086,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0012182308983506268,
+      "loss": 0.1069,
+      "step": 44517
+    },
+    {
+      "epoch": 0.38643761772901275,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0012182013937591663,
+      "loss": 0.1035,
+      "step": 44518
+    },
+    {
+      "epoch": 0.38644629820921694,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0012181718890384426,
+      "loss": 0.0913,
+      "step": 44519
+    },
+    {
+      "epoch": 0.3864549786894211,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0012181423841884876,
+      "loss": 0.0898,
+      "step": 44520
+    },
+    {
+      "epoch": 0.3864636591696253,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0012181128792093342,
+      "loss": 0.0967,
+      "step": 44521
+    },
+    {
+      "epoch": 0.3864723396498294,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001218083374101014,
+      "loss": 0.1055,
+      "step": 44522
+    },
+    {
+      "epoch": 0.3864810201300336,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0012180538688635599,
+      "loss": 0.0952,
+      "step": 44523
+    },
+    {
+      "epoch": 0.38648970061023774,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0012180243634970036,
+      "loss": 0.1914,
+      "step": 44524
+    },
+    {
+      "epoch": 0.38649838109044193,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001217994858001378,
+      "loss": 0.1099,
+      "step": 44525
+    },
+    {
+      "epoch": 0.38650706157064607,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001217965352376715,
+      "loss": 0.0688,
+      "step": 44526
+    },
+    {
+      "epoch": 0.38651574205085026,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0012179358466230465,
+      "loss": 0.1689,
+      "step": 44527
+    },
+    {
+      "epoch": 0.3865244225310544,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0012179063407404052,
+      "loss": 0.1279,
+      "step": 44528
+    },
+    {
+      "epoch": 0.3865331030112586,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0012178768347288233,
+      "loss": 0.0732,
+      "step": 44529
+    },
+    {
+      "epoch": 0.38654178349146273,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0012178473285883332,
+      "loss": 0.1133,
+      "step": 44530
+    },
+    {
+      "epoch": 0.3865504639716669,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001217817822318967,
+      "loss": 0.1309,
+      "step": 44531
+    },
+    {
+      "epoch": 0.38655914445187106,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001217788315920757,
+      "loss": 0.1035,
+      "step": 44532
+    },
+    {
+      "epoch": 0.38656782493207525,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0012177588093937352,
+      "loss": 0.1152,
+      "step": 44533
+    },
+    {
+      "epoch": 0.3865765054122794,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0012177293027379346,
+      "loss": 0.083,
+      "step": 44534
+    },
+    {
+      "epoch": 0.3865851858924836,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0012176997959533868,
+      "loss": 0.1416,
+      "step": 44535
+    },
+    {
+      "epoch": 0.3865938663726877,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0012176702890401244,
+      "loss": 0.1064,
+      "step": 44536
+    },
+    {
+      "epoch": 0.3866025468528919,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0012176407819981792,
+      "loss": 0.0728,
+      "step": 44537
+    },
+    {
+      "epoch": 0.38661122733309605,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001217611274827584,
+      "loss": 0.1245,
+      "step": 44538
+    },
+    {
+      "epoch": 0.38661990781330025,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0012175817675283709,
+      "loss": 0.0928,
+      "step": 44539
+    },
+    {
+      "epoch": 0.3866285882935044,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0012175522601005722,
+      "loss": 0.1631,
+      "step": 44540
+    },
+    {
+      "epoch": 0.3866372687737086,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.00121752275254422,
+      "loss": 0.0967,
+      "step": 44541
+    },
+    {
+      "epoch": 0.3866459492539127,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0012174932448593466,
+      "loss": 0.0874,
+      "step": 44542
+    },
+    {
+      "epoch": 0.3866546297341169,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0012174637370459846,
+      "loss": 0.0923,
+      "step": 44543
+    },
+    {
+      "epoch": 0.38666331021432104,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.001217434229104166,
+      "loss": 0.0947,
+      "step": 44544
+    },
+    {
+      "epoch": 0.38667199069452524,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0012174047210339227,
+      "loss": 0.1582,
+      "step": 44545
+    },
+    {
+      "epoch": 0.3866806711747294,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0012173752128352877,
+      "loss": 0.1045,
+      "step": 44546
+    },
+    {
+      "epoch": 0.38668935165493357,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0012173457045082932,
+      "loss": 0.0957,
+      "step": 44547
+    },
+    {
+      "epoch": 0.3866980321351377,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001217316196052971,
+      "loss": 0.0996,
+      "step": 44548
+    },
+    {
+      "epoch": 0.3867067126153419,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0012172866874693533,
+      "loss": 0.0771,
+      "step": 44549
+    },
+    {
+      "epoch": 0.38671539309554603,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0012172571787574726,
+      "loss": 0.1055,
+      "step": 44550
+    },
+    {
+      "epoch": 0.3867240735757502,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0012172276699173614,
+      "loss": 0.0845,
+      "step": 44551
+    },
+    {
+      "epoch": 0.38673275405595436,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0012171981609490517,
+      "loss": 0.0874,
+      "step": 44552
+    },
+    {
+      "epoch": 0.38674143453615856,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0012171686518525763,
+      "loss": 0.1133,
+      "step": 44553
+    },
+    {
+      "epoch": 0.3867501150163627,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0012171391426279664,
+      "loss": 0.1289,
+      "step": 44554
+    },
+    {
+      "epoch": 0.3867587954965669,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0012171096332752552,
+      "loss": 0.1387,
+      "step": 44555
+    },
+    {
+      "epoch": 0.386767475976771,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0012170801237944746,
+      "loss": 0.0869,
+      "step": 44556
+    },
+    {
+      "epoch": 0.3867761564569752,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001217050614185657,
+      "loss": 0.123,
+      "step": 44557
+    },
+    {
+      "epoch": 0.38678483693717935,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0012170211044488343,
+      "loss": 0.0767,
+      "step": 44558
+    },
+    {
+      "epoch": 0.38679351741738355,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0012169915945840397,
+      "loss": 0.1074,
+      "step": 44559
+    },
+    {
+      "epoch": 0.3868021978975877,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0012169620845913045,
+      "loss": 0.1279,
+      "step": 44560
+    },
+    {
+      "epoch": 0.3868108783777919,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0012169325744706611,
+      "loss": 0.1084,
+      "step": 44561
+    },
+    {
+      "epoch": 0.386819558857996,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0012169030642221424,
+      "loss": 0.1289,
+      "step": 44562
+    },
+    {
+      "epoch": 0.3868282393382002,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.00121687355384578,
+      "loss": 0.0957,
+      "step": 44563
+    },
+    {
+      "epoch": 0.38683691981840435,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0012168440433416065,
+      "loss": 0.1123,
+      "step": 44564
+    },
+    {
+      "epoch": 0.38684560029860854,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001216814532709654,
+      "loss": 0.0879,
+      "step": 44565
+    },
+    {
+      "epoch": 0.3868542807788127,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0012167850219499553,
+      "loss": 0.1064,
+      "step": 44566
+    },
+    {
+      "epoch": 0.38686296125901687,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0012167555110625416,
+      "loss": 0.1182,
+      "step": 44567
+    },
+    {
+      "epoch": 0.386871641739221,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0012167260000474462,
+      "loss": 0.0938,
+      "step": 44568
+    },
+    {
+      "epoch": 0.3868803222194252,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0012166964889047012,
+      "loss": 0.0757,
+      "step": 44569
+    },
+    {
+      "epoch": 0.38688900269962934,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0012166669776343384,
+      "loss": 0.1182,
+      "step": 44570
+    },
+    {
+      "epoch": 0.38689768317983353,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0012166374662363902,
+      "loss": 0.0996,
+      "step": 44571
+    },
+    {
+      "epoch": 0.38690636366003767,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0012166079547108893,
+      "loss": 0.124,
+      "step": 44572
+    },
+    {
+      "epoch": 0.38691504414024186,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0012165784430578674,
+      "loss": 0.0913,
+      "step": 44573
+    },
+    {
+      "epoch": 0.386923724620446,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0012165489312773578,
+      "loss": 0.1396,
+      "step": 44574
+    },
+    {
+      "epoch": 0.3869324051006502,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0012165194193693913,
+      "loss": 0.1201,
+      "step": 44575
+    },
+    {
+      "epoch": 0.3869410855808543,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.001216489907334001,
+      "loss": 0.2461,
+      "step": 44576
+    },
+    {
+      "epoch": 0.3869497660610585,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0012164603951712195,
+      "loss": 0.1465,
+      "step": 44577
+    },
+    {
+      "epoch": 0.38695844654126266,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0012164308828810785,
+      "loss": 0.1216,
+      "step": 44578
+    },
+    {
+      "epoch": 0.38696712702146685,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0012164013704636099,
+      "loss": 0.0718,
+      "step": 44579
+    },
+    {
+      "epoch": 0.386975807501671,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0012163718579188472,
+      "loss": 0.0811,
+      "step": 44580
+    },
+    {
+      "epoch": 0.3869844879818752,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0012163423452468216,
+      "loss": 0.1089,
+      "step": 44581
+    },
+    {
+      "epoch": 0.3869931684620793,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001216312832447566,
+      "loss": 0.0737,
+      "step": 44582
+    },
+    {
+      "epoch": 0.3870018489422835,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0012162833195211123,
+      "loss": 0.0742,
+      "step": 44583
+    },
+    {
+      "epoch": 0.38701052942248765,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0012162538064674927,
+      "loss": 0.062,
+      "step": 44584
+    },
+    {
+      "epoch": 0.38701920990269184,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.00121622429328674,
+      "loss": 0.1582,
+      "step": 44585
+    },
+    {
+      "epoch": 0.387027890382896,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0012161947799788863,
+      "loss": 0.1348,
+      "step": 44586
+    },
+    {
+      "epoch": 0.38703657086310017,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0012161652665439633,
+      "loss": 0.084,
+      "step": 44587
+    },
+    {
+      "epoch": 0.3870452513433043,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0012161357529820041,
+      "loss": 0.1133,
+      "step": 44588
+    },
+    {
+      "epoch": 0.3870539318235085,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0012161062392930402,
+      "loss": 0.1426,
+      "step": 44589
+    },
+    {
+      "epoch": 0.38706261230371264,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0012160767254771044,
+      "loss": 0.0991,
+      "step": 44590
+    },
+    {
+      "epoch": 0.38707129278391683,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0012160472115342288,
+      "loss": 0.1084,
+      "step": 44591
+    },
+    {
+      "epoch": 0.38707997326412097,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0012160176974644458,
+      "loss": 0.123,
+      "step": 44592
+    },
+    {
+      "epoch": 0.38708865374432516,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0012159881832677878,
+      "loss": 0.0957,
+      "step": 44593
+    },
+    {
+      "epoch": 0.3870973342245293,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0012159586689442865,
+      "loss": 0.0654,
+      "step": 44594
+    },
+    {
+      "epoch": 0.3871060147047335,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0012159291544939748,
+      "loss": 0.1133,
+      "step": 44595
+    },
+    {
+      "epoch": 0.38711469518493763,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0012158996399168848,
+      "loss": 0.1572,
+      "step": 44596
+    },
+    {
+      "epoch": 0.3871233756651418,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0012158701252130482,
+      "loss": 0.1123,
+      "step": 44597
+    },
+    {
+      "epoch": 0.38713205614534596,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0012158406103824982,
+      "loss": 0.1289,
+      "step": 44598
+    },
+    {
+      "epoch": 0.38714073662555015,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0012158110954252664,
+      "loss": 0.1104,
+      "step": 44599
+    },
+    {
+      "epoch": 0.3871494171057543,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0012157815803413854,
+      "loss": 0.1113,
+      "step": 44600
+    },
+    {
+      "epoch": 0.3871580975859585,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0012157520651308875,
+      "loss": 0.1338,
+      "step": 44601
+    },
+    {
+      "epoch": 0.3871667780661626,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001215722549793805,
+      "loss": 0.1045,
+      "step": 44602
+    },
+    {
+      "epoch": 0.38717545854636676,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0012156930343301697,
+      "loss": 0.084,
+      "step": 44603
+    },
+    {
+      "epoch": 0.38718413902657095,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0012156635187400146,
+      "loss": 0.1035,
+      "step": 44604
+    },
+    {
+      "epoch": 0.3871928195067751,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0012156340030233714,
+      "loss": 0.0903,
+      "step": 44605
+    },
+    {
+      "epoch": 0.3872014999869793,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0012156044871802728,
+      "loss": 0.0981,
+      "step": 44606
+    },
+    {
+      "epoch": 0.3872101804671834,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0012155749712107506,
+      "loss": 0.0771,
+      "step": 44607
+    },
+    {
+      "epoch": 0.3872188609473876,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0012155454551148375,
+      "loss": 0.0996,
+      "step": 44608
+    },
+    {
+      "epoch": 0.38722754142759175,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0012155159388925654,
+      "loss": 0.0811,
+      "step": 44609
+    },
+    {
+      "epoch": 0.38723622190779594,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001215486422543967,
+      "loss": 0.1172,
+      "step": 44610
+    },
+    {
+      "epoch": 0.3872449023880001,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.0012154569060690743,
+      "loss": 0.082,
+      "step": 44611
+    },
+    {
+      "epoch": 0.38725358286820427,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.00121542738946792,
+      "loss": 0.126,
+      "step": 44612
+    },
+    {
+      "epoch": 0.3872622633484084,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0012153978727405357,
+      "loss": 0.085,
+      "step": 44613
+    },
+    {
+      "epoch": 0.3872709438286126,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001215368355886954,
+      "loss": 0.0947,
+      "step": 44614
+    },
+    {
+      "epoch": 0.38727962430881674,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0012153388389072072,
+      "loss": 0.1162,
+      "step": 44615
+    },
+    {
+      "epoch": 0.38728830478902093,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001215309321801328,
+      "loss": 0.1157,
+      "step": 44616
+    },
+    {
+      "epoch": 0.38729698526922507,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0012152798045693479,
+      "loss": 0.1201,
+      "step": 44617
+    },
+    {
+      "epoch": 0.38730566574942926,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0012152502872112994,
+      "loss": 0.0859,
+      "step": 44618
+    },
+    {
+      "epoch": 0.3873143462296334,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001215220769727215,
+      "loss": 0.1016,
+      "step": 44619
+    },
+    {
+      "epoch": 0.3873230267098376,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001215191252117127,
+      "loss": 0.0732,
+      "step": 44620
+    },
+    {
+      "epoch": 0.38733170719004173,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0012151617343810677,
+      "loss": 0.1475,
+      "step": 44621
+    },
+    {
+      "epoch": 0.3873403876702459,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0012151322165190692,
+      "loss": 0.082,
+      "step": 44622
+    },
+    {
+      "epoch": 0.38734906815045006,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0012151026985311635,
+      "loss": 0.1455,
+      "step": 44623
+    },
+    {
+      "epoch": 0.38735774863065425,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0012150731804173836,
+      "loss": 0.1592,
+      "step": 44624
+    },
+    {
+      "epoch": 0.3873664291108584,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0012150436621777615,
+      "loss": 0.123,
+      "step": 44625
+    },
+    {
+      "epoch": 0.3873751095910626,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0012150141438123292,
+      "loss": 0.0771,
+      "step": 44626
+    },
+    {
+      "epoch": 0.3873837900712667,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0012149846253211188,
+      "loss": 0.1201,
+      "step": 44627
+    },
+    {
+      "epoch": 0.3873924705514709,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0012149551067041633,
+      "loss": 0.0967,
+      "step": 44628
+    },
+    {
+      "epoch": 0.38740115103167505,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0012149255879614948,
+      "loss": 0.0889,
+      "step": 44629
+    },
+    {
+      "epoch": 0.38740983151187924,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001214896069093145,
+      "loss": 0.1318,
+      "step": 44630
+    },
+    {
+      "epoch": 0.3874185119920834,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0012148665500991468,
+      "loss": 0.1064,
+      "step": 44631
+    },
+    {
+      "epoch": 0.3874271924722876,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001214837030979532,
+      "loss": 0.103,
+      "step": 44632
+    },
+    {
+      "epoch": 0.3874358729524917,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0012148075117343336,
+      "loss": 0.0718,
+      "step": 44633
+    },
+    {
+      "epoch": 0.3874445534326959,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0012147779923635832,
+      "loss": 0.0898,
+      "step": 44634
+    },
+    {
+      "epoch": 0.38745323391290004,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0012147484728673135,
+      "loss": 0.1074,
+      "step": 44635
+    },
+    {
+      "epoch": 0.38746191439310423,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0012147189532455563,
+      "loss": 0.0977,
+      "step": 44636
+    },
+    {
+      "epoch": 0.38747059487330837,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0012146894334983446,
+      "loss": 0.124,
+      "step": 44637
+    },
+    {
+      "epoch": 0.38747927535351256,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0012146599136257095,
+      "loss": 0.1045,
+      "step": 44638
+    },
+    {
+      "epoch": 0.3874879558337167,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0012146303936276847,
+      "loss": 0.0679,
+      "step": 44639
+    },
+    {
+      "epoch": 0.3874966363139209,
+      "grad_norm": 1.9296875,
+      "learning_rate": 0.0012146008735043016,
+      "loss": 0.4277,
+      "step": 44640
+    },
+    {
+      "epoch": 0.38750531679412503,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0012145713532555928,
+      "loss": 0.085,
+      "step": 44641
+    },
+    {
+      "epoch": 0.3875139972743292,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0012145418328815903,
+      "loss": 0.1084,
+      "step": 44642
+    },
+    {
+      "epoch": 0.38752267775453336,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0012145123123823267,
+      "loss": 0.1729,
+      "step": 44643
+    },
+    {
+      "epoch": 0.38753135823473756,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0012144827917578342,
+      "loss": 0.1211,
+      "step": 44644
+    },
+    {
+      "epoch": 0.3875400387149417,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0012144532710081448,
+      "loss": 0.1006,
+      "step": 44645
+    },
+    {
+      "epoch": 0.3875487191951459,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0012144237501332913,
+      "loss": 0.0918,
+      "step": 44646
+    },
+    {
+      "epoch": 0.38755739967535,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0012143942291333056,
+      "loss": 0.0879,
+      "step": 44647
+    },
+    {
+      "epoch": 0.3875660801555542,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00121436470800822,
+      "loss": 0.0781,
+      "step": 44648
+    },
+    {
+      "epoch": 0.38757476063575835,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0012143351867580669,
+      "loss": 0.0752,
+      "step": 44649
+    },
+    {
+      "epoch": 0.38758344111596255,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0012143056653828787,
+      "loss": 0.0933,
+      "step": 44650
+    },
+    {
+      "epoch": 0.3875921215961667,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0012142761438826877,
+      "loss": 0.0879,
+      "step": 44651
+    },
+    {
+      "epoch": 0.3876008020763709,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0012142466222575257,
+      "loss": 0.0967,
+      "step": 44652
+    },
+    {
+      "epoch": 0.387609482556575,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0012142171005074252,
+      "loss": 0.1484,
+      "step": 44653
+    },
+    {
+      "epoch": 0.3876181630367792,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.001214187578632419,
+      "loss": 0.0923,
+      "step": 44654
+    },
+    {
+      "epoch": 0.38762684351698334,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001214158056632539,
+      "loss": 0.1084,
+      "step": 44655
+    },
+    {
+      "epoch": 0.38763552399718754,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001214128534507817,
+      "loss": 0.1055,
+      "step": 44656
+    },
+    {
+      "epoch": 0.3876442044773917,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001214099012258286,
+      "loss": 0.0869,
+      "step": 44657
+    },
+    {
+      "epoch": 0.38765288495759587,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001214069489883978,
+      "loss": 0.0806,
+      "step": 44658
+    },
+    {
+      "epoch": 0.3876615654378,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0012140399673849253,
+      "loss": 0.0752,
+      "step": 44659
+    },
+    {
+      "epoch": 0.3876702459180042,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0012140104447611604,
+      "loss": 0.0947,
+      "step": 44660
+    },
+    {
+      "epoch": 0.38767892639820833,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0012139809220127154,
+      "loss": 0.0669,
+      "step": 44661
+    },
+    {
+      "epoch": 0.3876876068784125,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0012139513991396222,
+      "loss": 0.0732,
+      "step": 44662
+    },
+    {
+      "epoch": 0.38769628735861666,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0012139218761419138,
+      "loss": 0.1357,
+      "step": 44663
+    },
+    {
+      "epoch": 0.38770496783882086,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001213892353019622,
+      "loss": 0.1113,
+      "step": 44664
+    },
+    {
+      "epoch": 0.387713648319025,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0012138628297727794,
+      "loss": 0.0884,
+      "step": 44665
+    },
+    {
+      "epoch": 0.3877223287992292,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0012138333064014178,
+      "loss": 0.1465,
+      "step": 44666
+    },
+    {
+      "epoch": 0.3877310092794333,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0012138037829055699,
+      "loss": 0.1582,
+      "step": 44667
+    },
+    {
+      "epoch": 0.3877396897596375,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001213774259285268,
+      "loss": 0.1279,
+      "step": 44668
+    },
+    {
+      "epoch": 0.38774837023984166,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0012137447355405444,
+      "loss": 0.1138,
+      "step": 44669
+    },
+    {
+      "epoch": 0.38775705072004585,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0012137152116714312,
+      "loss": 0.1089,
+      "step": 44670
+    },
+    {
+      "epoch": 0.38776573120025,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0012136856876779605,
+      "loss": 0.0991,
+      "step": 44671
+    },
+    {
+      "epoch": 0.3877744116804542,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0012136561635601652,
+      "loss": 0.1279,
+      "step": 44672
+    },
+    {
+      "epoch": 0.3877830921606583,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.001213626639318077,
+      "loss": 0.127,
+      "step": 44673
+    },
+    {
+      "epoch": 0.3877917726408625,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0012135971149517285,
+      "loss": 0.1191,
+      "step": 44674
+    },
+    {
+      "epoch": 0.38780045312106665,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001213567590461152,
+      "loss": 0.1318,
+      "step": 44675
+    },
+    {
+      "epoch": 0.38780913360127084,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0012135380658463792,
+      "loss": 0.0854,
+      "step": 44676
+    },
+    {
+      "epoch": 0.387817814081475,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0012135085411074437,
+      "loss": 0.1025,
+      "step": 44677
+    },
+    {
+      "epoch": 0.38782649456167917,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0012134790162443762,
+      "loss": 0.1006,
+      "step": 44678
+    },
+    {
+      "epoch": 0.3878351750418833,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00121344949125721,
+      "loss": 0.1055,
+      "step": 44679
+    },
+    {
+      "epoch": 0.3878438555220875,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0012134199661459773,
+      "loss": 0.1348,
+      "step": 44680
+    },
+    {
+      "epoch": 0.38785253600229164,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0012133904409107102,
+      "loss": 0.1201,
+      "step": 44681
+    },
+    {
+      "epoch": 0.38786121648249583,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001213360915551441,
+      "loss": 0.1289,
+      "step": 44682
+    },
+    {
+      "epoch": 0.38786989696269997,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001213331390068202,
+      "loss": 0.0889,
+      "step": 44683
+    },
+    {
+      "epoch": 0.38787857744290416,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0012133018644610253,
+      "loss": 0.0933,
+      "step": 44684
+    },
+    {
+      "epoch": 0.3878872579231083,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0012132723387299437,
+      "loss": 0.0952,
+      "step": 44685
+    },
+    {
+      "epoch": 0.3878959384033125,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001213242812874989,
+      "loss": 0.1172,
+      "step": 44686
+    },
+    {
+      "epoch": 0.3879046188835166,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0012132132868961935,
+      "loss": 0.0713,
+      "step": 44687
+    },
+    {
+      "epoch": 0.3879132993637208,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0012131837607935896,
+      "loss": 0.1245,
+      "step": 44688
+    },
+    {
+      "epoch": 0.38792197984392496,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00121315423456721,
+      "loss": 0.1064,
+      "step": 44689
+    },
+    {
+      "epoch": 0.38793066032412915,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0012131247082170864,
+      "loss": 0.0996,
+      "step": 44690
+    },
+    {
+      "epoch": 0.3879393408043333,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0012130951817432517,
+      "loss": 0.106,
+      "step": 44691
+    },
+    {
+      "epoch": 0.3879480212845375,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0012130656551457373,
+      "loss": 0.1562,
+      "step": 44692
+    },
+    {
+      "epoch": 0.3879567017647416,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0012130361284245763,
+      "loss": 0.1211,
+      "step": 44693
+    },
+    {
+      "epoch": 0.3879653822449458,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0012130066015798007,
+      "loss": 0.1016,
+      "step": 44694
+    },
+    {
+      "epoch": 0.38797406272514995,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0012129770746114425,
+      "loss": 0.084,
+      "step": 44695
+    },
+    {
+      "epoch": 0.38798274320535414,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0012129475475195343,
+      "loss": 0.1123,
+      "step": 44696
+    },
+    {
+      "epoch": 0.3879914236855583,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0012129180203041086,
+      "loss": 0.0625,
+      "step": 44697
+    },
+    {
+      "epoch": 0.38800010416576247,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0012128884929651974,
+      "loss": 0.0957,
+      "step": 44698
+    },
+    {
+      "epoch": 0.3880087846459666,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001212858965502833,
+      "loss": 0.1152,
+      "step": 44699
+    },
+    {
+      "epoch": 0.3880174651261708,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0012128294379170478,
+      "loss": 0.1162,
+      "step": 44700
+    },
+    {
+      "epoch": 0.38802614560637494,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001212799910207874,
+      "loss": 0.1011,
+      "step": 44701
+    },
+    {
+      "epoch": 0.38803482608657913,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0012127703823753438,
+      "loss": 0.1069,
+      "step": 44702
+    },
+    {
+      "epoch": 0.38804350656678327,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0012127408544194897,
+      "loss": 0.1094,
+      "step": 44703
+    },
+    {
+      "epoch": 0.38805218704698746,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0012127113263403439,
+      "loss": 0.0879,
+      "step": 44704
+    },
+    {
+      "epoch": 0.3880608675271916,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0012126817981379385,
+      "loss": 0.0991,
+      "step": 44705
+    },
+    {
+      "epoch": 0.3880695480073958,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0012126522698123062,
+      "loss": 0.1113,
+      "step": 44706
+    },
+    {
+      "epoch": 0.38807822848759993,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0012126227413634786,
+      "loss": 0.1055,
+      "step": 44707
+    },
+    {
+      "epoch": 0.3880869089678041,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0012125932127914892,
+      "loss": 0.0796,
+      "step": 44708
+    },
+    {
+      "epoch": 0.38809558944800826,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0012125636840963692,
+      "loss": 0.0869,
+      "step": 44709
+    },
+    {
+      "epoch": 0.38810426992821245,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0012125341552781512,
+      "loss": 0.0728,
+      "step": 44710
+    },
+    {
+      "epoch": 0.3881129504084166,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0012125046263368675,
+      "loss": 0.1387,
+      "step": 44711
+    },
+    {
+      "epoch": 0.3881216308886208,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0012124750972725507,
+      "loss": 0.0967,
+      "step": 44712
+    },
+    {
+      "epoch": 0.3881303113688249,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0012124455680852326,
+      "loss": 0.1357,
+      "step": 44713
+    },
+    {
+      "epoch": 0.3881389918490291,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0012124160387749457,
+      "loss": 0.0825,
+      "step": 44714
+    },
+    {
+      "epoch": 0.38814767232923325,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0012123865093417224,
+      "loss": 0.0708,
+      "step": 44715
+    },
+    {
+      "epoch": 0.38815635280943744,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0012123569797855948,
+      "loss": 0.0728,
+      "step": 44716
+    },
+    {
+      "epoch": 0.3881650332896416,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0012123274501065955,
+      "loss": 0.1006,
+      "step": 44717
+    },
+    {
+      "epoch": 0.3881737137698458,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0012122979203047564,
+      "loss": 0.1021,
+      "step": 44718
+    },
+    {
+      "epoch": 0.3881823942500499,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0012122683903801103,
+      "loss": 0.1152,
+      "step": 44719
+    },
+    {
+      "epoch": 0.3881910747302541,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0012122388603326888,
+      "loss": 0.1328,
+      "step": 44720
+    },
+    {
+      "epoch": 0.38819975521045824,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0012122093301625246,
+      "loss": 0.1582,
+      "step": 44721
+    },
+    {
+      "epoch": 0.38820843569066243,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0012121797998696502,
+      "loss": 0.1235,
+      "step": 44722
+    },
+    {
+      "epoch": 0.38821711617086657,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0012121502694540974,
+      "loss": 0.0728,
+      "step": 44723
+    },
+    {
+      "epoch": 0.38822579665107076,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0012121207389158989,
+      "loss": 0.0884,
+      "step": 44724
+    },
+    {
+      "epoch": 0.3882344771312749,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0012120912082550868,
+      "loss": 0.2598,
+      "step": 44725
+    },
+    {
+      "epoch": 0.38824315761147904,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0012120616774716935,
+      "loss": 0.0981,
+      "step": 44726
+    },
+    {
+      "epoch": 0.38825183809168323,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001212032146565751,
+      "loss": 0.0908,
+      "step": 44727
+    },
+    {
+      "epoch": 0.38826051857188737,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001212002615537292,
+      "loss": 0.1113,
+      "step": 44728
+    },
+    {
+      "epoch": 0.38826919905209156,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0012119730843863485,
+      "loss": 0.0732,
+      "step": 44729
+    },
+    {
+      "epoch": 0.3882778795322957,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0012119435531129533,
+      "loss": 0.083,
+      "step": 44730
+    },
+    {
+      "epoch": 0.3882865600124999,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0012119140217171378,
+      "loss": 0.0757,
+      "step": 44731
+    },
+    {
+      "epoch": 0.38829524049270403,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0012118844901989351,
+      "loss": 0.0977,
+      "step": 44732
+    },
+    {
+      "epoch": 0.3883039209729082,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.001211854958558377,
+      "loss": 0.1089,
+      "step": 44733
+    },
+    {
+      "epoch": 0.38831260145311236,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0012118254267954962,
+      "loss": 0.1001,
+      "step": 44734
+    },
+    {
+      "epoch": 0.38832128193331655,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001211795894910325,
+      "loss": 0.1006,
+      "step": 44735
+    },
+    {
+      "epoch": 0.3883299624135207,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0012117663629028952,
+      "loss": 0.0991,
+      "step": 44736
+    },
+    {
+      "epoch": 0.3883386428937249,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0012117368307732394,
+      "loss": 0.125,
+      "step": 44737
+    },
+    {
+      "epoch": 0.388347323373929,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.00121170729852139,
+      "loss": 0.0664,
+      "step": 44738
+    },
+    {
+      "epoch": 0.3883560038541332,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0012116777661473792,
+      "loss": 0.1113,
+      "step": 44739
+    },
+    {
+      "epoch": 0.38836468433433735,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0012116482336512388,
+      "loss": 0.0728,
+      "step": 44740
+    },
+    {
+      "epoch": 0.38837336481454154,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001211618701033002,
+      "loss": 0.126,
+      "step": 44741
+    },
+    {
+      "epoch": 0.3883820452947457,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0012115891682927006,
+      "loss": 0.1221,
+      "step": 44742
+    },
+    {
+      "epoch": 0.3883907257749499,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.001211559635430367,
+      "loss": 0.082,
+      "step": 44743
+    },
+    {
+      "epoch": 0.388399406255154,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0012115301024460332,
+      "loss": 0.1348,
+      "step": 44744
+    },
+    {
+      "epoch": 0.3884080867353582,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0012115005693397322,
+      "loss": 0.0752,
+      "step": 44745
+    },
+    {
+      "epoch": 0.38841676721556234,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0012114710361114955,
+      "loss": 0.0884,
+      "step": 44746
+    },
+    {
+      "epoch": 0.38842544769576653,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0012114415027613562,
+      "loss": 0.124,
+      "step": 44747
+    },
+    {
+      "epoch": 0.38843412817597067,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0012114119692893455,
+      "loss": 0.064,
+      "step": 44748
+    },
+    {
+      "epoch": 0.38844280865617486,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0012113824356954968,
+      "loss": 0.0771,
+      "step": 44749
+    },
+    {
+      "epoch": 0.388451489136379,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001211352901979842,
+      "loss": 0.0879,
+      "step": 44750
+    },
+    {
+      "epoch": 0.3884601696165832,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0012113233681424129,
+      "loss": 0.0801,
+      "step": 44751
+    },
+    {
+      "epoch": 0.38846885009678733,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0012112938341832427,
+      "loss": 0.0742,
+      "step": 44752
+    },
+    {
+      "epoch": 0.3884775305769915,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0012112643001023628,
+      "loss": 0.1123,
+      "step": 44753
+    },
+    {
+      "epoch": 0.38848621105719566,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001211234765899806,
+      "loss": 0.0762,
+      "step": 44754
+    },
+    {
+      "epoch": 0.38849489153739986,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0012112052315756047,
+      "loss": 0.0825,
+      "step": 44755
+    },
+    {
+      "epoch": 0.388503572017604,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001211175697129791,
+      "loss": 0.0903,
+      "step": 44756
+    },
+    {
+      "epoch": 0.3885122524978082,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001211146162562397,
+      "loss": 0.1055,
+      "step": 44757
+    },
+    {
+      "epoch": 0.3885209329780123,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0012111166278734558,
+      "loss": 0.1104,
+      "step": 44758
+    },
+    {
+      "epoch": 0.3885296134582165,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001211087093062999,
+      "loss": 0.0957,
+      "step": 44759
+    },
+    {
+      "epoch": 0.38853829393842065,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0012110575581310587,
+      "loss": 0.0913,
+      "step": 44760
+    },
+    {
+      "epoch": 0.38854697441862485,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0012110280230776677,
+      "loss": 0.1191,
+      "step": 44761
+    },
+    {
+      "epoch": 0.388555654898829,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0012109984879028581,
+      "loss": 0.0991,
+      "step": 44762
+    },
+    {
+      "epoch": 0.3885643353790332,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.001210968952606662,
+      "loss": 0.1245,
+      "step": 44763
+    },
+    {
+      "epoch": 0.3885730158592373,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0012109394171891119,
+      "loss": 0.1338,
+      "step": 44764
+    },
+    {
+      "epoch": 0.3885816963394415,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0012109098816502407,
+      "loss": 0.0879,
+      "step": 44765
+    },
+    {
+      "epoch": 0.38859037681964564,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0012108803459900796,
+      "loss": 0.0996,
+      "step": 44766
+    },
+    {
+      "epoch": 0.38859905729984984,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0012108508102086615,
+      "loss": 0.1211,
+      "step": 44767
+    },
+    {
+      "epoch": 0.388607737780054,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0012108212743060188,
+      "loss": 0.0723,
+      "step": 44768
+    },
+    {
+      "epoch": 0.38861641826025817,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0012107917382821836,
+      "loss": 0.1133,
+      "step": 44769
+    },
+    {
+      "epoch": 0.3886250987404623,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001210762202137188,
+      "loss": 0.1475,
+      "step": 44770
+    },
+    {
+      "epoch": 0.3886337792206665,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0012107326658710646,
+      "loss": 0.0991,
+      "step": 44771
+    },
+    {
+      "epoch": 0.38864245970087063,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0012107031294838456,
+      "loss": 0.127,
+      "step": 44772
+    },
+    {
+      "epoch": 0.38865114018107483,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0012106735929755634,
+      "loss": 0.082,
+      "step": 44773
+    },
+    {
+      "epoch": 0.38865982066127897,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0012106440563462502,
+      "loss": 0.167,
+      "step": 44774
+    },
+    {
+      "epoch": 0.38866850114148316,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0012106145195959384,
+      "loss": 0.0942,
+      "step": 44775
+    },
+    {
+      "epoch": 0.3886771816216873,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.00121058498272466,
+      "loss": 0.1299,
+      "step": 44776
+    },
+    {
+      "epoch": 0.3886858621018915,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001210555445732448,
+      "loss": 0.1387,
+      "step": 44777
+    },
+    {
+      "epoch": 0.3886945425820956,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.001210525908619334,
+      "loss": 0.1445,
+      "step": 44778
+    },
+    {
+      "epoch": 0.3887032230622998,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0012104963713853501,
+      "loss": 0.0835,
+      "step": 44779
+    },
+    {
+      "epoch": 0.38871190354250396,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0012104668340305294,
+      "loss": 0.0806,
+      "step": 44780
+    },
+    {
+      "epoch": 0.38872058402270815,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0012104372965549037,
+      "loss": 0.0796,
+      "step": 44781
+    },
+    {
+      "epoch": 0.3887292645029123,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0012104077589585056,
+      "loss": 0.0776,
+      "step": 44782
+    },
+    {
+      "epoch": 0.3887379449831165,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001210378221241367,
+      "loss": 0.1406,
+      "step": 44783
+    },
+    {
+      "epoch": 0.3887466254633206,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0012103486834035206,
+      "loss": 0.1455,
+      "step": 44784
+    },
+    {
+      "epoch": 0.3887553059435248,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0012103191454449986,
+      "loss": 0.0869,
+      "step": 44785
+    },
+    {
+      "epoch": 0.38876398642372895,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0012102896073658332,
+      "loss": 0.104,
+      "step": 44786
+    },
+    {
+      "epoch": 0.38877266690393314,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0012102600691660567,
+      "loss": 0.1172,
+      "step": 44787
+    },
+    {
+      "epoch": 0.3887813473841373,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0012102305308457012,
+      "loss": 0.0967,
+      "step": 44788
+    },
+    {
+      "epoch": 0.38879002786434147,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0012102009924047996,
+      "loss": 0.0898,
+      "step": 44789
+    },
+    {
+      "epoch": 0.3887987083445456,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0012101714538433842,
+      "loss": 0.0938,
+      "step": 44790
+    },
+    {
+      "epoch": 0.3888073888247498,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0012101419151614862,
+      "loss": 0.0806,
+      "step": 44791
+    },
+    {
+      "epoch": 0.38881606930495394,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001210112376359139,
+      "loss": 0.1138,
+      "step": 44792
+    },
+    {
+      "epoch": 0.38882474978515813,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0012100828374363743,
+      "loss": 0.0752,
+      "step": 44793
+    },
+    {
+      "epoch": 0.38883343026536227,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0012100532983932248,
+      "loss": 0.1064,
+      "step": 44794
+    },
+    {
+      "epoch": 0.38884211074556646,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0012100237592297229,
+      "loss": 0.0688,
+      "step": 44795
+    },
+    {
+      "epoch": 0.3888507912257706,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0012099942199459005,
+      "loss": 0.0928,
+      "step": 44796
+    },
+    {
+      "epoch": 0.3888594717059748,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.00120996468054179,
+      "loss": 0.0918,
+      "step": 44797
+    },
+    {
+      "epoch": 0.38886815218617893,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0012099351410174242,
+      "loss": 0.0879,
+      "step": 44798
+    },
+    {
+      "epoch": 0.3888768326663831,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0012099056013728344,
+      "loss": 0.0625,
+      "step": 44799
+    },
+    {
+      "epoch": 0.38888551314658726,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0012098760616080539,
+      "loss": 0.0757,
+      "step": 44800
+    },
+    {
+      "epoch": 0.38889419362679145,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001209846521723114,
+      "loss": 0.0845,
+      "step": 44801
+    },
+    {
+      "epoch": 0.3889028741069956,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0012098169817180483,
+      "loss": 0.0938,
+      "step": 44802
+    },
+    {
+      "epoch": 0.3889115545871998,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.001209787441592888,
+      "loss": 0.1143,
+      "step": 44803
+    },
+    {
+      "epoch": 0.3889202350674039,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001209757901347666,
+      "loss": 0.0933,
+      "step": 44804
+    },
+    {
+      "epoch": 0.3889289155476081,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0012097283609824143,
+      "loss": 0.1133,
+      "step": 44805
+    },
+    {
+      "epoch": 0.38893759602781225,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0012096988204971656,
+      "loss": 0.0952,
+      "step": 44806
+    },
+    {
+      "epoch": 0.38894627650801644,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0012096692798919517,
+      "loss": 0.0947,
+      "step": 44807
+    },
+    {
+      "epoch": 0.3889549569882206,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001209639739166805,
+      "loss": 0.0728,
+      "step": 44808
+    },
+    {
+      "epoch": 0.38896363746842477,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0012096101983217577,
+      "loss": 0.1001,
+      "step": 44809
+    },
+    {
+      "epoch": 0.3889723179486289,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001209580657356843,
+      "loss": 0.1094,
+      "step": 44810
+    },
+    {
+      "epoch": 0.3889809984288331,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0012095511162720921,
+      "loss": 0.0908,
+      "step": 44811
+    },
+    {
+      "epoch": 0.38898967890903724,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001209521575067538,
+      "loss": 0.0977,
+      "step": 44812
+    },
+    {
+      "epoch": 0.38899835938924143,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0012094920337432122,
+      "loss": 0.0957,
+      "step": 44813
+    },
+    {
+      "epoch": 0.38900703986944557,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.001209462492299148,
+      "loss": 0.0884,
+      "step": 44814
+    },
+    {
+      "epoch": 0.38901572034964976,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0012094329507353774,
+      "loss": 0.1045,
+      "step": 44815
+    },
+    {
+      "epoch": 0.3890244008298539,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0012094034090519323,
+      "loss": 0.1123,
+      "step": 44816
+    },
+    {
+      "epoch": 0.3890330813100581,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0012093738672488456,
+      "loss": 0.0688,
+      "step": 44817
+    },
+    {
+      "epoch": 0.38904176179026223,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0012093443253261486,
+      "loss": 0.0786,
+      "step": 44818
+    },
+    {
+      "epoch": 0.3890504422704664,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001209314783283875,
+      "loss": 0.0576,
+      "step": 44819
+    },
+    {
+      "epoch": 0.38905912275067056,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0012092852411220557,
+      "loss": 0.0786,
+      "step": 44820
+    },
+    {
+      "epoch": 0.38906780323087475,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0012092556988407244,
+      "loss": 0.1172,
+      "step": 44821
+    },
+    {
+      "epoch": 0.3890764837110789,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0012092261564399123,
+      "loss": 0.1309,
+      "step": 44822
+    },
+    {
+      "epoch": 0.3890851641912831,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0012091966139196521,
+      "loss": 0.082,
+      "step": 44823
+    },
+    {
+      "epoch": 0.3890938446714872,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0012091670712799764,
+      "loss": 0.1328,
+      "step": 44824
+    },
+    {
+      "epoch": 0.3891025251516914,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0012091375285209175,
+      "loss": 0.1006,
+      "step": 44825
+    },
+    {
+      "epoch": 0.38911120563189555,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0012091079856425067,
+      "loss": 0.1011,
+      "step": 44826
+    },
+    {
+      "epoch": 0.38911988611209974,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0012090784426447773,
+      "loss": 0.1108,
+      "step": 44827
+    },
+    {
+      "epoch": 0.3891285665923039,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0012090488995277618,
+      "loss": 0.1006,
+      "step": 44828
+    },
+    {
+      "epoch": 0.3891372470725081,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0012090193562914915,
+      "loss": 0.084,
+      "step": 44829
+    },
+    {
+      "epoch": 0.3891459275527122,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0012089898129359995,
+      "loss": 0.1006,
+      "step": 44830
+    },
+    {
+      "epoch": 0.3891546080329164,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0012089602694613179,
+      "loss": 0.1094,
+      "step": 44831
+    },
+    {
+      "epoch": 0.38916328851312054,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0012089307258674787,
+      "loss": 0.2109,
+      "step": 44832
+    },
+    {
+      "epoch": 0.38917196899332474,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0012089011821545149,
+      "loss": 0.1318,
+      "step": 44833
+    },
+    {
+      "epoch": 0.3891806494735289,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0012088716383224583,
+      "loss": 0.0854,
+      "step": 44834
+    },
+    {
+      "epoch": 0.38918932995373307,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001208842094371341,
+      "loss": 0.1133,
+      "step": 44835
+    },
+    {
+      "epoch": 0.3891980104339372,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001208812550301196,
+      "loss": 0.1035,
+      "step": 44836
+    },
+    {
+      "epoch": 0.3892066909141414,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001208783006112055,
+      "loss": 0.0918,
+      "step": 44837
+    },
+    {
+      "epoch": 0.38921537139434553,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0012087534618039507,
+      "loss": 0.0947,
+      "step": 44838
+    },
+    {
+      "epoch": 0.3892240518745497,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0012087239173769154,
+      "loss": 0.1602,
+      "step": 44839
+    },
+    {
+      "epoch": 0.38923273235475386,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0012086943728309807,
+      "loss": 0.0879,
+      "step": 44840
+    },
+    {
+      "epoch": 0.38924141283495806,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0012086648281661799,
+      "loss": 0.0718,
+      "step": 44841
+    },
+    {
+      "epoch": 0.3892500933151622,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0012086352833825449,
+      "loss": 0.1201,
+      "step": 44842
+    },
+    {
+      "epoch": 0.3892587737953664,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0012086057384801078,
+      "loss": 0.1211,
+      "step": 44843
+    },
+    {
+      "epoch": 0.3892674542755705,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0012085761934589013,
+      "loss": 0.0703,
+      "step": 44844
+    },
+    {
+      "epoch": 0.3892761347557747,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0012085466483189571,
+      "loss": 0.085,
+      "step": 44845
+    },
+    {
+      "epoch": 0.38928481523597885,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0012085171030603084,
+      "loss": 0.1074,
+      "step": 44846
+    },
+    {
+      "epoch": 0.38929349571618305,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0012084875576829869,
+      "loss": 0.0801,
+      "step": 44847
+    },
+    {
+      "epoch": 0.3893021761963872,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0012084580121870248,
+      "loss": 0.0723,
+      "step": 44848
+    },
+    {
+      "epoch": 0.3893108566765913,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0012084284665724546,
+      "loss": 0.1021,
+      "step": 44849
+    },
+    {
+      "epoch": 0.3893195371567955,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001208398920839309,
+      "loss": 0.0889,
+      "step": 44850
+    },
+    {
+      "epoch": 0.38932821763699965,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0012083693749876199,
+      "loss": 0.1084,
+      "step": 44851
+    },
+    {
+      "epoch": 0.38933689811720384,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0012083398290174195,
+      "loss": 0.0869,
+      "step": 44852
+    },
+    {
+      "epoch": 0.389345578597408,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0012083102829287405,
+      "loss": 0.0918,
+      "step": 44853
+    },
+    {
+      "epoch": 0.3893542590776122,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0012082807367216149,
+      "loss": 0.1201,
+      "step": 44854
+    },
+    {
+      "epoch": 0.3893629395578163,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001208251190396075,
+      "loss": 0.1089,
+      "step": 44855
+    },
+    {
+      "epoch": 0.3893716200380205,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0012082216439521533,
+      "loss": 0.0938,
+      "step": 44856
+    },
+    {
+      "epoch": 0.38938030051822464,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001208192097389882,
+      "loss": 0.0742,
+      "step": 44857
+    },
+    {
+      "epoch": 0.38938898099842884,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0012081625507092935,
+      "loss": 0.0889,
+      "step": 44858
+    },
+    {
+      "epoch": 0.389397661478633,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.00120813300391042,
+      "loss": 0.0791,
+      "step": 44859
+    },
+    {
+      "epoch": 0.38940634195883717,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0012081034569932942,
+      "loss": 0.0957,
+      "step": 44860
+    },
+    {
+      "epoch": 0.3894150224390413,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0012080739099579478,
+      "loss": 0.0884,
+      "step": 44861
+    },
+    {
+      "epoch": 0.3894237029192455,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0012080443628044134,
+      "loss": 0.084,
+      "step": 44862
+    },
+    {
+      "epoch": 0.38943238339944963,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0012080148155327233,
+      "loss": 0.0708,
+      "step": 44863
+    },
+    {
+      "epoch": 0.3894410638796538,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.00120798526814291,
+      "loss": 0.0752,
+      "step": 44864
+    },
+    {
+      "epoch": 0.38944974435985796,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0012079557206350056,
+      "loss": 0.0698,
+      "step": 44865
+    },
+    {
+      "epoch": 0.38945842484006216,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.001207926173009042,
+      "loss": 0.1221,
+      "step": 44866
+    },
+    {
+      "epoch": 0.3894671053202663,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0012078966252650526,
+      "loss": 0.1182,
+      "step": 44867
+    },
+    {
+      "epoch": 0.3894757858004705,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0012078670774030686,
+      "loss": 0.1113,
+      "step": 44868
+    },
+    {
+      "epoch": 0.3894844662806746,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0012078375294231228,
+      "loss": 0.0801,
+      "step": 44869
+    },
+    {
+      "epoch": 0.3894931467608788,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0012078079813252476,
+      "loss": 0.103,
+      "step": 44870
+    },
+    {
+      "epoch": 0.38950182724108295,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0012077784331094754,
+      "loss": 0.0869,
+      "step": 44871
+    },
+    {
+      "epoch": 0.38951050772128715,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0012077488847758381,
+      "loss": 0.1211,
+      "step": 44872
+    },
+    {
+      "epoch": 0.3895191882014913,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0012077193363243686,
+      "loss": 0.1069,
+      "step": 44873
+    },
+    {
+      "epoch": 0.3895278686816955,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0012076897877550984,
+      "loss": 0.1348,
+      "step": 44874
+    },
+    {
+      "epoch": 0.3895365491618996,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0012076602390680606,
+      "loss": 0.0771,
+      "step": 44875
+    },
+    {
+      "epoch": 0.3895452296421038,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001207630690263287,
+      "loss": 0.1167,
+      "step": 44876
+    },
+    {
+      "epoch": 0.38955391012230794,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.00120760114134081,
+      "loss": 0.1641,
+      "step": 44877
+    },
+    {
+      "epoch": 0.38956259060251214,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001207571592300662,
+      "loss": 0.104,
+      "step": 44878
+    },
+    {
+      "epoch": 0.3895712710827163,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0012075420431428756,
+      "loss": 0.0623,
+      "step": 44879
+    },
+    {
+      "epoch": 0.38957995156292047,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0012075124938674827,
+      "loss": 0.0977,
+      "step": 44880
+    },
+    {
+      "epoch": 0.3895886320431246,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0012074829444745157,
+      "loss": 0.0786,
+      "step": 44881
+    },
+    {
+      "epoch": 0.3895973125233288,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001207453394964007,
+      "loss": 0.0732,
+      "step": 44882
+    },
+    {
+      "epoch": 0.38960599300353294,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001207423845335989,
+      "loss": 0.0757,
+      "step": 44883
+    },
+    {
+      "epoch": 0.38961467348373713,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0012073942955904938,
+      "loss": 0.0972,
+      "step": 44884
+    },
+    {
+      "epoch": 0.38962335396394127,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0012073647457275538,
+      "loss": 0.0913,
+      "step": 44885
+    },
+    {
+      "epoch": 0.38963203444414546,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0012073351957472011,
+      "loss": 0.0835,
+      "step": 44886
+    },
+    {
+      "epoch": 0.3896407149243496,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0012073056456494687,
+      "loss": 0.0718,
+      "step": 44887
+    },
+    {
+      "epoch": 0.3896493954045538,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0012072760954343882,
+      "loss": 0.1172,
+      "step": 44888
+    },
+    {
+      "epoch": 0.3896580758847579,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0012072465451019921,
+      "loss": 0.1167,
+      "step": 44889
+    },
+    {
+      "epoch": 0.3896667563649621,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.001207216994652313,
+      "loss": 0.0952,
+      "step": 44890
+    },
+    {
+      "epoch": 0.38967543684516626,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001207187444085383,
+      "loss": 0.0898,
+      "step": 44891
+    },
+    {
+      "epoch": 0.38968411732537045,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0012071578934012342,
+      "loss": 0.0894,
+      "step": 44892
+    },
+    {
+      "epoch": 0.3896927978055746,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0012071283425998994,
+      "loss": 0.1123,
+      "step": 44893
+    },
+    {
+      "epoch": 0.3897014782857788,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0012070987916814105,
+      "loss": 0.1152,
+      "step": 44894
+    },
+    {
+      "epoch": 0.3897101587659829,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0012070692406457998,
+      "loss": 0.0874,
+      "step": 44895
+    },
+    {
+      "epoch": 0.3897188392461871,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0012070396894931,
+      "loss": 0.0957,
+      "step": 44896
+    },
+    {
+      "epoch": 0.38972751972639125,
+      "grad_norm": 1.0,
+      "learning_rate": 0.001207010138223343,
+      "loss": 0.126,
+      "step": 44897
+    },
+    {
+      "epoch": 0.38973620020659544,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0012069805868365616,
+      "loss": 0.0796,
+      "step": 44898
+    },
+    {
+      "epoch": 0.3897448806867996,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0012069510353327876,
+      "loss": 0.1152,
+      "step": 44899
+    },
+    {
+      "epoch": 0.38975356116700377,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0012069214837120536,
+      "loss": 0.0962,
+      "step": 44900
+    },
+    {
+      "epoch": 0.3897622416472079,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001206891931974392,
+      "loss": 0.1045,
+      "step": 44901
+    },
+    {
+      "epoch": 0.3897709221274121,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001206862380119835,
+      "loss": 0.1201,
+      "step": 44902
+    },
+    {
+      "epoch": 0.38977960260761624,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0012068328281484152,
+      "loss": 0.0908,
+      "step": 44903
+    },
+    {
+      "epoch": 0.38978828308782043,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0012068032760601639,
+      "loss": 0.1309,
+      "step": 44904
+    },
+    {
+      "epoch": 0.38979696356802457,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0012067737238551147,
+      "loss": 0.1157,
+      "step": 44905
+    },
+    {
+      "epoch": 0.38980564404822876,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.001206744171533299,
+      "loss": 0.0986,
+      "step": 44906
+    },
+    {
+      "epoch": 0.3898143245284329,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0012067146190947499,
+      "loss": 0.1011,
+      "step": 44907
+    },
+    {
+      "epoch": 0.3898230050086371,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0012066850665394988,
+      "loss": 0.0898,
+      "step": 44908
+    },
+    {
+      "epoch": 0.38983168548884123,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0012066555138675788,
+      "loss": 0.0752,
+      "step": 44909
+    },
+    {
+      "epoch": 0.3898403659690454,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.001206625961079022,
+      "loss": 0.0889,
+      "step": 44910
+    },
+    {
+      "epoch": 0.38984904644924956,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0012065964081738605,
+      "loss": 0.0918,
+      "step": 44911
+    },
+    {
+      "epoch": 0.38985772692945375,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001206566855152127,
+      "loss": 0.0933,
+      "step": 44912
+    },
+    {
+      "epoch": 0.3898664074096579,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0012065373020138533,
+      "loss": 0.1318,
+      "step": 44913
+    },
+    {
+      "epoch": 0.3898750878898621,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001206507748759072,
+      "loss": 0.1011,
+      "step": 44914
+    },
+    {
+      "epoch": 0.3898837683700662,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0012064781953878157,
+      "loss": 0.1182,
+      "step": 44915
+    },
+    {
+      "epoch": 0.3898924488502704,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0012064486419001161,
+      "loss": 0.0723,
+      "step": 44916
+    },
+    {
+      "epoch": 0.38990112933047455,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001206419088296006,
+      "loss": 0.0947,
+      "step": 44917
+    },
+    {
+      "epoch": 0.38990980981067874,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0012063895345755177,
+      "loss": 0.0825,
+      "step": 44918
+    },
+    {
+      "epoch": 0.3899184902908829,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0012063599807386834,
+      "loss": 0.0918,
+      "step": 44919
+    },
+    {
+      "epoch": 0.3899271707710871,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0012063304267855355,
+      "loss": 0.1035,
+      "step": 44920
+    },
+    {
+      "epoch": 0.3899358512512912,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.001206300872716106,
+      "loss": 0.0923,
+      "step": 44921
+    },
+    {
+      "epoch": 0.3899445317314954,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0012062713185304278,
+      "loss": 0.1167,
+      "step": 44922
+    },
+    {
+      "epoch": 0.38995321221169954,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0012062417642285323,
+      "loss": 0.0874,
+      "step": 44923
+    },
+    {
+      "epoch": 0.38996189269190373,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.001206212209810453,
+      "loss": 0.0801,
+      "step": 44924
+    },
+    {
+      "epoch": 0.38997057317210787,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0012061826552762212,
+      "loss": 0.1328,
+      "step": 44925
+    },
+    {
+      "epoch": 0.38997925365231206,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0012061531006258698,
+      "loss": 0.1064,
+      "step": 44926
+    },
+    {
+      "epoch": 0.3899879341325162,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001206123545859431,
+      "loss": 0.0864,
+      "step": 44927
+    },
+    {
+      "epoch": 0.3899966146127204,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001206093990976937,
+      "loss": 0.0898,
+      "step": 44928
+    },
+    {
+      "epoch": 0.39000529509292453,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0012060644359784202,
+      "loss": 0.207,
+      "step": 44929
+    },
+    {
+      "epoch": 0.3900139755731287,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001206034880863913,
+      "loss": 0.1006,
+      "step": 44930
+    },
+    {
+      "epoch": 0.39002265605333286,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0012060053256334477,
+      "loss": 0.0869,
+      "step": 44931
+    },
+    {
+      "epoch": 0.39003133653353705,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0012059757702870566,
+      "loss": 0.0483,
+      "step": 44932
+    },
+    {
+      "epoch": 0.3900400170137412,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001205946214824772,
+      "loss": 0.1104,
+      "step": 44933
+    },
+    {
+      "epoch": 0.3900486974939454,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001205916659246626,
+      "loss": 0.0918,
+      "step": 44934
+    },
+    {
+      "epoch": 0.3900573779741495,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0012058871035526514,
+      "loss": 0.1221,
+      "step": 44935
+    },
+    {
+      "epoch": 0.3900660584543537,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.00120585754774288,
+      "loss": 0.082,
+      "step": 44936
+    },
+    {
+      "epoch": 0.39007473893455785,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0012058279918173444,
+      "loss": 0.0845,
+      "step": 44937
+    },
+    {
+      "epoch": 0.39008341941476204,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0012057984357760772,
+      "loss": 0.105,
+      "step": 44938
+    },
+    {
+      "epoch": 0.3900920998949662,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.00120576887961911,
+      "loss": 0.0908,
+      "step": 44939
+    },
+    {
+      "epoch": 0.3901007803751704,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0012057393233464758,
+      "loss": 0.104,
+      "step": 44940
+    },
+    {
+      "epoch": 0.3901094608553745,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0012057097669582067,
+      "loss": 0.1216,
+      "step": 44941
+    },
+    {
+      "epoch": 0.3901181413355787,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001205680210454335,
+      "loss": 0.0889,
+      "step": 44942
+    },
+    {
+      "epoch": 0.39012682181578284,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0012056506538348929,
+      "loss": 0.1475,
+      "step": 44943
+    },
+    {
+      "epoch": 0.39013550229598704,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0012056210970999126,
+      "loss": 0.1445,
+      "step": 44944
+    },
+    {
+      "epoch": 0.3901441827761912,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0012055915402494266,
+      "loss": 0.0771,
+      "step": 44945
+    },
+    {
+      "epoch": 0.39015286325639537,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0012055619832834676,
+      "loss": 0.1309,
+      "step": 44946
+    },
+    {
+      "epoch": 0.3901615437365995,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0012055324262020676,
+      "loss": 0.1338,
+      "step": 44947
+    },
+    {
+      "epoch": 0.3901702242168037,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0012055028690052593,
+      "loss": 0.0811,
+      "step": 44948
+    },
+    {
+      "epoch": 0.39017890469700783,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001205473311693074,
+      "loss": 0.0845,
+      "step": 44949
+    },
+    {
+      "epoch": 0.390187585177212,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001205443754265545,
+      "loss": 0.1162,
+      "step": 44950
+    },
+    {
+      "epoch": 0.39019626565741616,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0012054141967227045,
+      "loss": 0.0898,
+      "step": 44951
+    },
+    {
+      "epoch": 0.39020494613762036,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001205384639064584,
+      "loss": 0.1055,
+      "step": 44952
+    },
+    {
+      "epoch": 0.3902136266178245,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0012053550812912166,
+      "loss": 0.0991,
+      "step": 44953
+    },
+    {
+      "epoch": 0.3902223070980287,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0012053255234026347,
+      "loss": 0.1035,
+      "step": 44954
+    },
+    {
+      "epoch": 0.3902309875782328,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0012052959653988703,
+      "loss": 0.082,
+      "step": 44955
+    },
+    {
+      "epoch": 0.390239668058437,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0012052664072799558,
+      "loss": 0.0947,
+      "step": 44956
+    },
+    {
+      "epoch": 0.39024834853864115,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0012052368490459236,
+      "loss": 0.0913,
+      "step": 44957
+    },
+    {
+      "epoch": 0.39025702901884535,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001205207290696806,
+      "loss": 0.0776,
+      "step": 44958
+    },
+    {
+      "epoch": 0.3902657094990495,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0012051777322326354,
+      "loss": 0.0996,
+      "step": 44959
+    },
+    {
+      "epoch": 0.3902743899792537,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0012051481736534435,
+      "loss": 0.0977,
+      "step": 44960
+    },
+    {
+      "epoch": 0.3902830704594578,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0012051186149592637,
+      "loss": 0.0684,
+      "step": 44961
+    },
+    {
+      "epoch": 0.390291750939662,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0012050890561501276,
+      "loss": 0.0864,
+      "step": 44962
+    },
+    {
+      "epoch": 0.39030043141986615,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0012050594972260675,
+      "loss": 0.1064,
+      "step": 44963
+    },
+    {
+      "epoch": 0.39030911190007034,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001205029938187116,
+      "loss": 0.0977,
+      "step": 44964
+    },
+    {
+      "epoch": 0.3903177923802745,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0012050003790333053,
+      "loss": 0.1348,
+      "step": 44965
+    },
+    {
+      "epoch": 0.39032647286047867,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0012049708197646681,
+      "loss": 0.063,
+      "step": 44966
+    },
+    {
+      "epoch": 0.3903351533406828,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001204941260381236,
+      "loss": 0.125,
+      "step": 44967
+    },
+    {
+      "epoch": 0.390343833820887,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0012049117008830418,
+      "loss": 0.0835,
+      "step": 44968
+    },
+    {
+      "epoch": 0.39035251430109114,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0012048821412701178,
+      "loss": 0.084,
+      "step": 44969
+    },
+    {
+      "epoch": 0.39036119478129533,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0012048525815424962,
+      "loss": 0.0938,
+      "step": 44970
+    },
+    {
+      "epoch": 0.39036987526149947,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0012048230217002096,
+      "loss": 0.1035,
+      "step": 44971
+    },
+    {
+      "epoch": 0.3903785557417036,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.00120479346174329,
+      "loss": 0.1348,
+      "step": 44972
+    },
+    {
+      "epoch": 0.3903872362219078,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0012047639016717698,
+      "loss": 0.1377,
+      "step": 44973
+    },
+    {
+      "epoch": 0.39039591670211193,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0012047343414856814,
+      "loss": 0.1177,
+      "step": 44974
+    },
+    {
+      "epoch": 0.3904045971823161,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001204704781185057,
+      "loss": 0.0928,
+      "step": 44975
+    },
+    {
+      "epoch": 0.39041327766252026,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.001204675220769929,
+      "loss": 0.1504,
+      "step": 44976
+    },
+    {
+      "epoch": 0.39042195814272446,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0012046456602403297,
+      "loss": 0.0947,
+      "step": 44977
+    },
+    {
+      "epoch": 0.3904306386229286,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0012046160995962919,
+      "loss": 0.0879,
+      "step": 44978
+    },
+    {
+      "epoch": 0.3904393191031328,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001204586538837847,
+      "loss": 0.0972,
+      "step": 44979
+    },
+    {
+      "epoch": 0.3904479995833369,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0012045569779650283,
+      "loss": 0.0801,
+      "step": 44980
+    },
+    {
+      "epoch": 0.3904566800635411,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0012045274169778675,
+      "loss": 0.0938,
+      "step": 44981
+    },
+    {
+      "epoch": 0.39046536054374525,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0012044978558763968,
+      "loss": 0.1221,
+      "step": 44982
+    },
+    {
+      "epoch": 0.39047404102394945,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0012044682946606492,
+      "loss": 0.1309,
+      "step": 44983
+    },
+    {
+      "epoch": 0.3904827215041536,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0012044387333306563,
+      "loss": 0.126,
+      "step": 44984
+    },
+    {
+      "epoch": 0.3904914019843578,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0012044091718864509,
+      "loss": 0.0879,
+      "step": 44985
+    },
+    {
+      "epoch": 0.3905000824645619,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0012043796103280652,
+      "loss": 0.1045,
+      "step": 44986
+    },
+    {
+      "epoch": 0.3905087629447661,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0012043500486555317,
+      "loss": 0.1152,
+      "step": 44987
+    },
+    {
+      "epoch": 0.39051744342497025,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0012043204868688825,
+      "loss": 0.1021,
+      "step": 44988
+    },
+    {
+      "epoch": 0.39052612390517444,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00120429092496815,
+      "loss": 0.0947,
+      "step": 44989
+    },
+    {
+      "epoch": 0.3905348043853786,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001204261362953366,
+      "loss": 0.1094,
+      "step": 44990
+    },
+    {
+      "epoch": 0.39054348486558277,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001204231800824564,
+      "loss": 0.1152,
+      "step": 44991
+    },
+    {
+      "epoch": 0.3905521653457869,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0012042022385817756,
+      "loss": 0.1011,
+      "step": 44992
+    },
+    {
+      "epoch": 0.3905608458259911,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0012041726762250327,
+      "loss": 0.1113,
+      "step": 44993
+    },
+    {
+      "epoch": 0.39056952630619524,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0012041431137543685,
+      "loss": 0.1162,
+      "step": 44994
+    },
+    {
+      "epoch": 0.39057820678639943,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001204113551169815,
+      "loss": 0.1064,
+      "step": 44995
+    },
+    {
+      "epoch": 0.39058688726660357,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0012040839884714042,
+      "loss": 0.0791,
+      "step": 44996
+    },
+    {
+      "epoch": 0.39059556774680776,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001204054425659169,
+      "loss": 0.1201,
+      "step": 44997
+    },
+    {
+      "epoch": 0.3906042482270119,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0012040248627331414,
+      "loss": 0.082,
+      "step": 44998
+    },
+    {
+      "epoch": 0.3906129287072161,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0012039952996933537,
+      "loss": 0.1074,
+      "step": 44999
+    },
+    {
+      "epoch": 0.3906216091874202,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0012039657365398384,
+      "loss": 0.1235,
+      "step": 45000
+    },
+    {
+      "epoch": 0.3906302896676244,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0012039361732726277,
+      "loss": 0.0752,
+      "step": 45001
+    },
+    {
+      "epoch": 0.39063897014782856,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0012039066098917537,
+      "loss": 0.1113,
+      "step": 45002
+    },
+    {
+      "epoch": 0.39064765062803275,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001203877046397249,
+      "loss": 0.0957,
+      "step": 45003
+    },
+    {
+      "epoch": 0.3906563311082369,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0012038474827891464,
+      "loss": 0.1133,
+      "step": 45004
+    },
+    {
+      "epoch": 0.3906650115884411,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0012038179190674773,
+      "loss": 0.1309,
+      "step": 45005
+    },
+    {
+      "epoch": 0.3906736920686452,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0012037883552322747,
+      "loss": 0.0889,
+      "step": 45006
+    },
+    {
+      "epoch": 0.3906823725488494,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001203758791283571,
+      "loss": 0.0908,
+      "step": 45007
+    },
+    {
+      "epoch": 0.39069105302905355,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0012037292272213977,
+      "loss": 0.0889,
+      "step": 45008
+    },
+    {
+      "epoch": 0.39069973350925774,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001203699663045788,
+      "loss": 0.0957,
+      "step": 45009
+    },
+    {
+      "epoch": 0.3907084139894619,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0012036700987567738,
+      "loss": 0.0845,
+      "step": 45010
+    },
+    {
+      "epoch": 0.39071709446966607,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0012036405343543875,
+      "loss": 0.0654,
+      "step": 45011
+    },
+    {
+      "epoch": 0.3907257749498702,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0012036109698386616,
+      "loss": 0.1455,
+      "step": 45012
+    },
+    {
+      "epoch": 0.3907344554300744,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0012035814052096278,
+      "loss": 0.0869,
+      "step": 45013
+    },
+    {
+      "epoch": 0.39074313591027854,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0012035518404673196,
+      "loss": 0.1152,
+      "step": 45014
+    },
+    {
+      "epoch": 0.39075181639048273,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0012035222756117685,
+      "loss": 0.0947,
+      "step": 45015
+    },
+    {
+      "epoch": 0.39076049687068687,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0012034927106430067,
+      "loss": 0.1162,
+      "step": 45016
+    },
+    {
+      "epoch": 0.39076917735089106,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0012034631455610672,
+      "loss": 0.1318,
+      "step": 45017
+    },
+    {
+      "epoch": 0.3907778578310952,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001203433580365982,
+      "loss": 0.0967,
+      "step": 45018
+    },
+    {
+      "epoch": 0.3907865383112994,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0012034040150577829,
+      "loss": 0.1377,
+      "step": 45019
+    },
+    {
+      "epoch": 0.39079521879150353,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001203374449636503,
+      "loss": 0.1021,
+      "step": 45020
+    },
+    {
+      "epoch": 0.3908038992717077,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0012033448841021744,
+      "loss": 0.0977,
+      "step": 45021
+    },
+    {
+      "epoch": 0.39081257975191186,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0012033153184548293,
+      "loss": 0.0874,
+      "step": 45022
+    },
+    {
+      "epoch": 0.39082126023211605,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0012032857526945,
+      "loss": 0.0898,
+      "step": 45023
+    },
+    {
+      "epoch": 0.3908299407123202,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0012032561868212193,
+      "loss": 0.0962,
+      "step": 45024
+    },
+    {
+      "epoch": 0.3908386211925244,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0012032266208350192,
+      "loss": 0.1006,
+      "step": 45025
+    },
+    {
+      "epoch": 0.3908473016727285,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0012031970547359317,
+      "loss": 0.0908,
+      "step": 45026
+    },
+    {
+      "epoch": 0.3908559821529327,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0012031674885239897,
+      "loss": 0.1094,
+      "step": 45027
+    },
+    {
+      "epoch": 0.39086466263313685,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0012031379221992251,
+      "loss": 0.0742,
+      "step": 45028
+    },
+    {
+      "epoch": 0.39087334311334104,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0012031083557616704,
+      "loss": 0.123,
+      "step": 45029
+    },
+    {
+      "epoch": 0.3908820235935452,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0012030787892113582,
+      "loss": 0.063,
+      "step": 45030
+    },
+    {
+      "epoch": 0.3908907040737494,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0012030492225483204,
+      "loss": 0.1094,
+      "step": 45031
+    },
+    {
+      "epoch": 0.3908993845539535,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0012030196557725893,
+      "loss": 0.1191,
+      "step": 45032
+    },
+    {
+      "epoch": 0.3909080650341577,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0012029900888841977,
+      "loss": 0.1104,
+      "step": 45033
+    },
+    {
+      "epoch": 0.39091674551436184,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0012029605218831776,
+      "loss": 0.0928,
+      "step": 45034
+    },
+    {
+      "epoch": 0.39092542599456603,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0012029309547695614,
+      "loss": 0.0918,
+      "step": 45035
+    },
+    {
+      "epoch": 0.39093410647477017,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0012029013875433816,
+      "loss": 0.1318,
+      "step": 45036
+    },
+    {
+      "epoch": 0.39094278695497436,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0012028718202046708,
+      "loss": 0.0781,
+      "step": 45037
+    },
+    {
+      "epoch": 0.3909514674351785,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.00120284225275346,
+      "loss": 0.1064,
+      "step": 45038
+    },
+    {
+      "epoch": 0.3909601479153827,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0012028126851897829,
+      "loss": 0.085,
+      "step": 45039
+    },
+    {
+      "epoch": 0.39096882839558683,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0012027831175136716,
+      "loss": 0.0835,
+      "step": 45040
+    },
+    {
+      "epoch": 0.390977508875791,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0012027535497251577,
+      "loss": 0.0601,
+      "step": 45041
+    },
+    {
+      "epoch": 0.39098618935599516,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0012027239818242746,
+      "loss": 0.0938,
+      "step": 45042
+    },
+    {
+      "epoch": 0.39099486983619935,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001202694413811054,
+      "loss": 0.1128,
+      "step": 45043
+    },
+    {
+      "epoch": 0.3910035503164035,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001202664845685528,
+      "loss": 0.0938,
+      "step": 45044
+    },
+    {
+      "epoch": 0.3910122307966077,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0012026352774477298,
+      "loss": 0.064,
+      "step": 45045
+    },
+    {
+      "epoch": 0.3910209112768118,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001202605709097691,
+      "loss": 0.1279,
+      "step": 45046
+    },
+    {
+      "epoch": 0.391029591757016,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001202576140635444,
+      "loss": 0.0811,
+      "step": 45047
+    },
+    {
+      "epoch": 0.39103827223722015,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001202546572061021,
+      "loss": 0.0957,
+      "step": 45048
+    },
+    {
+      "epoch": 0.39104695271742435,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001202517003374455,
+      "loss": 0.0928,
+      "step": 45049
+    },
+    {
+      "epoch": 0.3910556331976285,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0012024874345757776,
+      "loss": 0.0996,
+      "step": 45050
+    },
+    {
+      "epoch": 0.3910643136778327,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0012024578656650217,
+      "loss": 0.0874,
+      "step": 45051
+    },
+    {
+      "epoch": 0.3910729941580368,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0012024282966422194,
+      "loss": 0.1279,
+      "step": 45052
+    },
+    {
+      "epoch": 0.391081674638241,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0012023987275074036,
+      "loss": 0.0923,
+      "step": 45053
+    },
+    {
+      "epoch": 0.39109035511844514,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0012023691582606054,
+      "loss": 0.1162,
+      "step": 45054
+    },
+    {
+      "epoch": 0.39109903559864934,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001202339588901858,
+      "loss": 0.0991,
+      "step": 45055
+    },
+    {
+      "epoch": 0.3911077160788535,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0012023100194311935,
+      "loss": 0.1001,
+      "step": 45056
+    },
+    {
+      "epoch": 0.39111639655905767,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0012022804498486447,
+      "loss": 0.1367,
+      "step": 45057
+    },
+    {
+      "epoch": 0.3911250770392618,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0012022508801542433,
+      "loss": 0.0864,
+      "step": 45058
+    },
+    {
+      "epoch": 0.391133757519466,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0012022213103480214,
+      "loss": 0.082,
+      "step": 45059
+    },
+    {
+      "epoch": 0.39114243799967013,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0012021917404300125,
+      "loss": 0.1348,
+      "step": 45060
+    },
+    {
+      "epoch": 0.3911511184798743,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0012021621704002476,
+      "loss": 0.0981,
+      "step": 45061
+    },
+    {
+      "epoch": 0.39115979896007846,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0012021326002587603,
+      "loss": 0.0732,
+      "step": 45062
+    },
+    {
+      "epoch": 0.39116847944028266,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001202103030005582,
+      "loss": 0.1162,
+      "step": 45063
+    },
+    {
+      "epoch": 0.3911771599204868,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0012020734596407455,
+      "loss": 0.0762,
+      "step": 45064
+    },
+    {
+      "epoch": 0.391185840400691,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0012020438891642829,
+      "loss": 0.1006,
+      "step": 45065
+    },
+    {
+      "epoch": 0.3911945208808951,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0012020143185762268,
+      "loss": 0.062,
+      "step": 45066
+    },
+    {
+      "epoch": 0.3912032013610993,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0012019847478766093,
+      "loss": 0.0938,
+      "step": 45067
+    },
+    {
+      "epoch": 0.39121188184130345,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0012019551770654625,
+      "loss": 0.0757,
+      "step": 45068
+    },
+    {
+      "epoch": 0.39122056232150765,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0012019256061428195,
+      "loss": 0.0703,
+      "step": 45069
+    },
+    {
+      "epoch": 0.3912292428017118,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0012018960351087118,
+      "loss": 0.1279,
+      "step": 45070
+    },
+    {
+      "epoch": 0.391237923281916,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0012018664639631724,
+      "loss": 0.1436,
+      "step": 45071
+    },
+    {
+      "epoch": 0.3912466037621201,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0012018368927062336,
+      "loss": 0.0996,
+      "step": 45072
+    },
+    {
+      "epoch": 0.3912552842423243,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0012018073213379268,
+      "loss": 0.1201,
+      "step": 45073
+    },
+    {
+      "epoch": 0.39126396472252845,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0012017777498582855,
+      "loss": 0.0977,
+      "step": 45074
+    },
+    {
+      "epoch": 0.39127264520273264,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0012017481782673415,
+      "loss": 0.0933,
+      "step": 45075
+    },
+    {
+      "epoch": 0.3912813256829368,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0012017186065651276,
+      "loss": 0.1738,
+      "step": 45076
+    },
+    {
+      "epoch": 0.39129000616314097,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0012016890347516752,
+      "loss": 0.0972,
+      "step": 45077
+    },
+    {
+      "epoch": 0.3912986866433451,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0012016594628270173,
+      "loss": 0.0947,
+      "step": 45078
+    },
+    {
+      "epoch": 0.3913073671235493,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0012016298907911863,
+      "loss": 0.0693,
+      "step": 45079
+    },
+    {
+      "epoch": 0.39131604760375344,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0012016003186442142,
+      "loss": 0.0938,
+      "step": 45080
+    },
+    {
+      "epoch": 0.39132472808395763,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0012015707463861333,
+      "loss": 0.0664,
+      "step": 45081
+    },
+    {
+      "epoch": 0.39133340856416177,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0012015411740169768,
+      "loss": 0.1543,
+      "step": 45082
+    },
+    {
+      "epoch": 0.39134208904436596,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001201511601536776,
+      "loss": 0.1113,
+      "step": 45083
+    },
+    {
+      "epoch": 0.3913507695245701,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0012014820289455639,
+      "loss": 0.0815,
+      "step": 45084
+    },
+    {
+      "epoch": 0.3913594500047743,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0012014524562433723,
+      "loss": 0.1367,
+      "step": 45085
+    },
+    {
+      "epoch": 0.3913681304849784,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0012014228834302339,
+      "loss": 0.1094,
+      "step": 45086
+    },
+    {
+      "epoch": 0.3913768109651826,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001201393310506181,
+      "loss": 0.1025,
+      "step": 45087
+    },
+    {
+      "epoch": 0.39138549144538676,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0012013637374712457,
+      "loss": 0.0791,
+      "step": 45088
+    },
+    {
+      "epoch": 0.39139417192559095,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0012013341643254609,
+      "loss": 0.1465,
+      "step": 45089
+    },
+    {
+      "epoch": 0.3914028524057951,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0012013045910688582,
+      "loss": 0.123,
+      "step": 45090
+    },
+    {
+      "epoch": 0.3914115328859993,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0012012750177014704,
+      "loss": 0.0913,
+      "step": 45091
+    },
+    {
+      "epoch": 0.3914202133662034,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00120124544422333,
+      "loss": 0.082,
+      "step": 45092
+    },
+    {
+      "epoch": 0.3914288938464076,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.001201215870634469,
+      "loss": 0.0942,
+      "step": 45093
+    },
+    {
+      "epoch": 0.39143757432661175,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.00120118629693492,
+      "loss": 0.0928,
+      "step": 45094
+    },
+    {
+      "epoch": 0.3914462548068159,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.001201156723124715,
+      "loss": 0.0771,
+      "step": 45095
+    },
+    {
+      "epoch": 0.3914549352870201,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0012011271492038864,
+      "loss": 0.0967,
+      "step": 45096
+    },
+    {
+      "epoch": 0.3914636157672242,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 0.001201097575172467,
+      "loss": 0.0449,
+      "step": 45097
+    },
+    {
+      "epoch": 0.3914722962474284,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0012010680010304886,
+      "loss": 0.0967,
+      "step": 45098
+    },
+    {
+      "epoch": 0.39148097672763255,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0012010384267779836,
+      "loss": 0.0674,
+      "step": 45099
+    },
+    {
+      "epoch": 0.39148965720783674,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0012010088524149847,
+      "loss": 0.1484,
+      "step": 45100
+    },
+    {
+      "epoch": 0.3914983376880409,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001200979277941524,
+      "loss": 0.1201,
+      "step": 45101
+    },
+    {
+      "epoch": 0.39150701816824507,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001200949703357634,
+      "loss": 0.1367,
+      "step": 45102
+    },
+    {
+      "epoch": 0.3915156986484492,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0012009201286633468,
+      "loss": 0.1021,
+      "step": 45103
+    },
+    {
+      "epoch": 0.3915243791286534,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0012008905538586946,
+      "loss": 0.1055,
+      "step": 45104
+    },
+    {
+      "epoch": 0.39153305960885754,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0012008609789437106,
+      "loss": 0.0825,
+      "step": 45105
+    },
+    {
+      "epoch": 0.39154174008906173,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0012008314039184263,
+      "loss": 0.1152,
+      "step": 45106
+    },
+    {
+      "epoch": 0.39155042056926587,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0012008018287828742,
+      "loss": 0.1074,
+      "step": 45107
+    },
+    {
+      "epoch": 0.39155910104947006,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0012007722535370864,
+      "loss": 0.0645,
+      "step": 45108
+    },
+    {
+      "epoch": 0.3915677815296742,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.001200742678181096,
+      "loss": 0.127,
+      "step": 45109
+    },
+    {
+      "epoch": 0.3915764620098784,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001200713102714935,
+      "loss": 0.0859,
+      "step": 45110
+    },
+    {
+      "epoch": 0.3915851424900825,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0012006835271386354,
+      "loss": 0.123,
+      "step": 45111
+    },
+    {
+      "epoch": 0.3915938229702867,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0012006539514522298,
+      "loss": 0.0767,
+      "step": 45112
+    },
+    {
+      "epoch": 0.39160250345049086,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0012006243756557508,
+      "loss": 0.0908,
+      "step": 45113
+    },
+    {
+      "epoch": 0.39161118393069505,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0012005947997492303,
+      "loss": 0.1069,
+      "step": 45114
+    },
+    {
+      "epoch": 0.3916198644108992,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0012005652237327012,
+      "loss": 0.0674,
+      "step": 45115
+    },
+    {
+      "epoch": 0.3916285448911034,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001200535647606195,
+      "loss": 0.083,
+      "step": 45116
+    },
+    {
+      "epoch": 0.3916372253713075,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0012005060713697446,
+      "loss": 0.0752,
+      "step": 45117
+    },
+    {
+      "epoch": 0.3916459058515117,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0012004764950233823,
+      "loss": 0.0825,
+      "step": 45118
+    },
+    {
+      "epoch": 0.39165458633171585,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0012004469185671407,
+      "loss": 0.1016,
+      "step": 45119
+    },
+    {
+      "epoch": 0.39166326681192004,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0012004173420010517,
+      "loss": 0.0732,
+      "step": 45120
+    },
+    {
+      "epoch": 0.3916719472921242,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0012003877653251477,
+      "loss": 0.0762,
+      "step": 45121
+    },
+    {
+      "epoch": 0.39168062777232837,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0012003581885394613,
+      "loss": 0.0996,
+      "step": 45122
+    },
+    {
+      "epoch": 0.3916893082525325,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0012003286116440246,
+      "loss": 0.1445,
+      "step": 45123
+    },
+    {
+      "epoch": 0.3916979887327367,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0012002990346388698,
+      "loss": 0.1089,
+      "step": 45124
+    },
+    {
+      "epoch": 0.39170666921294084,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0012002694575240298,
+      "loss": 0.0879,
+      "step": 45125
+    },
+    {
+      "epoch": 0.39171534969314503,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0012002398802995367,
+      "loss": 0.0781,
+      "step": 45126
+    },
+    {
+      "epoch": 0.39172403017334917,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0012002103029654222,
+      "loss": 0.1309,
+      "step": 45127
+    },
+    {
+      "epoch": 0.39173271065355336,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0012001807255217199,
+      "loss": 0.1147,
+      "step": 45128
+    },
+    {
+      "epoch": 0.3917413911337575,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.001200151147968461,
+      "loss": 0.0977,
+      "step": 45129
+    },
+    {
+      "epoch": 0.3917500716139617,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0012001215703056787,
+      "loss": 0.085,
+      "step": 45130
+    },
+    {
+      "epoch": 0.39175875209416583,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0012000919925334048,
+      "loss": 0.1309,
+      "step": 45131
+    },
+    {
+      "epoch": 0.39176743257437,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.001200062414651672,
+      "loss": 0.0991,
+      "step": 45132
+    },
+    {
+      "epoch": 0.39177611305457416,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0012000328366605119,
+      "loss": 0.0825,
+      "step": 45133
+    },
+    {
+      "epoch": 0.39178479353477835,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0012000032585599576,
+      "loss": 0.103,
+      "step": 45134
+    },
+    {
+      "epoch": 0.3917934740149825,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0011999736803500417,
+      "loss": 0.1758,
+      "step": 45135
+    },
+    {
+      "epoch": 0.3918021544951867,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0011999441020307956,
+      "loss": 0.1011,
+      "step": 45136
+    },
+    {
+      "epoch": 0.3918108349753908,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0011999145236022523,
+      "loss": 0.1484,
+      "step": 45137
+    },
+    {
+      "epoch": 0.391819515455595,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0011998849450644438,
+      "loss": 0.125,
+      "step": 45138
+    },
+    {
+      "epoch": 0.39182819593579915,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0011998553664174028,
+      "loss": 0.085,
+      "step": 45139
+    },
+    {
+      "epoch": 0.39183687641600334,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0011998257876611614,
+      "loss": 0.1074,
+      "step": 45140
+    },
+    {
+      "epoch": 0.3918455568962075,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0011997962087957523,
+      "loss": 0.0884,
+      "step": 45141
+    },
+    {
+      "epoch": 0.3918542373764117,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.001199766629821207,
+      "loss": 0.1162,
+      "step": 45142
+    },
+    {
+      "epoch": 0.3918629178566158,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0011997370507375589,
+      "loss": 0.0786,
+      "step": 45143
+    },
+    {
+      "epoch": 0.39187159833682,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0011997074715448398,
+      "loss": 0.1191,
+      "step": 45144
+    },
+    {
+      "epoch": 0.39188027881702414,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001199677892243082,
+      "loss": 0.1279,
+      "step": 45145
+    },
+    {
+      "epoch": 0.39188895929722833,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.001199648312832318,
+      "loss": 0.1187,
+      "step": 45146
+    },
+    {
+      "epoch": 0.39189763977743247,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0011996187333125799,
+      "loss": 0.1079,
+      "step": 45147
+    },
+    {
+      "epoch": 0.39190632025763666,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0011995891536839003,
+      "loss": 0.0825,
+      "step": 45148
+    },
+    {
+      "epoch": 0.3919150007378408,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0011995595739463117,
+      "loss": 0.0806,
+      "step": 45149
+    },
+    {
+      "epoch": 0.391923681218045,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001199529994099846,
+      "loss": 0.0811,
+      "step": 45150
+    },
+    {
+      "epoch": 0.39193236169824913,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.001199500414144536,
+      "loss": 0.0864,
+      "step": 45151
+    },
+    {
+      "epoch": 0.3919410421784533,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001199470834080414,
+      "loss": 0.1182,
+      "step": 45152
+    },
+    {
+      "epoch": 0.39194972265865746,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001199441253907512,
+      "loss": 0.0859,
+      "step": 45153
+    },
+    {
+      "epoch": 0.39195840313886166,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0011994116736258624,
+      "loss": 0.1201,
+      "step": 45154
+    },
+    {
+      "epoch": 0.3919670836190658,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0011993820932354978,
+      "loss": 0.0591,
+      "step": 45155
+    },
+    {
+      "epoch": 0.39197576409927,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0011993525127364504,
+      "loss": 0.0664,
+      "step": 45156
+    },
+    {
+      "epoch": 0.3919844445794741,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0011993229321287527,
+      "loss": 0.1143,
+      "step": 45157
+    },
+    {
+      "epoch": 0.3919931250596783,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0011992933514124368,
+      "loss": 0.0579,
+      "step": 45158
+    },
+    {
+      "epoch": 0.39200180553988245,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0011992637705875353,
+      "loss": 0.0703,
+      "step": 45159
+    },
+    {
+      "epoch": 0.39201048602008665,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0011992341896540804,
+      "loss": 0.0728,
+      "step": 45160
+    },
+    {
+      "epoch": 0.3920191665002908,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0011992046086121044,
+      "loss": 0.1191,
+      "step": 45161
+    },
+    {
+      "epoch": 0.392027846980495,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0011991750274616398,
+      "loss": 0.1045,
+      "step": 45162
+    },
+    {
+      "epoch": 0.3920365274606991,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0011991454462027189,
+      "loss": 0.0864,
+      "step": 45163
+    },
+    {
+      "epoch": 0.3920452079409033,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0011991158648353738,
+      "loss": 0.0757,
+      "step": 45164
+    },
+    {
+      "epoch": 0.39205388842110744,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0011990862833596374,
+      "loss": 0.1455,
+      "step": 45165
+    },
+    {
+      "epoch": 0.39206256890131164,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0011990567017755415,
+      "loss": 0.1011,
+      "step": 45166
+    },
+    {
+      "epoch": 0.3920712493815158,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0011990271200831189,
+      "loss": 0.106,
+      "step": 45167
+    },
+    {
+      "epoch": 0.39207992986171997,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0011989975382824015,
+      "loss": 0.1436,
+      "step": 45168
+    },
+    {
+      "epoch": 0.3920886103419241,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001198967956373422,
+      "loss": 0.0747,
+      "step": 45169
+    },
+    {
+      "epoch": 0.3920972908221283,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0011989383743562128,
+      "loss": 0.1094,
+      "step": 45170
+    },
+    {
+      "epoch": 0.39210597130233243,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001198908792230806,
+      "loss": 0.0732,
+      "step": 45171
+    },
+    {
+      "epoch": 0.3921146517825366,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0011988792099972337,
+      "loss": 0.0869,
+      "step": 45172
+    },
+    {
+      "epoch": 0.39212333226274076,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001198849627655529,
+      "loss": 0.1113,
+      "step": 45173
+    },
+    {
+      "epoch": 0.39213201274294496,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0011988200452057237,
+      "loss": 0.1123,
+      "step": 45174
+    },
+    {
+      "epoch": 0.3921406932231491,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.00119879046264785,
+      "loss": 0.1191,
+      "step": 45175
+    },
+    {
+      "epoch": 0.3921493737033533,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0011987608799819407,
+      "loss": 0.0742,
+      "step": 45176
+    },
+    {
+      "epoch": 0.3921580541835574,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0011987312972080281,
+      "loss": 0.0728,
+      "step": 45177
+    },
+    {
+      "epoch": 0.3921667346637616,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0011987017143261444,
+      "loss": 0.084,
+      "step": 45178
+    },
+    {
+      "epoch": 0.39217541514396576,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001198672131336322,
+      "loss": 0.1055,
+      "step": 45179
+    },
+    {
+      "epoch": 0.39218409562416995,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0011986425482385934,
+      "loss": 0.0942,
+      "step": 45180
+    },
+    {
+      "epoch": 0.3921927761043741,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0011986129650329905,
+      "loss": 0.1201,
+      "step": 45181
+    },
+    {
+      "epoch": 0.3922014565845783,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001198583381719546,
+      "loss": 0.1094,
+      "step": 45182
+    },
+    {
+      "epoch": 0.3922101370647824,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0011985537982982927,
+      "loss": 0.166,
+      "step": 45183
+    },
+    {
+      "epoch": 0.3922188175449866,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0011985242147692617,
+      "loss": 0.0796,
+      "step": 45184
+    },
+    {
+      "epoch": 0.39222749802519075,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001198494631132486,
+      "loss": 0.0825,
+      "step": 45185
+    },
+    {
+      "epoch": 0.39223617850539494,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0011984650473879986,
+      "loss": 0.1094,
+      "step": 45186
+    },
+    {
+      "epoch": 0.3922448589855991,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0011984354635358311,
+      "loss": 0.0791,
+      "step": 45187
+    },
+    {
+      "epoch": 0.39225353946580327,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0011984058795760162,
+      "loss": 0.0811,
+      "step": 45188
+    },
+    {
+      "epoch": 0.3922622199460074,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.001198376295508586,
+      "loss": 0.0815,
+      "step": 45189
+    },
+    {
+      "epoch": 0.3922709004262116,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0011983467113335729,
+      "loss": 0.0967,
+      "step": 45190
+    },
+    {
+      "epoch": 0.39227958090641574,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0011983171270510096,
+      "loss": 0.0801,
+      "step": 45191
+    },
+    {
+      "epoch": 0.39228826138661993,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001198287542660928,
+      "loss": 0.0962,
+      "step": 45192
+    },
+    {
+      "epoch": 0.39229694186682407,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0011982579581633604,
+      "loss": 0.1025,
+      "step": 45193
+    },
+    {
+      "epoch": 0.39230562234702826,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0011982283735583394,
+      "loss": 0.1738,
+      "step": 45194
+    },
+    {
+      "epoch": 0.3923143028272324,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0011981987888458974,
+      "loss": 0.0874,
+      "step": 45195
+    },
+    {
+      "epoch": 0.3923229833074366,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0011981692040260669,
+      "loss": 0.1426,
+      "step": 45196
+    },
+    {
+      "epoch": 0.3923316637876407,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.00119813961909888,
+      "loss": 0.1035,
+      "step": 45197
+    },
+    {
+      "epoch": 0.3923403442678449,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.001198110034064369,
+      "loss": 0.1001,
+      "step": 45198
+    },
+    {
+      "epoch": 0.39234902474804906,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0011980804489225662,
+      "loss": 0.1445,
+      "step": 45199
+    },
+    {
+      "epoch": 0.39235770522825325,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0011980508636735044,
+      "loss": 0.1055,
+      "step": 45200
+    },
+    {
+      "epoch": 0.3923663857084574,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0011980212783172153,
+      "loss": 0.0918,
+      "step": 45201
+    },
+    {
+      "epoch": 0.3923750661886616,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0011979916928537317,
+      "loss": 0.084,
+      "step": 45202
+    },
+    {
+      "epoch": 0.3923837466688657,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0011979621072830862,
+      "loss": 0.0776,
+      "step": 45203
+    },
+    {
+      "epoch": 0.3923924271490699,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0011979325216053102,
+      "loss": 0.1201,
+      "step": 45204
+    },
+    {
+      "epoch": 0.39240110762927405,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0011979029358204372,
+      "loss": 0.085,
+      "step": 45205
+    },
+    {
+      "epoch": 0.39240978810947824,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0011978733499284986,
+      "loss": 0.0762,
+      "step": 45206
+    },
+    {
+      "epoch": 0.3924184685896824,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0011978437639295274,
+      "loss": 0.1543,
+      "step": 45207
+    },
+    {
+      "epoch": 0.39242714906988657,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0011978141778235557,
+      "loss": 0.1064,
+      "step": 45208
+    },
+    {
+      "epoch": 0.3924358295500907,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.001197784591610616,
+      "loss": 0.1006,
+      "step": 45209
+    },
+    {
+      "epoch": 0.3924445100302949,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0011977550052907407,
+      "loss": 0.0864,
+      "step": 45210
+    },
+    {
+      "epoch": 0.39245319051049904,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0011977254188639615,
+      "loss": 0.0825,
+      "step": 45211
+    },
+    {
+      "epoch": 0.39246187099070323,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0011976958323303118,
+      "loss": 0.0879,
+      "step": 45212
+    },
+    {
+      "epoch": 0.39247055147090737,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0011976662456898227,
+      "loss": 0.105,
+      "step": 45213
+    },
+    {
+      "epoch": 0.39247923195111156,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001197636658942528,
+      "loss": 0.0737,
+      "step": 45214
+    },
+    {
+      "epoch": 0.3924879124313157,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0011976070720884587,
+      "loss": 0.0938,
+      "step": 45215
+    },
+    {
+      "epoch": 0.3924965929115199,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001197577485127648,
+      "loss": 0.0947,
+      "step": 45216
+    },
+    {
+      "epoch": 0.39250527339172403,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001197547898060128,
+      "loss": 0.0874,
+      "step": 45217
+    },
+    {
+      "epoch": 0.39251395387192817,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0011975183108859314,
+      "loss": 0.1445,
+      "step": 45218
+    },
+    {
+      "epoch": 0.39252263435213236,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.00119748872360509,
+      "loss": 0.0918,
+      "step": 45219
+    },
+    {
+      "epoch": 0.3925313148323365,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0011974591362176365,
+      "loss": 0.1973,
+      "step": 45220
+    },
+    {
+      "epoch": 0.3925399953125407,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.001197429548723603,
+      "loss": 0.1104,
+      "step": 45221
+    },
+    {
+      "epoch": 0.39254867579274483,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001197399961123022,
+      "loss": 0.0962,
+      "step": 45222
+    },
+    {
+      "epoch": 0.392557356272949,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001197370373415926,
+      "loss": 0.1245,
+      "step": 45223
+    },
+    {
+      "epoch": 0.39256603675315316,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001197340785602347,
+      "loss": 0.0815,
+      "step": 45224
+    },
+    {
+      "epoch": 0.39257471723335735,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0011973111976823177,
+      "loss": 0.0879,
+      "step": 45225
+    },
+    {
+      "epoch": 0.3925833977135615,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0011972816096558706,
+      "loss": 0.0698,
+      "step": 45226
+    },
+    {
+      "epoch": 0.3925920781937657,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0011972520215230373,
+      "loss": 0.1162,
+      "step": 45227
+    },
+    {
+      "epoch": 0.3926007586739698,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0011972224332838508,
+      "loss": 0.105,
+      "step": 45228
+    },
+    {
+      "epoch": 0.392609439154174,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0011971928449383436,
+      "loss": 0.1191,
+      "step": 45229
+    },
+    {
+      "epoch": 0.39261811963437815,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0011971632564865477,
+      "loss": 0.1025,
+      "step": 45230
+    },
+    {
+      "epoch": 0.39262680011458234,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0011971336679284955,
+      "loss": 0.0732,
+      "step": 45231
+    },
+    {
+      "epoch": 0.3926354805947865,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0011971040792642192,
+      "loss": 0.0903,
+      "step": 45232
+    },
+    {
+      "epoch": 0.39264416107499067,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001197074490493751,
+      "loss": 0.1035,
+      "step": 45233
+    },
+    {
+      "epoch": 0.3926528415551948,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0011970449016171242,
+      "loss": 0.1133,
+      "step": 45234
+    },
+    {
+      "epoch": 0.392661522035399,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0011970153126343705,
+      "loss": 0.1133,
+      "step": 45235
+    },
+    {
+      "epoch": 0.39267020251560314,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0011969857235455221,
+      "loss": 0.0938,
+      "step": 45236
+    },
+    {
+      "epoch": 0.39267888299580733,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0011969561343506117,
+      "loss": 0.0938,
+      "step": 45237
+    },
+    {
+      "epoch": 0.39268756347601147,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0011969265450496713,
+      "loss": 0.1436,
+      "step": 45238
+    },
+    {
+      "epoch": 0.39269624395621566,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0011968969556427338,
+      "loss": 0.25,
+      "step": 45239
+    },
+    {
+      "epoch": 0.3927049244364198,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0011968673661298312,
+      "loss": 0.0918,
+      "step": 45240
+    },
+    {
+      "epoch": 0.392713604916624,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0011968377765109958,
+      "loss": 0.1152,
+      "step": 45241
+    },
+    {
+      "epoch": 0.39272228539682813,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0011968081867862602,
+      "loss": 0.0991,
+      "step": 45242
+    },
+    {
+      "epoch": 0.3927309658770323,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0011967785969556565,
+      "loss": 0.0801,
+      "step": 45243
+    },
+    {
+      "epoch": 0.39273964635723646,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001196749007019217,
+      "loss": 0.1094,
+      "step": 45244
+    },
+    {
+      "epoch": 0.39274832683744065,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0011967194169769744,
+      "loss": 0.0908,
+      "step": 45245
+    },
+    {
+      "epoch": 0.3927570073176448,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.001196689826828961,
+      "loss": 0.1016,
+      "step": 45246
+    },
+    {
+      "epoch": 0.392765687797849,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0011966602365752089,
+      "loss": 0.0903,
+      "step": 45247
+    },
+    {
+      "epoch": 0.3927743682780531,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0011966306462157508,
+      "loss": 0.1104,
+      "step": 45248
+    },
+    {
+      "epoch": 0.3927830487582573,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0011966010557506188,
+      "loss": 0.1021,
+      "step": 45249
+    },
+    {
+      "epoch": 0.39279172923846145,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0011965714651798452,
+      "loss": 0.1006,
+      "step": 45250
+    },
+    {
+      "epoch": 0.39280040971866564,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0011965418745034627,
+      "loss": 0.1016,
+      "step": 45251
+    },
+    {
+      "epoch": 0.3928090901988698,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0011965122837215032,
+      "loss": 0.0928,
+      "step": 45252
+    },
+    {
+      "epoch": 0.392817770679074,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0011964826928339994,
+      "loss": 0.1289,
+      "step": 45253
+    },
+    {
+      "epoch": 0.3928264511592781,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0011964531018409837,
+      "loss": 0.0913,
+      "step": 45254
+    },
+    {
+      "epoch": 0.3928351316394823,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0011964235107424883,
+      "loss": 0.1514,
+      "step": 45255
+    },
+    {
+      "epoch": 0.39284381211968644,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0011963939195385456,
+      "loss": 0.1045,
+      "step": 45256
+    },
+    {
+      "epoch": 0.39285249259989063,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001196364328229188,
+      "loss": 0.1211,
+      "step": 45257
+    },
+    {
+      "epoch": 0.39286117308009477,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0011963347368144476,
+      "loss": 0.085,
+      "step": 45258
+    },
+    {
+      "epoch": 0.39286985356029896,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0011963051452943572,
+      "loss": 0.0962,
+      "step": 45259
+    },
+    {
+      "epoch": 0.3928785340405031,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001196275553668949,
+      "loss": 0.1016,
+      "step": 45260
+    },
+    {
+      "epoch": 0.3928872145207073,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001196245961938255,
+      "loss": 0.0859,
+      "step": 45261
+    },
+    {
+      "epoch": 0.39289589500091143,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0011962163701023081,
+      "loss": 0.0923,
+      "step": 45262
+    },
+    {
+      "epoch": 0.3929045754811156,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0011961867781611405,
+      "loss": 0.0923,
+      "step": 45263
+    },
+    {
+      "epoch": 0.39291325596131976,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0011961571861147842,
+      "loss": 0.0869,
+      "step": 45264
+    },
+    {
+      "epoch": 0.39292193644152396,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0011961275939632722,
+      "loss": 0.0894,
+      "step": 45265
+    },
+    {
+      "epoch": 0.3929306169217281,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0011960980017066364,
+      "loss": 0.1172,
+      "step": 45266
+    },
+    {
+      "epoch": 0.3929392974019323,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0011960684093449089,
+      "loss": 0.0967,
+      "step": 45267
+    },
+    {
+      "epoch": 0.3929479778821364,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0011960388168781228,
+      "loss": 0.1514,
+      "step": 45268
+    },
+    {
+      "epoch": 0.3929566583623406,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0011960092243063101,
+      "loss": 0.0977,
+      "step": 45269
+    },
+    {
+      "epoch": 0.39296533884254475,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001195979631629503,
+      "loss": 0.1021,
+      "step": 45270
+    },
+    {
+      "epoch": 0.39297401932274895,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.001195950038847734,
+      "loss": 0.0991,
+      "step": 45271
+    },
+    {
+      "epoch": 0.3929826998029531,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0011959204459610355,
+      "loss": 0.1001,
+      "step": 45272
+    },
+    {
+      "epoch": 0.3929913802831573,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0011958908529694399,
+      "loss": 0.1289,
+      "step": 45273
+    },
+    {
+      "epoch": 0.3930000607633614,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0011958612598729794,
+      "loss": 0.1245,
+      "step": 45274
+    },
+    {
+      "epoch": 0.3930087412435656,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0011958316666716869,
+      "loss": 0.0791,
+      "step": 45275
+    },
+    {
+      "epoch": 0.39301742172376974,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.001195802073365594,
+      "loss": 0.1309,
+      "step": 45276
+    },
+    {
+      "epoch": 0.39302610220397394,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0011957724799547332,
+      "loss": 0.0752,
+      "step": 45277
+    },
+    {
+      "epoch": 0.3930347826841781,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0011957428864391372,
+      "loss": 0.127,
+      "step": 45278
+    },
+    {
+      "epoch": 0.39304346316438227,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0011957132928188383,
+      "loss": 0.0664,
+      "step": 45279
+    },
+    {
+      "epoch": 0.3930521436445864,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0011956836990938689,
+      "loss": 0.0986,
+      "step": 45280
+    },
+    {
+      "epoch": 0.3930608241247906,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0011956541052642608,
+      "loss": 0.1045,
+      "step": 45281
+    },
+    {
+      "epoch": 0.39306950460499473,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.001195624511330047,
+      "loss": 0.1016,
+      "step": 45282
+    },
+    {
+      "epoch": 0.39307818508519893,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.00119559491729126,
+      "loss": 0.1079,
+      "step": 45283
+    },
+    {
+      "epoch": 0.39308686556540307,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0011955653231479316,
+      "loss": 0.0674,
+      "step": 45284
+    },
+    {
+      "epoch": 0.39309554604560726,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0011955357289000943,
+      "loss": 0.1416,
+      "step": 45285
+    },
+    {
+      "epoch": 0.3931042265258114,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0011955061345477807,
+      "loss": 0.0972,
+      "step": 45286
+    },
+    {
+      "epoch": 0.3931129070060156,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0011954765400910233,
+      "loss": 0.0874,
+      "step": 45287
+    },
+    {
+      "epoch": 0.3931215874862197,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0011954469455298536,
+      "loss": 0.125,
+      "step": 45288
+    },
+    {
+      "epoch": 0.3931302679664239,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001195417350864305,
+      "loss": 0.103,
+      "step": 45289
+    },
+    {
+      "epoch": 0.39313894844662806,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0011953877560944089,
+      "loss": 0.1064,
+      "step": 45290
+    },
+    {
+      "epoch": 0.39314762892683225,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0011953581612201987,
+      "loss": 0.1104,
+      "step": 45291
+    },
+    {
+      "epoch": 0.3931563094070364,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0011953285662417061,
+      "loss": 0.0898,
+      "step": 45292
+    },
+    {
+      "epoch": 0.3931649898872406,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0011952989711589634,
+      "loss": 0.0972,
+      "step": 45293
+    },
+    {
+      "epoch": 0.3931736703674447,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0011952693759720034,
+      "loss": 0.1133,
+      "step": 45294
+    },
+    {
+      "epoch": 0.3931823508476489,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0011952397806808584,
+      "loss": 0.0767,
+      "step": 45295
+    },
+    {
+      "epoch": 0.39319103132785305,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0011952101852855603,
+      "loss": 0.0752,
+      "step": 45296
+    },
+    {
+      "epoch": 0.39319971180805724,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001195180589786142,
+      "loss": 0.1094,
+      "step": 45297
+    },
+    {
+      "epoch": 0.3932083922882614,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0011951509941826356,
+      "loss": 0.1113,
+      "step": 45298
+    },
+    {
+      "epoch": 0.39321707276846557,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0011951213984750734,
+      "loss": 0.1279,
+      "step": 45299
+    },
+    {
+      "epoch": 0.3932257532486697,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0011950918026634879,
+      "loss": 0.1025,
+      "step": 45300
+    },
+    {
+      "epoch": 0.3932344337288739,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0011950622067479113,
+      "loss": 0.123,
+      "step": 45301
+    },
+    {
+      "epoch": 0.39324311420907804,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0011950326107283762,
+      "loss": 0.1143,
+      "step": 45302
+    },
+    {
+      "epoch": 0.39325179468928223,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.001195003014604915,
+      "loss": 0.0693,
+      "step": 45303
+    },
+    {
+      "epoch": 0.39326047516948637,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0011949734183775598,
+      "loss": 0.1338,
+      "step": 45304
+    },
+    {
+      "epoch": 0.39326915564969056,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001194943822046343,
+      "loss": 0.1006,
+      "step": 45305
+    },
+    {
+      "epoch": 0.3932778361298947,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0011949142256112973,
+      "loss": 0.0771,
+      "step": 45306
+    },
+    {
+      "epoch": 0.3932865166100989,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0011948846290724547,
+      "loss": 0.0957,
+      "step": 45307
+    },
+    {
+      "epoch": 0.39329519709030303,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0011948550324298477,
+      "loss": 0.1064,
+      "step": 45308
+    },
+    {
+      "epoch": 0.3933038775705072,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0011948254356835086,
+      "loss": 0.0854,
+      "step": 45309
+    },
+    {
+      "epoch": 0.39331255805071136,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.00119479583883347,
+      "loss": 0.1553,
+      "step": 45310
+    },
+    {
+      "epoch": 0.39332123853091555,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0011947662418797637,
+      "loss": 0.1191,
+      "step": 45311
+    },
+    {
+      "epoch": 0.3933299190111197,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001194736644822423,
+      "loss": 0.0986,
+      "step": 45312
+    },
+    {
+      "epoch": 0.3933385994913239,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0011947070476614797,
+      "loss": 0.0913,
+      "step": 45313
+    },
+    {
+      "epoch": 0.393347279971528,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001194677450396966,
+      "loss": 0.1133,
+      "step": 45314
+    },
+    {
+      "epoch": 0.3933559604517322,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0011946478530289143,
+      "loss": 0.104,
+      "step": 45315
+    },
+    {
+      "epoch": 0.39336464093193635,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0011946182555573573,
+      "loss": 0.083,
+      "step": 45316
+    },
+    {
+      "epoch": 0.39337332141214054,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0011945886579823272,
+      "loss": 0.0874,
+      "step": 45317
+    },
+    {
+      "epoch": 0.3933820018923447,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.001194559060303856,
+      "loss": 0.0767,
+      "step": 45318
+    },
+    {
+      "epoch": 0.39339068237254887,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0011945294625219768,
+      "loss": 0.167,
+      "step": 45319
+    },
+    {
+      "epoch": 0.393399362852753,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0011944998646367215,
+      "loss": 0.0708,
+      "step": 45320
+    },
+    {
+      "epoch": 0.3934080433329572,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0011944702666481228,
+      "loss": 0.085,
+      "step": 45321
+    },
+    {
+      "epoch": 0.39341672381316134,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0011944406685562125,
+      "loss": 0.1152,
+      "step": 45322
+    },
+    {
+      "epoch": 0.39342540429336553,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0011944110703610237,
+      "loss": 0.1279,
+      "step": 45323
+    },
+    {
+      "epoch": 0.39343408477356967,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001194381472062588,
+      "loss": 0.0903,
+      "step": 45324
+    },
+    {
+      "epoch": 0.39344276525377386,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0011943518736609385,
+      "loss": 0.1113,
+      "step": 45325
+    },
+    {
+      "epoch": 0.393451445733978,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.001194322275156107,
+      "loss": 0.0991,
+      "step": 45326
+    },
+    {
+      "epoch": 0.3934601262141822,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001194292676548126,
+      "loss": 0.0972,
+      "step": 45327
+    },
+    {
+      "epoch": 0.39346880669438633,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0011942630778370277,
+      "loss": 0.1079,
+      "step": 45328
+    },
+    {
+      "epoch": 0.3934774871745905,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0011942334790228453,
+      "loss": 0.085,
+      "step": 45329
+    },
+    {
+      "epoch": 0.39348616765479466,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0011942038801056104,
+      "loss": 0.1113,
+      "step": 45330
+    },
+    {
+      "epoch": 0.39349484813499885,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0011941742810853556,
+      "loss": 0.0952,
+      "step": 45331
+    },
+    {
+      "epoch": 0.393503528615203,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0011941446819621127,
+      "loss": 0.125,
+      "step": 45332
+    },
+    {
+      "epoch": 0.3935122090954072,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001194115082735915,
+      "loss": 0.1064,
+      "step": 45333
+    },
+    {
+      "epoch": 0.3935208895756113,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0011940854834067947,
+      "loss": 0.1045,
+      "step": 45334
+    },
+    {
+      "epoch": 0.3935295700558155,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0011940558839747836,
+      "loss": 0.0996,
+      "step": 45335
+    },
+    {
+      "epoch": 0.39353825053601965,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0011940262844399143,
+      "loss": 0.1177,
+      "step": 45336
+    },
+    {
+      "epoch": 0.39354693101622384,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0011939966848022196,
+      "loss": 0.0762,
+      "step": 45337
+    },
+    {
+      "epoch": 0.393555611496428,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0011939670850617313,
+      "loss": 0.0801,
+      "step": 45338
+    },
+    {
+      "epoch": 0.3935642919766322,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.001193937485218482,
+      "loss": 0.0977,
+      "step": 45339
+    },
+    {
+      "epoch": 0.3935729724568363,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001193907885272504,
+      "loss": 0.1079,
+      "step": 45340
+    },
+    {
+      "epoch": 0.3935816529370405,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0011938782852238302,
+      "loss": 0.0898,
+      "step": 45341
+    },
+    {
+      "epoch": 0.39359033341724464,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0011938486850724922,
+      "loss": 0.0923,
+      "step": 45342
+    },
+    {
+      "epoch": 0.3935990138974488,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001193819084818523,
+      "loss": 0.127,
+      "step": 45343
+    },
+    {
+      "epoch": 0.393607694377653,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001193789484461954,
+      "loss": 0.084,
+      "step": 45344
+    },
+    {
+      "epoch": 0.3936163748578571,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0011937598840028186,
+      "loss": 0.1162,
+      "step": 45345
+    },
+    {
+      "epoch": 0.3936250553380613,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001193730283441149,
+      "loss": 0.1021,
+      "step": 45346
+    },
+    {
+      "epoch": 0.39363373581826544,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0011937006827769771,
+      "loss": 0.127,
+      "step": 45347
+    },
+    {
+      "epoch": 0.39364241629846963,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0011936710820103357,
+      "loss": 0.1309,
+      "step": 45348
+    },
+    {
+      "epoch": 0.39365109677867377,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0011936414811412567,
+      "loss": 0.1289,
+      "step": 45349
+    },
+    {
+      "epoch": 0.39365977725887796,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0011936118801697728,
+      "loss": 0.124,
+      "step": 45350
+    },
+    {
+      "epoch": 0.3936684577390821,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0011935822790959168,
+      "loss": 0.0713,
+      "step": 45351
+    },
+    {
+      "epoch": 0.3936771382192863,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0011935526779197204,
+      "loss": 0.1221,
+      "step": 45352
+    },
+    {
+      "epoch": 0.39368581869949043,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0011935230766412159,
+      "loss": 0.0894,
+      "step": 45353
+    },
+    {
+      "epoch": 0.3936944991796946,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0011934934752604366,
+      "loss": 0.1016,
+      "step": 45354
+    },
+    {
+      "epoch": 0.39370317965989876,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001193463873777414,
+      "loss": 0.1533,
+      "step": 45355
+    },
+    {
+      "epoch": 0.39371186014010295,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0011934342721921804,
+      "loss": 0.0889,
+      "step": 45356
+    },
+    {
+      "epoch": 0.3937205406203071,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0011934046705047686,
+      "loss": 0.0962,
+      "step": 45357
+    },
+    {
+      "epoch": 0.3937292211005113,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0011933750687152108,
+      "loss": 0.1226,
+      "step": 45358
+    },
+    {
+      "epoch": 0.3937379015807154,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0011933454668235396,
+      "loss": 0.1055,
+      "step": 45359
+    },
+    {
+      "epoch": 0.3937465820609196,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0011933158648297873,
+      "loss": 0.1689,
+      "step": 45360
+    },
+    {
+      "epoch": 0.39375526254112375,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001193286262733986,
+      "loss": 0.1328,
+      "step": 45361
+    },
+    {
+      "epoch": 0.39376394302132794,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0011932566605361683,
+      "loss": 0.0825,
+      "step": 45362
+    },
+    {
+      "epoch": 0.3937726235015321,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0011932270582363667,
+      "loss": 0.0864,
+      "step": 45363
+    },
+    {
+      "epoch": 0.3937813039817363,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0011931974558346133,
+      "loss": 0.0898,
+      "step": 45364
+    },
+    {
+      "epoch": 0.3937899844619404,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0011931678533309404,
+      "loss": 0.0781,
+      "step": 45365
+    },
+    {
+      "epoch": 0.3937986649421446,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0011931382507253804,
+      "loss": 0.1162,
+      "step": 45366
+    },
+    {
+      "epoch": 0.39380734542234874,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0011931086480179661,
+      "loss": 0.083,
+      "step": 45367
+    },
+    {
+      "epoch": 0.39381602590255294,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0011930790452087292,
+      "loss": 0.0859,
+      "step": 45368
+    },
+    {
+      "epoch": 0.3938247063827571,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0011930494422977029,
+      "loss": 0.1064,
+      "step": 45369
+    },
+    {
+      "epoch": 0.39383338686296127,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0011930198392849191,
+      "loss": 0.0967,
+      "step": 45370
+    },
+    {
+      "epoch": 0.3938420673431654,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0011929902361704097,
+      "loss": 0.0654,
+      "step": 45371
+    },
+    {
+      "epoch": 0.3938507478233696,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0011929606329542082,
+      "loss": 0.082,
+      "step": 45372
+    },
+    {
+      "epoch": 0.39385942830357373,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001192931029636346,
+      "loss": 0.1768,
+      "step": 45373
+    },
+    {
+      "epoch": 0.3938681087837779,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0011929014262168562,
+      "loss": 0.0908,
+      "step": 45374
+    },
+    {
+      "epoch": 0.39387678926398206,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0011928718226957703,
+      "loss": 0.1426,
+      "step": 45375
+    },
+    {
+      "epoch": 0.39388546974418626,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0011928422190731214,
+      "loss": 0.1035,
+      "step": 45376
+    },
+    {
+      "epoch": 0.3938941502243904,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0011928126153489416,
+      "loss": 0.1289,
+      "step": 45377
+    },
+    {
+      "epoch": 0.3939028307045946,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0011927830115232635,
+      "loss": 0.082,
+      "step": 45378
+    },
+    {
+      "epoch": 0.3939115111847987,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001192753407596119,
+      "loss": 0.0894,
+      "step": 45379
+    },
+    {
+      "epoch": 0.3939201916650029,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0011927238035675408,
+      "loss": 0.1328,
+      "step": 45380
+    },
+    {
+      "epoch": 0.39392887214520705,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0011926941994375613,
+      "loss": 0.1113,
+      "step": 45381
+    },
+    {
+      "epoch": 0.39393755262541125,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001192664595206213,
+      "loss": 0.0967,
+      "step": 45382
+    },
+    {
+      "epoch": 0.3939462331056154,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0011926349908735281,
+      "loss": 0.0645,
+      "step": 45383
+    },
+    {
+      "epoch": 0.3939549135858196,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0011926053864395386,
+      "loss": 0.0957,
+      "step": 45384
+    },
+    {
+      "epoch": 0.3939635940660237,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0011925757819042774,
+      "loss": 0.1143,
+      "step": 45385
+    },
+    {
+      "epoch": 0.3939722745462279,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0011925461772677767,
+      "loss": 0.127,
+      "step": 45386
+    },
+    {
+      "epoch": 0.39398095502643204,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0011925165725300692,
+      "loss": 0.1084,
+      "step": 45387
+    },
+    {
+      "epoch": 0.39398963550663624,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0011924869676911865,
+      "loss": 0.1167,
+      "step": 45388
+    },
+    {
+      "epoch": 0.3939983159868404,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0011924573627511616,
+      "loss": 0.0791,
+      "step": 45389
+    },
+    {
+      "epoch": 0.39400699646704457,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001192427757710027,
+      "loss": 0.1094,
+      "step": 45390
+    },
+    {
+      "epoch": 0.3940156769472487,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0011923981525678146,
+      "loss": 0.1006,
+      "step": 45391
+    },
+    {
+      "epoch": 0.3940243574274529,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0011923685473245566,
+      "loss": 0.0947,
+      "step": 45392
+    },
+    {
+      "epoch": 0.39403303790765704,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0011923389419802863,
+      "loss": 0.127,
+      "step": 45393
+    },
+    {
+      "epoch": 0.39404171838786123,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0011923093365350354,
+      "loss": 0.0913,
+      "step": 45394
+    },
+    {
+      "epoch": 0.39405039886806537,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0011922797309888362,
+      "loss": 0.1006,
+      "step": 45395
+    },
+    {
+      "epoch": 0.39405907934826956,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0011922501253417215,
+      "loss": 0.085,
+      "step": 45396
+    },
+    {
+      "epoch": 0.3940677598284737,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0011922205195937232,
+      "loss": 0.0986,
+      "step": 45397
+    },
+    {
+      "epoch": 0.3940764403086779,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0011921909137448741,
+      "loss": 0.0889,
+      "step": 45398
+    },
+    {
+      "epoch": 0.394085120788882,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0011921613077952064,
+      "loss": 0.0962,
+      "step": 45399
+    },
+    {
+      "epoch": 0.3940938012690862,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0011921317017447523,
+      "loss": 0.0625,
+      "step": 45400
+    },
+    {
+      "epoch": 0.39410248174929036,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0011921020955935446,
+      "loss": 0.0625,
+      "step": 45401
+    },
+    {
+      "epoch": 0.39411116222949455,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0011920724893416154,
+      "loss": 0.1089,
+      "step": 45402
+    },
+    {
+      "epoch": 0.3941198427096987,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0011920428829889972,
+      "loss": 0.0752,
+      "step": 45403
+    },
+    {
+      "epoch": 0.3941285231899029,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.001192013276535722,
+      "loss": 0.0957,
+      "step": 45404
+    },
+    {
+      "epoch": 0.394137203670107,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0011919836699818226,
+      "loss": 0.1553,
+      "step": 45405
+    },
+    {
+      "epoch": 0.3941458841503112,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0011919540633273309,
+      "loss": 0.1357,
+      "step": 45406
+    },
+    {
+      "epoch": 0.39415456463051535,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0011919244565722802,
+      "loss": 0.1094,
+      "step": 45407
+    },
+    {
+      "epoch": 0.39416324511071954,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001191894849716702,
+      "loss": 0.0928,
+      "step": 45408
+    },
+    {
+      "epoch": 0.3941719255909237,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001191865242760629,
+      "loss": 0.1211,
+      "step": 45409
+    },
+    {
+      "epoch": 0.39418060607112787,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0011918356357040936,
+      "loss": 0.0923,
+      "step": 45410
+    },
+    {
+      "epoch": 0.394189286551332,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0011918060285471279,
+      "loss": 0.0928,
+      "step": 45411
+    },
+    {
+      "epoch": 0.3941979670315362,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0011917764212897649,
+      "loss": 0.0996,
+      "step": 45412
+    },
+    {
+      "epoch": 0.39420664751174034,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0011917468139320364,
+      "loss": 0.0933,
+      "step": 45413
+    },
+    {
+      "epoch": 0.39421532799194453,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001191717206473975,
+      "loss": 0.0903,
+      "step": 45414
+    },
+    {
+      "epoch": 0.39422400847214867,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0011916875989156127,
+      "loss": 0.1279,
+      "step": 45415
+    },
+    {
+      "epoch": 0.39423268895235286,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0011916579912569826,
+      "loss": 0.1221,
+      "step": 45416
+    },
+    {
+      "epoch": 0.394241369432557,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0011916283834981169,
+      "loss": 0.1309,
+      "step": 45417
+    },
+    {
+      "epoch": 0.3942500499127612,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001191598775639047,
+      "loss": 0.1221,
+      "step": 45418
+    },
+    {
+      "epoch": 0.39425873039296533,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0011915691676798066,
+      "loss": 0.126,
+      "step": 45419
+    },
+    {
+      "epoch": 0.3942674108731695,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0011915395596204276,
+      "loss": 0.1025,
+      "step": 45420
+    },
+    {
+      "epoch": 0.39427609135337366,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0011915099514609423,
+      "loss": 0.1128,
+      "step": 45421
+    },
+    {
+      "epoch": 0.39428477183357785,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001191480343201383,
+      "loss": 0.0869,
+      "step": 45422
+    },
+    {
+      "epoch": 0.394293452313782,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.001191450734841782,
+      "loss": 0.1826,
+      "step": 45423
+    },
+    {
+      "epoch": 0.3943021327939862,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0011914211263821719,
+      "loss": 0.0625,
+      "step": 45424
+    },
+    {
+      "epoch": 0.3943108132741903,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0011913915178225851,
+      "loss": 0.0835,
+      "step": 45425
+    },
+    {
+      "epoch": 0.3943194937543945,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.001191361909163054,
+      "loss": 0.1719,
+      "step": 45426
+    },
+    {
+      "epoch": 0.39432817423459865,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0011913323004036106,
+      "loss": 0.1094,
+      "step": 45427
+    },
+    {
+      "epoch": 0.39433685471480284,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.001191302691544288,
+      "loss": 0.0981,
+      "step": 45428
+    },
+    {
+      "epoch": 0.394345535195007,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001191273082585118,
+      "loss": 0.082,
+      "step": 45429
+    },
+    {
+      "epoch": 0.3943542156752112,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0011912434735261333,
+      "loss": 0.0781,
+      "step": 45430
+    },
+    {
+      "epoch": 0.3943628961554153,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0011912138643673657,
+      "loss": 0.061,
+      "step": 45431
+    },
+    {
+      "epoch": 0.3943715766356195,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0011911842551088482,
+      "loss": 0.1113,
+      "step": 45432
+    },
+    {
+      "epoch": 0.39438025711582364,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0011911546457506127,
+      "loss": 0.0898,
+      "step": 45433
+    },
+    {
+      "epoch": 0.39438893759602783,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0011911250362926923,
+      "loss": 0.1211,
+      "step": 45434
+    },
+    {
+      "epoch": 0.39439761807623197,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0011910954267351187,
+      "loss": 0.0913,
+      "step": 45435
+    },
+    {
+      "epoch": 0.39440629855643616,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.0011910658170779242,
+      "loss": 0.1211,
+      "step": 45436
+    },
+    {
+      "epoch": 0.3944149790366403,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001191036207321142,
+      "loss": 0.1436,
+      "step": 45437
+    },
+    {
+      "epoch": 0.3944236595168445,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0011910065974648036,
+      "loss": 0.124,
+      "step": 45438
+    },
+    {
+      "epoch": 0.39443233999704863,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0011909769875089423,
+      "loss": 0.106,
+      "step": 45439
+    },
+    {
+      "epoch": 0.3944410204772528,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0011909473774535894,
+      "loss": 0.1104,
+      "step": 45440
+    },
+    {
+      "epoch": 0.39444970095745696,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001190917767298778,
+      "loss": 0.0942,
+      "step": 45441
+    },
+    {
+      "epoch": 0.39445838143766115,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0011908881570445403,
+      "loss": 0.1299,
+      "step": 45442
+    },
+    {
+      "epoch": 0.3944670619178653,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0011908585466909086,
+      "loss": 0.1572,
+      "step": 45443
+    },
+    {
+      "epoch": 0.3944757423980695,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0011908289362379154,
+      "loss": 0.082,
+      "step": 45444
+    },
+    {
+      "epoch": 0.3944844228782736,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0011907993256855928,
+      "loss": 0.1279,
+      "step": 45445
+    },
+    {
+      "epoch": 0.3944931033584778,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0011907697150339738,
+      "loss": 0.0918,
+      "step": 45446
+    },
+    {
+      "epoch": 0.39450178383868195,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0011907401042830902,
+      "loss": 0.1406,
+      "step": 45447
+    },
+    {
+      "epoch": 0.39451046431888614,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0011907104934329748,
+      "loss": 0.0889,
+      "step": 45448
+    },
+    {
+      "epoch": 0.3945191447990903,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0011906808824836597,
+      "loss": 0.1279,
+      "step": 45449
+    },
+    {
+      "epoch": 0.3945278252792945,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0011906512714351773,
+      "loss": 0.0762,
+      "step": 45450
+    },
+    {
+      "epoch": 0.3945365057594986,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.00119062166028756,
+      "loss": 0.125,
+      "step": 45451
+    },
+    {
+      "epoch": 0.3945451862397028,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0011905920490408405,
+      "loss": 0.1118,
+      "step": 45452
+    },
+    {
+      "epoch": 0.39455386671990694,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0011905624376950504,
+      "loss": 0.0967,
+      "step": 45453
+    },
+    {
+      "epoch": 0.39456254720011114,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0011905328262502225,
+      "loss": 0.0664,
+      "step": 45454
+    },
+    {
+      "epoch": 0.3945712276803153,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0011905032147063897,
+      "loss": 0.1211,
+      "step": 45455
+    },
+    {
+      "epoch": 0.39457990816051947,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001190473603063584,
+      "loss": 0.1016,
+      "step": 45456
+    },
+    {
+      "epoch": 0.3945885886407236,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0011904439913218371,
+      "loss": 0.1113,
+      "step": 45457
+    },
+    {
+      "epoch": 0.3945972691209278,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001190414379481183,
+      "loss": 0.0859,
+      "step": 45458
+    },
+    {
+      "epoch": 0.39460594960113193,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0011903847675416525,
+      "loss": 0.1289,
+      "step": 45459
+    },
+    {
+      "epoch": 0.3946146300813361,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0011903551555032786,
+      "loss": 0.0923,
+      "step": 45460
+    },
+    {
+      "epoch": 0.39462331056154026,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0011903255433660933,
+      "loss": 0.166,
+      "step": 45461
+    },
+    {
+      "epoch": 0.39463199104174446,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0011902959311301299,
+      "loss": 0.0859,
+      "step": 45462
+    },
+    {
+      "epoch": 0.3946406715219486,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.00119026631879542,
+      "loss": 0.105,
+      "step": 45463
+    },
+    {
+      "epoch": 0.3946493520021528,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0011902367063619961,
+      "loss": 0.125,
+      "step": 45464
+    },
+    {
+      "epoch": 0.3946580324823569,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0011902070938298913,
+      "loss": 0.0669,
+      "step": 45465
+    },
+    {
+      "epoch": 0.39466671296256106,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0011901774811991367,
+      "loss": 0.125,
+      "step": 45466
+    },
+    {
+      "epoch": 0.39467539344276525,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0011901478684697656,
+      "loss": 0.0898,
+      "step": 45467
+    },
+    {
+      "epoch": 0.3946840739229694,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0011901182556418102,
+      "loss": 0.1172,
+      "step": 45468
+    },
+    {
+      "epoch": 0.3946927544031736,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0011900886427153026,
+      "loss": 0.0776,
+      "step": 45469
+    },
+    {
+      "epoch": 0.3947014348833777,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0011900590296902757,
+      "loss": 0.1079,
+      "step": 45470
+    },
+    {
+      "epoch": 0.3947101153635819,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0011900294165667613,
+      "loss": 0.0942,
+      "step": 45471
+    },
+    {
+      "epoch": 0.39471879584378605,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0011899998033447926,
+      "loss": 0.0918,
+      "step": 45472
+    },
+    {
+      "epoch": 0.39472747632399025,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0011899701900244006,
+      "loss": 0.0918,
+      "step": 45473
+    },
+    {
+      "epoch": 0.3947361568041944,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0011899405766056194,
+      "loss": 0.0742,
+      "step": 45474
+    },
+    {
+      "epoch": 0.3947448372843986,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.00118991096308848,
+      "loss": 0.0874,
+      "step": 45475
+    },
+    {
+      "epoch": 0.3947535177646027,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0011898813494730157,
+      "loss": 0.0996,
+      "step": 45476
+    },
+    {
+      "epoch": 0.3947621982448069,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0011898517357592582,
+      "loss": 0.0908,
+      "step": 45477
+    },
+    {
+      "epoch": 0.39477087872501104,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0011898221219472404,
+      "loss": 0.1206,
+      "step": 45478
+    },
+    {
+      "epoch": 0.39477955920521524,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0011897925080369945,
+      "loss": 0.083,
+      "step": 45479
+    },
+    {
+      "epoch": 0.3947882396854194,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0011897628940285528,
+      "loss": 0.1553,
+      "step": 45480
+    },
+    {
+      "epoch": 0.39479692016562357,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0011897332799219477,
+      "loss": 0.0781,
+      "step": 45481
+    },
+    {
+      "epoch": 0.3948056006458277,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0011897036657172115,
+      "loss": 0.124,
+      "step": 45482
+    },
+    {
+      "epoch": 0.3948142811260319,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0011896740514143766,
+      "loss": 0.1001,
+      "step": 45483
+    },
+    {
+      "epoch": 0.39482296160623603,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.001189644437013476,
+      "loss": 0.0781,
+      "step": 45484
+    },
+    {
+      "epoch": 0.3948316420864402,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0011896148225145416,
+      "loss": 0.1152,
+      "step": 45485
+    },
+    {
+      "epoch": 0.39484032256664436,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0011895852079176054,
+      "loss": 0.1191,
+      "step": 45486
+    },
+    {
+      "epoch": 0.39484900304684856,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0011895555932227003,
+      "loss": 0.0938,
+      "step": 45487
+    },
+    {
+      "epoch": 0.3948576835270527,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0011895259784298587,
+      "loss": 0.0977,
+      "step": 45488
+    },
+    {
+      "epoch": 0.3948663640072569,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0011894963635391128,
+      "loss": 0.0791,
+      "step": 45489
+    },
+    {
+      "epoch": 0.394875044487461,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001189466748550495,
+      "loss": 0.0825,
+      "step": 45490
+    },
+    {
+      "epoch": 0.3948837249676652,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0011894371334640374,
+      "loss": 0.1455,
+      "step": 45491
+    },
+    {
+      "epoch": 0.39489240544786935,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.001189407518279773,
+      "loss": 0.1221,
+      "step": 45492
+    },
+    {
+      "epoch": 0.39490108592807355,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0011893779029977334,
+      "loss": 0.0854,
+      "step": 45493
+    },
+    {
+      "epoch": 0.3949097664082777,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0011893482876179523,
+      "loss": 0.0938,
+      "step": 45494
+    },
+    {
+      "epoch": 0.3949184468884819,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0011893186721404604,
+      "loss": 0.0903,
+      "step": 45495
+    },
+    {
+      "epoch": 0.394927127368686,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0011892890565652916,
+      "loss": 0.1133,
+      "step": 45496
+    },
+    {
+      "epoch": 0.3949358078488902,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0011892594408924775,
+      "loss": 0.1191,
+      "step": 45497
+    },
+    {
+      "epoch": 0.39494448832909435,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0011892298251220503,
+      "loss": 0.1113,
+      "step": 45498
+    },
+    {
+      "epoch": 0.39495316880929854,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001189200209254043,
+      "loss": 0.1338,
+      "step": 45499
+    },
+    {
+      "epoch": 0.3949618492895027,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0011891705932884876,
+      "loss": 0.0732,
+      "step": 45500
+    },
+    {
+      "epoch": 0.39497052976970687,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0011891409772254167,
+      "loss": 0.1123,
+      "step": 45501
+    },
+    {
+      "epoch": 0.394979210249911,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0011891113610648621,
+      "loss": 0.0933,
+      "step": 45502
+    },
+    {
+      "epoch": 0.3949878907301152,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.001189081744806857,
+      "loss": 0.0781,
+      "step": 45503
+    },
+    {
+      "epoch": 0.39499657121031934,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0011890521284514334,
+      "loss": 0.1001,
+      "step": 45504
+    },
+    {
+      "epoch": 0.39500525169052353,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001189022511998624,
+      "loss": 0.0815,
+      "step": 45505
+    },
+    {
+      "epoch": 0.39501393217072767,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0011889928954484609,
+      "loss": 0.1621,
+      "step": 45506
+    },
+    {
+      "epoch": 0.39502261265093186,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0011889632788009761,
+      "loss": 0.0986,
+      "step": 45507
+    },
+    {
+      "epoch": 0.395031293131136,
+      "grad_norm": 3.109375,
+      "learning_rate": 0.0011889336620562025,
+      "loss": 0.2256,
+      "step": 45508
+    },
+    {
+      "epoch": 0.3950399736113402,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0011889040452141726,
+      "loss": 0.1396,
+      "step": 45509
+    },
+    {
+      "epoch": 0.3950486540915443,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0011888744282749185,
+      "loss": 0.0796,
+      "step": 45510
+    },
+    {
+      "epoch": 0.3950573345717485,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0011888448112384723,
+      "loss": 0.1104,
+      "step": 45511
+    },
+    {
+      "epoch": 0.39506601505195266,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0011888151941048673,
+      "loss": 0.125,
+      "step": 45512
+    },
+    {
+      "epoch": 0.39507469553215685,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0011887855768741352,
+      "loss": 0.0928,
+      "step": 45513
+    },
+    {
+      "epoch": 0.395083376012361,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0011887559595463082,
+      "loss": 0.1167,
+      "step": 45514
+    },
+    {
+      "epoch": 0.3950920564925652,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0011887263421214195,
+      "loss": 0.0723,
+      "step": 45515
+    },
+    {
+      "epoch": 0.3951007369727693,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0011886967245995006,
+      "loss": 0.0791,
+      "step": 45516
+    },
+    {
+      "epoch": 0.3951094174529735,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0011886671069805846,
+      "loss": 0.0669,
+      "step": 45517
+    },
+    {
+      "epoch": 0.39511809793317765,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0011886374892647032,
+      "loss": 0.0972,
+      "step": 45518
+    },
+    {
+      "epoch": 0.39512677841338184,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0011886078714518895,
+      "loss": 0.1025,
+      "step": 45519
+    },
+    {
+      "epoch": 0.395135458893586,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0011885782535421751,
+      "loss": 0.0913,
+      "step": 45520
+    },
+    {
+      "epoch": 0.39514413937379017,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0011885486355355938,
+      "loss": 0.0913,
+      "step": 45521
+    },
+    {
+      "epoch": 0.3951528198539943,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001188519017432176,
+      "loss": 0.1011,
+      "step": 45522
+    },
+    {
+      "epoch": 0.3951615003341985,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0011884893992319557,
+      "loss": 0.1021,
+      "step": 45523
+    },
+    {
+      "epoch": 0.39517018081440264,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0011884597809349647,
+      "loss": 0.1006,
+      "step": 45524
+    },
+    {
+      "epoch": 0.39517886129460683,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0011884301625412353,
+      "loss": 0.0918,
+      "step": 45525
+    },
+    {
+      "epoch": 0.39518754177481097,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0011884005440508,
+      "loss": 0.0957,
+      "step": 45526
+    },
+    {
+      "epoch": 0.39519622225501516,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0011883709254636911,
+      "loss": 0.0879,
+      "step": 45527
+    },
+    {
+      "epoch": 0.3952049027352193,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0011883413067799414,
+      "loss": 0.1309,
+      "step": 45528
+    },
+    {
+      "epoch": 0.3952135832154235,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0011883116879995827,
+      "loss": 0.0684,
+      "step": 45529
+    },
+    {
+      "epoch": 0.39522226369562763,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0011882820691226476,
+      "loss": 0.0708,
+      "step": 45530
+    },
+    {
+      "epoch": 0.3952309441758318,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0011882524501491687,
+      "loss": 0.0933,
+      "step": 45531
+    },
+    {
+      "epoch": 0.39523962465603596,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0011882228310791782,
+      "loss": 0.1328,
+      "step": 45532
+    },
+    {
+      "epoch": 0.39524830513624015,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0011881932119127089,
+      "loss": 0.0723,
+      "step": 45533
+    },
+    {
+      "epoch": 0.3952569856164443,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0011881635926497924,
+      "loss": 0.1138,
+      "step": 45534
+    },
+    {
+      "epoch": 0.3952656660966485,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0011881339732904615,
+      "loss": 0.082,
+      "step": 45535
+    },
+    {
+      "epoch": 0.3952743465768526,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001188104353834749,
+      "loss": 0.1357,
+      "step": 45536
+    },
+    {
+      "epoch": 0.3952830270570568,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0011880747342826866,
+      "loss": 0.0786,
+      "step": 45537
+    },
+    {
+      "epoch": 0.39529170753726095,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0011880451146343072,
+      "loss": 0.0967,
+      "step": 45538
+    },
+    {
+      "epoch": 0.39530038801746514,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0011880154948896427,
+      "loss": 0.1084,
+      "step": 45539
+    },
+    {
+      "epoch": 0.3953090684976693,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0011879858750487257,
+      "loss": 0.1025,
+      "step": 45540
+    },
+    {
+      "epoch": 0.3953177489778735,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0011879562551115887,
+      "loss": 0.1025,
+      "step": 45541
+    },
+    {
+      "epoch": 0.3953264294580776,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0011879266350782645,
+      "loss": 0.1064,
+      "step": 45542
+    },
+    {
+      "epoch": 0.3953351099382818,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0011878970149487847,
+      "loss": 0.0713,
+      "step": 45543
+    },
+    {
+      "epoch": 0.39534379041848594,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001187867394723182,
+      "loss": 0.125,
+      "step": 45544
+    },
+    {
+      "epoch": 0.39535247089869013,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001187837774401489,
+      "loss": 0.0967,
+      "step": 45545
+    },
+    {
+      "epoch": 0.39536115137889427,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0011878081539837381,
+      "loss": 0.1182,
+      "step": 45546
+    },
+    {
+      "epoch": 0.39536983185909846,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0011877785334699614,
+      "loss": 0.1221,
+      "step": 45547
+    },
+    {
+      "epoch": 0.3953785123393026,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0011877489128601913,
+      "loss": 0.1006,
+      "step": 45548
+    },
+    {
+      "epoch": 0.3953871928195068,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0011877192921544602,
+      "loss": 0.1162,
+      "step": 45549
+    },
+    {
+      "epoch": 0.39539587329971093,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0011876896713528006,
+      "loss": 0.1396,
+      "step": 45550
+    },
+    {
+      "epoch": 0.3954045537799151,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0011876600504552453,
+      "loss": 0.1084,
+      "step": 45551
+    },
+    {
+      "epoch": 0.39541323426011926,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0011876304294618255,
+      "loss": 0.0815,
+      "step": 45552
+    },
+    {
+      "epoch": 0.39542191474032345,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0011876008083725753,
+      "loss": 0.0781,
+      "step": 45553
+    },
+    {
+      "epoch": 0.3954305952205276,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0011875711871875256,
+      "loss": 0.0811,
+      "step": 45554
+    },
+    {
+      "epoch": 0.3954392757007318,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0011875415659067098,
+      "loss": 0.106,
+      "step": 45555
+    },
+    {
+      "epoch": 0.3954479561809359,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0011875119445301596,
+      "loss": 0.0781,
+      "step": 45556
+    },
+    {
+      "epoch": 0.3954566366611401,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0011874823230579078,
+      "loss": 0.0977,
+      "step": 45557
+    },
+    {
+      "epoch": 0.39546531714134425,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0011874527014899862,
+      "loss": 0.1426,
+      "step": 45558
+    },
+    {
+      "epoch": 0.39547399762154845,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0011874230798264278,
+      "loss": 0.103,
+      "step": 45559
+    },
+    {
+      "epoch": 0.3954826781017526,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0011873934580672652,
+      "loss": 0.0928,
+      "step": 45560
+    },
+    {
+      "epoch": 0.3954913585819568,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.00118736383621253,
+      "loss": 0.0889,
+      "step": 45561
+    },
+    {
+      "epoch": 0.3955000390621609,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0011873342142622555,
+      "loss": 0.0845,
+      "step": 45562
+    },
+    {
+      "epoch": 0.3955087195423651,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001187304592216473,
+      "loss": 0.0981,
+      "step": 45563
+    },
+    {
+      "epoch": 0.39551740002256924,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0011872749700752162,
+      "loss": 0.0869,
+      "step": 45564
+    },
+    {
+      "epoch": 0.39552608050277344,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0011872453478385164,
+      "loss": 0.1133,
+      "step": 45565
+    },
+    {
+      "epoch": 0.3955347609829776,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0011872157255064062,
+      "loss": 0.1069,
+      "step": 45566
+    },
+    {
+      "epoch": 0.39554344146318177,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0011871861030789187,
+      "loss": 0.0986,
+      "step": 45567
+    },
+    {
+      "epoch": 0.3955521219433859,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0011871564805560854,
+      "loss": 0.1484,
+      "step": 45568
+    },
+    {
+      "epoch": 0.3955608024235901,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0011871268579379394,
+      "loss": 0.1318,
+      "step": 45569
+    },
+    {
+      "epoch": 0.39556948290379423,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0011870972352245122,
+      "loss": 0.1021,
+      "step": 45570
+    },
+    {
+      "epoch": 0.3955781633839984,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0011870676124158372,
+      "loss": 0.0811,
+      "step": 45571
+    },
+    {
+      "epoch": 0.39558684386420256,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0011870379895119465,
+      "loss": 0.125,
+      "step": 45572
+    },
+    {
+      "epoch": 0.39559552434440676,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0011870083665128722,
+      "loss": 0.0815,
+      "step": 45573
+    },
+    {
+      "epoch": 0.3956042048246109,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0011869787434186467,
+      "loss": 0.1064,
+      "step": 45574
+    },
+    {
+      "epoch": 0.3956128853048151,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0011869491202293025,
+      "loss": 0.1006,
+      "step": 45575
+    },
+    {
+      "epoch": 0.3956215657850192,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0011869194969448727,
+      "loss": 0.0913,
+      "step": 45576
+    },
+    {
+      "epoch": 0.3956302462652234,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0011868898735653886,
+      "loss": 0.0806,
+      "step": 45577
+    },
+    {
+      "epoch": 0.39563892674542755,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.0011868602500908828,
+      "loss": 0.1055,
+      "step": 45578
+    },
+    {
+      "epoch": 0.39564760722563175,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0011868306265213882,
+      "loss": 0.1055,
+      "step": 45579
+    },
+    {
+      "epoch": 0.3956562877058359,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0011868010028569368,
+      "loss": 0.0623,
+      "step": 45580
+    },
+    {
+      "epoch": 0.3956649681860401,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0011867713790975614,
+      "loss": 0.0811,
+      "step": 45581
+    },
+    {
+      "epoch": 0.3956736486662442,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0011867417552432937,
+      "loss": 0.1001,
+      "step": 45582
+    },
+    {
+      "epoch": 0.3956823291464484,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0011867121312941669,
+      "loss": 0.0928,
+      "step": 45583
+    },
+    {
+      "epoch": 0.39569100962665255,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0011866825072502129,
+      "loss": 0.1113,
+      "step": 45584
+    },
+    {
+      "epoch": 0.39569969010685674,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0011866528831114642,
+      "loss": 0.0972,
+      "step": 45585
+    },
+    {
+      "epoch": 0.3957083705870609,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0011866232588779533,
+      "loss": 0.1201,
+      "step": 45586
+    },
+    {
+      "epoch": 0.39571705106726507,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0011865936345497121,
+      "loss": 0.1025,
+      "step": 45587
+    },
+    {
+      "epoch": 0.3957257315474692,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0011865640101267737,
+      "loss": 0.0957,
+      "step": 45588
+    },
+    {
+      "epoch": 0.39573441202767334,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0011865343856091704,
+      "loss": 0.1118,
+      "step": 45589
+    },
+    {
+      "epoch": 0.39574309250787754,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0011865047609969344,
+      "loss": 0.1094,
+      "step": 45590
+    },
+    {
+      "epoch": 0.3957517729880817,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0011864751362900977,
+      "loss": 0.1748,
+      "step": 45591
+    },
+    {
+      "epoch": 0.39576045346828587,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0011864455114886931,
+      "loss": 0.1104,
+      "step": 45592
+    },
+    {
+      "epoch": 0.39576913394849,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0011864158865927535,
+      "loss": 0.1055,
+      "step": 45593
+    },
+    {
+      "epoch": 0.3957778144286942,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0011863862616023103,
+      "loss": 0.0938,
+      "step": 45594
+    },
+    {
+      "epoch": 0.39578649490889833,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0011863566365173963,
+      "loss": 0.0728,
+      "step": 45595
+    },
+    {
+      "epoch": 0.3957951753891025,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0011863270113380447,
+      "loss": 0.0928,
+      "step": 45596
+    },
+    {
+      "epoch": 0.39580385586930666,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0011862973860642864,
+      "loss": 0.0942,
+      "step": 45597
+    },
+    {
+      "epoch": 0.39581253634951086,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001186267760696155,
+      "loss": 0.1074,
+      "step": 45598
+    },
+    {
+      "epoch": 0.395821216829715,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0011862381352336824,
+      "loss": 0.1001,
+      "step": 45599
+    },
+    {
+      "epoch": 0.3958298973099192,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001186208509676901,
+      "loss": 0.1113,
+      "step": 45600
+    },
+    {
+      "epoch": 0.3958385777901233,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0011861788840258432,
+      "loss": 0.0908,
+      "step": 45601
+    },
+    {
+      "epoch": 0.3958472582703275,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0011861492582805415,
+      "loss": 0.0923,
+      "step": 45602
+    },
+    {
+      "epoch": 0.39585593875053166,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0011861196324410284,
+      "loss": 0.0986,
+      "step": 45603
+    },
+    {
+      "epoch": 0.39586461923073585,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0011860900065073358,
+      "loss": 0.1348,
+      "step": 45604
+    },
+    {
+      "epoch": 0.39587329971094,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001186060380479497,
+      "loss": 0.085,
+      "step": 45605
+    },
+    {
+      "epoch": 0.3958819801911442,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0011860307543575437,
+      "loss": 0.1348,
+      "step": 45606
+    },
+    {
+      "epoch": 0.3958906606713483,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0011860011281415084,
+      "loss": 0.0898,
+      "step": 45607
+    },
+    {
+      "epoch": 0.3958993411515525,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0011859715018314235,
+      "loss": 0.0884,
+      "step": 45608
+    },
+    {
+      "epoch": 0.39590802163175665,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0011859418754273214,
+      "loss": 0.127,
+      "step": 45609
+    },
+    {
+      "epoch": 0.39591670211196084,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0011859122489292348,
+      "loss": 0.0908,
+      "step": 45610
+    },
+    {
+      "epoch": 0.395925382592165,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0011858826223371955,
+      "loss": 0.1211,
+      "step": 45611
+    },
+    {
+      "epoch": 0.39593406307236917,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0011858529956512364,
+      "loss": 0.127,
+      "step": 45612
+    },
+    {
+      "epoch": 0.3959427435525733,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0011858233688713897,
+      "loss": 0.0938,
+      "step": 45613
+    },
+    {
+      "epoch": 0.3959514240327775,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0011857937419976882,
+      "loss": 0.0645,
+      "step": 45614
+    },
+    {
+      "epoch": 0.39596010451298164,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0011857641150301637,
+      "loss": 0.1055,
+      "step": 45615
+    },
+    {
+      "epoch": 0.39596878499318583,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0011857344879688488,
+      "loss": 0.1104,
+      "step": 45616
+    },
+    {
+      "epoch": 0.39597746547338997,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0011857048608137758,
+      "loss": 0.0664,
+      "step": 45617
+    },
+    {
+      "epoch": 0.39598614595359416,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0011856752335649777,
+      "loss": 0.1201,
+      "step": 45618
+    },
+    {
+      "epoch": 0.3959948264337983,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0011856456062224862,
+      "loss": 0.1855,
+      "step": 45619
+    },
+    {
+      "epoch": 0.3960035069140025,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0011856159787863338,
+      "loss": 0.0752,
+      "step": 45620
+    },
+    {
+      "epoch": 0.3960121873942066,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0011855863512565533,
+      "loss": 0.2295,
+      "step": 45621
+    },
+    {
+      "epoch": 0.3960208678744108,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.0011855567236331768,
+      "loss": 0.2158,
+      "step": 45622
+    },
+    {
+      "epoch": 0.39602954835461496,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0011855270959162368,
+      "loss": 0.1055,
+      "step": 45623
+    },
+    {
+      "epoch": 0.39603822883481915,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0011854974681057657,
+      "loss": 0.1191,
+      "step": 45624
+    },
+    {
+      "epoch": 0.3960469093150233,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0011854678402017955,
+      "loss": 0.0972,
+      "step": 45625
+    },
+    {
+      "epoch": 0.3960555897952275,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0011854382122043591,
+      "loss": 0.0757,
+      "step": 45626
+    },
+    {
+      "epoch": 0.3960642702754316,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001185408584113489,
+      "loss": 0.1055,
+      "step": 45627
+    },
+    {
+      "epoch": 0.3960729507556358,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0011853789559292172,
+      "loss": 0.1152,
+      "step": 45628
+    },
+    {
+      "epoch": 0.39608163123583995,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0011853493276515763,
+      "loss": 0.105,
+      "step": 45629
+    },
+    {
+      "epoch": 0.39609031171604414,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0011853196992805985,
+      "loss": 0.0894,
+      "step": 45630
+    },
+    {
+      "epoch": 0.3960989921962483,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0011852900708163167,
+      "loss": 0.103,
+      "step": 45631
+    },
+    {
+      "epoch": 0.39610767267645247,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0011852604422587626,
+      "loss": 0.105,
+      "step": 45632
+    },
+    {
+      "epoch": 0.3961163531566566,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0011852308136079692,
+      "loss": 0.0972,
+      "step": 45633
+    },
+    {
+      "epoch": 0.3961250336368608,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0011852011848639684,
+      "loss": 0.0771,
+      "step": 45634
+    },
+    {
+      "epoch": 0.39613371411706494,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.001185171556026793,
+      "loss": 0.1206,
+      "step": 45635
+    },
+    {
+      "epoch": 0.39614239459726913,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0011851419270964751,
+      "loss": 0.127,
+      "step": 45636
+    },
+    {
+      "epoch": 0.39615107507747327,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0011851122980730477,
+      "loss": 0.0957,
+      "step": 45637
+    },
+    {
+      "epoch": 0.39615975555767746,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0011850826689565426,
+      "loss": 0.124,
+      "step": 45638
+    },
+    {
+      "epoch": 0.3961684360378816,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0011850530397469923,
+      "loss": 0.1299,
+      "step": 45639
+    },
+    {
+      "epoch": 0.3961771165180858,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.001185023410444429,
+      "loss": 0.0933,
+      "step": 45640
+    },
+    {
+      "epoch": 0.39618579699828993,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0011849937810488858,
+      "loss": 0.1289,
+      "step": 45641
+    },
+    {
+      "epoch": 0.3961944774784941,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0011849641515603948,
+      "loss": 0.0996,
+      "step": 45642
+    },
+    {
+      "epoch": 0.39620315795869826,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0011849345219789877,
+      "loss": 0.1445,
+      "step": 45643
+    },
+    {
+      "epoch": 0.39621183843890245,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0011849048923046979,
+      "loss": 0.1064,
+      "step": 45644
+    },
+    {
+      "epoch": 0.3962205189191066,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0011848752625375572,
+      "loss": 0.1055,
+      "step": 45645
+    },
+    {
+      "epoch": 0.3962291993993108,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0011848456326775982,
+      "loss": 0.1729,
+      "step": 45646
+    },
+    {
+      "epoch": 0.3962378798795149,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0011848160027248533,
+      "loss": 0.1128,
+      "step": 45647
+    },
+    {
+      "epoch": 0.3962465603597191,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0011847863726793548,
+      "loss": 0.0776,
+      "step": 45648
+    },
+    {
+      "epoch": 0.39625524083992325,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0011847567425411357,
+      "loss": 0.1074,
+      "step": 45649
+    },
+    {
+      "epoch": 0.39626392132012744,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0011847271123102275,
+      "loss": 0.1348,
+      "step": 45650
+    },
+    {
+      "epoch": 0.3962726018003316,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0011846974819866632,
+      "loss": 0.0781,
+      "step": 45651
+    },
+    {
+      "epoch": 0.3962812822805358,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0011846678515704747,
+      "loss": 0.0859,
+      "step": 45652
+    },
+    {
+      "epoch": 0.3962899627607399,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0011846382210616948,
+      "loss": 0.1196,
+      "step": 45653
+    },
+    {
+      "epoch": 0.3962986432409441,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0011846085904603561,
+      "loss": 0.0894,
+      "step": 45654
+    },
+    {
+      "epoch": 0.39630732372114824,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0011845789597664902,
+      "loss": 0.1064,
+      "step": 45655
+    },
+    {
+      "epoch": 0.39631600420135243,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0011845493289801303,
+      "loss": 0.1006,
+      "step": 45656
+    },
+    {
+      "epoch": 0.39632468468155657,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0011845196981013087,
+      "loss": 0.1035,
+      "step": 45657
+    },
+    {
+      "epoch": 0.39633336516176076,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0011844900671300574,
+      "loss": 0.0933,
+      "step": 45658
+    },
+    {
+      "epoch": 0.3963420456419649,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0011844604360664094,
+      "loss": 0.0869,
+      "step": 45659
+    },
+    {
+      "epoch": 0.3963507261221691,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001184430804910396,
+      "loss": 0.1426,
+      "step": 45660
+    },
+    {
+      "epoch": 0.39635940660237323,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0011844011736620507,
+      "loss": 0.1001,
+      "step": 45661
+    },
+    {
+      "epoch": 0.3963680870825774,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001184371542321406,
+      "loss": 0.1289,
+      "step": 45662
+    },
+    {
+      "epoch": 0.39637676756278156,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001184341910888493,
+      "loss": 0.1328,
+      "step": 45663
+    },
+    {
+      "epoch": 0.39638544804298576,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0011843122793633457,
+      "loss": 0.1182,
+      "step": 45664
+    },
+    {
+      "epoch": 0.3963941285231899,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0011842826477459953,
+      "loss": 0.1074,
+      "step": 45665
+    },
+    {
+      "epoch": 0.3964028090033941,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0011842530160364748,
+      "loss": 0.0913,
+      "step": 45666
+    },
+    {
+      "epoch": 0.3964114894835982,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0011842233842348166,
+      "loss": 0.0991,
+      "step": 45667
+    },
+    {
+      "epoch": 0.3964201699638024,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0011841937523410528,
+      "loss": 0.1099,
+      "step": 45668
+    },
+    {
+      "epoch": 0.39642885044400655,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001184164120355216,
+      "loss": 0.0835,
+      "step": 45669
+    },
+    {
+      "epoch": 0.39643753092421075,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0011841344882773388,
+      "loss": 0.0928,
+      "step": 45670
+    },
+    {
+      "epoch": 0.3964462114044149,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0011841048561074534,
+      "loss": 0.1416,
+      "step": 45671
+    },
+    {
+      "epoch": 0.3964548918846191,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001184075223845592,
+      "loss": 0.1016,
+      "step": 45672
+    },
+    {
+      "epoch": 0.3964635723648232,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001184045591491787,
+      "loss": 0.1006,
+      "step": 45673
+    },
+    {
+      "epoch": 0.3964722528450274,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0011840159590460716,
+      "loss": 0.167,
+      "step": 45674
+    },
+    {
+      "epoch": 0.39648093332523154,
+      "grad_norm": 1.7890625,
+      "learning_rate": 0.0011839863265084769,
+      "loss": 0.1582,
+      "step": 45675
+    },
+    {
+      "epoch": 0.39648961380543574,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0011839566938790362,
+      "loss": 0.0908,
+      "step": 45676
+    },
+    {
+      "epoch": 0.3964982942856399,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0011839270611577822,
+      "loss": 0.0869,
+      "step": 45677
+    },
+    {
+      "epoch": 0.39650697476584407,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0011838974283447461,
+      "loss": 0.1338,
+      "step": 45678
+    },
+    {
+      "epoch": 0.3965156552460482,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0011838677954399616,
+      "loss": 0.0977,
+      "step": 45679
+    },
+    {
+      "epoch": 0.3965243357262524,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0011838381624434604,
+      "loss": 0.105,
+      "step": 45680
+    },
+    {
+      "epoch": 0.39653301620645653,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001183808529355275,
+      "loss": 0.0835,
+      "step": 45681
+    },
+    {
+      "epoch": 0.3965416966866607,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001183778896175438,
+      "loss": 0.1328,
+      "step": 45682
+    },
+    {
+      "epoch": 0.39655037716686486,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0011837492629039814,
+      "loss": 0.0986,
+      "step": 45683
+    },
+    {
+      "epoch": 0.39655905764706906,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001183719629540938,
+      "loss": 0.1147,
+      "step": 45684
+    },
+    {
+      "epoch": 0.3965677381272732,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.00118368999608634,
+      "loss": 0.1016,
+      "step": 45685
+    },
+    {
+      "epoch": 0.3965764186074774,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0011836603625402199,
+      "loss": 0.1152,
+      "step": 45686
+    },
+    {
+      "epoch": 0.3965850990876815,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0011836307289026103,
+      "loss": 0.0718,
+      "step": 45687
+    },
+    {
+      "epoch": 0.3965937795678857,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0011836010951735431,
+      "loss": 0.1216,
+      "step": 45688
+    },
+    {
+      "epoch": 0.39660246004808986,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001183571461353051,
+      "loss": 0.0938,
+      "step": 45689
+    },
+    {
+      "epoch": 0.39661114052829405,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0011835418274411668,
+      "loss": 0.0762,
+      "step": 45690
+    },
+    {
+      "epoch": 0.3966198210084982,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001183512193437922,
+      "loss": 0.1533,
+      "step": 45691
+    },
+    {
+      "epoch": 0.3966285014887024,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0011834825593433499,
+      "loss": 0.0918,
+      "step": 45692
+    },
+    {
+      "epoch": 0.3966371819689065,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001183452925157482,
+      "loss": 0.0898,
+      "step": 45693
+    },
+    {
+      "epoch": 0.3966458624491107,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001183423290880352,
+      "loss": 0.0967,
+      "step": 45694
+    },
+    {
+      "epoch": 0.39665454292931485,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0011833936565119909,
+      "loss": 0.1172,
+      "step": 45695
+    },
+    {
+      "epoch": 0.39666322340951904,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0011833640220524323,
+      "loss": 0.0811,
+      "step": 45696
+    },
+    {
+      "epoch": 0.3966719038897232,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0011833343875017074,
+      "loss": 0.1118,
+      "step": 45697
+    },
+    {
+      "epoch": 0.39668058436992737,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.00118330475285985,
+      "loss": 0.0889,
+      "step": 45698
+    },
+    {
+      "epoch": 0.3966892648501315,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001183275118126891,
+      "loss": 0.1157,
+      "step": 45699
+    },
+    {
+      "epoch": 0.3966979453303357,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0011832454833028642,
+      "loss": 0.1094,
+      "step": 45700
+    },
+    {
+      "epoch": 0.39670662581053984,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0011832158483878012,
+      "loss": 0.1055,
+      "step": 45701
+    },
+    {
+      "epoch": 0.39671530629074403,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0011831862133817347,
+      "loss": 0.1016,
+      "step": 45702
+    },
+    {
+      "epoch": 0.39672398677094817,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0011831565782846967,
+      "loss": 0.1143,
+      "step": 45703
+    },
+    {
+      "epoch": 0.39673266725115236,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.00118312694309672,
+      "loss": 0.0776,
+      "step": 45704
+    },
+    {
+      "epoch": 0.3967413477313565,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0011830973078178371,
+      "loss": 0.1309,
+      "step": 45705
+    },
+    {
+      "epoch": 0.3967500282115607,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0011830676724480802,
+      "loss": 0.0742,
+      "step": 45706
+    },
+    {
+      "epoch": 0.3967587086917648,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0011830380369874818,
+      "loss": 0.0894,
+      "step": 45707
+    },
+    {
+      "epoch": 0.396767389171969,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001183008401436074,
+      "loss": 0.083,
+      "step": 45708
+    },
+    {
+      "epoch": 0.39677606965217316,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0011829787657938896,
+      "loss": 0.0874,
+      "step": 45709
+    },
+    {
+      "epoch": 0.39678475013237735,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001182949130060961,
+      "loss": 0.1309,
+      "step": 45710
+    },
+    {
+      "epoch": 0.3967934306125815,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0011829194942373205,
+      "loss": 0.0747,
+      "step": 45711
+    },
+    {
+      "epoch": 0.3968021110927856,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001182889858323,
+      "loss": 0.0977,
+      "step": 45712
+    },
+    {
+      "epoch": 0.3968107915729898,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0011828602223180327,
+      "loss": 0.1084,
+      "step": 45713
+    },
+    {
+      "epoch": 0.39681947205319396,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.001182830586222451,
+      "loss": 0.127,
+      "step": 45714
+    },
+    {
+      "epoch": 0.39682815253339815,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0011828009500362865,
+      "loss": 0.1226,
+      "step": 45715
+    },
+    {
+      "epoch": 0.3968368330136023,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0011827713137595724,
+      "loss": 0.1128,
+      "step": 45716
+    },
+    {
+      "epoch": 0.3968455134938065,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0011827416773923407,
+      "loss": 0.1006,
+      "step": 45717
+    },
+    {
+      "epoch": 0.3968541939740106,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0011827120409346242,
+      "loss": 0.1196,
+      "step": 45718
+    },
+    {
+      "epoch": 0.3968628744542148,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001182682404386455,
+      "loss": 0.0957,
+      "step": 45719
+    },
+    {
+      "epoch": 0.39687155493441895,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0011826527677478655,
+      "loss": 0.103,
+      "step": 45720
+    },
+    {
+      "epoch": 0.39688023541462314,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0011826231310188883,
+      "loss": 0.1006,
+      "step": 45721
+    },
+    {
+      "epoch": 0.3968889158948273,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0011825934941995554,
+      "loss": 0.1201,
+      "step": 45722
+    },
+    {
+      "epoch": 0.39689759637503147,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0011825638572898994,
+      "loss": 0.1133,
+      "step": 45723
+    },
+    {
+      "epoch": 0.3969062768552356,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0011825342202899532,
+      "loss": 0.0742,
+      "step": 45724
+    },
+    {
+      "epoch": 0.3969149573354398,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0011825045831997485,
+      "loss": 0.0957,
+      "step": 45725
+    },
+    {
+      "epoch": 0.39692363781564394,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0011824749460193182,
+      "loss": 0.1035,
+      "step": 45726
+    },
+    {
+      "epoch": 0.39693231829584813,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0011824453087486946,
+      "loss": 0.0942,
+      "step": 45727
+    },
+    {
+      "epoch": 0.39694099877605227,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0011824156713879101,
+      "loss": 0.0889,
+      "step": 45728
+    },
+    {
+      "epoch": 0.39694967925625646,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0011823860339369968,
+      "loss": 0.125,
+      "step": 45729
+    },
+    {
+      "epoch": 0.3969583597364606,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0011823563963959875,
+      "loss": 0.0688,
+      "step": 45730
+    },
+    {
+      "epoch": 0.3969670402166648,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0011823267587649144,
+      "loss": 0.1143,
+      "step": 45731
+    },
+    {
+      "epoch": 0.39697572069686893,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0011822971210438103,
+      "loss": 0.1089,
+      "step": 45732
+    },
+    {
+      "epoch": 0.3969844011770731,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0011822674832327072,
+      "loss": 0.1357,
+      "step": 45733
+    },
+    {
+      "epoch": 0.39699308165727726,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0011822378453316372,
+      "loss": 0.1001,
+      "step": 45734
+    },
+    {
+      "epoch": 0.39700176213748145,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0011822082073406335,
+      "loss": 0.0679,
+      "step": 45735
+    },
+    {
+      "epoch": 0.3970104426176856,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001182178569259728,
+      "loss": 0.1123,
+      "step": 45736
+    },
+    {
+      "epoch": 0.3970191230978898,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0011821489310889534,
+      "loss": 0.1426,
+      "step": 45737
+    },
+    {
+      "epoch": 0.3970278035780939,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0011821192928283418,
+      "loss": 0.0669,
+      "step": 45738
+    },
+    {
+      "epoch": 0.3970364840582981,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0011820896544779257,
+      "loss": 0.064,
+      "step": 45739
+    },
+    {
+      "epoch": 0.39704516453850225,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001182060016037738,
+      "loss": 0.0957,
+      "step": 45740
+    },
+    {
+      "epoch": 0.39705384501870644,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0011820303775078103,
+      "loss": 0.1318,
+      "step": 45741
+    },
+    {
+      "epoch": 0.3970625254989106,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0011820007388881756,
+      "loss": 0.0898,
+      "step": 45742
+    },
+    {
+      "epoch": 0.39707120597911477,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001181971100178866,
+      "loss": 0.0898,
+      "step": 45743
+    },
+    {
+      "epoch": 0.3970798864593189,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0011819414613799142,
+      "loss": 0.1416,
+      "step": 45744
+    },
+    {
+      "epoch": 0.3970885669395231,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0011819118224913525,
+      "loss": 0.1025,
+      "step": 45745
+    },
+    {
+      "epoch": 0.39709724741972724,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0011818821835132133,
+      "loss": 0.1021,
+      "step": 45746
+    },
+    {
+      "epoch": 0.39710592789993143,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0011818525444455285,
+      "loss": 0.0947,
+      "step": 45747
+    },
+    {
+      "epoch": 0.39711460838013557,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0011818229052883314,
+      "loss": 0.0957,
+      "step": 45748
+    },
+    {
+      "epoch": 0.39712328886033976,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001181793266041654,
+      "loss": 0.1177,
+      "step": 45749
+    },
+    {
+      "epoch": 0.3971319693405439,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0011817636267055286,
+      "loss": 0.0908,
+      "step": 45750
+    },
+    {
+      "epoch": 0.3971406498207481,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0011817339872799875,
+      "loss": 0.0869,
+      "step": 45751
+    },
+    {
+      "epoch": 0.39714933030095223,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0011817043477650635,
+      "loss": 0.1777,
+      "step": 45752
+    },
+    {
+      "epoch": 0.3971580107811564,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.001181674708160789,
+      "loss": 0.061,
+      "step": 45753
+    },
+    {
+      "epoch": 0.39716669126136056,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0011816450684671962,
+      "loss": 0.1162,
+      "step": 45754
+    },
+    {
+      "epoch": 0.39717537174156475,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0011816154286843174,
+      "loss": 0.0693,
+      "step": 45755
+    },
+    {
+      "epoch": 0.3971840522217689,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0011815857888121855,
+      "loss": 0.1216,
+      "step": 45756
+    },
+    {
+      "epoch": 0.3971927327019731,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0011815561488508323,
+      "loss": 0.0869,
+      "step": 45757
+    },
+    {
+      "epoch": 0.3972014131821772,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001181526508800291,
+      "loss": 0.1426,
+      "step": 45758
+    },
+    {
+      "epoch": 0.3972100936623814,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001181496868660593,
+      "loss": 0.1475,
+      "step": 45759
+    },
+    {
+      "epoch": 0.39721877414258555,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0011814672284317718,
+      "loss": 0.1445,
+      "step": 45760
+    },
+    {
+      "epoch": 0.39722745462278974,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0011814375881138586,
+      "loss": 0.1133,
+      "step": 45761
+    },
+    {
+      "epoch": 0.3972361351029939,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001181407947706887,
+      "loss": 0.0874,
+      "step": 45762
+    },
+    {
+      "epoch": 0.3972448155831981,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0011813783072108887,
+      "loss": 0.1387,
+      "step": 45763
+    },
+    {
+      "epoch": 0.3972534960634022,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0011813486666258962,
+      "loss": 0.0908,
+      "step": 45764
+    },
+    {
+      "epoch": 0.3972621765436064,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0011813190259519419,
+      "loss": 0.0889,
+      "step": 45765
+    },
+    {
+      "epoch": 0.39727085702381054,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001181289385189059,
+      "loss": 0.0933,
+      "step": 45766
+    },
+    {
+      "epoch": 0.39727953750401473,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0011812597443372785,
+      "loss": 0.0938,
+      "step": 45767
+    },
+    {
+      "epoch": 0.39728821798421887,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001181230103396634,
+      "loss": 0.2168,
+      "step": 45768
+    },
+    {
+      "epoch": 0.39729689846442307,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0011812004623671575,
+      "loss": 0.1113,
+      "step": 45769
+    },
+    {
+      "epoch": 0.3973055789446272,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001181170821248881,
+      "loss": 0.0608,
+      "step": 45770
+    },
+    {
+      "epoch": 0.3973142594248314,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0011811411800418377,
+      "loss": 0.127,
+      "step": 45771
+    },
+    {
+      "epoch": 0.39732293990503553,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0011811115387460594,
+      "loss": 0.1387,
+      "step": 45772
+    },
+    {
+      "epoch": 0.3973316203852397,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0011810818973615788,
+      "loss": 0.0757,
+      "step": 45773
+    },
+    {
+      "epoch": 0.39734030086544386,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0011810522558884282,
+      "loss": 0.0947,
+      "step": 45774
+    },
+    {
+      "epoch": 0.39734898134564806,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0011810226143266404,
+      "loss": 0.0835,
+      "step": 45775
+    },
+    {
+      "epoch": 0.3973576618258522,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0011809929726762471,
+      "loss": 0.0737,
+      "step": 45776
+    },
+    {
+      "epoch": 0.3973663423060564,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0011809633309372814,
+      "loss": 0.125,
+      "step": 45777
+    },
+    {
+      "epoch": 0.3973750227862605,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001180933689109775,
+      "loss": 0.1152,
+      "step": 45778
+    },
+    {
+      "epoch": 0.3973837032664647,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001180904047193761,
+      "loss": 0.0693,
+      "step": 45779
+    },
+    {
+      "epoch": 0.39739238374666885,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0011808744051892716,
+      "loss": 0.1006,
+      "step": 45780
+    },
+    {
+      "epoch": 0.39740106422687305,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001180844763096339,
+      "loss": 0.0977,
+      "step": 45781
+    },
+    {
+      "epoch": 0.3974097447070772,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0011808151209149959,
+      "loss": 0.0898,
+      "step": 45782
+    },
+    {
+      "epoch": 0.3974184251872814,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0011807854786452748,
+      "loss": 0.1011,
+      "step": 45783
+    },
+    {
+      "epoch": 0.3974271056674855,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0011807558362872076,
+      "loss": 0.1221,
+      "step": 45784
+    },
+    {
+      "epoch": 0.3974357861476897,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0011807261938408273,
+      "loss": 0.0903,
+      "step": 45785
+    },
+    {
+      "epoch": 0.39744446662789384,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0011806965513061656,
+      "loss": 0.1484,
+      "step": 45786
+    },
+    {
+      "epoch": 0.39745314710809804,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0011806669086832557,
+      "loss": 0.1465,
+      "step": 45787
+    },
+    {
+      "epoch": 0.3974618275883022,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.0011806372659721297,
+      "loss": 0.1328,
+      "step": 45788
+    },
+    {
+      "epoch": 0.39747050806850637,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.00118060762317282,
+      "loss": 0.1001,
+      "step": 45789
+    },
+    {
+      "epoch": 0.3974791885487105,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0011805779802853587,
+      "loss": 0.106,
+      "step": 45790
+    },
+    {
+      "epoch": 0.3974878690289147,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0011805483373097789,
+      "loss": 0.1074,
+      "step": 45791
+    },
+    {
+      "epoch": 0.39749654950911883,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0011805186942461126,
+      "loss": 0.085,
+      "step": 45792
+    },
+    {
+      "epoch": 0.39750522998932303,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0011804890510943922,
+      "loss": 0.0967,
+      "step": 45793
+    },
+    {
+      "epoch": 0.39751391046952717,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.00118045940785465,
+      "loss": 0.1318,
+      "step": 45794
+    },
+    {
+      "epoch": 0.39752259094973136,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0011804297645269188,
+      "loss": 0.1055,
+      "step": 45795
+    },
+    {
+      "epoch": 0.3975312714299355,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0011804001211112307,
+      "loss": 0.1279,
+      "step": 45796
+    },
+    {
+      "epoch": 0.3975399519101397,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0011803704776076186,
+      "loss": 0.0986,
+      "step": 45797
+    },
+    {
+      "epoch": 0.3975486323903438,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0011803408340161142,
+      "loss": 0.1006,
+      "step": 45798
+    },
+    {
+      "epoch": 0.397557312870548,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0011803111903367503,
+      "loss": 0.0767,
+      "step": 45799
+    },
+    {
+      "epoch": 0.39756599335075216,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0011802815465695594,
+      "loss": 0.1211,
+      "step": 45800
+    },
+    {
+      "epoch": 0.39757467383095635,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0011802519027145737,
+      "loss": 0.1348,
+      "step": 45801
+    },
+    {
+      "epoch": 0.3975833543111605,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0011802222587718259,
+      "loss": 0.1162,
+      "step": 45802
+    },
+    {
+      "epoch": 0.3975920347913647,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0011801926147413477,
+      "loss": 0.0913,
+      "step": 45803
+    },
+    {
+      "epoch": 0.3976007152715688,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0011801629706231728,
+      "loss": 0.1006,
+      "step": 45804
+    },
+    {
+      "epoch": 0.397609395751773,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0011801333264173325,
+      "loss": 0.0898,
+      "step": 45805
+    },
+    {
+      "epoch": 0.39761807623197715,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0011801036821238596,
+      "loss": 0.1191,
+      "step": 45806
+    },
+    {
+      "epoch": 0.39762675671218134,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0011800740377427865,
+      "loss": 0.0742,
+      "step": 45807
+    },
+    {
+      "epoch": 0.3976354371923855,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0011800443932741458,
+      "loss": 0.0869,
+      "step": 45808
+    },
+    {
+      "epoch": 0.39764411767258967,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0011800147487179696,
+      "loss": 0.0996,
+      "step": 45809
+    },
+    {
+      "epoch": 0.3976527981527938,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0011799851040742903,
+      "loss": 0.084,
+      "step": 45810
+    },
+    {
+      "epoch": 0.397661478632998,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0011799554593431407,
+      "loss": 0.124,
+      "step": 45811
+    },
+    {
+      "epoch": 0.39767015911320214,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0011799258145245531,
+      "loss": 0.0952,
+      "step": 45812
+    },
+    {
+      "epoch": 0.39767883959340633,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.00117989616961856,
+      "loss": 0.0991,
+      "step": 45813
+    },
+    {
+      "epoch": 0.39768752007361047,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0011798665246251934,
+      "loss": 0.0928,
+      "step": 45814
+    },
+    {
+      "epoch": 0.39769620055381466,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001179836879544486,
+      "loss": 0.1445,
+      "step": 45815
+    },
+    {
+      "epoch": 0.3977048810340188,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0011798072343764698,
+      "loss": 0.0894,
+      "step": 45816
+    },
+    {
+      "epoch": 0.397713561514223,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0011797775891211782,
+      "loss": 0.1045,
+      "step": 45817
+    },
+    {
+      "epoch": 0.39772224199442713,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0011797479437786428,
+      "loss": 0.209,
+      "step": 45818
+    },
+    {
+      "epoch": 0.3977309224746313,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0011797182983488963,
+      "loss": 0.1157,
+      "step": 45819
+    },
+    {
+      "epoch": 0.39773960295483546,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0011796886528319707,
+      "loss": 0.1064,
+      "step": 45820
+    },
+    {
+      "epoch": 0.39774828343503965,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0011796590072278993,
+      "loss": 0.0972,
+      "step": 45821
+    },
+    {
+      "epoch": 0.3977569639152438,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001179629361536714,
+      "loss": 0.1338,
+      "step": 45822
+    },
+    {
+      "epoch": 0.397765644395448,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0011795997157584468,
+      "loss": 0.1934,
+      "step": 45823
+    },
+    {
+      "epoch": 0.3977743248756521,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0011795700698931308,
+      "loss": 0.085,
+      "step": 45824
+    },
+    {
+      "epoch": 0.3977830053558563,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0011795404239407982,
+      "loss": 0.0928,
+      "step": 45825
+    },
+    {
+      "epoch": 0.39779168583606045,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0011795107779014812,
+      "loss": 0.0737,
+      "step": 45826
+    },
+    {
+      "epoch": 0.39780036631626464,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0011794811317752127,
+      "loss": 0.0894,
+      "step": 45827
+    },
+    {
+      "epoch": 0.3978090467964688,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0011794514855620246,
+      "loss": 0.1006,
+      "step": 45828
+    },
+    {
+      "epoch": 0.39781772727667297,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0011794218392619496,
+      "loss": 0.1138,
+      "step": 45829
+    },
+    {
+      "epoch": 0.3978264077568771,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.0011793921928750202,
+      "loss": 0.1118,
+      "step": 45830
+    },
+    {
+      "epoch": 0.3978350882370813,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0011793625464012686,
+      "loss": 0.1113,
+      "step": 45831
+    },
+    {
+      "epoch": 0.39784376871728544,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0011793328998407273,
+      "loss": 0.083,
+      "step": 45832
+    },
+    {
+      "epoch": 0.39785244919748963,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0011793032531934287,
+      "loss": 0.104,
+      "step": 45833
+    },
+    {
+      "epoch": 0.39786112967769377,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0011792736064594055,
+      "loss": 0.124,
+      "step": 45834
+    },
+    {
+      "epoch": 0.3978698101578979,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0011792439596386894,
+      "loss": 0.1504,
+      "step": 45835
+    },
+    {
+      "epoch": 0.3978784906381021,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0011792143127313138,
+      "loss": 0.105,
+      "step": 45836
+    },
+    {
+      "epoch": 0.39788717111830624,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0011791846657373101,
+      "loss": 0.1089,
+      "step": 45837
+    },
+    {
+      "epoch": 0.39789585159851043,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0011791550186567115,
+      "loss": 0.1396,
+      "step": 45838
+    },
+    {
+      "epoch": 0.39790453207871457,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0011791253714895503,
+      "loss": 0.0591,
+      "step": 45839
+    },
+    {
+      "epoch": 0.39791321255891876,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0011790957242358584,
+      "loss": 0.1328,
+      "step": 45840
+    },
+    {
+      "epoch": 0.3979218930391229,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0011790660768956692,
+      "loss": 0.0815,
+      "step": 45841
+    },
+    {
+      "epoch": 0.3979305735193271,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0011790364294690138,
+      "loss": 0.0898,
+      "step": 45842
+    },
+    {
+      "epoch": 0.39793925399953123,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0011790067819559258,
+      "loss": 0.1143,
+      "step": 45843
+    },
+    {
+      "epoch": 0.3979479344797354,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0011789771343564373,
+      "loss": 0.0786,
+      "step": 45844
+    },
+    {
+      "epoch": 0.39795661495993956,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0011789474866705802,
+      "loss": 0.1143,
+      "step": 45845
+    },
+    {
+      "epoch": 0.39796529544014375,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0011789178388983876,
+      "loss": 0.1357,
+      "step": 45846
+    },
+    {
+      "epoch": 0.3979739759203479,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0011788881910398915,
+      "loss": 0.1133,
+      "step": 45847
+    },
+    {
+      "epoch": 0.3979826564005521,
+      "grad_norm": 2.03125,
+      "learning_rate": 0.0011788585430951245,
+      "loss": 0.0977,
+      "step": 45848
+    },
+    {
+      "epoch": 0.3979913368807562,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0011788288950641188,
+      "loss": 0.1309,
+      "step": 45849
+    },
+    {
+      "epoch": 0.3980000173609604,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0011787992469469073,
+      "loss": 0.105,
+      "step": 45850
+    },
+    {
+      "epoch": 0.39800869784116455,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001178769598743522,
+      "loss": 0.1318,
+      "step": 45851
+    },
+    {
+      "epoch": 0.39801737832136874,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0011787399504539955,
+      "loss": 0.1279,
+      "step": 45852
+    },
+    {
+      "epoch": 0.3980260588015729,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0011787103020783603,
+      "loss": 0.1172,
+      "step": 45853
+    },
+    {
+      "epoch": 0.3980347392817771,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0011786806536166485,
+      "loss": 0.1128,
+      "step": 45854
+    },
+    {
+      "epoch": 0.3980434197619812,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0011786510050688925,
+      "loss": 0.1152,
+      "step": 45855
+    },
+    {
+      "epoch": 0.3980521002421854,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0011786213564351255,
+      "loss": 0.1572,
+      "step": 45856
+    },
+    {
+      "epoch": 0.39806078072238954,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.001178591707715379,
+      "loss": 0.0957,
+      "step": 45857
+    },
+    {
+      "epoch": 0.39806946120259373,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001178562058909686,
+      "loss": 0.1016,
+      "step": 45858
+    },
+    {
+      "epoch": 0.39807814168279787,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0011785324100180782,
+      "loss": 0.125,
+      "step": 45859
+    },
+    {
+      "epoch": 0.39808682216300206,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0011785027610405892,
+      "loss": 0.0928,
+      "step": 45860
+    },
+    {
+      "epoch": 0.3980955026432062,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0011784731119772506,
+      "loss": 0.0776,
+      "step": 45861
+    },
+    {
+      "epoch": 0.3981041831234104,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0011784434628280946,
+      "loss": 0.1035,
+      "step": 45862
+    },
+    {
+      "epoch": 0.39811286360361453,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0011784138135931541,
+      "loss": 0.0767,
+      "step": 45863
+    },
+    {
+      "epoch": 0.3981215440838187,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0011783841642724617,
+      "loss": 0.0879,
+      "step": 45864
+    },
+    {
+      "epoch": 0.39813022456402286,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0011783545148660495,
+      "loss": 0.1162,
+      "step": 45865
+    },
+    {
+      "epoch": 0.39813890504422705,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0011783248653739497,
+      "loss": 0.1064,
+      "step": 45866
+    },
+    {
+      "epoch": 0.3981475855244312,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0011782952157961956,
+      "loss": 0.1465,
+      "step": 45867
+    },
+    {
+      "epoch": 0.3981562660046354,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0011782655661328183,
+      "loss": 0.0898,
+      "step": 45868
+    },
+    {
+      "epoch": 0.3981649464848395,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0011782359163838513,
+      "loss": 0.1191,
+      "step": 45869
+    },
+    {
+      "epoch": 0.3981736269650437,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0011782062665493267,
+      "loss": 0.1016,
+      "step": 45870
+    },
+    {
+      "epoch": 0.39818230744524785,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0011781766166292771,
+      "loss": 0.0977,
+      "step": 45871
+    },
+    {
+      "epoch": 0.39819098792545204,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0011781469666237345,
+      "loss": 0.1089,
+      "step": 45872
+    },
+    {
+      "epoch": 0.3981996684056562,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0011781173165327315,
+      "loss": 0.1104,
+      "step": 45873
+    },
+    {
+      "epoch": 0.3982083488858604,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0011780876663563006,
+      "loss": 0.1011,
+      "step": 45874
+    },
+    {
+      "epoch": 0.3982170293660645,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0011780580160944743,
+      "loss": 0.1035,
+      "step": 45875
+    },
+    {
+      "epoch": 0.3982257098462687,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001178028365747285,
+      "loss": 0.126,
+      "step": 45876
+    },
+    {
+      "epoch": 0.39823439032647284,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0011779987153147646,
+      "loss": 0.0981,
+      "step": 45877
+    },
+    {
+      "epoch": 0.39824307080667704,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0011779690647969465,
+      "loss": 0.0947,
+      "step": 45878
+    },
+    {
+      "epoch": 0.3982517512868812,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0011779394141938624,
+      "loss": 0.1147,
+      "step": 45879
+    },
+    {
+      "epoch": 0.39826043176708537,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0011779097635055452,
+      "loss": 0.1221,
+      "step": 45880
+    },
+    {
+      "epoch": 0.3982691122472895,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0011778801127320265,
+      "loss": 0.1367,
+      "step": 45881
+    },
+    {
+      "epoch": 0.3982777927274937,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0011778504618733397,
+      "loss": 0.123,
+      "step": 45882
+    },
+    {
+      "epoch": 0.39828647320769783,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0011778208109295165,
+      "loss": 0.1055,
+      "step": 45883
+    },
+    {
+      "epoch": 0.398295153687902,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0011777911599005902,
+      "loss": 0.1167,
+      "step": 45884
+    },
+    {
+      "epoch": 0.39830383416810616,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0011777615087865921,
+      "loss": 0.1177,
+      "step": 45885
+    },
+    {
+      "epoch": 0.39831251464831036,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.001177731857587555,
+      "loss": 0.1035,
+      "step": 45886
+    },
+    {
+      "epoch": 0.3983211951285145,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001177702206303512,
+      "loss": 0.0957,
+      "step": 45887
+    },
+    {
+      "epoch": 0.3983298756087187,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0011776725549344949,
+      "loss": 0.0952,
+      "step": 45888
+    },
+    {
+      "epoch": 0.3983385560889228,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001177642903480536,
+      "loss": 0.1123,
+      "step": 45889
+    },
+    {
+      "epoch": 0.398347236569127,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0011776132519416681,
+      "loss": 0.1143,
+      "step": 45890
+    },
+    {
+      "epoch": 0.39835591704933115,
+      "grad_norm": 2.484375,
+      "learning_rate": 0.001177583600317924,
+      "loss": 0.3984,
+      "step": 45891
+    },
+    {
+      "epoch": 0.39836459752953535,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0011775539486093352,
+      "loss": 0.061,
+      "step": 45892
+    },
+    {
+      "epoch": 0.3983732780097395,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0011775242968159348,
+      "loss": 0.0659,
+      "step": 45893
+    },
+    {
+      "epoch": 0.3983819584899437,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0011774946449377547,
+      "loss": 0.1328,
+      "step": 45894
+    },
+    {
+      "epoch": 0.3983906389701478,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0011774649929748278,
+      "loss": 0.0908,
+      "step": 45895
+    },
+    {
+      "epoch": 0.398399319450352,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0011774353409271863,
+      "loss": 0.1299,
+      "step": 45896
+    },
+    {
+      "epoch": 0.39840799993055614,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0011774056887948628,
+      "loss": 0.0908,
+      "step": 45897
+    },
+    {
+      "epoch": 0.39841668041076034,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0011773760365778893,
+      "loss": 0.1182,
+      "step": 45898
+    },
+    {
+      "epoch": 0.3984253608909645,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0011773463842762988,
+      "loss": 0.0791,
+      "step": 45899
+    },
+    {
+      "epoch": 0.39843404137116867,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0011773167318901232,
+      "loss": 0.085,
+      "step": 45900
+    },
+    {
+      "epoch": 0.3984427218513728,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0011772870794193955,
+      "loss": 0.1099,
+      "step": 45901
+    },
+    {
+      "epoch": 0.398451402331577,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0011772574268641477,
+      "loss": 0.1074,
+      "step": 45902
+    },
+    {
+      "epoch": 0.39846008281178114,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.001177227774224412,
+      "loss": 0.0996,
+      "step": 45903
+    },
+    {
+      "epoch": 0.39846876329198533,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0011771981215002217,
+      "loss": 0.1289,
+      "step": 45904
+    },
+    {
+      "epoch": 0.39847744377218947,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0011771684686916084,
+      "loss": 0.1094,
+      "step": 45905
+    },
+    {
+      "epoch": 0.39848612425239366,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001177138815798605,
+      "loss": 0.1084,
+      "step": 45906
+    },
+    {
+      "epoch": 0.3984948047325978,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0011771091628212437,
+      "loss": 0.1011,
+      "step": 45907
+    },
+    {
+      "epoch": 0.398503485212802,
+      "grad_norm": 1.765625,
+      "learning_rate": 0.0011770795097595568,
+      "loss": 0.3047,
+      "step": 45908
+    },
+    {
+      "epoch": 0.3985121656930061,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.001177049856613577,
+      "loss": 0.0923,
+      "step": 45909
+    },
+    {
+      "epoch": 0.3985208461732103,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0011770202033833368,
+      "loss": 0.1201,
+      "step": 45910
+    },
+    {
+      "epoch": 0.39852952665341446,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001176990550068868,
+      "loss": 0.1318,
+      "step": 45911
+    },
+    {
+      "epoch": 0.39853820713361865,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001176960896670204,
+      "loss": 0.127,
+      "step": 45912
+    },
+    {
+      "epoch": 0.3985468876138228,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0011769312431873767,
+      "loss": 0.0786,
+      "step": 45913
+    },
+    {
+      "epoch": 0.398555568094027,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001176901589620418,
+      "loss": 0.1016,
+      "step": 45914
+    },
+    {
+      "epoch": 0.3985642485742311,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0011768719359693614,
+      "loss": 0.0942,
+      "step": 45915
+    },
+    {
+      "epoch": 0.3985729290544353,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0011768422822342386,
+      "loss": 0.1074,
+      "step": 45916
+    },
+    {
+      "epoch": 0.39858160953463945,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0011768126284150825,
+      "loss": 0.1245,
+      "step": 45917
+    },
+    {
+      "epoch": 0.39859029001484364,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0011767829745119248,
+      "loss": 0.0654,
+      "step": 45918
+    },
+    {
+      "epoch": 0.3985989704950478,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001176753320524799,
+      "loss": 0.0977,
+      "step": 45919
+    },
+    {
+      "epoch": 0.39860765097525197,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0011767236664537361,
+      "loss": 0.082,
+      "step": 45920
+    },
+    {
+      "epoch": 0.3986163314554561,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0011766940122987701,
+      "loss": 0.1104,
+      "step": 45921
+    },
+    {
+      "epoch": 0.3986250119356603,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0011766643580599324,
+      "loss": 0.0981,
+      "step": 45922
+    },
+    {
+      "epoch": 0.39863369241586444,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0011766347037372555,
+      "loss": 0.1426,
+      "step": 45923
+    },
+    {
+      "epoch": 0.39864237289606863,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0011766050493307726,
+      "loss": 0.0752,
+      "step": 45924
+    },
+    {
+      "epoch": 0.39865105337627277,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001176575394840515,
+      "loss": 0.1196,
+      "step": 45925
+    },
+    {
+      "epoch": 0.39865973385647696,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001176545740266516,
+      "loss": 0.1143,
+      "step": 45926
+    },
+    {
+      "epoch": 0.3986684143366811,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0011765160856088076,
+      "loss": 0.0845,
+      "step": 45927
+    },
+    {
+      "epoch": 0.3986770948168853,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0011764864308674223,
+      "loss": 0.1318,
+      "step": 45928
+    },
+    {
+      "epoch": 0.39868577529708943,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0011764567760423929,
+      "loss": 0.085,
+      "step": 45929
+    },
+    {
+      "epoch": 0.3986944557772936,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0011764271211337513,
+      "loss": 0.0923,
+      "step": 45930
+    },
+    {
+      "epoch": 0.39870313625749776,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0011763974661415301,
+      "loss": 0.0923,
+      "step": 45931
+    },
+    {
+      "epoch": 0.39871181673770195,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0011763678110657619,
+      "loss": 0.0986,
+      "step": 45932
+    },
+    {
+      "epoch": 0.3987204972179061,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.001176338155906479,
+      "loss": 0.0571,
+      "step": 45933
+    },
+    {
+      "epoch": 0.3987291776981103,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0011763085006637137,
+      "loss": 0.0796,
+      "step": 45934
+    },
+    {
+      "epoch": 0.3987378581783144,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0011762788453374986,
+      "loss": 0.0874,
+      "step": 45935
+    },
+    {
+      "epoch": 0.3987465386585186,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0011762491899278663,
+      "loss": 0.0884,
+      "step": 45936
+    },
+    {
+      "epoch": 0.39875521913872275,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0011762195344348489,
+      "loss": 0.166,
+      "step": 45937
+    },
+    {
+      "epoch": 0.39876389961892694,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0011761898788584788,
+      "loss": 0.1006,
+      "step": 45938
+    },
+    {
+      "epoch": 0.3987725800991311,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.001176160223198789,
+      "loss": 0.0996,
+      "step": 45939
+    },
+    {
+      "epoch": 0.3987812605793353,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0011761305674558111,
+      "loss": 0.1001,
+      "step": 45940
+    },
+    {
+      "epoch": 0.3987899410595394,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001176100911629578,
+      "loss": 0.1187,
+      "step": 45941
+    },
+    {
+      "epoch": 0.3987986215397436,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0011760712557201223,
+      "loss": 0.0845,
+      "step": 45942
+    },
+    {
+      "epoch": 0.39880730201994774,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001176041599727476,
+      "loss": 0.103,
+      "step": 45943
+    },
+    {
+      "epoch": 0.39881598250015193,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001176011943651672,
+      "loss": 0.082,
+      "step": 45944
+    },
+    {
+      "epoch": 0.39882466298035607,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0011759822874927422,
+      "loss": 0.1006,
+      "step": 45945
+    },
+    {
+      "epoch": 0.39883334346056026,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0011759526312507195,
+      "loss": 0.1357,
+      "step": 45946
+    },
+    {
+      "epoch": 0.3988420239407644,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0011759229749256361,
+      "loss": 0.0972,
+      "step": 45947
+    },
+    {
+      "epoch": 0.3988507044209686,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001175893318517525,
+      "loss": 0.1016,
+      "step": 45948
+    },
+    {
+      "epoch": 0.39885938490117273,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0011758636620264173,
+      "loss": 0.1172,
+      "step": 45949
+    },
+    {
+      "epoch": 0.3988680653813769,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0011758340054523464,
+      "loss": 0.0757,
+      "step": 45950
+    },
+    {
+      "epoch": 0.39887674586158106,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001175804348795345,
+      "loss": 0.1035,
+      "step": 45951
+    },
+    {
+      "epoch": 0.39888542634178525,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0011757746920554447,
+      "loss": 0.1572,
+      "step": 45952
+    },
+    {
+      "epoch": 0.3988941068219894,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0011757450352326785,
+      "loss": 0.123,
+      "step": 45953
+    },
+    {
+      "epoch": 0.3989027873021936,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0011757153783270784,
+      "loss": 0.1006,
+      "step": 45954
+    },
+    {
+      "epoch": 0.3989114677823977,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0011756857213386773,
+      "loss": 0.104,
+      "step": 45955
+    },
+    {
+      "epoch": 0.3989201482626019,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0011756560642675075,
+      "loss": 0.1191,
+      "step": 45956
+    },
+    {
+      "epoch": 0.39892882874280605,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0011756264071136017,
+      "loss": 0.1108,
+      "step": 45957
+    },
+    {
+      "epoch": 0.3989375092230102,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0011755967498769913,
+      "loss": 0.1084,
+      "step": 45958
+    },
+    {
+      "epoch": 0.3989461897032144,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0011755670925577097,
+      "loss": 0.1118,
+      "step": 45959
+    },
+    {
+      "epoch": 0.3989548701834185,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.001175537435155789,
+      "loss": 0.1367,
+      "step": 45960
+    },
+    {
+      "epoch": 0.3989635506636227,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0011755077776712621,
+      "loss": 0.1094,
+      "step": 45961
+    },
+    {
+      "epoch": 0.39897223114382685,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0011754781201041605,
+      "loss": 0.083,
+      "step": 45962
+    },
+    {
+      "epoch": 0.39898091162403104,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0011754484624545173,
+      "loss": 0.1172,
+      "step": 45963
+    },
+    {
+      "epoch": 0.3989895921042352,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0011754188047223648,
+      "loss": 0.1133,
+      "step": 45964
+    },
+    {
+      "epoch": 0.3989982725844394,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0011753891469077358,
+      "loss": 0.1455,
+      "step": 45965
+    },
+    {
+      "epoch": 0.3990069530646435,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001175359489010662,
+      "loss": 0.1025,
+      "step": 45966
+    },
+    {
+      "epoch": 0.3990156335448477,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0011753298310311763,
+      "loss": 0.1089,
+      "step": 45967
+    },
+    {
+      "epoch": 0.39902431402505184,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0011753001729693112,
+      "loss": 0.1006,
+      "step": 45968
+    },
+    {
+      "epoch": 0.39903299450525603,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0011752705148250985,
+      "loss": 0.0605,
+      "step": 45969
+    },
+    {
+      "epoch": 0.39904167498546017,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0011752408565985717,
+      "loss": 0.1104,
+      "step": 45970
+    },
+    {
+      "epoch": 0.39905035546566436,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.001175211198289762,
+      "loss": 0.0977,
+      "step": 45971
+    },
+    {
+      "epoch": 0.3990590359458685,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0011751815398987028,
+      "loss": 0.127,
+      "step": 45972
+    },
+    {
+      "epoch": 0.3990677164260727,
+      "grad_norm": 0.75,
+      "learning_rate": 0.001175151881425426,
+      "loss": 0.1709,
+      "step": 45973
+    },
+    {
+      "epoch": 0.39907639690627683,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0011751222228699645,
+      "loss": 0.1123,
+      "step": 45974
+    },
+    {
+      "epoch": 0.399085077386481,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0011750925642323505,
+      "loss": 0.0889,
+      "step": 45975
+    },
+    {
+      "epoch": 0.39909375786668516,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001175062905512616,
+      "loss": 0.1543,
+      "step": 45976
+    },
+    {
+      "epoch": 0.39910243834688935,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0011750332467107942,
+      "loss": 0.1387,
+      "step": 45977
+    },
+    {
+      "epoch": 0.3991111188270935,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0011750035878269171,
+      "loss": 0.1562,
+      "step": 45978
+    },
+    {
+      "epoch": 0.3991197993072977,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0011749739288610171,
+      "loss": 0.1006,
+      "step": 45979
+    },
+    {
+      "epoch": 0.3991284797875018,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0011749442698131267,
+      "loss": 0.0845,
+      "step": 45980
+    },
+    {
+      "epoch": 0.399137160267706,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0011749146106832787,
+      "loss": 0.1025,
+      "step": 45981
+    },
+    {
+      "epoch": 0.39914584074791015,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0011748849514715046,
+      "loss": 0.0952,
+      "step": 45982
+    },
+    {
+      "epoch": 0.39915452122811435,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.001174855292177838,
+      "loss": 0.1318,
+      "step": 45983
+    },
+    {
+      "epoch": 0.3991632017083185,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0011748256328023103,
+      "loss": 0.1416,
+      "step": 45984
+    },
+    {
+      "epoch": 0.3991718821885227,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.001174795973344955,
+      "loss": 0.1021,
+      "step": 45985
+    },
+    {
+      "epoch": 0.3991805626687268,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0011747663138058037,
+      "loss": 0.1367,
+      "step": 45986
+    },
+    {
+      "epoch": 0.399189243148931,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0011747366541848892,
+      "loss": 0.1191,
+      "step": 45987
+    },
+    {
+      "epoch": 0.39919792362913514,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0011747069944822437,
+      "loss": 0.082,
+      "step": 45988
+    },
+    {
+      "epoch": 0.39920660410933934,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0011746773346978994,
+      "loss": 0.0898,
+      "step": 45989
+    },
+    {
+      "epoch": 0.3992152845895435,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0011746476748318896,
+      "loss": 0.1152,
+      "step": 45990
+    },
+    {
+      "epoch": 0.39922396506974767,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0011746180148842458,
+      "loss": 0.0884,
+      "step": 45991
+    },
+    {
+      "epoch": 0.3992326455499518,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0011745883548550011,
+      "loss": 0.1152,
+      "step": 45992
+    },
+    {
+      "epoch": 0.399241326030156,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0011745586947441876,
+      "loss": 0.0972,
+      "step": 45993
+    },
+    {
+      "epoch": 0.39925000651036013,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0011745290345518378,
+      "loss": 0.5625,
+      "step": 45994
+    },
+    {
+      "epoch": 0.3992586869905643,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0011744993742779845,
+      "loss": 0.0815,
+      "step": 45995
+    },
+    {
+      "epoch": 0.39926736747076846,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0011744697139226595,
+      "loss": 0.2578,
+      "step": 45996
+    },
+    {
+      "epoch": 0.39927604795097266,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0011744400534858957,
+      "loss": 0.1172,
+      "step": 45997
+    },
+    {
+      "epoch": 0.3992847284311768,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0011744103929677253,
+      "loss": 0.0962,
+      "step": 45998
+    },
+    {
+      "epoch": 0.399293408911381,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0011743807323681807,
+      "loss": 0.1367,
+      "step": 45999
+    },
+    {
+      "epoch": 0.3993020893915851,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0011743510716872945,
+      "loss": 0.0859,
+      "step": 46000
+    },
+    {
+      "epoch": 0.3993107698717893,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0011743214109250992,
+      "loss": 0.125,
+      "step": 46001
+    },
+    {
+      "epoch": 0.39931945035199345,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001174291750081627,
+      "loss": 0.1055,
+      "step": 46002
+    },
+    {
+      "epoch": 0.39932813083219765,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0011742620891569107,
+      "loss": 0.0918,
+      "step": 46003
+    },
+    {
+      "epoch": 0.3993368113124018,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0011742324281509822,
+      "loss": 0.0918,
+      "step": 46004
+    },
+    {
+      "epoch": 0.399345491792606,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0011742027670638745,
+      "loss": 0.1104,
+      "step": 46005
+    },
+    {
+      "epoch": 0.3993541722728101,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0011741731058956195,
+      "loss": 0.1123,
+      "step": 46006
+    },
+    {
+      "epoch": 0.3993628527530143,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.00117414344464625,
+      "loss": 0.1089,
+      "step": 46007
+    },
+    {
+      "epoch": 0.39937153323321845,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0011741137833157984,
+      "loss": 0.0981,
+      "step": 46008
+    },
+    {
+      "epoch": 0.39938021371342264,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0011740841219042971,
+      "loss": 0.0869,
+      "step": 46009
+    },
+    {
+      "epoch": 0.3993888941936268,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0011740544604117783,
+      "loss": 0.0923,
+      "step": 46010
+    },
+    {
+      "epoch": 0.39939757467383097,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0011740247988382747,
+      "loss": 0.1221,
+      "step": 46011
+    },
+    {
+      "epoch": 0.3994062551540351,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.001173995137183819,
+      "loss": 0.1299,
+      "step": 46012
+    },
+    {
+      "epoch": 0.3994149356342393,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001173965475448443,
+      "loss": 0.062,
+      "step": 46013
+    },
+    {
+      "epoch": 0.39942361611444344,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0011739358136321797,
+      "loss": 0.104,
+      "step": 46014
+    },
+    {
+      "epoch": 0.39943229659464763,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001173906151735061,
+      "loss": 0.1367,
+      "step": 46015
+    },
+    {
+      "epoch": 0.39944097707485177,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.0011738764897571202,
+      "loss": 0.3691,
+      "step": 46016
+    },
+    {
+      "epoch": 0.39944965755505596,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0011738468276983887,
+      "loss": 0.0781,
+      "step": 46017
+    },
+    {
+      "epoch": 0.3994583380352601,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0011738171655588997,
+      "loss": 0.1895,
+      "step": 46018
+    },
+    {
+      "epoch": 0.3994670185154643,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0011737875033386849,
+      "loss": 0.1006,
+      "step": 46019
+    },
+    {
+      "epoch": 0.3994756989956684,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0011737578410377773,
+      "loss": 0.0962,
+      "step": 46020
+    },
+    {
+      "epoch": 0.3994843794758726,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0011737281786562098,
+      "loss": 0.0742,
+      "step": 46021
+    },
+    {
+      "epoch": 0.39949305995607676,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0011736985161940137,
+      "loss": 0.0923,
+      "step": 46022
+    },
+    {
+      "epoch": 0.39950174043628095,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0011736688536512223,
+      "loss": 0.0903,
+      "step": 46023
+    },
+    {
+      "epoch": 0.3995104209164851,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0011736391910278675,
+      "loss": 0.0918,
+      "step": 46024
+    },
+    {
+      "epoch": 0.3995191013966893,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0011736095283239826,
+      "loss": 0.0786,
+      "step": 46025
+    },
+    {
+      "epoch": 0.3995277818768934,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0011735798655395989,
+      "loss": 0.0972,
+      "step": 46026
+    },
+    {
+      "epoch": 0.3995364623570976,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0011735502026747496,
+      "loss": 0.084,
+      "step": 46027
+    },
+    {
+      "epoch": 0.39954514283730175,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0011735205397294664,
+      "loss": 0.082,
+      "step": 46028
+    },
+    {
+      "epoch": 0.39955382331750594,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0011734908767037826,
+      "loss": 0.0918,
+      "step": 46029
+    },
+    {
+      "epoch": 0.3995625037977101,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0011734612135977302,
+      "loss": 0.1123,
+      "step": 46030
+    },
+    {
+      "epoch": 0.39957118427791427,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0011734315504113419,
+      "loss": 0.0928,
+      "step": 46031
+    },
+    {
+      "epoch": 0.3995798647581184,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00117340188714465,
+      "loss": 0.2891,
+      "step": 46032
+    },
+    {
+      "epoch": 0.3995885452383226,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0011733722237976865,
+      "loss": 0.1182,
+      "step": 46033
+    },
+    {
+      "epoch": 0.39959722571852674,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0011733425603704846,
+      "loss": 0.1006,
+      "step": 46034
+    },
+    {
+      "epoch": 0.39960590619873093,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0011733128968630763,
+      "loss": 0.0732,
+      "step": 46035
+    },
+    {
+      "epoch": 0.39961458667893507,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0011732832332754942,
+      "loss": 0.1055,
+      "step": 46036
+    },
+    {
+      "epoch": 0.39962326715913926,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0011732535696077704,
+      "loss": 0.0947,
+      "step": 46037
+    },
+    {
+      "epoch": 0.3996319476393434,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0011732239058599378,
+      "loss": 0.0933,
+      "step": 46038
+    },
+    {
+      "epoch": 0.3996406281195476,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0011731942420320283,
+      "loss": 0.1108,
+      "step": 46039
+    },
+    {
+      "epoch": 0.39964930859975173,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0011731645781240753,
+      "loss": 0.0786,
+      "step": 46040
+    },
+    {
+      "epoch": 0.3996579890799559,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.00117313491413611,
+      "loss": 0.1191,
+      "step": 46041
+    },
+    {
+      "epoch": 0.39966666956016006,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0011731052500681659,
+      "loss": 0.1406,
+      "step": 46042
+    },
+    {
+      "epoch": 0.39967535004036425,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001173075585920275,
+      "loss": 0.084,
+      "step": 46043
+    },
+    {
+      "epoch": 0.3996840305205684,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0011730459216924695,
+      "loss": 0.0938,
+      "step": 46044
+    },
+    {
+      "epoch": 0.3996927110007726,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0011730162573847817,
+      "loss": 0.1162,
+      "step": 46045
+    },
+    {
+      "epoch": 0.3997013914809767,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0011729865929972453,
+      "loss": 0.1104,
+      "step": 46046
+    },
+    {
+      "epoch": 0.3997100719611809,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0011729569285298914,
+      "loss": 0.1279,
+      "step": 46047
+    },
+    {
+      "epoch": 0.39971875244138505,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0011729272639827526,
+      "loss": 0.1426,
+      "step": 46048
+    },
+    {
+      "epoch": 0.39972743292158924,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0011728975993558623,
+      "loss": 0.1123,
+      "step": 46049
+    },
+    {
+      "epoch": 0.3997361134017934,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0011728679346492516,
+      "loss": 0.1309,
+      "step": 46050
+    },
+    {
+      "epoch": 0.3997447938819976,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001172838269862954,
+      "loss": 0.1064,
+      "step": 46051
+    },
+    {
+      "epoch": 0.3997534743622017,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0011728086049970015,
+      "loss": 0.0635,
+      "step": 46052
+    },
+    {
+      "epoch": 0.3997621548424059,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0011727789400514266,
+      "loss": 0.085,
+      "step": 46053
+    },
+    {
+      "epoch": 0.39977083532261004,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0011727492750262618,
+      "loss": 0.1187,
+      "step": 46054
+    },
+    {
+      "epoch": 0.39977951580281423,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0011727196099215395,
+      "loss": 0.0986,
+      "step": 46055
+    },
+    {
+      "epoch": 0.39978819628301837,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0011726899447372923,
+      "loss": 0.0967,
+      "step": 46056
+    },
+    {
+      "epoch": 0.39979687676322256,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001172660279473552,
+      "loss": 0.0693,
+      "step": 46057
+    },
+    {
+      "epoch": 0.3998055572434267,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0011726306141303519,
+      "loss": 0.1689,
+      "step": 46058
+    },
+    {
+      "epoch": 0.3998142377236309,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0011726009487077237,
+      "loss": 0.0776,
+      "step": 46059
+    },
+    {
+      "epoch": 0.39982291820383503,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0011725712832057005,
+      "loss": 0.0947,
+      "step": 46060
+    },
+    {
+      "epoch": 0.3998315986840392,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0011725416176243144,
+      "loss": 0.1084,
+      "step": 46061
+    },
+    {
+      "epoch": 0.39984027916424336,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0011725119519635976,
+      "loss": 0.1084,
+      "step": 46062
+    },
+    {
+      "epoch": 0.39984895964444755,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0011724822862235831,
+      "loss": 0.1196,
+      "step": 46063
+    },
+    {
+      "epoch": 0.3998576401246517,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001172452620404303,
+      "loss": 0.0923,
+      "step": 46064
+    },
+    {
+      "epoch": 0.3998663206048559,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0011724229545057897,
+      "loss": 0.1025,
+      "step": 46065
+    },
+    {
+      "epoch": 0.39987500108506,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0011723932885280759,
+      "loss": 0.0845,
+      "step": 46066
+    },
+    {
+      "epoch": 0.3998836815652642,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001172363622471194,
+      "loss": 0.1279,
+      "step": 46067
+    },
+    {
+      "epoch": 0.39989236204546835,
+      "grad_norm": 4.0,
+      "learning_rate": 0.001172333956335176,
+      "loss": 0.2256,
+      "step": 46068
+    },
+    {
+      "epoch": 0.39990104252567255,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0011723042901200548,
+      "loss": 0.1162,
+      "step": 46069
+    },
+    {
+      "epoch": 0.3999097230058767,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0011722746238258628,
+      "loss": 0.0762,
+      "step": 46070
+    },
+    {
+      "epoch": 0.3999184034860809,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0011722449574526324,
+      "loss": 0.0801,
+      "step": 46071
+    },
+    {
+      "epoch": 0.399927083966285,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0011722152910003958,
+      "loss": 0.083,
+      "step": 46072
+    },
+    {
+      "epoch": 0.3999357644464892,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001172185624469186,
+      "loss": 0.1064,
+      "step": 46073
+    },
+    {
+      "epoch": 0.39994444492669334,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0011721559578590347,
+      "loss": 0.0505,
+      "step": 46074
+    },
+    {
+      "epoch": 0.39995312540689754,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0011721262911699745,
+      "loss": 0.1201,
+      "step": 46075
+    },
+    {
+      "epoch": 0.3999618058871017,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0011720966244020388,
+      "loss": 0.0703,
+      "step": 46076
+    },
+    {
+      "epoch": 0.39997048636730587,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0011720669575552589,
+      "loss": 0.1201,
+      "step": 46077
+    },
+    {
+      "epoch": 0.39997916684751,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0011720372906296676,
+      "loss": 0.1074,
+      "step": 46078
+    },
+    {
+      "epoch": 0.3999878473277142,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0011720076236252975,
+      "loss": 0.1191,
+      "step": 46079
+    },
+    {
+      "epoch": 0.39999652780791833,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.001171977956542181,
+      "loss": 0.1221,
+      "step": 46080
+    },
+    {
+      "epoch": 0.40000520828812247,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0011719482893803503,
+      "loss": 0.0981,
+      "step": 46081
+    },
+    {
+      "epoch": 0.40001388876832666,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0011719186221398384,
+      "loss": 0.0776,
+      "step": 46082
+    },
+    {
+      "epoch": 0.4000225692485308,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.001171888954820677,
+      "loss": 0.1191,
+      "step": 46083
+    },
+    {
+      "epoch": 0.400031249728735,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001171859287422899,
+      "loss": 0.0884,
+      "step": 46084
+    },
+    {
+      "epoch": 0.40003993020893913,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0011718296199465371,
+      "loss": 0.1045,
+      "step": 46085
+    },
+    {
+      "epoch": 0.4000486106891433,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001171799952391623,
+      "loss": 0.0747,
+      "step": 46086
+    },
+    {
+      "epoch": 0.40005729116934746,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0011717702847581896,
+      "loss": 0.0811,
+      "step": 46087
+    },
+    {
+      "epoch": 0.40006597164955165,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0011717406170462692,
+      "loss": 0.1182,
+      "step": 46088
+    },
+    {
+      "epoch": 0.4000746521297558,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0011717109492558947,
+      "loss": 0.1348,
+      "step": 46089
+    },
+    {
+      "epoch": 0.40008333260996,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001171681281387098,
+      "loss": 0.1602,
+      "step": 46090
+    },
+    {
+      "epoch": 0.4000920130901641,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001171651613439912,
+      "loss": 0.0923,
+      "step": 46091
+    },
+    {
+      "epoch": 0.4001006935703683,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0011716219454143684,
+      "loss": 0.1162,
+      "step": 46092
+    },
+    {
+      "epoch": 0.40010937405057245,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0011715922773105002,
+      "loss": 0.0903,
+      "step": 46093
+    },
+    {
+      "epoch": 0.40011805453077665,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.00117156260912834,
+      "loss": 0.1426,
+      "step": 46094
+    },
+    {
+      "epoch": 0.4001267350109808,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0011715329408679201,
+      "loss": 0.084,
+      "step": 46095
+    },
+    {
+      "epoch": 0.400135415491185,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0011715032725292725,
+      "loss": 0.0874,
+      "step": 46096
+    },
+    {
+      "epoch": 0.4001440959713891,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.00117147360411243,
+      "loss": 0.127,
+      "step": 46097
+    },
+    {
+      "epoch": 0.4001527764515933,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0011714439356174252,
+      "loss": 0.1089,
+      "step": 46098
+    },
+    {
+      "epoch": 0.40016145693179744,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0011714142670442905,
+      "loss": 0.1035,
+      "step": 46099
+    },
+    {
+      "epoch": 0.40017013741200164,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001171384598393058,
+      "loss": 0.0923,
+      "step": 46100
+    },
+    {
+      "epoch": 0.4001788178922058,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0011713549296637601,
+      "loss": 0.0806,
+      "step": 46101
+    },
+    {
+      "epoch": 0.40018749837240997,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0011713252608564302,
+      "loss": 0.1099,
+      "step": 46102
+    },
+    {
+      "epoch": 0.4001961788526141,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0011712955919711,
+      "loss": 0.0938,
+      "step": 46103
+    },
+    {
+      "epoch": 0.4002048593328183,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0011712659230078018,
+      "loss": 0.1699,
+      "step": 46104
+    },
+    {
+      "epoch": 0.40021353981302243,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001171236253966568,
+      "loss": 0.0732,
+      "step": 46105
+    },
+    {
+      "epoch": 0.4002222202932266,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0011712065848474315,
+      "loss": 0.1187,
+      "step": 46106
+    },
+    {
+      "epoch": 0.40023090077343076,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0011711769156504243,
+      "loss": 0.0962,
+      "step": 46107
+    },
+    {
+      "epoch": 0.40023958125363496,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0011711472463755796,
+      "loss": 0.0684,
+      "step": 46108
+    },
+    {
+      "epoch": 0.4002482617338391,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0011711175770229292,
+      "loss": 0.1011,
+      "step": 46109
+    },
+    {
+      "epoch": 0.4002569422140433,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0011710879075925054,
+      "loss": 0.0771,
+      "step": 46110
+    },
+    {
+      "epoch": 0.4002656226942474,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.001171058238084341,
+      "loss": 0.0698,
+      "step": 46111
+    },
+    {
+      "epoch": 0.4002743031744516,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001171028568498469,
+      "loss": 0.1021,
+      "step": 46112
+    },
+    {
+      "epoch": 0.40028298365465576,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0011709988988349207,
+      "loss": 0.0938,
+      "step": 46113
+    },
+    {
+      "epoch": 0.40029166413485995,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001170969229093729,
+      "loss": 0.1108,
+      "step": 46114
+    },
+    {
+      "epoch": 0.4003003446150641,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0011709395592749265,
+      "loss": 0.0898,
+      "step": 46115
+    },
+    {
+      "epoch": 0.4003090250952683,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0011709098893785456,
+      "loss": 0.0713,
+      "step": 46116
+    },
+    {
+      "epoch": 0.4003177055754724,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0011708802194046184,
+      "loss": 0.1533,
+      "step": 46117
+    },
+    {
+      "epoch": 0.4003263860556766,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001170850549353178,
+      "loss": 0.0742,
+      "step": 46118
+    },
+    {
+      "epoch": 0.40033506653588075,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0011708208792242565,
+      "loss": 0.1167,
+      "step": 46119
+    },
+    {
+      "epoch": 0.40034374701608494,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0011707912090178866,
+      "loss": 0.0894,
+      "step": 46120
+    },
+    {
+      "epoch": 0.4003524274962891,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0011707615387341,
+      "loss": 0.0835,
+      "step": 46121
+    },
+    {
+      "epoch": 0.40036110797649327,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.00117073186837293,
+      "loss": 0.1367,
+      "step": 46122
+    },
+    {
+      "epoch": 0.4003697884566974,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0011707021979344083,
+      "loss": 0.1035,
+      "step": 46123
+    },
+    {
+      "epoch": 0.4003784689369016,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001170672527418568,
+      "loss": 0.0786,
+      "step": 46124
+    },
+    {
+      "epoch": 0.40038714941710574,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0011706428568254413,
+      "loss": 0.0996,
+      "step": 46125
+    },
+    {
+      "epoch": 0.40039582989730993,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0011706131861550606,
+      "loss": 0.0908,
+      "step": 46126
+    },
+    {
+      "epoch": 0.40040451037751407,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0011705835154074582,
+      "loss": 0.085,
+      "step": 46127
+    },
+    {
+      "epoch": 0.40041319085771826,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0011705538445826672,
+      "loss": 0.1279,
+      "step": 46128
+    },
+    {
+      "epoch": 0.4004218713379224,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0011705241736807192,
+      "loss": 0.0967,
+      "step": 46129
+    },
+    {
+      "epoch": 0.4004305518181266,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0011704945027016472,
+      "loss": 0.1035,
+      "step": 46130
+    },
+    {
+      "epoch": 0.4004392322983307,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0011704648316454831,
+      "loss": 0.084,
+      "step": 46131
+    },
+    {
+      "epoch": 0.4004479127785349,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.00117043516051226,
+      "loss": 0.1074,
+      "step": 46132
+    },
+    {
+      "epoch": 0.40045659325873906,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0011704054893020102,
+      "loss": 0.1143,
+      "step": 46133
+    },
+    {
+      "epoch": 0.40046527373894325,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001170375818014766,
+      "loss": 0.0977,
+      "step": 46134
+    },
+    {
+      "epoch": 0.4004739542191474,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0011703461466505596,
+      "loss": 0.0742,
+      "step": 46135
+    },
+    {
+      "epoch": 0.4004826346993516,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0011703164752094237,
+      "loss": 0.124,
+      "step": 46136
+    },
+    {
+      "epoch": 0.4004913151795557,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001170286803691391,
+      "loss": 0.0908,
+      "step": 46137
+    },
+    {
+      "epoch": 0.4004999956597599,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0011702571320964938,
+      "loss": 0.0908,
+      "step": 46138
+    },
+    {
+      "epoch": 0.40050867613996405,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.0011702274604247643,
+      "loss": 0.1157,
+      "step": 46139
+    },
+    {
+      "epoch": 0.40051735662016824,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0011701977886762348,
+      "loss": 0.1206,
+      "step": 46140
+    },
+    {
+      "epoch": 0.4005260371003724,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0011701681168509383,
+      "loss": 0.1338,
+      "step": 46141
+    },
+    {
+      "epoch": 0.40053471758057657,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0011701384449489072,
+      "loss": 0.0898,
+      "step": 46142
+    },
+    {
+      "epoch": 0.4005433980607807,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0011701087729701735,
+      "loss": 0.1084,
+      "step": 46143
+    },
+    {
+      "epoch": 0.4005520785409849,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0011700791009147697,
+      "loss": 0.1191,
+      "step": 46144
+    },
+    {
+      "epoch": 0.40056075902118904,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0011700494287827288,
+      "loss": 0.0894,
+      "step": 46145
+    },
+    {
+      "epoch": 0.40056943950139323,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001170019756574083,
+      "loss": 0.1123,
+      "step": 46146
+    },
+    {
+      "epoch": 0.40057811998159737,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0011699900842888644,
+      "loss": 0.1182,
+      "step": 46147
+    },
+    {
+      "epoch": 0.40058680046180156,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001169960411927106,
+      "loss": 0.083,
+      "step": 46148
+    },
+    {
+      "epoch": 0.4005954809420057,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0011699307394888395,
+      "loss": 0.0952,
+      "step": 46149
+    },
+    {
+      "epoch": 0.4006041614222099,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001169901066974098,
+      "loss": 0.1201,
+      "step": 46150
+    },
+    {
+      "epoch": 0.40061284190241403,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0011698713943829138,
+      "loss": 0.0967,
+      "step": 46151
+    },
+    {
+      "epoch": 0.4006215223826182,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0011698417217153195,
+      "loss": 0.1016,
+      "step": 46152
+    },
+    {
+      "epoch": 0.40063020286282236,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0011698120489713468,
+      "loss": 0.1309,
+      "step": 46153
+    },
+    {
+      "epoch": 0.40063888334302655,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001169782376151029,
+      "loss": 0.1162,
+      "step": 46154
+    },
+    {
+      "epoch": 0.4006475638232307,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001169752703254398,
+      "loss": 0.0913,
+      "step": 46155
+    },
+    {
+      "epoch": 0.4006562443034349,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001169723030281487,
+      "loss": 0.1074,
+      "step": 46156
+    },
+    {
+      "epoch": 0.400664924783639,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0011696933572323276,
+      "loss": 0.1172,
+      "step": 46157
+    },
+    {
+      "epoch": 0.4006736052638432,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0011696636841069525,
+      "loss": 0.1226,
+      "step": 46158
+    },
+    {
+      "epoch": 0.40068228574404735,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0011696340109053946,
+      "loss": 0.1162,
+      "step": 46159
+    },
+    {
+      "epoch": 0.40069096622425154,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0011696043376276859,
+      "loss": 0.0986,
+      "step": 46160
+    },
+    {
+      "epoch": 0.4006996467044557,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0011695746642738586,
+      "loss": 0.0713,
+      "step": 46161
+    },
+    {
+      "epoch": 0.4007083271846599,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0011695449908439457,
+      "loss": 0.0742,
+      "step": 46162
+    },
+    {
+      "epoch": 0.400717007664864,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0011695153173379796,
+      "loss": 0.0869,
+      "step": 46163
+    },
+    {
+      "epoch": 0.4007256881450682,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0011694856437559921,
+      "loss": 0.0874,
+      "step": 46164
+    },
+    {
+      "epoch": 0.40073436862527234,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0011694559700980167,
+      "loss": 0.0972,
+      "step": 46165
+    },
+    {
+      "epoch": 0.40074304910547653,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0011694262963640849,
+      "loss": 0.1084,
+      "step": 46166
+    },
+    {
+      "epoch": 0.40075172958568067,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0011693966225542298,
+      "loss": 0.0645,
+      "step": 46167
+    },
+    {
+      "epoch": 0.40076041006588486,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0011693669486684836,
+      "loss": 0.1025,
+      "step": 46168
+    },
+    {
+      "epoch": 0.400769090546089,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0011693372747068787,
+      "loss": 0.126,
+      "step": 46169
+    },
+    {
+      "epoch": 0.4007777710262932,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0011693076006694474,
+      "loss": 0.0928,
+      "step": 46170
+    },
+    {
+      "epoch": 0.40078645150649733,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0011692779265562226,
+      "loss": 0.0864,
+      "step": 46171
+    },
+    {
+      "epoch": 0.4007951319867015,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0011692482523672366,
+      "loss": 0.0737,
+      "step": 46172
+    },
+    {
+      "epoch": 0.40080381246690566,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0011692185781025212,
+      "loss": 0.1152,
+      "step": 46173
+    },
+    {
+      "epoch": 0.40081249294710986,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.00116918890376211,
+      "loss": 0.1084,
+      "step": 46174
+    },
+    {
+      "epoch": 0.400821173427314,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0011691592293460345,
+      "loss": 0.1108,
+      "step": 46175
+    },
+    {
+      "epoch": 0.4008298539075182,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0011691295548543277,
+      "loss": 0.1328,
+      "step": 46176
+    },
+    {
+      "epoch": 0.4008385343877223,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0011690998802870221,
+      "loss": 0.0889,
+      "step": 46177
+    },
+    {
+      "epoch": 0.4008472148679265,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0011690702056441494,
+      "loss": 0.0703,
+      "step": 46178
+    },
+    {
+      "epoch": 0.40085589534813065,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0011690405309257425,
+      "loss": 0.0679,
+      "step": 46179
+    },
+    {
+      "epoch": 0.40086457582833485,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0011690108561318345,
+      "loss": 0.0889,
+      "step": 46180
+    },
+    {
+      "epoch": 0.400873256308539,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0011689811812624568,
+      "loss": 0.1367,
+      "step": 46181
+    },
+    {
+      "epoch": 0.4008819367887432,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0011689515063176426,
+      "loss": 0.1035,
+      "step": 46182
+    },
+    {
+      "epoch": 0.4008906172689473,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001168921831297424,
+      "loss": 0.0928,
+      "step": 46183
+    },
+    {
+      "epoch": 0.4008992977491515,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0011688921562018333,
+      "loss": 0.0796,
+      "step": 46184
+    },
+    {
+      "epoch": 0.40090797822935564,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0011688624810309032,
+      "loss": 0.1123,
+      "step": 46185
+    },
+    {
+      "epoch": 0.40091665870955984,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0011688328057846665,
+      "loss": 0.1074,
+      "step": 46186
+    },
+    {
+      "epoch": 0.400925339189764,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0011688031304631548,
+      "loss": 0.1104,
+      "step": 46187
+    },
+    {
+      "epoch": 0.40093401966996817,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0011687734550664013,
+      "loss": 0.0703,
+      "step": 46188
+    },
+    {
+      "epoch": 0.4009427001501723,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0011687437795944381,
+      "loss": 0.0977,
+      "step": 46189
+    },
+    {
+      "epoch": 0.4009513806303765,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0011687141040472979,
+      "loss": 0.0923,
+      "step": 46190
+    },
+    {
+      "epoch": 0.40096006111058063,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001168684428425013,
+      "loss": 0.1289,
+      "step": 46191
+    },
+    {
+      "epoch": 0.4009687415907848,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0011686547527276156,
+      "loss": 0.125,
+      "step": 46192
+    },
+    {
+      "epoch": 0.40097742207098896,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0011686250769551384,
+      "loss": 0.1079,
+      "step": 46193
+    },
+    {
+      "epoch": 0.40098610255119316,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0011685954011076138,
+      "loss": 0.1289,
+      "step": 46194
+    },
+    {
+      "epoch": 0.4009947830313973,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0011685657251850746,
+      "loss": 0.082,
+      "step": 46195
+    },
+    {
+      "epoch": 0.4010034635116015,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0011685360491875527,
+      "loss": 0.0933,
+      "step": 46196
+    },
+    {
+      "epoch": 0.4010121439918056,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001168506373115081,
+      "loss": 0.1104,
+      "step": 46197
+    },
+    {
+      "epoch": 0.4010208244720098,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0011684766969676918,
+      "loss": 0.0864,
+      "step": 46198
+    },
+    {
+      "epoch": 0.40102950495221396,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0011684470207454175,
+      "loss": 0.1011,
+      "step": 46199
+    },
+    {
+      "epoch": 0.40103818543241815,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0011684173444482901,
+      "loss": 0.1235,
+      "step": 46200
+    },
+    {
+      "epoch": 0.4010468659126223,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0011683876680763428,
+      "loss": 0.0947,
+      "step": 46201
+    },
+    {
+      "epoch": 0.4010555463928265,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001168357991629608,
+      "loss": 0.0884,
+      "step": 46202
+    },
+    {
+      "epoch": 0.4010642268730306,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0011683283151081177,
+      "loss": 0.105,
+      "step": 46203
+    },
+    {
+      "epoch": 0.40107290735323475,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0011682986385119047,
+      "loss": 0.1191,
+      "step": 46204
+    },
+    {
+      "epoch": 0.40108158783343895,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0011682689618410012,
+      "loss": 0.1074,
+      "step": 46205
+    },
+    {
+      "epoch": 0.4010902683136431,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.00116823928509544,
+      "loss": 0.1133,
+      "step": 46206
+    },
+    {
+      "epoch": 0.4010989487938473,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0011682096082752532,
+      "loss": 0.0723,
+      "step": 46207
+    },
+    {
+      "epoch": 0.4011076292740514,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0011681799313804735,
+      "loss": 0.1455,
+      "step": 46208
+    },
+    {
+      "epoch": 0.4011163097542556,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0011681502544111332,
+      "loss": 0.1357,
+      "step": 46209
+    },
+    {
+      "epoch": 0.40112499023445974,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0011681205773672648,
+      "loss": 0.124,
+      "step": 46210
+    },
+    {
+      "epoch": 0.40113367071466394,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0011680909002489005,
+      "loss": 0.0957,
+      "step": 46211
+    },
+    {
+      "epoch": 0.4011423511948681,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0011680612230560731,
+      "loss": 0.1079,
+      "step": 46212
+    },
+    {
+      "epoch": 0.40115103167507227,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0011680315457888151,
+      "loss": 0.1001,
+      "step": 46213
+    },
+    {
+      "epoch": 0.4011597121552764,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001168001868447159,
+      "loss": 0.1328,
+      "step": 46214
+    },
+    {
+      "epoch": 0.4011683926354806,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0011679721910311366,
+      "loss": 0.0938,
+      "step": 46215
+    },
+    {
+      "epoch": 0.40117707311568473,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0011679425135407815,
+      "loss": 0.0928,
+      "step": 46216
+    },
+    {
+      "epoch": 0.4011857535958889,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0011679128359761252,
+      "loss": 0.1108,
+      "step": 46217
+    },
+    {
+      "epoch": 0.40119443407609306,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0011678831583372003,
+      "loss": 0.1113,
+      "step": 46218
+    },
+    {
+      "epoch": 0.40120311455629726,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0011678534806240394,
+      "loss": 0.1387,
+      "step": 46219
+    },
+    {
+      "epoch": 0.4012117950365014,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001167823802836675,
+      "loss": 0.1143,
+      "step": 46220
+    },
+    {
+      "epoch": 0.4012204755167056,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0011677941249751396,
+      "loss": 0.0894,
+      "step": 46221
+    },
+    {
+      "epoch": 0.4012291559969097,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0011677644470394655,
+      "loss": 0.0864,
+      "step": 46222
+    },
+    {
+      "epoch": 0.4012378364771139,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0011677347690296851,
+      "loss": 0.1172,
+      "step": 46223
+    },
+    {
+      "epoch": 0.40124651695731806,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0011677050909458314,
+      "loss": 0.1289,
+      "step": 46224
+    },
+    {
+      "epoch": 0.40125519743752225,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0011676754127879362,
+      "loss": 0.0786,
+      "step": 46225
+    },
+    {
+      "epoch": 0.4012638779177264,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0011676457345560318,
+      "loss": 0.1187,
+      "step": 46226
+    },
+    {
+      "epoch": 0.4012725583979306,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0011676160562501512,
+      "loss": 0.1064,
+      "step": 46227
+    },
+    {
+      "epoch": 0.4012812388781347,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001167586377870327,
+      "loss": 0.126,
+      "step": 46228
+    },
+    {
+      "epoch": 0.4012899193583389,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0011675566994165912,
+      "loss": 0.0996,
+      "step": 46229
+    },
+    {
+      "epoch": 0.40129859983854305,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0011675270208889763,
+      "loss": 0.0806,
+      "step": 46230
+    },
+    {
+      "epoch": 0.40130728031874724,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001167497342287515,
+      "loss": 0.0972,
+      "step": 46231
+    },
+    {
+      "epoch": 0.4013159607989514,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0011674676636122398,
+      "loss": 0.1182,
+      "step": 46232
+    },
+    {
+      "epoch": 0.40132464127915557,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0011674379848631827,
+      "loss": 0.0977,
+      "step": 46233
+    },
+    {
+      "epoch": 0.4013333217593597,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0011674083060403764,
+      "loss": 0.1006,
+      "step": 46234
+    },
+    {
+      "epoch": 0.4013420022395639,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0011673786271438535,
+      "loss": 0.0879,
+      "step": 46235
+    },
+    {
+      "epoch": 0.40135068271976804,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0011673489481736463,
+      "loss": 0.0928,
+      "step": 46236
+    },
+    {
+      "epoch": 0.40135936319997223,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0011673192691297877,
+      "loss": 0.0889,
+      "step": 46237
+    },
+    {
+      "epoch": 0.40136804368017637,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0011672895900123094,
+      "loss": 0.1094,
+      "step": 46238
+    },
+    {
+      "epoch": 0.40137672416038056,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0011672599108212439,
+      "loss": 0.0957,
+      "step": 46239
+    },
+    {
+      "epoch": 0.4013854046405847,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0011672302315566244,
+      "loss": 0.085,
+      "step": 46240
+    },
+    {
+      "epoch": 0.4013940851207889,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0011672005522184825,
+      "loss": 0.0972,
+      "step": 46241
+    },
+    {
+      "epoch": 0.40140276560099303,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0011671708728068517,
+      "loss": 0.0737,
+      "step": 46242
+    },
+    {
+      "epoch": 0.4014114460811972,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0011671411933217636,
+      "loss": 0.0967,
+      "step": 46243
+    },
+    {
+      "epoch": 0.40142012656140136,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0011671115137632507,
+      "loss": 0.0869,
+      "step": 46244
+    },
+    {
+      "epoch": 0.40142880704160555,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0011670818341313455,
+      "loss": 0.0752,
+      "step": 46245
+    },
+    {
+      "epoch": 0.4014374875218097,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0011670521544260813,
+      "loss": 0.1167,
+      "step": 46246
+    },
+    {
+      "epoch": 0.4014461680020139,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0011670224746474893,
+      "loss": 0.0815,
+      "step": 46247
+    },
+    {
+      "epoch": 0.401454848482218,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0011669927947956028,
+      "loss": 0.0703,
+      "step": 46248
+    },
+    {
+      "epoch": 0.4014635289624222,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0011669631148704539,
+      "loss": 0.0854,
+      "step": 46249
+    },
+    {
+      "epoch": 0.40147220944262635,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001166933434872075,
+      "loss": 0.0957,
+      "step": 46250
+    },
+    {
+      "epoch": 0.40148088992283054,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.001166903754800499,
+      "loss": 0.125,
+      "step": 46251
+    },
+    {
+      "epoch": 0.4014895704030347,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0011668740746557577,
+      "loss": 0.0801,
+      "step": 46252
+    },
+    {
+      "epoch": 0.40149825088323887,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0011668443944378845,
+      "loss": 0.1279,
+      "step": 46253
+    },
+    {
+      "epoch": 0.401506931363443,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0011668147141469108,
+      "loss": 0.0874,
+      "step": 46254
+    },
+    {
+      "epoch": 0.4015156118436472,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0011667850337828696,
+      "loss": 0.0967,
+      "step": 46255
+    },
+    {
+      "epoch": 0.40152429232385134,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0011667553533457934,
+      "loss": 0.0854,
+      "step": 46256
+    },
+    {
+      "epoch": 0.40153297280405553,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0011667256728357142,
+      "loss": 0.1157,
+      "step": 46257
+    },
+    {
+      "epoch": 0.40154165328425967,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0011666959922526652,
+      "loss": 0.0913,
+      "step": 46258
+    },
+    {
+      "epoch": 0.40155033376446386,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0011666663115966784,
+      "loss": 0.0986,
+      "step": 46259
+    },
+    {
+      "epoch": 0.401559014244668,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0011666366308677864,
+      "loss": 0.0977,
+      "step": 46260
+    },
+    {
+      "epoch": 0.4015676947248722,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001166606950066021,
+      "loss": 0.0889,
+      "step": 46261
+    },
+    {
+      "epoch": 0.40157637520507633,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001166577269191416,
+      "loss": 0.084,
+      "step": 46262
+    },
+    {
+      "epoch": 0.4015850556852805,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0011665475882440025,
+      "loss": 0.1045,
+      "step": 46263
+    },
+    {
+      "epoch": 0.40159373616548466,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001166517907223814,
+      "loss": 0.1045,
+      "step": 46264
+    },
+    {
+      "epoch": 0.40160241664568885,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0011664882261308823,
+      "loss": 0.0942,
+      "step": 46265
+    },
+    {
+      "epoch": 0.401611097125893,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.00116645854496524,
+      "loss": 0.1221,
+      "step": 46266
+    },
+    {
+      "epoch": 0.4016197776060972,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.00116642886372692,
+      "loss": 0.1094,
+      "step": 46267
+    },
+    {
+      "epoch": 0.4016284580863013,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0011663991824159541,
+      "loss": 0.1099,
+      "step": 46268
+    },
+    {
+      "epoch": 0.4016371385665055,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0011663695010323749,
+      "loss": 0.1094,
+      "step": 46269
+    },
+    {
+      "epoch": 0.40164581904670965,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0011663398195762154,
+      "loss": 0.0801,
+      "step": 46270
+    },
+    {
+      "epoch": 0.40165449952691384,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0011663101380475075,
+      "loss": 0.1064,
+      "step": 46271
+    },
+    {
+      "epoch": 0.401663180007118,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001166280456446284,
+      "loss": 0.1055,
+      "step": 46272
+    },
+    {
+      "epoch": 0.4016718604873222,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001166250774772577,
+      "loss": 0.1494,
+      "step": 46273
+    },
+    {
+      "epoch": 0.4016805409675263,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0011662210930264189,
+      "loss": 0.1221,
+      "step": 46274
+    },
+    {
+      "epoch": 0.4016892214477305,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0011661914112078427,
+      "loss": 0.1191,
+      "step": 46275
+    },
+    {
+      "epoch": 0.40169790192793464,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0011661617293168805,
+      "loss": 0.125,
+      "step": 46276
+    },
+    {
+      "epoch": 0.40170658240813883,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0011661320473535651,
+      "loss": 0.1484,
+      "step": 46277
+    },
+    {
+      "epoch": 0.40171526288834297,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.001166102365317928,
+      "loss": 0.1211,
+      "step": 46278
+    },
+    {
+      "epoch": 0.40172394336854717,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.001166072683210003,
+      "loss": 0.1025,
+      "step": 46279
+    },
+    {
+      "epoch": 0.4017326238487513,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0011660430010298217,
+      "loss": 0.1196,
+      "step": 46280
+    },
+    {
+      "epoch": 0.4017413043289555,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0011660133187774166,
+      "loss": 0.124,
+      "step": 46281
+    },
+    {
+      "epoch": 0.40174998480915963,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0011659836364528206,
+      "loss": 0.0781,
+      "step": 46282
+    },
+    {
+      "epoch": 0.4017586652893638,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0011659539540560657,
+      "loss": 0.083,
+      "step": 46283
+    },
+    {
+      "epoch": 0.40176734576956796,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0011659242715871846,
+      "loss": 0.0811,
+      "step": 46284
+    },
+    {
+      "epoch": 0.40177602624977216,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0011658945890462098,
+      "loss": 0.0713,
+      "step": 46285
+    },
+    {
+      "epoch": 0.4017847067299763,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001165864906433174,
+      "loss": 0.082,
+      "step": 46286
+    },
+    {
+      "epoch": 0.4017933872101805,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0011658352237481084,
+      "loss": 0.1191,
+      "step": 46287
+    },
+    {
+      "epoch": 0.4018020676903846,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0011658055409910473,
+      "loss": 0.0732,
+      "step": 46288
+    },
+    {
+      "epoch": 0.4018107481705888,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0011657758581620214,
+      "loss": 0.126,
+      "step": 46289
+    },
+    {
+      "epoch": 0.40181942865079295,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0011657461752610648,
+      "loss": 0.061,
+      "step": 46290
+    },
+    {
+      "epoch": 0.40182810913099715,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0011657164922882087,
+      "loss": 0.0688,
+      "step": 46291
+    },
+    {
+      "epoch": 0.4018367896112013,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0011656868092434862,
+      "loss": 0.1094,
+      "step": 46292
+    },
+    {
+      "epoch": 0.4018454700914055,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0011656571261269295,
+      "loss": 0.0796,
+      "step": 46293
+    },
+    {
+      "epoch": 0.4018541505716096,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0011656274429385713,
+      "loss": 0.1084,
+      "step": 46294
+    },
+    {
+      "epoch": 0.4018628310518138,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001165597759678444,
+      "loss": 0.0942,
+      "step": 46295
+    },
+    {
+      "epoch": 0.40187151153201794,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.00116556807634658,
+      "loss": 0.0928,
+      "step": 46296
+    },
+    {
+      "epoch": 0.40188019201222214,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0011655383929430113,
+      "loss": 0.0718,
+      "step": 46297
+    },
+    {
+      "epoch": 0.4018888724924263,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001165508709467771,
+      "loss": 0.0962,
+      "step": 46298
+    },
+    {
+      "epoch": 0.40189755297263047,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0011654790259208914,
+      "loss": 0.1172,
+      "step": 46299
+    },
+    {
+      "epoch": 0.4019062334528346,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0011654493423024048,
+      "loss": 0.0815,
+      "step": 46300
+    },
+    {
+      "epoch": 0.4019149139330388,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001165419658612344,
+      "loss": 0.0928,
+      "step": 46301
+    },
+    {
+      "epoch": 0.40192359441324294,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0011653899748507411,
+      "loss": 0.1318,
+      "step": 46302
+    },
+    {
+      "epoch": 0.40193227489344713,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001165360291017629,
+      "loss": 0.0996,
+      "step": 46303
+    },
+    {
+      "epoch": 0.40194095537365127,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0011653306071130393,
+      "loss": 0.0854,
+      "step": 46304
+    },
+    {
+      "epoch": 0.40194963585385546,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0011653009231370056,
+      "loss": 0.1377,
+      "step": 46305
+    },
+    {
+      "epoch": 0.4019583163340596,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0011652712390895594,
+      "loss": 0.0776,
+      "step": 46306
+    },
+    {
+      "epoch": 0.4019669968142638,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0011652415549707339,
+      "loss": 0.0684,
+      "step": 46307
+    },
+    {
+      "epoch": 0.4019756772944679,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0011652118707805607,
+      "loss": 0.0811,
+      "step": 46308
+    },
+    {
+      "epoch": 0.4019843577746721,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0011651821865190733,
+      "loss": 0.0898,
+      "step": 46309
+    },
+    {
+      "epoch": 0.40199303825487626,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0011651525021863034,
+      "loss": 0.0747,
+      "step": 46310
+    },
+    {
+      "epoch": 0.40200171873508045,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0011651228177822837,
+      "loss": 0.1182,
+      "step": 46311
+    },
+    {
+      "epoch": 0.4020103992152846,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001165093133307047,
+      "loss": 0.0762,
+      "step": 46312
+    },
+    {
+      "epoch": 0.4020190796954888,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0011650634487606248,
+      "loss": 0.1123,
+      "step": 46313
+    },
+    {
+      "epoch": 0.4020277601756929,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0011650337641430507,
+      "loss": 0.0898,
+      "step": 46314
+    },
+    {
+      "epoch": 0.4020364406558971,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0011650040794543566,
+      "loss": 0.1245,
+      "step": 46315
+    },
+    {
+      "epoch": 0.40204512113610125,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0011649743946945748,
+      "loss": 0.1045,
+      "step": 46316
+    },
+    {
+      "epoch": 0.40205380161630544,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001164944709863738,
+      "loss": 0.127,
+      "step": 46317
+    },
+    {
+      "epoch": 0.4020624820965096,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0011649150249618786,
+      "loss": 0.1108,
+      "step": 46318
+    },
+    {
+      "epoch": 0.40207116257671377,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0011648853399890292,
+      "loss": 0.1484,
+      "step": 46319
+    },
+    {
+      "epoch": 0.4020798430569179,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001164855654945222,
+      "loss": 0.0952,
+      "step": 46320
+    },
+    {
+      "epoch": 0.4020885235371221,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.00116482596983049,
+      "loss": 0.083,
+      "step": 46321
+    },
+    {
+      "epoch": 0.40209720401732624,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001164796284644865,
+      "loss": 0.1187,
+      "step": 46322
+    },
+    {
+      "epoch": 0.40210588449753043,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.00116476659938838,
+      "loss": 0.0811,
+      "step": 46323
+    },
+    {
+      "epoch": 0.40211456497773457,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001164736914061067,
+      "loss": 0.0869,
+      "step": 46324
+    },
+    {
+      "epoch": 0.40212324545793876,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001164707228662959,
+      "loss": 0.0786,
+      "step": 46325
+    },
+    {
+      "epoch": 0.4021319259381429,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0011646775431940878,
+      "loss": 0.1001,
+      "step": 46326
+    },
+    {
+      "epoch": 0.40214060641834704,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001164647857654486,
+      "loss": 0.168,
+      "step": 46327
+    },
+    {
+      "epoch": 0.40214928689855123,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.001164618172044187,
+      "loss": 0.1328,
+      "step": 46328
+    },
+    {
+      "epoch": 0.40215796737875537,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001164588486363222,
+      "loss": 0.0767,
+      "step": 46329
+    },
+    {
+      "epoch": 0.40216664785895956,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0011645588006116239,
+      "loss": 0.1162,
+      "step": 46330
+    },
+    {
+      "epoch": 0.4021753283391637,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0011645291147894257,
+      "loss": 0.0879,
+      "step": 46331
+    },
+    {
+      "epoch": 0.4021840088193679,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0011644994288966593,
+      "loss": 0.1182,
+      "step": 46332
+    },
+    {
+      "epoch": 0.402192689299572,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0011644697429333572,
+      "loss": 0.1133,
+      "step": 46333
+    },
+    {
+      "epoch": 0.4022013697797762,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.001164440056899552,
+      "loss": 0.1157,
+      "step": 46334
+    },
+    {
+      "epoch": 0.40221005025998036,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001164410370795276,
+      "loss": 0.0898,
+      "step": 46335
+    },
+    {
+      "epoch": 0.40221873074018455,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001164380684620562,
+      "loss": 0.1348,
+      "step": 46336
+    },
+    {
+      "epoch": 0.4022274112203887,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0011643509983754422,
+      "loss": 0.1035,
+      "step": 46337
+    },
+    {
+      "epoch": 0.4022360917005929,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001164321312059949,
+      "loss": 0.1021,
+      "step": 46338
+    },
+    {
+      "epoch": 0.402244772180797,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001164291625674115,
+      "loss": 0.1221,
+      "step": 46339
+    },
+    {
+      "epoch": 0.4022534526610012,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0011642619392179725,
+      "loss": 0.1084,
+      "step": 46340
+    },
+    {
+      "epoch": 0.40226213314120535,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0011642322526915546,
+      "loss": 0.1055,
+      "step": 46341
+    },
+    {
+      "epoch": 0.40227081362140954,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0011642025660948928,
+      "loss": 0.0776,
+      "step": 46342
+    },
+    {
+      "epoch": 0.4022794941016137,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0011641728794280202,
+      "loss": 0.1084,
+      "step": 46343
+    },
+    {
+      "epoch": 0.40228817458181787,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001164143192690969,
+      "loss": 0.0732,
+      "step": 46344
+    },
+    {
+      "epoch": 0.402296855062022,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0011641135058837722,
+      "loss": 0.0981,
+      "step": 46345
+    },
+    {
+      "epoch": 0.4023055355422262,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0011640838190064614,
+      "loss": 0.127,
+      "step": 46346
+    },
+    {
+      "epoch": 0.40231421602243034,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0011640541320590695,
+      "loss": 0.082,
+      "step": 46347
+    },
+    {
+      "epoch": 0.40232289650263453,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001164024445041629,
+      "loss": 0.127,
+      "step": 46348
+    },
+    {
+      "epoch": 0.40233157698283867,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0011639947579541723,
+      "loss": 0.1113,
+      "step": 46349
+    },
+    {
+      "epoch": 0.40234025746304286,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0011639650707967323,
+      "loss": 0.1187,
+      "step": 46350
+    },
+    {
+      "epoch": 0.402348937943247,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0011639353835693404,
+      "loss": 0.0728,
+      "step": 46351
+    },
+    {
+      "epoch": 0.4023576184234512,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.00116390569627203,
+      "loss": 0.0938,
+      "step": 46352
+    },
+    {
+      "epoch": 0.40236629890365533,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0011638760089048336,
+      "loss": 0.1064,
+      "step": 46353
+    },
+    {
+      "epoch": 0.4023749793838595,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0011638463214677832,
+      "loss": 0.1099,
+      "step": 46354
+    },
+    {
+      "epoch": 0.40238365986406366,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0011638166339609114,
+      "loss": 0.1611,
+      "step": 46355
+    },
+    {
+      "epoch": 0.40239234034426785,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0011637869463842505,
+      "loss": 0.0854,
+      "step": 46356
+    },
+    {
+      "epoch": 0.402401020824472,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.001163757258737833,
+      "loss": 0.124,
+      "step": 46357
+    },
+    {
+      "epoch": 0.4024097013046762,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.001163727571021692,
+      "loss": 0.0957,
+      "step": 46358
+    },
+    {
+      "epoch": 0.4024183817848803,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0011636978832358594,
+      "loss": 0.1309,
+      "step": 46359
+    },
+    {
+      "epoch": 0.4024270622650845,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0011636681953803673,
+      "loss": 0.1035,
+      "step": 46360
+    },
+    {
+      "epoch": 0.40243574274528865,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001163638507455249,
+      "loss": 0.0933,
+      "step": 46361
+    },
+    {
+      "epoch": 0.40244442322549284,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0011636088194605368,
+      "loss": 0.1147,
+      "step": 46362
+    },
+    {
+      "epoch": 0.402453103705697,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0011635791313962627,
+      "loss": 0.0967,
+      "step": 46363
+    },
+    {
+      "epoch": 0.4024617841859012,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0011635494432624593,
+      "loss": 0.1387,
+      "step": 46364
+    },
+    {
+      "epoch": 0.4024704646661053,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0011635197550591593,
+      "loss": 0.1226,
+      "step": 46365
+    },
+    {
+      "epoch": 0.4024791451463095,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001163490066786395,
+      "loss": 0.0854,
+      "step": 46366
+    },
+    {
+      "epoch": 0.40248782562651364,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.001163460378444199,
+      "loss": 0.1182,
+      "step": 46367
+    },
+    {
+      "epoch": 0.40249650610671783,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0011634306900326038,
+      "loss": 0.0918,
+      "step": 46368
+    },
+    {
+      "epoch": 0.40250518658692197,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0011634010015516415,
+      "loss": 0.1025,
+      "step": 46369
+    },
+    {
+      "epoch": 0.40251386706712616,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0011633713130013447,
+      "loss": 0.1182,
+      "step": 46370
+    },
+    {
+      "epoch": 0.4025225475473303,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0011633416243817464,
+      "loss": 0.0991,
+      "step": 46371
+    },
+    {
+      "epoch": 0.4025312280275345,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0011633119356928784,
+      "loss": 0.1045,
+      "step": 46372
+    },
+    {
+      "epoch": 0.40253990850773863,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0011632822469347737,
+      "loss": 0.1104,
+      "step": 46373
+    },
+    {
+      "epoch": 0.4025485889879428,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0011632525581074641,
+      "loss": 0.0845,
+      "step": 46374
+    },
+    {
+      "epoch": 0.40255726946814696,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0011632228692109825,
+      "loss": 0.1445,
+      "step": 46375
+    },
+    {
+      "epoch": 0.40256594994835115,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0011631931802453615,
+      "loss": 0.0879,
+      "step": 46376
+    },
+    {
+      "epoch": 0.4025746304285553,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0011631634912106332,
+      "loss": 0.0928,
+      "step": 46377
+    },
+    {
+      "epoch": 0.4025833109087595,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0011631338021068302,
+      "loss": 0.0854,
+      "step": 46378
+    },
+    {
+      "epoch": 0.4025919913889636,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0011631041129339856,
+      "loss": 0.0728,
+      "step": 46379
+    },
+    {
+      "epoch": 0.4026006718691678,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0011630744236921307,
+      "loss": 0.1084,
+      "step": 46380
+    },
+    {
+      "epoch": 0.40260935234937195,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0011630447343812988,
+      "loss": 0.0918,
+      "step": 46381
+    },
+    {
+      "epoch": 0.40261803282957614,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0011630150450015221,
+      "loss": 0.0938,
+      "step": 46382
+    },
+    {
+      "epoch": 0.4026267133097803,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.001162985355552833,
+      "loss": 0.0801,
+      "step": 46383
+    },
+    {
+      "epoch": 0.4026353937899845,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0011629556660352643,
+      "loss": 0.1147,
+      "step": 46384
+    },
+    {
+      "epoch": 0.4026440742701886,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0011629259764488478,
+      "loss": 0.0767,
+      "step": 46385
+    },
+    {
+      "epoch": 0.4026527547503928,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0011628962867936165,
+      "loss": 0.1025,
+      "step": 46386
+    },
+    {
+      "epoch": 0.40266143523059694,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.001162866597069603,
+      "loss": 0.1182,
+      "step": 46387
+    },
+    {
+      "epoch": 0.40267011571080114,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0011628369072768394,
+      "loss": 0.1138,
+      "step": 46388
+    },
+    {
+      "epoch": 0.4026787961910053,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0011628072174153586,
+      "loss": 0.0898,
+      "step": 46389
+    },
+    {
+      "epoch": 0.40268747667120947,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0011627775274851927,
+      "loss": 0.0713,
+      "step": 46390
+    },
+    {
+      "epoch": 0.4026961571514136,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0011627478374863739,
+      "loss": 0.124,
+      "step": 46391
+    },
+    {
+      "epoch": 0.4027048376316178,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0011627181474189352,
+      "loss": 0.1006,
+      "step": 46392
+    },
+    {
+      "epoch": 0.40271351811182193,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0011626884572829089,
+      "loss": 0.1406,
+      "step": 46393
+    },
+    {
+      "epoch": 0.4027221985920261,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001162658767078327,
+      "loss": 0.1196,
+      "step": 46394
+    },
+    {
+      "epoch": 0.40273087907223026,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0011626290768052229,
+      "loss": 0.0791,
+      "step": 46395
+    },
+    {
+      "epoch": 0.40273955955243446,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0011625993864636285,
+      "loss": 0.106,
+      "step": 46396
+    },
+    {
+      "epoch": 0.4027482400326386,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0011625696960535766,
+      "loss": 0.0967,
+      "step": 46397
+    },
+    {
+      "epoch": 0.4027569205128428,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001162540005575099,
+      "loss": 0.0879,
+      "step": 46398
+    },
+    {
+      "epoch": 0.4027656009930469,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0011625103150282286,
+      "loss": 0.1279,
+      "step": 46399
+    },
+    {
+      "epoch": 0.4027742814732511,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0011624806244129981,
+      "loss": 0.1396,
+      "step": 46400
+    },
+    {
+      "epoch": 0.40278296195345525,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0011624509337294397,
+      "loss": 0.0869,
+      "step": 46401
+    },
+    {
+      "epoch": 0.40279164243365945,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001162421242977586,
+      "loss": 0.0957,
+      "step": 46402
+    },
+    {
+      "epoch": 0.4028003229138636,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001162391552157469,
+      "loss": 0.1025,
+      "step": 46403
+    },
+    {
+      "epoch": 0.4028090033940678,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0011623618612691216,
+      "loss": 0.0981,
+      "step": 46404
+    },
+    {
+      "epoch": 0.4028176838742719,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0011623321703125764,
+      "loss": 0.1221,
+      "step": 46405
+    },
+    {
+      "epoch": 0.4028263643544761,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0011623024792878657,
+      "loss": 0.1602,
+      "step": 46406
+    },
+    {
+      "epoch": 0.40283504483468024,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0011622727881950218,
+      "loss": 0.0801,
+      "step": 46407
+    },
+    {
+      "epoch": 0.40284372531488444,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0011622430970340775,
+      "loss": 0.3555,
+      "step": 46408
+    },
+    {
+      "epoch": 0.4028524057950886,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001162213405805065,
+      "loss": 0.1348,
+      "step": 46409
+    },
+    {
+      "epoch": 0.40286108627529277,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0011621837145080168,
+      "loss": 0.1055,
+      "step": 46410
+    },
+    {
+      "epoch": 0.4028697667554969,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0011621540231429655,
+      "loss": 0.1504,
+      "step": 46411
+    },
+    {
+      "epoch": 0.4028784472357011,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0011621243317099432,
+      "loss": 0.1084,
+      "step": 46412
+    },
+    {
+      "epoch": 0.40288712771590524,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0011620946402089829,
+      "loss": 0.0967,
+      "step": 46413
+    },
+    {
+      "epoch": 0.40289580819610943,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001162064948640117,
+      "loss": 0.0593,
+      "step": 46414
+    },
+    {
+      "epoch": 0.40290448867631357,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0011620352570033776,
+      "loss": 0.1226,
+      "step": 46415
+    },
+    {
+      "epoch": 0.40291316915651776,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0011620055652987972,
+      "loss": 0.0933,
+      "step": 46416
+    },
+    {
+      "epoch": 0.4029218496367219,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0011619758735264087,
+      "loss": 0.1289,
+      "step": 46417
+    },
+    {
+      "epoch": 0.4029305301169261,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0011619461816862442,
+      "loss": 0.0864,
+      "step": 46418
+    },
+    {
+      "epoch": 0.4029392105971302,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0011619164897783366,
+      "loss": 0.0986,
+      "step": 46419
+    },
+    {
+      "epoch": 0.4029478910773344,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.001161886797802718,
+      "loss": 0.0806,
+      "step": 46420
+    },
+    {
+      "epoch": 0.40295657155753856,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0011618571057594204,
+      "loss": 0.103,
+      "step": 46421
+    },
+    {
+      "epoch": 0.40296525203774275,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0011618274136484773,
+      "loss": 0.1074,
+      "step": 46422
+    },
+    {
+      "epoch": 0.4029739325179469,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0011617977214699203,
+      "loss": 0.0898,
+      "step": 46423
+    },
+    {
+      "epoch": 0.4029826129981511,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0011617680292237828,
+      "loss": 0.104,
+      "step": 46424
+    },
+    {
+      "epoch": 0.4029912934783552,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0011617383369100958,
+      "loss": 0.105,
+      "step": 46425
+    },
+    {
+      "epoch": 0.4029999739585594,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0011617086445288936,
+      "loss": 0.1191,
+      "step": 46426
+    },
+    {
+      "epoch": 0.40300865443876355,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0011616789520802074,
+      "loss": 0.1338,
+      "step": 46427
+    },
+    {
+      "epoch": 0.40301733491896774,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0011616492595640702,
+      "loss": 0.1172,
+      "step": 46428
+    },
+    {
+      "epoch": 0.4030260153991719,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0011616195669805138,
+      "loss": 0.1094,
+      "step": 46429
+    },
+    {
+      "epoch": 0.40303469587937607,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0011615898743295716,
+      "loss": 0.0928,
+      "step": 46430
+    },
+    {
+      "epoch": 0.4030433763595802,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0011615601816112756,
+      "loss": 0.084,
+      "step": 46431
+    },
+    {
+      "epoch": 0.4030520568397844,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.001161530488825658,
+      "loss": 0.1021,
+      "step": 46432
+    },
+    {
+      "epoch": 0.40306073731998854,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001161500795972752,
+      "loss": 0.1094,
+      "step": 46433
+    },
+    {
+      "epoch": 0.40306941780019273,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0011614711030525893,
+      "loss": 0.0742,
+      "step": 46434
+    },
+    {
+      "epoch": 0.40307809828039687,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001161441410065203,
+      "loss": 0.1123,
+      "step": 46435
+    },
+    {
+      "epoch": 0.40308677876060106,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0011614117170106251,
+      "loss": 0.0947,
+      "step": 46436
+    },
+    {
+      "epoch": 0.4030954592408052,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0011613820238888886,
+      "loss": 0.0908,
+      "step": 46437
+    },
+    {
+      "epoch": 0.4031041397210094,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0011613523307000254,
+      "loss": 0.1113,
+      "step": 46438
+    },
+    {
+      "epoch": 0.40311282020121353,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001161322637444068,
+      "loss": 0.0967,
+      "step": 46439
+    },
+    {
+      "epoch": 0.4031215006814177,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0011612929441210493,
+      "loss": 0.1094,
+      "step": 46440
+    },
+    {
+      "epoch": 0.40313018116162186,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0011612632507310018,
+      "loss": 0.1045,
+      "step": 46441
+    },
+    {
+      "epoch": 0.40313886164182605,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0011612335572739575,
+      "loss": 0.0752,
+      "step": 46442
+    },
+    {
+      "epoch": 0.4031475421220302,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0011612038637499487,
+      "loss": 0.1162,
+      "step": 46443
+    },
+    {
+      "epoch": 0.4031562226022344,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001161174170159009,
+      "loss": 0.1016,
+      "step": 46444
+    },
+    {
+      "epoch": 0.4031649030824385,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0011611444765011697,
+      "loss": 0.1143,
+      "step": 46445
+    },
+    {
+      "epoch": 0.4031735835626427,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0011611147827764639,
+      "loss": 0.0698,
+      "step": 46446
+    },
+    {
+      "epoch": 0.40318226404284685,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0011610850889849237,
+      "loss": 0.0996,
+      "step": 46447
+    },
+    {
+      "epoch": 0.40319094452305104,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0011610553951265819,
+      "loss": 0.1094,
+      "step": 46448
+    },
+    {
+      "epoch": 0.4031996250032552,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0011610257012014708,
+      "loss": 0.0981,
+      "step": 46449
+    },
+    {
+      "epoch": 0.4032083054834594,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0011609960072096232,
+      "loss": 0.1099,
+      "step": 46450
+    },
+    {
+      "epoch": 0.4032169859636635,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0011609663131510707,
+      "loss": 0.1006,
+      "step": 46451
+    },
+    {
+      "epoch": 0.40322566644386765,
+      "grad_norm": 3.125,
+      "learning_rate": 0.0011609366190258466,
+      "loss": 0.4023,
+      "step": 46452
+    },
+    {
+      "epoch": 0.40323434692407184,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0011609069248339832,
+      "loss": 0.0825,
+      "step": 46453
+    },
+    {
+      "epoch": 0.403243027404276,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0011608772305755132,
+      "loss": 0.1016,
+      "step": 46454
+    },
+    {
+      "epoch": 0.40325170788448017,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0011608475362504683,
+      "loss": 0.1191,
+      "step": 46455
+    },
+    {
+      "epoch": 0.4032603883646843,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0011608178418588814,
+      "loss": 0.1396,
+      "step": 46456
+    },
+    {
+      "epoch": 0.4032690688448885,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0011607881474007852,
+      "loss": 0.1016,
+      "step": 46457
+    },
+    {
+      "epoch": 0.40327774932509264,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0011607584528762122,
+      "loss": 0.0947,
+      "step": 46458
+    },
+    {
+      "epoch": 0.40328642980529683,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0011607287582851948,
+      "loss": 0.1108,
+      "step": 46459
+    },
+    {
+      "epoch": 0.40329511028550097,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0011606990636277648,
+      "loss": 0.0962,
+      "step": 46460
+    },
+    {
+      "epoch": 0.40330379076570516,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0011606693689039556,
+      "loss": 0.1289,
+      "step": 46461
+    },
+    {
+      "epoch": 0.4033124712459093,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001160639674113799,
+      "loss": 0.1226,
+      "step": 46462
+    },
+    {
+      "epoch": 0.4033211517261135,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0011606099792573282,
+      "loss": 0.1396,
+      "step": 46463
+    },
+    {
+      "epoch": 0.40332983220631763,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0011605802843345748,
+      "loss": 0.0894,
+      "step": 46464
+    },
+    {
+      "epoch": 0.4033385126865218,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001160550589345572,
+      "loss": 0.085,
+      "step": 46465
+    },
+    {
+      "epoch": 0.40334719316672596,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0011605208942903516,
+      "loss": 0.1221,
+      "step": 46466
+    },
+    {
+      "epoch": 0.40335587364693015,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0011604911991689468,
+      "loss": 0.1152,
+      "step": 46467
+    },
+    {
+      "epoch": 0.4033645541271343,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0011604615039813898,
+      "loss": 0.1367,
+      "step": 46468
+    },
+    {
+      "epoch": 0.4033732346073385,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0011604318087277126,
+      "loss": 0.0928,
+      "step": 46469
+    },
+    {
+      "epoch": 0.4033819150875426,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0011604021134079484,
+      "loss": 0.0967,
+      "step": 46470
+    },
+    {
+      "epoch": 0.4033905955677468,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001160372418022129,
+      "loss": 0.084,
+      "step": 46471
+    },
+    {
+      "epoch": 0.40339927604795095,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0011603427225702879,
+      "loss": 0.1289,
+      "step": 46472
+    },
+    {
+      "epoch": 0.40340795652815514,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0011603130270524565,
+      "loss": 0.1025,
+      "step": 46473
+    },
+    {
+      "epoch": 0.4034166370083593,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0011602833314686676,
+      "loss": 0.1582,
+      "step": 46474
+    },
+    {
+      "epoch": 0.4034253174885635,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0011602536358189539,
+      "loss": 0.0942,
+      "step": 46475
+    },
+    {
+      "epoch": 0.4034339979687676,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0011602239401033478,
+      "loss": 0.0967,
+      "step": 46476
+    },
+    {
+      "epoch": 0.4034426784489718,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0011601942443218813,
+      "loss": 0.1074,
+      "step": 46477
+    },
+    {
+      "epoch": 0.40345135892917594,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001160164548474588,
+      "loss": 0.0977,
+      "step": 46478
+    },
+    {
+      "epoch": 0.40346003940938013,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0011601348525614992,
+      "loss": 0.1104,
+      "step": 46479
+    },
+    {
+      "epoch": 0.40346871988958427,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0011601051565826477,
+      "loss": 0.1182,
+      "step": 46480
+    },
+    {
+      "epoch": 0.40347740036978846,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0011600754605380664,
+      "loss": 0.0947,
+      "step": 46481
+    },
+    {
+      "epoch": 0.4034860808499926,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0011600457644277873,
+      "loss": 0.0933,
+      "step": 46482
+    },
+    {
+      "epoch": 0.4034947613301968,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001160016068251843,
+      "loss": 0.1104,
+      "step": 46483
+    },
+    {
+      "epoch": 0.40350344181040093,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0011599863720102661,
+      "loss": 0.082,
+      "step": 46484
+    },
+    {
+      "epoch": 0.4035121222906051,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0011599566757030894,
+      "loss": 0.0801,
+      "step": 46485
+    },
+    {
+      "epoch": 0.40352080277080926,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0011599269793303442,
+      "loss": 0.1016,
+      "step": 46486
+    },
+    {
+      "epoch": 0.40352948325101345,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0011598972828920644,
+      "loss": 0.1318,
+      "step": 46487
+    },
+    {
+      "epoch": 0.4035381637312176,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0011598675863882814,
+      "loss": 0.123,
+      "step": 46488
+    },
+    {
+      "epoch": 0.4035468442114218,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0011598378898190285,
+      "loss": 0.0693,
+      "step": 46489
+    },
+    {
+      "epoch": 0.4035555246916259,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0011598081931843377,
+      "loss": 0.0811,
+      "step": 46490
+    },
+    {
+      "epoch": 0.4035642051718301,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001159778496484241,
+      "loss": 0.0908,
+      "step": 46491
+    },
+    {
+      "epoch": 0.40357288565203425,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0011597487997187718,
+      "loss": 0.1025,
+      "step": 46492
+    },
+    {
+      "epoch": 0.40358156613223845,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0011597191028879625,
+      "loss": 0.0752,
+      "step": 46493
+    },
+    {
+      "epoch": 0.4035902466124426,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0011596894059918452,
+      "loss": 0.083,
+      "step": 46494
+    },
+    {
+      "epoch": 0.4035989270926468,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0011596597090304524,
+      "loss": 0.0869,
+      "step": 46495
+    },
+    {
+      "epoch": 0.4036076075728509,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0011596300120038166,
+      "loss": 0.0825,
+      "step": 46496
+    },
+    {
+      "epoch": 0.4036162880530551,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0011596003149119702,
+      "loss": 0.1221,
+      "step": 46497
+    },
+    {
+      "epoch": 0.40362496853325924,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0011595706177549461,
+      "loss": 0.0879,
+      "step": 46498
+    },
+    {
+      "epoch": 0.40363364901346344,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001159540920532776,
+      "loss": 0.084,
+      "step": 46499
+    },
+    {
+      "epoch": 0.4036423294936676,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001159511223245493,
+      "loss": 0.104,
+      "step": 46500
+    },
+    {
+      "epoch": 0.40365100997387177,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0011594815258931296,
+      "loss": 0.1367,
+      "step": 46501
+    },
+    {
+      "epoch": 0.4036596904540759,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001159451828475718,
+      "loss": 0.2949,
+      "step": 46502
+    },
+    {
+      "epoch": 0.4036683709342801,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0011594221309932907,
+      "loss": 0.1182,
+      "step": 46503
+    },
+    {
+      "epoch": 0.40367705141448423,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0011593924334458805,
+      "loss": 0.0884,
+      "step": 46504
+    },
+    {
+      "epoch": 0.4036857318946884,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0011593627358335195,
+      "loss": 0.1064,
+      "step": 46505
+    },
+    {
+      "epoch": 0.40369441237489256,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0011593330381562401,
+      "loss": 0.1592,
+      "step": 46506
+    },
+    {
+      "epoch": 0.40370309285509676,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0011593033404140752,
+      "loss": 0.1211,
+      "step": 46507
+    },
+    {
+      "epoch": 0.4037117733353009,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0011592736426070571,
+      "loss": 0.1025,
+      "step": 46508
+    },
+    {
+      "epoch": 0.4037204538155051,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001159243944735218,
+      "loss": 0.0615,
+      "step": 46509
+    },
+    {
+      "epoch": 0.4037291342957092,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0011592142467985909,
+      "loss": 0.0771,
+      "step": 46510
+    },
+    {
+      "epoch": 0.4037378147759134,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0011591845487972077,
+      "loss": 0.1079,
+      "step": 46511
+    },
+    {
+      "epoch": 0.40374649525611755,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0011591548507311011,
+      "loss": 0.0791,
+      "step": 46512
+    },
+    {
+      "epoch": 0.40375517573632175,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0011591251526003041,
+      "loss": 0.0698,
+      "step": 46513
+    },
+    {
+      "epoch": 0.4037638562165259,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0011590954544048483,
+      "loss": 0.1162,
+      "step": 46514
+    },
+    {
+      "epoch": 0.4037725366967301,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0011590657561447672,
+      "loss": 0.105,
+      "step": 46515
+    },
+    {
+      "epoch": 0.4037812171769342,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.001159036057820092,
+      "loss": 0.1182,
+      "step": 46516
+    },
+    {
+      "epoch": 0.4037898976571384,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0011590063594308561,
+      "loss": 0.0752,
+      "step": 46517
+    },
+    {
+      "epoch": 0.40379857813734255,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001158976660977092,
+      "loss": 0.1133,
+      "step": 46518
+    },
+    {
+      "epoch": 0.40380725861754674,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0011589469624588313,
+      "loss": 0.0908,
+      "step": 46519
+    },
+    {
+      "epoch": 0.4038159390977509,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0011589172638761074,
+      "loss": 0.0962,
+      "step": 46520
+    },
+    {
+      "epoch": 0.40382461957795507,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.0011588875652289526,
+      "loss": 0.1064,
+      "step": 46521
+    },
+    {
+      "epoch": 0.4038333000581592,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0011588578665173992,
+      "loss": 0.1011,
+      "step": 46522
+    },
+    {
+      "epoch": 0.4038419805383634,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0011588281677414795,
+      "loss": 0.1182,
+      "step": 46523
+    },
+    {
+      "epoch": 0.40385066101856754,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0011587984689012264,
+      "loss": 0.0703,
+      "step": 46524
+    },
+    {
+      "epoch": 0.40385934149877173,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0011587687699966722,
+      "loss": 0.0884,
+      "step": 46525
+    },
+    {
+      "epoch": 0.40386802197897587,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0011587390710278494,
+      "loss": 0.1006,
+      "step": 46526
+    },
+    {
+      "epoch": 0.40387670245918006,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0011587093719947904,
+      "loss": 0.1426,
+      "step": 46527
+    },
+    {
+      "epoch": 0.4038853829393842,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0011586796728975275,
+      "loss": 0.1309,
+      "step": 46528
+    },
+    {
+      "epoch": 0.4038940634195884,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0011586499737360936,
+      "loss": 0.0986,
+      "step": 46529
+    },
+    {
+      "epoch": 0.4039027438997925,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.001158620274510521,
+      "loss": 0.0889,
+      "step": 46530
+    },
+    {
+      "epoch": 0.4039114243799967,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0011585905752208418,
+      "loss": 0.1377,
+      "step": 46531
+    },
+    {
+      "epoch": 0.40392010486020086,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0011585608758670895,
+      "loss": 0.1235,
+      "step": 46532
+    },
+    {
+      "epoch": 0.40392878534040505,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0011585311764492952,
+      "loss": 0.0801,
+      "step": 46533
+    },
+    {
+      "epoch": 0.4039374658206092,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0011585014769674923,
+      "loss": 0.1475,
+      "step": 46534
+    },
+    {
+      "epoch": 0.4039461463008134,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0011584717774217133,
+      "loss": 0.1514,
+      "step": 46535
+    },
+    {
+      "epoch": 0.4039548267810175,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0011584420778119901,
+      "loss": 0.1309,
+      "step": 46536
+    },
+    {
+      "epoch": 0.4039635072612217,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0011584123781383559,
+      "loss": 0.1221,
+      "step": 46537
+    },
+    {
+      "epoch": 0.40397218774142585,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0011583826784008423,
+      "loss": 0.1123,
+      "step": 46538
+    },
+    {
+      "epoch": 0.40398086822163004,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0011583529785994826,
+      "loss": 0.1162,
+      "step": 46539
+    },
+    {
+      "epoch": 0.4039895487018342,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001158323278734309,
+      "loss": 0.0967,
+      "step": 46540
+    },
+    {
+      "epoch": 0.40399822918203837,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001158293578805354,
+      "loss": 0.0894,
+      "step": 46541
+    },
+    {
+      "epoch": 0.4040069096622425,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.00115826387881265,
+      "loss": 0.0854,
+      "step": 46542
+    },
+    {
+      "epoch": 0.4040155901424467,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0011582341787562294,
+      "loss": 0.1094,
+      "step": 46543
+    },
+    {
+      "epoch": 0.40402427062265084,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0011582044786361248,
+      "loss": 0.103,
+      "step": 46544
+    },
+    {
+      "epoch": 0.40403295110285503,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0011581747784523686,
+      "loss": 0.084,
+      "step": 46545
+    },
+    {
+      "epoch": 0.40404163158305917,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0011581450782049934,
+      "loss": 0.1543,
+      "step": 46546
+    },
+    {
+      "epoch": 0.40405031206326336,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0011581153778940317,
+      "loss": 0.1484,
+      "step": 46547
+    },
+    {
+      "epoch": 0.4040589925434675,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0011580856775195156,
+      "loss": 0.0869,
+      "step": 46548
+    },
+    {
+      "epoch": 0.4040676730236717,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.001158055977081478,
+      "loss": 0.0713,
+      "step": 46549
+    },
+    {
+      "epoch": 0.40407635350387583,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0011580262765799511,
+      "loss": 0.0986,
+      "step": 46550
+    },
+    {
+      "epoch": 0.40408503398408,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0011579965760149678,
+      "loss": 0.0967,
+      "step": 46551
+    },
+    {
+      "epoch": 0.40409371446428416,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0011579668753865606,
+      "loss": 0.1211,
+      "step": 46552
+    },
+    {
+      "epoch": 0.40410239494448835,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0011579371746947614,
+      "loss": 0.1328,
+      "step": 46553
+    },
+    {
+      "epoch": 0.4041110754246925,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001157907473939603,
+      "loss": 0.0776,
+      "step": 46554
+    },
+    {
+      "epoch": 0.4041197559048967,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0011578777731211173,
+      "loss": 0.0781,
+      "step": 46555
+    },
+    {
+      "epoch": 0.4041284363851008,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0011578480722393382,
+      "loss": 0.0991,
+      "step": 46556
+    },
+    {
+      "epoch": 0.404137116865305,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0011578183712942969,
+      "loss": 0.1235,
+      "step": 46557
+    },
+    {
+      "epoch": 0.40414579734550915,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0011577886702860262,
+      "loss": 0.0972,
+      "step": 46558
+    },
+    {
+      "epoch": 0.40415447782571334,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0011577589692145587,
+      "loss": 0.1523,
+      "step": 46559
+    },
+    {
+      "epoch": 0.4041631583059175,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0011577292680799269,
+      "loss": 0.0977,
+      "step": 46560
+    },
+    {
+      "epoch": 0.4041718387861217,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0011576995668821635,
+      "loss": 0.1045,
+      "step": 46561
+    },
+    {
+      "epoch": 0.4041805192663258,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0011576698656213004,
+      "loss": 0.0762,
+      "step": 46562
+    },
+    {
+      "epoch": 0.40418919974653,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0011576401642973706,
+      "loss": 0.0947,
+      "step": 46563
+    },
+    {
+      "epoch": 0.40419788022673414,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0011576104629104065,
+      "loss": 0.1235,
+      "step": 46564
+    },
+    {
+      "epoch": 0.40420656070693833,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0011575807614604403,
+      "loss": 0.0781,
+      "step": 46565
+    },
+    {
+      "epoch": 0.40421524118714247,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0011575510599475048,
+      "loss": 0.1104,
+      "step": 46566
+    },
+    {
+      "epoch": 0.40422392166734666,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.001157521358371632,
+      "loss": 0.1074,
+      "step": 46567
+    },
+    {
+      "epoch": 0.4042326021475508,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0011574916567328546,
+      "loss": 0.0781,
+      "step": 46568
+    },
+    {
+      "epoch": 0.404241282627755,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0011574619550312058,
+      "loss": 0.2051,
+      "step": 46569
+    },
+    {
+      "epoch": 0.40424996310795913,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001157432253266717,
+      "loss": 0.0889,
+      "step": 46570
+    },
+    {
+      "epoch": 0.4042586435881633,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0011574025514394213,
+      "loss": 0.0654,
+      "step": 46571
+    },
+    {
+      "epoch": 0.40426732406836746,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001157372849549351,
+      "loss": 0.083,
+      "step": 46572
+    },
+    {
+      "epoch": 0.40427600454857165,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0011573431475965386,
+      "loss": 0.1123,
+      "step": 46573
+    },
+    {
+      "epoch": 0.4042846850287758,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0011573134455810168,
+      "loss": 0.1113,
+      "step": 46574
+    },
+    {
+      "epoch": 0.40429336550897993,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0011572837435028177,
+      "loss": 0.1494,
+      "step": 46575
+    },
+    {
+      "epoch": 0.4043020459891841,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0011572540413619737,
+      "loss": 0.1226,
+      "step": 46576
+    },
+    {
+      "epoch": 0.40431072646938826,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0011572243391585178,
+      "loss": 0.0947,
+      "step": 46577
+    },
+    {
+      "epoch": 0.40431940694959245,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0011571946368924822,
+      "loss": 0.1406,
+      "step": 46578
+    },
+    {
+      "epoch": 0.4043280874297966,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0011571649345638994,
+      "loss": 0.0947,
+      "step": 46579
+    },
+    {
+      "epoch": 0.4043367679100008,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0011571352321728021,
+      "loss": 0.126,
+      "step": 46580
+    },
+    {
+      "epoch": 0.4043454483902049,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0011571055297192221,
+      "loss": 0.1035,
+      "step": 46581
+    },
+    {
+      "epoch": 0.4043541288704091,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0011570758272031927,
+      "loss": 0.0869,
+      "step": 46582
+    },
+    {
+      "epoch": 0.40436280935061325,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0011570461246247462,
+      "loss": 0.0928,
+      "step": 46583
+    },
+    {
+      "epoch": 0.40437148983081744,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0011570164219839147,
+      "loss": 0.1279,
+      "step": 46584
+    },
+    {
+      "epoch": 0.4043801703110216,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0011569867192807307,
+      "loss": 0.1094,
+      "step": 46585
+    },
+    {
+      "epoch": 0.4043888507912258,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001156957016515227,
+      "loss": 0.0752,
+      "step": 46586
+    },
+    {
+      "epoch": 0.4043975312714299,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001156927313687436,
+      "loss": 0.0684,
+      "step": 46587
+    },
+    {
+      "epoch": 0.4044062117516341,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0011568976107973901,
+      "loss": 0.0889,
+      "step": 46588
+    },
+    {
+      "epoch": 0.40441489223183824,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0011568679078451218,
+      "loss": 0.0669,
+      "step": 46589
+    },
+    {
+      "epoch": 0.40442357271204243,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0011568382048306638,
+      "loss": 0.1079,
+      "step": 46590
+    },
+    {
+      "epoch": 0.40443225319224657,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0011568085017540483,
+      "loss": 0.0713,
+      "step": 46591
+    },
+    {
+      "epoch": 0.40444093367245076,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001156778798615308,
+      "loss": 0.0967,
+      "step": 46592
+    },
+    {
+      "epoch": 0.4044496141526549,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0011567490954144753,
+      "loss": 0.084,
+      "step": 46593
+    },
+    {
+      "epoch": 0.4044582946328591,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0011567193921515823,
+      "loss": 0.0928,
+      "step": 46594
+    },
+    {
+      "epoch": 0.40446697511306323,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.001156689688826662,
+      "loss": 0.1582,
+      "step": 46595
+    },
+    {
+      "epoch": 0.4044756555932674,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0011566599854397465,
+      "loss": 0.1182,
+      "step": 46596
+    },
+    {
+      "epoch": 0.40448433607347156,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0011566302819908687,
+      "loss": 0.0918,
+      "step": 46597
+    },
+    {
+      "epoch": 0.40449301655367575,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0011566005784800608,
+      "loss": 0.166,
+      "step": 46598
+    },
+    {
+      "epoch": 0.4045016970338799,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0011565708749073555,
+      "loss": 0.1387,
+      "step": 46599
+    },
+    {
+      "epoch": 0.4045103775140841,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001156541171272785,
+      "loss": 0.0762,
+      "step": 46600
+    },
+    {
+      "epoch": 0.4045190579942882,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0011565114675763822,
+      "loss": 0.1069,
+      "step": 46601
+    },
+    {
+      "epoch": 0.4045277384744924,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.001156481763818179,
+      "loss": 0.0835,
+      "step": 46602
+    },
+    {
+      "epoch": 0.40453641895469655,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0011564520599982085,
+      "loss": 0.0928,
+      "step": 46603
+    },
+    {
+      "epoch": 0.40454509943490075,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0011564223561165027,
+      "loss": 0.1211,
+      "step": 46604
+    },
+    {
+      "epoch": 0.4045537799151049,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001156392652173094,
+      "loss": 0.1143,
+      "step": 46605
+    },
+    {
+      "epoch": 0.4045624603953091,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0011563629481680154,
+      "loss": 0.085,
+      "step": 46606
+    },
+    {
+      "epoch": 0.4045711408755132,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0011563332441012991,
+      "loss": 0.1094,
+      "step": 46607
+    },
+    {
+      "epoch": 0.4045798213557174,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0011563035399729776,
+      "loss": 0.1089,
+      "step": 46608
+    },
+    {
+      "epoch": 0.40458850183592154,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0011562738357830833,
+      "loss": 0.083,
+      "step": 46609
+    },
+    {
+      "epoch": 0.40459718231612574,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001156244131531649,
+      "loss": 0.1338,
+      "step": 46610
+    },
+    {
+      "epoch": 0.4046058627963299,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0011562144272187068,
+      "loss": 0.0957,
+      "step": 46611
+    },
+    {
+      "epoch": 0.40461454327653407,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0011561847228442894,
+      "loss": 0.0957,
+      "step": 46612
+    },
+    {
+      "epoch": 0.4046232237567382,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0011561550184084294,
+      "loss": 0.0889,
+      "step": 46613
+    },
+    {
+      "epoch": 0.4046319042369424,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.0011561253139111588,
+      "loss": 0.1533,
+      "step": 46614
+    },
+    {
+      "epoch": 0.40464058471714653,
+      "grad_norm": 0.055419921875,
+      "learning_rate": 0.0011560956093525105,
+      "loss": 0.0562,
+      "step": 46615
+    },
+    {
+      "epoch": 0.4046492651973507,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0011560659047325167,
+      "loss": 0.0728,
+      "step": 46616
+    },
+    {
+      "epoch": 0.40465794567755486,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0011560362000512104,
+      "loss": 0.0737,
+      "step": 46617
+    },
+    {
+      "epoch": 0.40466662615775906,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0011560064953086238,
+      "loss": 0.1094,
+      "step": 46618
+    },
+    {
+      "epoch": 0.4046753066379632,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0011559767905047892,
+      "loss": 0.1191,
+      "step": 46619
+    },
+    {
+      "epoch": 0.4046839871181674,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0011559470856397388,
+      "loss": 0.0918,
+      "step": 46620
+    },
+    {
+      "epoch": 0.4046926675983715,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001155917380713506,
+      "loss": 0.1128,
+      "step": 46621
+    },
+    {
+      "epoch": 0.4047013480785757,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001155887675726123,
+      "loss": 0.1416,
+      "step": 46622
+    },
+    {
+      "epoch": 0.40471002855877986,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0011558579706776218,
+      "loss": 0.0742,
+      "step": 46623
+    },
+    {
+      "epoch": 0.40471870903898405,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001155828265568035,
+      "loss": 0.0869,
+      "step": 46624
+    },
+    {
+      "epoch": 0.4047273895191882,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0011557985603973954,
+      "loss": 0.1357,
+      "step": 46625
+    },
+    {
+      "epoch": 0.4047360699993924,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0011557688551657353,
+      "loss": 0.1001,
+      "step": 46626
+    },
+    {
+      "epoch": 0.4047447504795965,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001155739149873087,
+      "loss": 0.0801,
+      "step": 46627
+    },
+    {
+      "epoch": 0.4047534309598007,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0011557094445194835,
+      "loss": 0.0918,
+      "step": 46628
+    },
+    {
+      "epoch": 0.40476211144000485,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001155679739104957,
+      "loss": 0.1118,
+      "step": 46629
+    },
+    {
+      "epoch": 0.40477079192020904,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0011556500336295402,
+      "loss": 0.0962,
+      "step": 46630
+    },
+    {
+      "epoch": 0.4047794724004132,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0011556203280932648,
+      "loss": 0.1191,
+      "step": 46631
+    },
+    {
+      "epoch": 0.40478815288061737,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0011555906224961644,
+      "loss": 0.0991,
+      "step": 46632
+    },
+    {
+      "epoch": 0.4047968333608215,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0011555609168382704,
+      "loss": 0.0649,
+      "step": 46633
+    },
+    {
+      "epoch": 0.4048055138410257,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001155531211119616,
+      "loss": 0.0898,
+      "step": 46634
+    },
+    {
+      "epoch": 0.40481419432122984,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0011555015053402336,
+      "loss": 0.0869,
+      "step": 46635
+    },
+    {
+      "epoch": 0.40482287480143403,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0011554717995001557,
+      "loss": 0.0801,
+      "step": 46636
+    },
+    {
+      "epoch": 0.40483155528163817,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0011554420935994143,
+      "loss": 0.0806,
+      "step": 46637
+    },
+    {
+      "epoch": 0.40484023576184236,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0011554123876380427,
+      "loss": 0.1143,
+      "step": 46638
+    },
+    {
+      "epoch": 0.4048489162420465,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0011553826816160725,
+      "loss": 0.1094,
+      "step": 46639
+    },
+    {
+      "epoch": 0.4048575967222507,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001155352975533537,
+      "loss": 0.1104,
+      "step": 46640
+    },
+    {
+      "epoch": 0.4048662772024548,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.001155323269390468,
+      "loss": 0.0752,
+      "step": 46641
+    },
+    {
+      "epoch": 0.404874957682659,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0011552935631868985,
+      "loss": 0.1064,
+      "step": 46642
+    },
+    {
+      "epoch": 0.40488363816286316,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0011552638569228611,
+      "loss": 0.106,
+      "step": 46643
+    },
+    {
+      "epoch": 0.40489231864306735,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0011552341505983875,
+      "loss": 0.1167,
+      "step": 46644
+    },
+    {
+      "epoch": 0.4049009991232715,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0011552044442135105,
+      "loss": 0.1104,
+      "step": 46645
+    },
+    {
+      "epoch": 0.4049096796034757,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0011551747377682633,
+      "loss": 0.0791,
+      "step": 46646
+    },
+    {
+      "epoch": 0.4049183600836798,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0011551450312626773,
+      "loss": 0.1855,
+      "step": 46647
+    },
+    {
+      "epoch": 0.404927040563884,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0011551153246967861,
+      "loss": 0.1084,
+      "step": 46648
+    },
+    {
+      "epoch": 0.40493572104408815,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0011550856180706215,
+      "loss": 0.1001,
+      "step": 46649
+    },
+    {
+      "epoch": 0.40494440152429234,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0011550559113842156,
+      "loss": 0.1494,
+      "step": 46650
+    },
+    {
+      "epoch": 0.4049530820044965,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.001155026204637602,
+      "loss": 0.0996,
+      "step": 46651
+    },
+    {
+      "epoch": 0.40496176248470067,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0011549964978308123,
+      "loss": 0.1318,
+      "step": 46652
+    },
+    {
+      "epoch": 0.4049704429649048,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.001154966790963879,
+      "loss": 0.1152,
+      "step": 46653
+    },
+    {
+      "epoch": 0.404979123445109,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001154937084036835,
+      "loss": 0.1123,
+      "step": 46654
+    },
+    {
+      "epoch": 0.40498780392531314,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0011549073770497128,
+      "loss": 0.1104,
+      "step": 46655
+    },
+    {
+      "epoch": 0.40499648440551733,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0011548776700025444,
+      "loss": 0.1021,
+      "step": 46656
+    },
+    {
+      "epoch": 0.40500516488572147,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0011548479628953631,
+      "loss": 0.1152,
+      "step": 46657
+    },
+    {
+      "epoch": 0.40501384536592566,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0011548182557282005,
+      "loss": 0.0835,
+      "step": 46658
+    },
+    {
+      "epoch": 0.4050225258461298,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0011547885485010895,
+      "loss": 0.1064,
+      "step": 46659
+    },
+    {
+      "epoch": 0.405031206326334,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001154758841214063,
+      "loss": 0.0889,
+      "step": 46660
+    },
+    {
+      "epoch": 0.40503988680653813,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0011547291338671526,
+      "loss": 0.0918,
+      "step": 46661
+    },
+    {
+      "epoch": 0.4050485672867423,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0011546994264603914,
+      "loss": 0.0811,
+      "step": 46662
+    },
+    {
+      "epoch": 0.40505724776694646,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0011546697189938116,
+      "loss": 0.0791,
+      "step": 46663
+    },
+    {
+      "epoch": 0.40506592824715065,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0011546400114674459,
+      "loss": 0.0869,
+      "step": 46664
+    },
+    {
+      "epoch": 0.4050746087273548,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0011546103038813267,
+      "loss": 0.1011,
+      "step": 46665
+    },
+    {
+      "epoch": 0.405083289207559,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0011545805962354864,
+      "loss": 0.1211,
+      "step": 46666
+    },
+    {
+      "epoch": 0.4050919696877631,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0011545508885299578,
+      "loss": 0.0967,
+      "step": 46667
+    },
+    {
+      "epoch": 0.4051006501679673,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0011545211807647729,
+      "loss": 0.1035,
+      "step": 46668
+    },
+    {
+      "epoch": 0.40510933064817145,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0011544914729399646,
+      "loss": 0.1348,
+      "step": 46669
+    },
+    {
+      "epoch": 0.40511801112837564,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0011544617650555654,
+      "loss": 0.104,
+      "step": 46670
+    },
+    {
+      "epoch": 0.4051266916085798,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0011544320571116073,
+      "loss": 0.0713,
+      "step": 46671
+    },
+    {
+      "epoch": 0.405135372088784,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0011544023491081232,
+      "loss": 0.0767,
+      "step": 46672
+    },
+    {
+      "epoch": 0.4051440525689881,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0011543726410451456,
+      "loss": 0.0952,
+      "step": 46673
+    },
+    {
+      "epoch": 0.4051527330491923,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001154342932922707,
+      "loss": 0.0938,
+      "step": 46674
+    },
+    {
+      "epoch": 0.40516141352939644,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0011543132247408398,
+      "loss": 0.1377,
+      "step": 46675
+    },
+    {
+      "epoch": 0.40517009400960063,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0011542835164995763,
+      "loss": 0.1357,
+      "step": 46676
+    },
+    {
+      "epoch": 0.40517877448980477,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0011542538081989492,
+      "loss": 0.1377,
+      "step": 46677
+    },
+    {
+      "epoch": 0.40518745497000896,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001154224099838991,
+      "loss": 0.1069,
+      "step": 46678
+    },
+    {
+      "epoch": 0.4051961354502131,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0011541943914197342,
+      "loss": 0.0732,
+      "step": 46679
+    },
+    {
+      "epoch": 0.4052048159304173,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0011541646829412112,
+      "loss": 0.1128,
+      "step": 46680
+    },
+    {
+      "epoch": 0.40521349641062143,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0011541349744034546,
+      "loss": 0.1074,
+      "step": 46681
+    },
+    {
+      "epoch": 0.4052221768908256,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0011541052658064963,
+      "loss": 0.0918,
+      "step": 46682
+    },
+    {
+      "epoch": 0.40523085737102976,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0011540755571503697,
+      "loss": 0.0923,
+      "step": 46683
+    },
+    {
+      "epoch": 0.40523953785123396,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001154045848435107,
+      "loss": 0.083,
+      "step": 46684
+    },
+    {
+      "epoch": 0.4052482183314381,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0011540161396607403,
+      "loss": 0.0977,
+      "step": 46685
+    },
+    {
+      "epoch": 0.4052568988116423,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0011539864308273025,
+      "loss": 0.0674,
+      "step": 46686
+    },
+    {
+      "epoch": 0.4052655792918464,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001153956721934826,
+      "loss": 0.1211,
+      "step": 46687
+    },
+    {
+      "epoch": 0.4052742597720506,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0011539270129833433,
+      "loss": 0.0859,
+      "step": 46688
+    },
+    {
+      "epoch": 0.40528294025225475,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0011538973039728864,
+      "loss": 0.083,
+      "step": 46689
+    },
+    {
+      "epoch": 0.40529162073245895,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0011538675949034887,
+      "loss": 0.1104,
+      "step": 46690
+    },
+    {
+      "epoch": 0.4053003012126631,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0011538378857751822,
+      "loss": 0.1436,
+      "step": 46691
+    },
+    {
+      "epoch": 0.4053089816928673,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0011538081765879989,
+      "loss": 0.0972,
+      "step": 46692
+    },
+    {
+      "epoch": 0.4053176621730714,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001153778467341972,
+      "loss": 0.1348,
+      "step": 46693
+    },
+    {
+      "epoch": 0.4053263426532756,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.001153748758037134,
+      "loss": 0.0894,
+      "step": 46694
+    },
+    {
+      "epoch": 0.40533502313347974,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0011537190486735171,
+      "loss": 0.1084,
+      "step": 46695
+    },
+    {
+      "epoch": 0.40534370361368394,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001153689339251154,
+      "loss": 0.0996,
+      "step": 46696
+    },
+    {
+      "epoch": 0.4053523840938881,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0011536596297700766,
+      "loss": 0.1406,
+      "step": 46697
+    },
+    {
+      "epoch": 0.4053610645740922,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001153629920230318,
+      "loss": 0.0898,
+      "step": 46698
+    },
+    {
+      "epoch": 0.4053697450542964,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0011536002106319109,
+      "loss": 0.1309,
+      "step": 46699
+    },
+    {
+      "epoch": 0.40537842553450054,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.001153570500974887,
+      "loss": 0.1299,
+      "step": 46700
+    },
+    {
+      "epoch": 0.40538710601470473,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0011535407912592792,
+      "loss": 0.1338,
+      "step": 46701
+    },
+    {
+      "epoch": 0.40539578649490887,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.00115351108148512,
+      "loss": 0.1006,
+      "step": 46702
+    },
+    {
+      "epoch": 0.40540446697511306,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0011534813716524421,
+      "loss": 0.1338,
+      "step": 46703
+    },
+    {
+      "epoch": 0.4054131474553172,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0011534516617612775,
+      "loss": 0.1064,
+      "step": 46704
+    },
+    {
+      "epoch": 0.4054218279355214,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0011534219518116594,
+      "loss": 0.1426,
+      "step": 46705
+    },
+    {
+      "epoch": 0.40543050841572553,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0011533922418036193,
+      "loss": 0.0713,
+      "step": 46706
+    },
+    {
+      "epoch": 0.4054391888959297,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0011533625317371907,
+      "loss": 0.1216,
+      "step": 46707
+    },
+    {
+      "epoch": 0.40544786937613386,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0011533328216124057,
+      "loss": 0.1064,
+      "step": 46708
+    },
+    {
+      "epoch": 0.40545654985633806,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0011533031114292964,
+      "loss": 0.1104,
+      "step": 46709
+    },
+    {
+      "epoch": 0.4054652303365422,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0011532734011878956,
+      "loss": 0.064,
+      "step": 46710
+    },
+    {
+      "epoch": 0.4054739108167464,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0011532436908882361,
+      "loss": 0.0605,
+      "step": 46711
+    },
+    {
+      "epoch": 0.4054825912969505,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0011532139805303498,
+      "loss": 0.0947,
+      "step": 46712
+    },
+    {
+      "epoch": 0.4054912717771547,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0011531842701142697,
+      "loss": 0.1035,
+      "step": 46713
+    },
+    {
+      "epoch": 0.40549995225735885,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001153154559640028,
+      "loss": 0.0908,
+      "step": 46714
+    },
+    {
+      "epoch": 0.40550863273756305,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0011531248491076574,
+      "loss": 0.0742,
+      "step": 46715
+    },
+    {
+      "epoch": 0.4055173132177672,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0011530951385171902,
+      "loss": 0.1143,
+      "step": 46716
+    },
+    {
+      "epoch": 0.4055259936979714,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001153065427868659,
+      "loss": 0.0933,
+      "step": 46717
+    },
+    {
+      "epoch": 0.4055346741781755,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.001153035717162096,
+      "loss": 0.0859,
+      "step": 46718
+    },
+    {
+      "epoch": 0.4055433546583797,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001153006006397534,
+      "loss": 0.1123,
+      "step": 46719
+    },
+    {
+      "epoch": 0.40555203513858384,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0011529762955750057,
+      "loss": 0.085,
+      "step": 46720
+    },
+    {
+      "epoch": 0.40556071561878804,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001152946584694543,
+      "loss": 0.083,
+      "step": 46721
+    },
+    {
+      "epoch": 0.4055693960989922,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0011529168737561789,
+      "loss": 0.0952,
+      "step": 46722
+    },
+    {
+      "epoch": 0.40557807657919637,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0011528871627599453,
+      "loss": 0.0874,
+      "step": 46723
+    },
+    {
+      "epoch": 0.4055867570594005,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0011528574517058756,
+      "loss": 0.1104,
+      "step": 46724
+    },
+    {
+      "epoch": 0.4055954375396047,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0011528277405940019,
+      "loss": 0.0596,
+      "step": 46725
+    },
+    {
+      "epoch": 0.40560411801980883,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001152798029424356,
+      "loss": 0.1211,
+      "step": 46726
+    },
+    {
+      "epoch": 0.405612798500013,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0011527683181969718,
+      "loss": 0.1143,
+      "step": 46727
+    },
+    {
+      "epoch": 0.40562147898021716,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0011527386069118799,
+      "loss": 0.0898,
+      "step": 46728
+    },
+    {
+      "epoch": 0.40563015946042136,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0011527088955691144,
+      "loss": 0.1016,
+      "step": 46729
+    },
+    {
+      "epoch": 0.4056388399406255,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0011526791841687074,
+      "loss": 0.0747,
+      "step": 46730
+    },
+    {
+      "epoch": 0.4056475204208297,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001152649472710691,
+      "loss": 0.1099,
+      "step": 46731
+    },
+    {
+      "epoch": 0.4056562009010338,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0011526197611950977,
+      "loss": 0.0947,
+      "step": 46732
+    },
+    {
+      "epoch": 0.405664881381238,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0011525900496219606,
+      "loss": 0.0879,
+      "step": 46733
+    },
+    {
+      "epoch": 0.40567356186144216,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001152560337991312,
+      "loss": 0.1299,
+      "step": 46734
+    },
+    {
+      "epoch": 0.40568224234164635,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001152530626303184,
+      "loss": 0.083,
+      "step": 46735
+    },
+    {
+      "epoch": 0.4056909228218505,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0011525009145576088,
+      "loss": 0.1064,
+      "step": 46736
+    },
+    {
+      "epoch": 0.4056996033020547,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0011524712027546201,
+      "loss": 0.1426,
+      "step": 46737
+    },
+    {
+      "epoch": 0.4057082837822588,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0011524414908942492,
+      "loss": 0.0859,
+      "step": 46738
+    },
+    {
+      "epoch": 0.405716964262463,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0011524117789765294,
+      "loss": 0.1182,
+      "step": 46739
+    },
+    {
+      "epoch": 0.40572564474266715,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0011523820670014927,
+      "loss": 0.1216,
+      "step": 46740
+    },
+    {
+      "epoch": 0.40573432522287134,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0011523523549691714,
+      "loss": 0.0869,
+      "step": 46741
+    },
+    {
+      "epoch": 0.4057430057030755,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001152322642879599,
+      "loss": 0.0864,
+      "step": 46742
+    },
+    {
+      "epoch": 0.40575168618327967,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.001152292930732807,
+      "loss": 0.126,
+      "step": 46743
+    },
+    {
+      "epoch": 0.4057603666634838,
+      "grad_norm": 2.0,
+      "learning_rate": 0.0011522632185288287,
+      "loss": 0.2207,
+      "step": 46744
+    },
+    {
+      "epoch": 0.405769047143688,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0011522335062676954,
+      "loss": 0.0815,
+      "step": 46745
+    },
+    {
+      "epoch": 0.40577772762389214,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0011522037939494408,
+      "loss": 0.1152,
+      "step": 46746
+    },
+    {
+      "epoch": 0.40578640810409633,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0011521740815740968,
+      "loss": 0.0684,
+      "step": 46747
+    },
+    {
+      "epoch": 0.40579508858430047,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0011521443691416963,
+      "loss": 0.0879,
+      "step": 46748
+    },
+    {
+      "epoch": 0.40580376906450466,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0011521146566522708,
+      "loss": 0.1328,
+      "step": 46749
+    },
+    {
+      "epoch": 0.4058124495447088,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0011520849441058539,
+      "loss": 0.1426,
+      "step": 46750
+    },
+    {
+      "epoch": 0.405821130024913,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0011520552315024777,
+      "loss": 0.1182,
+      "step": 46751
+    },
+    {
+      "epoch": 0.40582981050511713,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0011520255188421748,
+      "loss": 0.1523,
+      "step": 46752
+    },
+    {
+      "epoch": 0.4058384909853213,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0011519958061249774,
+      "loss": 0.0903,
+      "step": 46753
+    },
+    {
+      "epoch": 0.40584717146552546,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 0.001151966093350918,
+      "loss": 0.0654,
+      "step": 46754
+    },
+    {
+      "epoch": 0.40585585194572965,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0011519363805200297,
+      "loss": 0.1475,
+      "step": 46755
+    },
+    {
+      "epoch": 0.4058645324259338,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0011519066676323442,
+      "loss": 0.167,
+      "step": 46756
+    },
+    {
+      "epoch": 0.405873212906138,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0011518769546878945,
+      "loss": 0.0977,
+      "step": 46757
+    },
+    {
+      "epoch": 0.4058818933863421,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0011518472416867126,
+      "loss": 0.1445,
+      "step": 46758
+    },
+    {
+      "epoch": 0.4058905738665463,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0011518175286288318,
+      "loss": 0.084,
+      "step": 46759
+    },
+    {
+      "epoch": 0.40589925434675045,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001151787815514284,
+      "loss": 0.1172,
+      "step": 46760
+    },
+    {
+      "epoch": 0.40590793482695464,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0011517581023431018,
+      "loss": 0.123,
+      "step": 46761
+    },
+    {
+      "epoch": 0.4059166153071588,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0011517283891153175,
+      "loss": 0.0938,
+      "step": 46762
+    },
+    {
+      "epoch": 0.40592529578736297,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0011516986758309642,
+      "loss": 0.0723,
+      "step": 46763
+    },
+    {
+      "epoch": 0.4059339762675671,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0011516689624900738,
+      "loss": 0.1416,
+      "step": 46764
+    },
+    {
+      "epoch": 0.4059426567477713,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0011516392490926793,
+      "loss": 0.1465,
+      "step": 46765
+    },
+    {
+      "epoch": 0.40595133722797544,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0011516095356388122,
+      "loss": 0.0859,
+      "step": 46766
+    },
+    {
+      "epoch": 0.40596001770817963,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0011515798221285063,
+      "loss": 0.0996,
+      "step": 46767
+    },
+    {
+      "epoch": 0.40596869818838377,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0011515501085617933,
+      "loss": 0.0737,
+      "step": 46768
+    },
+    {
+      "epoch": 0.40597737866858796,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001151520394938706,
+      "loss": 0.1113,
+      "step": 46769
+    },
+    {
+      "epoch": 0.4059860591487921,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0011514906812592767,
+      "loss": 0.123,
+      "step": 46770
+    },
+    {
+      "epoch": 0.4059947396289963,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0011514609675235374,
+      "loss": 0.1138,
+      "step": 46771
+    },
+    {
+      "epoch": 0.40600342010920043,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001151431253731522,
+      "loss": 0.0898,
+      "step": 46772
+    },
+    {
+      "epoch": 0.4060121005894046,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0011514015398832618,
+      "loss": 0.0903,
+      "step": 46773
+    },
+    {
+      "epoch": 0.40602078106960876,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0011513718259787897,
+      "loss": 0.0928,
+      "step": 46774
+    },
+    {
+      "epoch": 0.40602946154981295,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001151342112018138,
+      "loss": 0.0879,
+      "step": 46775
+    },
+    {
+      "epoch": 0.4060381420300171,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0011513123980013396,
+      "loss": 0.0977,
+      "step": 46776
+    },
+    {
+      "epoch": 0.4060468225102213,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0011512826839284265,
+      "loss": 0.0977,
+      "step": 46777
+    },
+    {
+      "epoch": 0.4060555029904254,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0011512529697994315,
+      "loss": 0.1055,
+      "step": 46778
+    },
+    {
+      "epoch": 0.4060641834706296,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0011512232556143867,
+      "loss": 0.1123,
+      "step": 46779
+    },
+    {
+      "epoch": 0.40607286395083375,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0011511935413733255,
+      "loss": 0.1035,
+      "step": 46780
+    },
+    {
+      "epoch": 0.40608154443103794,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0011511638270762796,
+      "loss": 0.1484,
+      "step": 46781
+    },
+    {
+      "epoch": 0.4060902249112421,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0011511341127232817,
+      "loss": 0.1191,
+      "step": 46782
+    },
+    {
+      "epoch": 0.4060989053914463,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0011511043983143644,
+      "loss": 0.1348,
+      "step": 46783
+    },
+    {
+      "epoch": 0.4061075858716504,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0011510746838495596,
+      "loss": 0.0791,
+      "step": 46784
+    },
+    {
+      "epoch": 0.4061162663518546,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.001151044969328901,
+      "loss": 0.0898,
+      "step": 46785
+    },
+    {
+      "epoch": 0.40612494683205874,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.00115101525475242,
+      "loss": 0.0723,
+      "step": 46786
+    },
+    {
+      "epoch": 0.40613362731226293,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0011509855401201495,
+      "loss": 0.0718,
+      "step": 46787
+    },
+    {
+      "epoch": 0.40614230779246707,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0011509558254321217,
+      "loss": 0.0679,
+      "step": 46788
+    },
+    {
+      "epoch": 0.40615098827267127,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0011509261106883697,
+      "loss": 0.0752,
+      "step": 46789
+    },
+    {
+      "epoch": 0.4061596687528754,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0011508963958889258,
+      "loss": 0.0908,
+      "step": 46790
+    },
+    {
+      "epoch": 0.4061683492330796,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0011508666810338222,
+      "loss": 0.1875,
+      "step": 46791
+    },
+    {
+      "epoch": 0.40617702971328373,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0011508369661230916,
+      "loss": 0.1162,
+      "step": 46792
+    },
+    {
+      "epoch": 0.4061857101934879,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0011508072511567664,
+      "loss": 0.0918,
+      "step": 46793
+    },
+    {
+      "epoch": 0.40619439067369206,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0011507775361348793,
+      "loss": 0.0884,
+      "step": 46794
+    },
+    {
+      "epoch": 0.40620307115389626,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0011507478210574626,
+      "loss": 0.1279,
+      "step": 46795
+    },
+    {
+      "epoch": 0.4062117516341004,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0011507181059245487,
+      "loss": 0.1128,
+      "step": 46796
+    },
+    {
+      "epoch": 0.4062204321143046,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0011506883907361702,
+      "loss": 0.1187,
+      "step": 46797
+    },
+    {
+      "epoch": 0.4062291125945087,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0011506586754923597,
+      "loss": 0.0923,
+      "step": 46798
+    },
+    {
+      "epoch": 0.4062377930747129,
+      "grad_norm": 0.25,
+      "learning_rate": 0.00115062896019315,
+      "loss": 0.1113,
+      "step": 46799
+    },
+    {
+      "epoch": 0.40624647355491705,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001150599244838573,
+      "loss": 0.1367,
+      "step": 46800
+    },
+    {
+      "epoch": 0.40625515403512125,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0011505695294286608,
+      "loss": 0.1367,
+      "step": 46801
+    },
+    {
+      "epoch": 0.4062638345153254,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0011505398139634473,
+      "loss": 0.1045,
+      "step": 46802
+    },
+    {
+      "epoch": 0.4062725149955296,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001150510098442964,
+      "loss": 0.0767,
+      "step": 46803
+    },
+    {
+      "epoch": 0.4062811954757337,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0011504803828672437,
+      "loss": 0.0737,
+      "step": 46804
+    },
+    {
+      "epoch": 0.4062898759559379,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0011504506672363186,
+      "loss": 0.1387,
+      "step": 46805
+    },
+    {
+      "epoch": 0.40629855643614204,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0011504209515502217,
+      "loss": 0.0791,
+      "step": 46806
+    },
+    {
+      "epoch": 0.40630723691634624,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0011503912358089847,
+      "loss": 0.0889,
+      "step": 46807
+    },
+    {
+      "epoch": 0.4063159173965504,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0011503615200126412,
+      "loss": 0.0884,
+      "step": 46808
+    },
+    {
+      "epoch": 0.40632459787675457,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.001150331804161223,
+      "loss": 0.0908,
+      "step": 46809
+    },
+    {
+      "epoch": 0.4063332783569587,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0011503020882547622,
+      "loss": 0.0859,
+      "step": 46810
+    },
+    {
+      "epoch": 0.4063419588371629,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0011502723722932923,
+      "loss": 0.1328,
+      "step": 46811
+    },
+    {
+      "epoch": 0.40635063931736704,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.001150242656276845,
+      "loss": 0.209,
+      "step": 46812
+    },
+    {
+      "epoch": 0.40635931979757123,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0011502129402054533,
+      "loss": 0.1035,
+      "step": 46813
+    },
+    {
+      "epoch": 0.40636800027777537,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0011501832240791493,
+      "loss": 0.0811,
+      "step": 46814
+    },
+    {
+      "epoch": 0.40637668075797956,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0011501535078979658,
+      "loss": 0.0859,
+      "step": 46815
+    },
+    {
+      "epoch": 0.4063853612381837,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.001150123791661935,
+      "loss": 0.0806,
+      "step": 46816
+    },
+    {
+      "epoch": 0.4063940417183879,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0011500940753710897,
+      "loss": 0.1035,
+      "step": 46817
+    },
+    {
+      "epoch": 0.406402722198592,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0011500643590254623,
+      "loss": 0.0796,
+      "step": 46818
+    },
+    {
+      "epoch": 0.4064114026787962,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0011500346426250852,
+      "loss": 0.1309,
+      "step": 46819
+    },
+    {
+      "epoch": 0.40642008315900036,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0011500049261699912,
+      "loss": 0.1553,
+      "step": 46820
+    },
+    {
+      "epoch": 0.4064287636392045,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0011499752096602123,
+      "loss": 0.1387,
+      "step": 46821
+    },
+    {
+      "epoch": 0.4064374441194087,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0011499454930957815,
+      "loss": 0.0903,
+      "step": 46822
+    },
+    {
+      "epoch": 0.4064461245996128,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0011499157764767308,
+      "loss": 0.0864,
+      "step": 46823
+    },
+    {
+      "epoch": 0.406454805079817,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.001149886059803093,
+      "loss": 0.0947,
+      "step": 46824
+    },
+    {
+      "epoch": 0.40646348556002115,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0011498563430749006,
+      "loss": 0.1025,
+      "step": 46825
+    },
+    {
+      "epoch": 0.40647216604022535,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0011498266262921861,
+      "loss": 0.1123,
+      "step": 46826
+    },
+    {
+      "epoch": 0.4064808465204295,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0011497969094549816,
+      "loss": 0.0698,
+      "step": 46827
+    },
+    {
+      "epoch": 0.4064895270006337,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0011497671925633204,
+      "loss": 0.0908,
+      "step": 46828
+    },
+    {
+      "epoch": 0.4064982074808378,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0011497374756172345,
+      "loss": 0.0645,
+      "step": 46829
+    },
+    {
+      "epoch": 0.406506887961042,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0011497077586167567,
+      "loss": 0.1553,
+      "step": 46830
+    },
+    {
+      "epoch": 0.40651556844124614,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0011496780415619185,
+      "loss": 0.3672,
+      "step": 46831
+    },
+    {
+      "epoch": 0.40652424892145034,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0011496483244527536,
+      "loss": 0.0986,
+      "step": 46832
+    },
+    {
+      "epoch": 0.4065329294016545,
+      "grad_norm": 2.265625,
+      "learning_rate": 0.001149618607289294,
+      "loss": 0.2383,
+      "step": 46833
+    },
+    {
+      "epoch": 0.40654160988185867,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0011495888900715724,
+      "loss": 0.1572,
+      "step": 46834
+    },
+    {
+      "epoch": 0.4065502903620628,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0011495591727996208,
+      "loss": 0.1035,
+      "step": 46835
+    },
+    {
+      "epoch": 0.406558970842267,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0011495294554734721,
+      "loss": 0.0908,
+      "step": 46836
+    },
+    {
+      "epoch": 0.40656765132247114,
+      "grad_norm": 2.265625,
+      "learning_rate": 0.0011494997380931589,
+      "loss": 0.2051,
+      "step": 46837
+    },
+    {
+      "epoch": 0.40657633180267533,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0011494700206587133,
+      "loss": 0.0908,
+      "step": 46838
+    },
+    {
+      "epoch": 0.40658501228287947,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0011494403031701683,
+      "loss": 0.1191,
+      "step": 46839
+    },
+    {
+      "epoch": 0.40659369276308366,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0011494105856275558,
+      "loss": 0.125,
+      "step": 46840
+    },
+    {
+      "epoch": 0.4066023732432878,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.001149380868030909,
+      "loss": 0.1045,
+      "step": 46841
+    },
+    {
+      "epoch": 0.406611053723492,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0011493511503802597,
+      "loss": 0.0562,
+      "step": 46842
+    },
+    {
+      "epoch": 0.4066197342036961,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0011493214326756412,
+      "loss": 0.0806,
+      "step": 46843
+    },
+    {
+      "epoch": 0.4066284146839003,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001149291714917085,
+      "loss": 0.0957,
+      "step": 46844
+    },
+    {
+      "epoch": 0.40663709516410446,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0011492619971046243,
+      "loss": 0.106,
+      "step": 46845
+    },
+    {
+      "epoch": 0.40664577564430865,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0011492322792382914,
+      "loss": 0.0996,
+      "step": 46846
+    },
+    {
+      "epoch": 0.4066544561245128,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0011492025613181187,
+      "loss": 0.0732,
+      "step": 46847
+    },
+    {
+      "epoch": 0.406663136604717,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001149172843344139,
+      "loss": 0.0908,
+      "step": 46848
+    },
+    {
+      "epoch": 0.4066718170849211,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0011491431253163847,
+      "loss": 0.1016,
+      "step": 46849
+    },
+    {
+      "epoch": 0.4066804975651253,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0011491134072348884,
+      "loss": 0.0669,
+      "step": 46850
+    },
+    {
+      "epoch": 0.40668917804532945,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001149083689099682,
+      "loss": 0.1299,
+      "step": 46851
+    },
+    {
+      "epoch": 0.40669785852553364,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0011490539709107985,
+      "loss": 0.1094,
+      "step": 46852
+    },
+    {
+      "epoch": 0.4067065390057378,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0011490242526682702,
+      "loss": 0.1182,
+      "step": 46853
+    },
+    {
+      "epoch": 0.40671521948594197,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.00114899453437213,
+      "loss": 0.1094,
+      "step": 46854
+    },
+    {
+      "epoch": 0.4067238999661461,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.00114896481602241,
+      "loss": 0.1016,
+      "step": 46855
+    },
+    {
+      "epoch": 0.4067325804463503,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0011489350976191426,
+      "loss": 0.1045,
+      "step": 46856
+    },
+    {
+      "epoch": 0.40674126092655444,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0011489053791623608,
+      "loss": 0.0957,
+      "step": 46857
+    },
+    {
+      "epoch": 0.40674994140675863,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0011488756606520968,
+      "loss": 0.124,
+      "step": 46858
+    },
+    {
+      "epoch": 0.40675862188696277,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001148845942088383,
+      "loss": 0.0996,
+      "step": 46859
+    },
+    {
+      "epoch": 0.40676730236716696,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0011488162234712521,
+      "loss": 0.0938,
+      "step": 46860
+    },
+    {
+      "epoch": 0.4067759828473711,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0011487865048007365,
+      "loss": 0.1016,
+      "step": 46861
+    },
+    {
+      "epoch": 0.4067846633275753,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0011487567860768687,
+      "loss": 0.0708,
+      "step": 46862
+    },
+    {
+      "epoch": 0.40679334380777943,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001148727067299681,
+      "loss": 0.1719,
+      "step": 46863
+    },
+    {
+      "epoch": 0.4068020242879836,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0011486973484692064,
+      "loss": 0.0713,
+      "step": 46864
+    },
+    {
+      "epoch": 0.40681070476818776,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001148667629585477,
+      "loss": 0.1289,
+      "step": 46865
+    },
+    {
+      "epoch": 0.40681938524839195,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0011486379106485252,
+      "loss": 0.1738,
+      "step": 46866
+    },
+    {
+      "epoch": 0.4068280657285961,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001148608191658384,
+      "loss": 0.0942,
+      "step": 46867
+    },
+    {
+      "epoch": 0.4068367462088003,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0011485784726150857,
+      "loss": 0.0918,
+      "step": 46868
+    },
+    {
+      "epoch": 0.4068454266890044,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0011485487535186625,
+      "loss": 0.0957,
+      "step": 46869
+    },
+    {
+      "epoch": 0.4068541071692086,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001148519034369147,
+      "loss": 0.1162,
+      "step": 46870
+    },
+    {
+      "epoch": 0.40686278764941275,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001148489315166572,
+      "loss": 0.0869,
+      "step": 46871
+    },
+    {
+      "epoch": 0.40687146812961694,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0011484595959109698,
+      "loss": 0.0884,
+      "step": 46872
+    },
+    {
+      "epoch": 0.4068801486098211,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001148429876602373,
+      "loss": 0.0889,
+      "step": 46873
+    },
+    {
+      "epoch": 0.4068888290900253,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0011484001572408138,
+      "loss": 0.127,
+      "step": 46874
+    },
+    {
+      "epoch": 0.4068975095702294,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0011483704378263248,
+      "loss": 0.0889,
+      "step": 46875
+    },
+    {
+      "epoch": 0.4069061900504336,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001148340718358939,
+      "loss": 0.1777,
+      "step": 46876
+    },
+    {
+      "epoch": 0.40691487053063774,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0011483109988386885,
+      "loss": 0.1143,
+      "step": 46877
+    },
+    {
+      "epoch": 0.40692355101084193,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0011482812792656056,
+      "loss": 0.0986,
+      "step": 46878
+    },
+    {
+      "epoch": 0.40693223149104607,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0011482515596397229,
+      "loss": 0.1621,
+      "step": 46879
+    },
+    {
+      "epoch": 0.40694091197125026,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0011482218399610734,
+      "loss": 0.082,
+      "step": 46880
+    },
+    {
+      "epoch": 0.4069495924514544,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001148192120229689,
+      "loss": 0.0918,
+      "step": 46881
+    },
+    {
+      "epoch": 0.4069582729316586,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0011481624004456026,
+      "loss": 0.084,
+      "step": 46882
+    },
+    {
+      "epoch": 0.40696695341186273,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0011481326806088463,
+      "loss": 0.126,
+      "step": 46883
+    },
+    {
+      "epoch": 0.4069756338920669,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0011481029607194526,
+      "loss": 0.1084,
+      "step": 46884
+    },
+    {
+      "epoch": 0.40698431437227106,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0011480732407774547,
+      "loss": 0.0967,
+      "step": 46885
+    },
+    {
+      "epoch": 0.40699299485247525,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0011480435207828845,
+      "loss": 0.0933,
+      "step": 46886
+    },
+    {
+      "epoch": 0.4070016753326794,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0011480138007357746,
+      "loss": 0.1074,
+      "step": 46887
+    },
+    {
+      "epoch": 0.4070103558128836,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0011479840806361574,
+      "loss": 0.085,
+      "step": 46888
+    },
+    {
+      "epoch": 0.4070190362930877,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0011479543604840658,
+      "loss": 0.0884,
+      "step": 46889
+    },
+    {
+      "epoch": 0.4070277167732919,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001147924640279532,
+      "loss": 0.0859,
+      "step": 46890
+    },
+    {
+      "epoch": 0.40703639725349605,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0011478949200225883,
+      "loss": 0.0645,
+      "step": 46891
+    },
+    {
+      "epoch": 0.40704507773370024,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0011478651997132675,
+      "loss": 0.1021,
+      "step": 46892
+    },
+    {
+      "epoch": 0.4070537582139044,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001147835479351602,
+      "loss": 0.1396,
+      "step": 46893
+    },
+    {
+      "epoch": 0.4070624386941086,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0011478057589376246,
+      "loss": 0.0977,
+      "step": 46894
+    },
+    {
+      "epoch": 0.4070711191743127,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0011477760384713672,
+      "loss": 0.0698,
+      "step": 46895
+    },
+    {
+      "epoch": 0.4070797996545169,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0011477463179528627,
+      "loss": 0.0728,
+      "step": 46896
+    },
+    {
+      "epoch": 0.40708848013472104,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0011477165973821438,
+      "loss": 0.1104,
+      "step": 46897
+    },
+    {
+      "epoch": 0.40709716061492524,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0011476868767592427,
+      "loss": 0.082,
+      "step": 46898
+    },
+    {
+      "epoch": 0.4071058410951294,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0011476571560841916,
+      "loss": 0.0791,
+      "step": 46899
+    },
+    {
+      "epoch": 0.40711452157533357,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0011476274353570235,
+      "loss": 0.0732,
+      "step": 46900
+    },
+    {
+      "epoch": 0.4071232020555377,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0011475977145777713,
+      "loss": 0.1113,
+      "step": 46901
+    },
+    {
+      "epoch": 0.4071318825357419,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0011475679937464664,
+      "loss": 0.0938,
+      "step": 46902
+    },
+    {
+      "epoch": 0.40714056301594603,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0011475382728631416,
+      "loss": 0.0889,
+      "step": 46903
+    },
+    {
+      "epoch": 0.4071492434961502,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.00114750855192783,
+      "loss": 0.0732,
+      "step": 46904
+    },
+    {
+      "epoch": 0.40715792397635436,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001147478830940564,
+      "loss": 0.0986,
+      "step": 46905
+    },
+    {
+      "epoch": 0.40716660445655856,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0011474491099013756,
+      "loss": 0.127,
+      "step": 46906
+    },
+    {
+      "epoch": 0.4071752849367627,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0011474193888102975,
+      "loss": 0.083,
+      "step": 46907
+    },
+    {
+      "epoch": 0.4071839654169669,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0011473896676673625,
+      "loss": 0.0825,
+      "step": 46908
+    },
+    {
+      "epoch": 0.407192645897171,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0011473599464726025,
+      "loss": 0.0605,
+      "step": 46909
+    },
+    {
+      "epoch": 0.4072013263773752,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0011473302252260507,
+      "loss": 0.0801,
+      "step": 46910
+    },
+    {
+      "epoch": 0.40721000685757935,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0011473005039277393,
+      "loss": 0.1221,
+      "step": 46911
+    },
+    {
+      "epoch": 0.40721868733778355,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0011472707825777007,
+      "loss": 0.0718,
+      "step": 46912
+    },
+    {
+      "epoch": 0.4072273678179877,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0011472410611759673,
+      "loss": 0.1201,
+      "step": 46913
+    },
+    {
+      "epoch": 0.4072360482981919,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0011472113397225719,
+      "loss": 0.1084,
+      "step": 46914
+    },
+    {
+      "epoch": 0.407244728778396,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.001147181618217547,
+      "loss": 0.1177,
+      "step": 46915
+    },
+    {
+      "epoch": 0.4072534092586002,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001147151896660925,
+      "loss": 0.1182,
+      "step": 46916
+    },
+    {
+      "epoch": 0.40726208973880434,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0011471221750527383,
+      "loss": 0.0801,
+      "step": 46917
+    },
+    {
+      "epoch": 0.40727077021900854,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0011470924533930197,
+      "loss": 0.1162,
+      "step": 46918
+    },
+    {
+      "epoch": 0.4072794506992127,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0011470627316818012,
+      "loss": 0.0732,
+      "step": 46919
+    },
+    {
+      "epoch": 0.40728813117941687,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0011470330099191157,
+      "loss": 0.1523,
+      "step": 46920
+    },
+    {
+      "epoch": 0.407296811659621,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0011470032881049958,
+      "loss": 0.1172,
+      "step": 46921
+    },
+    {
+      "epoch": 0.4073054921398252,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0011469735662394734,
+      "loss": 0.1377,
+      "step": 46922
+    },
+    {
+      "epoch": 0.40731417262002934,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0011469438443225816,
+      "loss": 0.0894,
+      "step": 46923
+    },
+    {
+      "epoch": 0.40732285310023353,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0011469141223543527,
+      "loss": 0.1143,
+      "step": 46924
+    },
+    {
+      "epoch": 0.40733153358043767,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0011468844003348193,
+      "loss": 0.1104,
+      "step": 46925
+    },
+    {
+      "epoch": 0.40734021406064186,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0011468546782640138,
+      "loss": 0.1074,
+      "step": 46926
+    },
+    {
+      "epoch": 0.407348894540846,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0011468249561419688,
+      "loss": 0.1001,
+      "step": 46927
+    },
+    {
+      "epoch": 0.4073575750210502,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0011467952339687167,
+      "loss": 0.124,
+      "step": 46928
+    },
+    {
+      "epoch": 0.4073662555012543,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.00114676551174429,
+      "loss": 0.127,
+      "step": 46929
+    },
+    {
+      "epoch": 0.4073749359814585,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0011467357894687214,
+      "loss": 0.1221,
+      "step": 46930
+    },
+    {
+      "epoch": 0.40738361646166266,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0011467060671420432,
+      "loss": 0.1172,
+      "step": 46931
+    },
+    {
+      "epoch": 0.40739229694186685,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0011466763447642873,
+      "loss": 0.125,
+      "step": 46932
+    },
+    {
+      "epoch": 0.407400977422071,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0011466466223354877,
+      "loss": 0.0645,
+      "step": 46933
+    },
+    {
+      "epoch": 0.4074096579022752,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0011466168998556757,
+      "loss": 0.126,
+      "step": 46934
+    },
+    {
+      "epoch": 0.4074183383824793,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001146587177324884,
+      "loss": 0.0952,
+      "step": 46935
+    },
+    {
+      "epoch": 0.4074270188626835,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0011465574547431456,
+      "loss": 0.125,
+      "step": 46936
+    },
+    {
+      "epoch": 0.40743569934288765,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0011465277321104925,
+      "loss": 0.1963,
+      "step": 46937
+    },
+    {
+      "epoch": 0.40744437982309184,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0011464980094269574,
+      "loss": 0.127,
+      "step": 46938
+    },
+    {
+      "epoch": 0.407453060303296,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0011464682866925728,
+      "loss": 0.0728,
+      "step": 46939
+    },
+    {
+      "epoch": 0.40746174078350017,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0011464385639073712,
+      "loss": 0.1201,
+      "step": 46940
+    },
+    {
+      "epoch": 0.4074704212637043,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001146408841071385,
+      "loss": 0.0889,
+      "step": 46941
+    },
+    {
+      "epoch": 0.4074791017439085,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0011463791181846469,
+      "loss": 0.0854,
+      "step": 46942
+    },
+    {
+      "epoch": 0.40748778222411264,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001146349395247189,
+      "loss": 0.1006,
+      "step": 46943
+    },
+    {
+      "epoch": 0.4074964627043168,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0011463196722590442,
+      "loss": 0.0986,
+      "step": 46944
+    },
+    {
+      "epoch": 0.40750514318452097,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0011462899492202451,
+      "loss": 0.1123,
+      "step": 46945
+    },
+    {
+      "epoch": 0.4075138236647251,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0011462602261308242,
+      "loss": 0.1445,
+      "step": 46946
+    },
+    {
+      "epoch": 0.4075225041449293,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0011462305029908135,
+      "loss": 0.0811,
+      "step": 46947
+    },
+    {
+      "epoch": 0.40753118462513344,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0011462007798002459,
+      "loss": 0.1221,
+      "step": 46948
+    },
+    {
+      "epoch": 0.40753986510533763,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0011461710565591537,
+      "loss": 0.0986,
+      "step": 46949
+    },
+    {
+      "epoch": 0.40754854558554177,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0011461413332675696,
+      "loss": 0.1055,
+      "step": 46950
+    },
+    {
+      "epoch": 0.40755722606574596,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0011461116099255257,
+      "loss": 0.0977,
+      "step": 46951
+    },
+    {
+      "epoch": 0.4075659065459501,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0011460818865330553,
+      "loss": 0.0903,
+      "step": 46952
+    },
+    {
+      "epoch": 0.4075745870261543,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0011460521630901903,
+      "loss": 0.085,
+      "step": 46953
+    },
+    {
+      "epoch": 0.4075832675063584,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0011460224395969634,
+      "loss": 0.0928,
+      "step": 46954
+    },
+    {
+      "epoch": 0.4075919479865626,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001145992716053407,
+      "loss": 0.1143,
+      "step": 46955
+    },
+    {
+      "epoch": 0.40760062846676676,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0011459629924595541,
+      "loss": 0.103,
+      "step": 46956
+    },
+    {
+      "epoch": 0.40760930894697095,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001145933268815436,
+      "loss": 0.0908,
+      "step": 46957
+    },
+    {
+      "epoch": 0.4076179894271751,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0011459035451210863,
+      "loss": 0.1074,
+      "step": 46958
+    },
+    {
+      "epoch": 0.4076266699073793,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 0.0011458738213765373,
+      "loss": 0.0737,
+      "step": 46959
+    },
+    {
+      "epoch": 0.4076353503875834,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001145844097581821,
+      "loss": 0.0825,
+      "step": 46960
+    },
+    {
+      "epoch": 0.4076440308677876,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0011458143737369704,
+      "loss": 0.1328,
+      "step": 46961
+    },
+    {
+      "epoch": 0.40765271134799175,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001145784649842018,
+      "loss": 0.1377,
+      "step": 46962
+    },
+    {
+      "epoch": 0.40766139182819594,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0011457549258969966,
+      "loss": 0.105,
+      "step": 46963
+    },
+    {
+      "epoch": 0.4076700723084001,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0011457252019019377,
+      "loss": 0.0894,
+      "step": 46964
+    },
+    {
+      "epoch": 0.40767875278860427,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0011456954778568745,
+      "loss": 0.0918,
+      "step": 46965
+    },
+    {
+      "epoch": 0.4076874332688084,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.0011456657537618398,
+      "loss": 0.1162,
+      "step": 46966
+    },
+    {
+      "epoch": 0.4076961137490126,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0011456360296168656,
+      "loss": 0.083,
+      "step": 46967
+    },
+    {
+      "epoch": 0.40770479422921674,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0011456063054219841,
+      "loss": 0.1221,
+      "step": 46968
+    },
+    {
+      "epoch": 0.40771347470942093,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0011455765811772285,
+      "loss": 0.1143,
+      "step": 46969
+    },
+    {
+      "epoch": 0.40772215518962507,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0011455468568826311,
+      "loss": 0.1279,
+      "step": 46970
+    },
+    {
+      "epoch": 0.40773083566982926,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001145517132538224,
+      "loss": 0.0835,
+      "step": 46971
+    },
+    {
+      "epoch": 0.4077395161500334,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0011454874081440406,
+      "loss": 0.0947,
+      "step": 46972
+    },
+    {
+      "epoch": 0.4077481966302376,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0011454576837001127,
+      "loss": 0.1226,
+      "step": 46973
+    },
+    {
+      "epoch": 0.40775687711044173,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0011454279592064726,
+      "loss": 0.1084,
+      "step": 46974
+    },
+    {
+      "epoch": 0.4077655575906459,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0011453982346631534,
+      "loss": 0.103,
+      "step": 46975
+    },
+    {
+      "epoch": 0.40777423807085006,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0011453685100701876,
+      "loss": 0.0952,
+      "step": 46976
+    },
+    {
+      "epoch": 0.40778291855105425,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0011453387854276072,
+      "loss": 0.0908,
+      "step": 46977
+    },
+    {
+      "epoch": 0.4077915990312584,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001145309060735445,
+      "loss": 0.0767,
+      "step": 46978
+    },
+    {
+      "epoch": 0.4078002795114626,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0011452793359937334,
+      "loss": 0.103,
+      "step": 46979
+    },
+    {
+      "epoch": 0.4078089599916667,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.001145249611202505,
+      "loss": 0.125,
+      "step": 46980
+    },
+    {
+      "epoch": 0.4078176404718709,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0011452198863617926,
+      "loss": 0.1162,
+      "step": 46981
+    },
+    {
+      "epoch": 0.40782632095207505,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0011451901614716285,
+      "loss": 0.0588,
+      "step": 46982
+    },
+    {
+      "epoch": 0.40783500143227924,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0011451604365320448,
+      "loss": 0.0918,
+      "step": 46983
+    },
+    {
+      "epoch": 0.4078436819124834,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0011451307115430745,
+      "loss": 0.0952,
+      "step": 46984
+    },
+    {
+      "epoch": 0.4078523623926876,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0011451009865047497,
+      "loss": 0.1758,
+      "step": 46985
+    },
+    {
+      "epoch": 0.4078610428728917,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0011450712614171034,
+      "loss": 0.1396,
+      "step": 46986
+    },
+    {
+      "epoch": 0.4078697233530959,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0011450415362801675,
+      "loss": 0.1138,
+      "step": 46987
+    },
+    {
+      "epoch": 0.40787840383330004,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0011450118110939752,
+      "loss": 0.123,
+      "step": 46988
+    },
+    {
+      "epoch": 0.40788708431350423,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0011449820858585582,
+      "loss": 0.127,
+      "step": 46989
+    },
+    {
+      "epoch": 0.40789576479370837,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0011449523605739501,
+      "loss": 0.1084,
+      "step": 46990
+    },
+    {
+      "epoch": 0.40790444527391256,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0011449226352401823,
+      "loss": 0.0889,
+      "step": 46991
+    },
+    {
+      "epoch": 0.4079131257541167,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001144892909857288,
+      "loss": 0.1172,
+      "step": 46992
+    },
+    {
+      "epoch": 0.4079218062343209,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0011448631844252996,
+      "loss": 0.0854,
+      "step": 46993
+    },
+    {
+      "epoch": 0.40793048671452503,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0011448334589442492,
+      "loss": 0.0752,
+      "step": 46994
+    },
+    {
+      "epoch": 0.4079391671947292,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.00114480373341417,
+      "loss": 0.1045,
+      "step": 46995
+    },
+    {
+      "epoch": 0.40794784767493336,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0011447740078350936,
+      "loss": 0.1064,
+      "step": 46996
+    },
+    {
+      "epoch": 0.40795652815513755,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0011447442822070534,
+      "loss": 0.1104,
+      "step": 46997
+    },
+    {
+      "epoch": 0.4079652086353417,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0011447145565300812,
+      "loss": 0.0889,
+      "step": 46998
+    },
+    {
+      "epoch": 0.4079738891155459,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0011446848308042101,
+      "loss": 0.126,
+      "step": 46999
+    },
+    {
+      "epoch": 0.40798256959575,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.001144655105029472,
+      "loss": 0.1445,
+      "step": 47000
+    },
+    {
+      "epoch": 0.4079912500759542,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0011446253792059,
+      "loss": 0.1348,
+      "step": 47001
+    },
+    {
+      "epoch": 0.40799993055615835,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0011445956533335264,
+      "loss": 0.0757,
+      "step": 47002
+    },
+    {
+      "epoch": 0.40800861103636255,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0011445659274123838,
+      "loss": 0.1133,
+      "step": 47003
+    },
+    {
+      "epoch": 0.4080172915165667,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0011445362014425042,
+      "loss": 0.1025,
+      "step": 47004
+    },
+    {
+      "epoch": 0.4080259719967709,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001144506475423921,
+      "loss": 0.0654,
+      "step": 47005
+    },
+    {
+      "epoch": 0.408034652476975,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0011444767493566657,
+      "loss": 0.0918,
+      "step": 47006
+    },
+    {
+      "epoch": 0.4080433329571792,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0011444470232407715,
+      "loss": 0.125,
+      "step": 47007
+    },
+    {
+      "epoch": 0.40805201343738334,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0011444172970762707,
+      "loss": 0.0991,
+      "step": 47008
+    },
+    {
+      "epoch": 0.40806069391758754,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0011443875708631957,
+      "loss": 0.3223,
+      "step": 47009
+    },
+    {
+      "epoch": 0.4080693743977917,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0011443578446015791,
+      "loss": 0.1191,
+      "step": 47010
+    },
+    {
+      "epoch": 0.40807805487799587,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0011443281182914535,
+      "loss": 0.0732,
+      "step": 47011
+    },
+    {
+      "epoch": 0.4080867353582,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0011442983919328516,
+      "loss": 0.1074,
+      "step": 47012
+    },
+    {
+      "epoch": 0.4080954158384042,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0011442686655258052,
+      "loss": 0.0645,
+      "step": 47013
+    },
+    {
+      "epoch": 0.40810409631860833,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0011442389390703476,
+      "loss": 0.1084,
+      "step": 47014
+    },
+    {
+      "epoch": 0.4081127767988125,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0011442092125665106,
+      "loss": 0.1309,
+      "step": 47015
+    },
+    {
+      "epoch": 0.40812145727901666,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0011441794860143275,
+      "loss": 0.0825,
+      "step": 47016
+    },
+    {
+      "epoch": 0.40813013775922086,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0011441497594138301,
+      "loss": 0.1035,
+      "step": 47017
+    },
+    {
+      "epoch": 0.408138818239425,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.001144120032765051,
+      "loss": 0.1309,
+      "step": 47018
+    },
+    {
+      "epoch": 0.4081474987196292,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0011440903060680233,
+      "loss": 0.0791,
+      "step": 47019
+    },
+    {
+      "epoch": 0.4081561791998333,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0011440605793227791,
+      "loss": 0.1006,
+      "step": 47020
+    },
+    {
+      "epoch": 0.4081648596800375,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0011440308525293506,
+      "loss": 0.123,
+      "step": 47021
+    },
+    {
+      "epoch": 0.40817354016024165,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0011440011256877707,
+      "loss": 0.0874,
+      "step": 47022
+    },
+    {
+      "epoch": 0.40818222064044585,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0011439713987980722,
+      "loss": 0.1064,
+      "step": 47023
+    },
+    {
+      "epoch": 0.40819090112065,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001143941671860287,
+      "loss": 0.1045,
+      "step": 47024
+    },
+    {
+      "epoch": 0.4081995816008542,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0011439119448744478,
+      "loss": 0.1035,
+      "step": 47025
+    },
+    {
+      "epoch": 0.4082082620810583,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001143882217840587,
+      "loss": 0.0889,
+      "step": 47026
+    },
+    {
+      "epoch": 0.4082169425612625,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0011438524907587374,
+      "loss": 0.1001,
+      "step": 47027
+    },
+    {
+      "epoch": 0.40822562304146665,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0011438227636289314,
+      "loss": 0.0928,
+      "step": 47028
+    },
+    {
+      "epoch": 0.40823430352167084,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0011437930364512014,
+      "loss": 0.1582,
+      "step": 47029
+    },
+    {
+      "epoch": 0.408242984001875,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.00114376330922558,
+      "loss": 0.0874,
+      "step": 47030
+    },
+    {
+      "epoch": 0.40825166448207917,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0011437335819521,
+      "loss": 0.1006,
+      "step": 47031
+    },
+    {
+      "epoch": 0.4082603449622833,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0011437038546307935,
+      "loss": 0.0879,
+      "step": 47032
+    },
+    {
+      "epoch": 0.4082690254424875,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0011436741272616928,
+      "loss": 0.1016,
+      "step": 47033
+    },
+    {
+      "epoch": 0.40827770592269164,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.001143644399844831,
+      "loss": 0.0898,
+      "step": 47034
+    },
+    {
+      "epoch": 0.40828638640289583,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0011436146723802403,
+      "loss": 0.1045,
+      "step": 47035
+    },
+    {
+      "epoch": 0.40829506688309997,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0011435849448679534,
+      "loss": 0.1084,
+      "step": 47036
+    },
+    {
+      "epoch": 0.40830374736330416,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0011435552173080022,
+      "loss": 0.0884,
+      "step": 47037
+    },
+    {
+      "epoch": 0.4083124278435083,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0011435254897004202,
+      "loss": 0.085,
+      "step": 47038
+    },
+    {
+      "epoch": 0.4083211083237125,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0011434957620452386,
+      "loss": 0.1069,
+      "step": 47039
+    },
+    {
+      "epoch": 0.4083297888039166,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0011434660343424914,
+      "loss": 0.1089,
+      "step": 47040
+    },
+    {
+      "epoch": 0.4083384692841208,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0011434363065922104,
+      "loss": 0.1152,
+      "step": 47041
+    },
+    {
+      "epoch": 0.40834714976432496,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0011434065787944282,
+      "loss": 0.1074,
+      "step": 47042
+    },
+    {
+      "epoch": 0.40835583024452915,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0011433768509491766,
+      "loss": 0.104,
+      "step": 47043
+    },
+    {
+      "epoch": 0.4083645107247333,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0011433471230564891,
+      "loss": 0.1445,
+      "step": 47044
+    },
+    {
+      "epoch": 0.4083731912049375,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0011433173951163977,
+      "loss": 0.1328,
+      "step": 47045
+    },
+    {
+      "epoch": 0.4083818716851416,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0011432876671289354,
+      "loss": 0.1367,
+      "step": 47046
+    },
+    {
+      "epoch": 0.4083905521653458,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001143257939094134,
+      "loss": 0.1162,
+      "step": 47047
+    },
+    {
+      "epoch": 0.40839923264554995,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0011432282110120263,
+      "loss": 0.1143,
+      "step": 47048
+    },
+    {
+      "epoch": 0.40840791312575414,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0011431984828826451,
+      "loss": 0.1064,
+      "step": 47049
+    },
+    {
+      "epoch": 0.4084165936059583,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0011431687547060227,
+      "loss": 0.0708,
+      "step": 47050
+    },
+    {
+      "epoch": 0.40842527408616247,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0011431390264821917,
+      "loss": 0.0869,
+      "step": 47051
+    },
+    {
+      "epoch": 0.4084339545663666,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001143109298211184,
+      "loss": 0.0894,
+      "step": 47052
+    },
+    {
+      "epoch": 0.4084426350465708,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0011430795698930331,
+      "loss": 0.1201,
+      "step": 47053
+    },
+    {
+      "epoch": 0.40845131552677494,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0011430498415277712,
+      "loss": 0.0869,
+      "step": 47054
+    },
+    {
+      "epoch": 0.40845999600697913,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0011430201131154302,
+      "loss": 0.0977,
+      "step": 47055
+    },
+    {
+      "epoch": 0.40846867648718327,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0011429903846560435,
+      "loss": 0.0996,
+      "step": 47056
+    },
+    {
+      "epoch": 0.40847735696738746,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0011429606561496426,
+      "loss": 0.0698,
+      "step": 47057
+    },
+    {
+      "epoch": 0.4084860374475916,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0011429309275962608,
+      "loss": 0.0713,
+      "step": 47058
+    },
+    {
+      "epoch": 0.4084947179277958,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0011429011989959304,
+      "loss": 0.1172,
+      "step": 47059
+    },
+    {
+      "epoch": 0.40850339840799993,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0011428714703486838,
+      "loss": 0.0791,
+      "step": 47060
+    },
+    {
+      "epoch": 0.4085120788882041,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0011428417416545539,
+      "loss": 0.085,
+      "step": 47061
+    },
+    {
+      "epoch": 0.40852075936840826,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0011428120129135728,
+      "loss": 0.0859,
+      "step": 47062
+    },
+    {
+      "epoch": 0.40852943984861245,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0011427822841257732,
+      "loss": 0.1436,
+      "step": 47063
+    },
+    {
+      "epoch": 0.4085381203288166,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0011427525552911873,
+      "loss": 0.1094,
+      "step": 47064
+    },
+    {
+      "epoch": 0.4085468008090208,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0011427228264098478,
+      "loss": 0.0967,
+      "step": 47065
+    },
+    {
+      "epoch": 0.4085554812892249,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0011426930974817874,
+      "loss": 0.1084,
+      "step": 47066
+    },
+    {
+      "epoch": 0.40856416176942906,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0011426633685070384,
+      "loss": 0.1006,
+      "step": 47067
+    },
+    {
+      "epoch": 0.40857284224963325,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0011426336394856334,
+      "loss": 0.1025,
+      "step": 47068
+    },
+    {
+      "epoch": 0.4085815227298374,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0011426039104176049,
+      "loss": 0.0747,
+      "step": 47069
+    },
+    {
+      "epoch": 0.4085902032100416,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0011425741813029856,
+      "loss": 0.1128,
+      "step": 47070
+    },
+    {
+      "epoch": 0.4085988836902457,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0011425444521418076,
+      "loss": 0.1162,
+      "step": 47071
+    },
+    {
+      "epoch": 0.4086075641704499,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0011425147229341037,
+      "loss": 0.1084,
+      "step": 47072
+    },
+    {
+      "epoch": 0.40861624465065405,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0011424849936799062,
+      "loss": 0.1045,
+      "step": 47073
+    },
+    {
+      "epoch": 0.40862492513085824,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0011424552643792478,
+      "loss": 0.0879,
+      "step": 47074
+    },
+    {
+      "epoch": 0.4086336056110624,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0011424255350321612,
+      "loss": 0.1006,
+      "step": 47075
+    },
+    {
+      "epoch": 0.40864228609126657,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0011423958056386782,
+      "loss": 0.106,
+      "step": 47076
+    },
+    {
+      "epoch": 0.4086509665714707,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0011423660761988323,
+      "loss": 0.0815,
+      "step": 47077
+    },
+    {
+      "epoch": 0.4086596470516749,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0011423363467126552,
+      "loss": 0.1226,
+      "step": 47078
+    },
+    {
+      "epoch": 0.40866832753187904,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0011423066171801798,
+      "loss": 0.0811,
+      "step": 47079
+    },
+    {
+      "epoch": 0.40867700801208323,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001142276887601439,
+      "loss": 0.1162,
+      "step": 47080
+    },
+    {
+      "epoch": 0.40868568849228737,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0011422471579764643,
+      "loss": 0.0981,
+      "step": 47081
+    },
+    {
+      "epoch": 0.40869436897249156,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0011422174283052884,
+      "loss": 0.1006,
+      "step": 47082
+    },
+    {
+      "epoch": 0.4087030494526957,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0011421876985879447,
+      "loss": 0.0913,
+      "step": 47083
+    },
+    {
+      "epoch": 0.4087117299328999,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0011421579688244653,
+      "loss": 0.1152,
+      "step": 47084
+    },
+    {
+      "epoch": 0.40872041041310403,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001142128239014882,
+      "loss": 0.0903,
+      "step": 47085
+    },
+    {
+      "epoch": 0.4087290908933082,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0011420985091592278,
+      "loss": 0.1016,
+      "step": 47086
+    },
+    {
+      "epoch": 0.40873777137351236,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0011420687792575359,
+      "loss": 0.0791,
+      "step": 47087
+    },
+    {
+      "epoch": 0.40874645185371655,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.001142039049309838,
+      "loss": 0.1079,
+      "step": 47088
+    },
+    {
+      "epoch": 0.4087551323339207,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0011420093193161669,
+      "loss": 0.1143,
+      "step": 47089
+    },
+    {
+      "epoch": 0.4087638128141249,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.001141979589276555,
+      "loss": 0.124,
+      "step": 47090
+    },
+    {
+      "epoch": 0.408772493294329,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001141949859191035,
+      "loss": 0.0967,
+      "step": 47091
+    },
+    {
+      "epoch": 0.4087811737745332,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0011419201290596389,
+      "loss": 0.1055,
+      "step": 47092
+    },
+    {
+      "epoch": 0.40878985425473735,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0011418903988823998,
+      "loss": 0.1006,
+      "step": 47093
+    },
+    {
+      "epoch": 0.40879853473494154,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0011418606686593499,
+      "loss": 0.0845,
+      "step": 47094
+    },
+    {
+      "epoch": 0.4088072152151457,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001141830938390522,
+      "loss": 0.0698,
+      "step": 47095
+    },
+    {
+      "epoch": 0.4088158956953499,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.001141801208075948,
+      "loss": 0.0791,
+      "step": 47096
+    },
+    {
+      "epoch": 0.408824576175554,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001141771477715661,
+      "loss": 0.0942,
+      "step": 47097
+    },
+    {
+      "epoch": 0.4088332566557582,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0011417417473096937,
+      "loss": 0.0889,
+      "step": 47098
+    },
+    {
+      "epoch": 0.40884193713596234,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0011417120168580777,
+      "loss": 0.0703,
+      "step": 47099
+    },
+    {
+      "epoch": 0.40885061761616653,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0011416822863608465,
+      "loss": 0.0854,
+      "step": 47100
+    },
+    {
+      "epoch": 0.40885929809637067,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001141652555818032,
+      "loss": 0.1196,
+      "step": 47101
+    },
+    {
+      "epoch": 0.40886797857657486,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001141622825229667,
+      "loss": 0.0679,
+      "step": 47102
+    },
+    {
+      "epoch": 0.408876659056779,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0011415930945957839,
+      "loss": 0.0684,
+      "step": 47103
+    },
+    {
+      "epoch": 0.4088853395369832,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001141563363916415,
+      "loss": 0.0913,
+      "step": 47104
+    },
+    {
+      "epoch": 0.40889402001718733,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0011415336331915928,
+      "loss": 0.1182,
+      "step": 47105
+    },
+    {
+      "epoch": 0.4089027004973915,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0011415039024213506,
+      "loss": 0.0874,
+      "step": 47106
+    },
+    {
+      "epoch": 0.40891138097759566,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0011414741716057203,
+      "loss": 0.0967,
+      "step": 47107
+    },
+    {
+      "epoch": 0.40892006145779985,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0011414444407447343,
+      "loss": 0.1309,
+      "step": 47108
+    },
+    {
+      "epoch": 0.408928741938004,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0011414147098384253,
+      "loss": 0.1162,
+      "step": 47109
+    },
+    {
+      "epoch": 0.4089374224182082,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0011413849788868258,
+      "loss": 0.1147,
+      "step": 47110
+    },
+    {
+      "epoch": 0.4089461028984123,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0011413552478899682,
+      "loss": 0.0908,
+      "step": 47111
+    },
+    {
+      "epoch": 0.4089547833786165,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0011413255168478853,
+      "loss": 0.0996,
+      "step": 47112
+    },
+    {
+      "epoch": 0.40896346385882065,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0011412957857606094,
+      "loss": 0.1602,
+      "step": 47113
+    },
+    {
+      "epoch": 0.40897214433902485,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001141266054628173,
+      "loss": 0.0752,
+      "step": 47114
+    },
+    {
+      "epoch": 0.408980824819229,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0011412363234506086,
+      "loss": 0.0991,
+      "step": 47115
+    },
+    {
+      "epoch": 0.4089895052994332,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001141206592227949,
+      "loss": 0.0962,
+      "step": 47116
+    },
+    {
+      "epoch": 0.4089981857796373,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0011411768609602265,
+      "loss": 0.1206,
+      "step": 47117
+    },
+    {
+      "epoch": 0.4090068662598415,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0011411471296474732,
+      "loss": 0.1191,
+      "step": 47118
+    },
+    {
+      "epoch": 0.40901554674004564,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0011411173982897225,
+      "loss": 0.1035,
+      "step": 47119
+    },
+    {
+      "epoch": 0.40902422722024984,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0011410876668870062,
+      "loss": 0.1182,
+      "step": 47120
+    },
+    {
+      "epoch": 0.409032907700454,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0011410579354393568,
+      "loss": 0.0918,
+      "step": 47121
+    },
+    {
+      "epoch": 0.40904158818065817,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0011410282039468075,
+      "loss": 0.0918,
+      "step": 47122
+    },
+    {
+      "epoch": 0.4090502686608623,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0011409984724093903,
+      "loss": 0.0737,
+      "step": 47123
+    },
+    {
+      "epoch": 0.4090589491410665,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0011409687408271375,
+      "loss": 0.1055,
+      "step": 47124
+    },
+    {
+      "epoch": 0.40906762962127063,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001140939009200082,
+      "loss": 0.0942,
+      "step": 47125
+    },
+    {
+      "epoch": 0.4090763101014748,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0011409092775282563,
+      "loss": 0.1143,
+      "step": 47126
+    },
+    {
+      "epoch": 0.40908499058167896,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0011408795458116927,
+      "loss": 0.0967,
+      "step": 47127
+    },
+    {
+      "epoch": 0.40909367106188316,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0011408498140504243,
+      "loss": 0.082,
+      "step": 47128
+    },
+    {
+      "epoch": 0.4091023515420873,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0011408200822444826,
+      "loss": 0.0898,
+      "step": 47129
+    },
+    {
+      "epoch": 0.4091110320222915,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0011407903503939008,
+      "loss": 0.0972,
+      "step": 47130
+    },
+    {
+      "epoch": 0.4091197125024956,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0011407606184987117,
+      "loss": 0.0894,
+      "step": 47131
+    },
+    {
+      "epoch": 0.4091283929826998,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0011407308865589469,
+      "loss": 0.1128,
+      "step": 47132
+    },
+    {
+      "epoch": 0.40913707346290396,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0011407011545746394,
+      "loss": 0.083,
+      "step": 47133
+    },
+    {
+      "epoch": 0.40914575394310815,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001140671422545822,
+      "loss": 0.1562,
+      "step": 47134
+    },
+    {
+      "epoch": 0.4091544344233123,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0011406416904725268,
+      "loss": 0.1143,
+      "step": 47135
+    },
+    {
+      "epoch": 0.4091631149035165,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0011406119583547865,
+      "loss": 0.0928,
+      "step": 47136
+    },
+    {
+      "epoch": 0.4091717953837206,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0011405822261926336,
+      "loss": 0.0879,
+      "step": 47137
+    },
+    {
+      "epoch": 0.4091804758639248,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0011405524939861005,
+      "loss": 0.1074,
+      "step": 47138
+    },
+    {
+      "epoch": 0.40918915634412895,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.00114052276173522,
+      "loss": 0.0928,
+      "step": 47139
+    },
+    {
+      "epoch": 0.40919783682433314,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0011404930294400244,
+      "loss": 0.0752,
+      "step": 47140
+    },
+    {
+      "epoch": 0.4092065173045373,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0011404632971005462,
+      "loss": 0.0796,
+      "step": 47141
+    },
+    {
+      "epoch": 0.40921519778474147,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0011404335647168175,
+      "loss": 0.0869,
+      "step": 47142
+    },
+    {
+      "epoch": 0.4092238782649456,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0011404038322888716,
+      "loss": 0.1084,
+      "step": 47143
+    },
+    {
+      "epoch": 0.4092325587451498,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0011403740998167408,
+      "loss": 0.084,
+      "step": 47144
+    },
+    {
+      "epoch": 0.40924123922535394,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0011403443673004574,
+      "loss": 0.0889,
+      "step": 47145
+    },
+    {
+      "epoch": 0.40924991970555813,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001140314634740054,
+      "loss": 0.0869,
+      "step": 47146
+    },
+    {
+      "epoch": 0.40925860018576227,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001140284902135563,
+      "loss": 0.1182,
+      "step": 47147
+    },
+    {
+      "epoch": 0.40926728066596646,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0011402551694870173,
+      "loss": 0.1138,
+      "step": 47148
+    },
+    {
+      "epoch": 0.4092759611461706,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001140225436794449,
+      "loss": 0.085,
+      "step": 47149
+    },
+    {
+      "epoch": 0.4092846416263748,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001140195704057891,
+      "loss": 0.0981,
+      "step": 47150
+    },
+    {
+      "epoch": 0.4092933221065789,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001140165971277375,
+      "loss": 0.1318,
+      "step": 47151
+    },
+    {
+      "epoch": 0.4093020025867831,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0011401362384529348,
+      "loss": 0.1279,
+      "step": 47152
+    },
+    {
+      "epoch": 0.40931068306698726,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0011401065055846016,
+      "loss": 0.1211,
+      "step": 47153
+    },
+    {
+      "epoch": 0.40931936354719145,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0011400767726724088,
+      "loss": 0.168,
+      "step": 47154
+    },
+    {
+      "epoch": 0.4093280440273956,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0011400470397163888,
+      "loss": 0.0874,
+      "step": 47155
+    },
+    {
+      "epoch": 0.4093367245075998,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0011400173067165737,
+      "loss": 0.1084,
+      "step": 47156
+    },
+    {
+      "epoch": 0.4093454049878039,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0011399875736729963,
+      "loss": 0.1016,
+      "step": 47157
+    },
+    {
+      "epoch": 0.4093540854680081,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0011399578405856892,
+      "loss": 0.0801,
+      "step": 47158
+    },
+    {
+      "epoch": 0.40936276594821225,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.001139928107454685,
+      "loss": 0.084,
+      "step": 47159
+    },
+    {
+      "epoch": 0.40937144642841644,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0011398983742800155,
+      "loss": 0.1094,
+      "step": 47160
+    },
+    {
+      "epoch": 0.4093801269086206,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001139868641061714,
+      "loss": 0.0771,
+      "step": 47161
+    },
+    {
+      "epoch": 0.40938880738882477,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001139838907799813,
+      "loss": 0.0703,
+      "step": 47162
+    },
+    {
+      "epoch": 0.4093974878690289,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0011398091744943444,
+      "loss": 0.1445,
+      "step": 47163
+    },
+    {
+      "epoch": 0.4094061683492331,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.001139779441145341,
+      "loss": 0.1309,
+      "step": 47164
+    },
+    {
+      "epoch": 0.40941484882943724,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.001139749707752836,
+      "loss": 0.0674,
+      "step": 47165
+    },
+    {
+      "epoch": 0.40942352930964143,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0011397199743168609,
+      "loss": 0.0654,
+      "step": 47166
+    },
+    {
+      "epoch": 0.40943220978984557,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0011396902408374488,
+      "loss": 0.1177,
+      "step": 47167
+    },
+    {
+      "epoch": 0.40944089027004976,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0011396605073146317,
+      "loss": 0.1011,
+      "step": 47168
+    },
+    {
+      "epoch": 0.4094495707502539,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001139630773748443,
+      "loss": 0.0869,
+      "step": 47169
+    },
+    {
+      "epoch": 0.4094582512304581,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0011396010401389144,
+      "loss": 0.1108,
+      "step": 47170
+    },
+    {
+      "epoch": 0.40946693171066223,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0011395713064860785,
+      "loss": 0.085,
+      "step": 47171
+    },
+    {
+      "epoch": 0.4094756121908664,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001139541572789968,
+      "loss": 0.1035,
+      "step": 47172
+    },
+    {
+      "epoch": 0.40948429267107056,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0011395118390506156,
+      "loss": 0.0737,
+      "step": 47173
+    },
+    {
+      "epoch": 0.40949297315127475,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0011394821052680536,
+      "loss": 0.0732,
+      "step": 47174
+    },
+    {
+      "epoch": 0.4095016536314789,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001139452371442315,
+      "loss": 0.0957,
+      "step": 47175
+    },
+    {
+      "epoch": 0.4095103341116831,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0011394226375734315,
+      "loss": 0.1211,
+      "step": 47176
+    },
+    {
+      "epoch": 0.4095190145918872,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.001139392903661436,
+      "loss": 0.0713,
+      "step": 47177
+    },
+    {
+      "epoch": 0.4095276950720914,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.001139363169706361,
+      "loss": 0.1025,
+      "step": 47178
+    },
+    {
+      "epoch": 0.40953637555229555,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.001139333435708239,
+      "loss": 0.1309,
+      "step": 47179
+    },
+    {
+      "epoch": 0.40954505603249974,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0011393037016671028,
+      "loss": 0.0913,
+      "step": 47180
+    },
+    {
+      "epoch": 0.4095537365127039,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0011392739675829841,
+      "loss": 0.1172,
+      "step": 47181
+    },
+    {
+      "epoch": 0.4095624169929081,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0011392442334559163,
+      "loss": 0.0972,
+      "step": 47182
+    },
+    {
+      "epoch": 0.4095710974731122,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0011392144992859318,
+      "loss": 0.0884,
+      "step": 47183
+    },
+    {
+      "epoch": 0.4095797779533164,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001139184765073063,
+      "loss": 0.1279,
+      "step": 47184
+    },
+    {
+      "epoch": 0.40958845843352054,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001139155030817342,
+      "loss": 0.0991,
+      "step": 47185
+    },
+    {
+      "epoch": 0.40959713891372473,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0011391252965188014,
+      "loss": 0.0996,
+      "step": 47186
+    },
+    {
+      "epoch": 0.40960581939392887,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0011390955621774746,
+      "loss": 0.1167,
+      "step": 47187
+    },
+    {
+      "epoch": 0.40961449987413306,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0011390658277933932,
+      "loss": 0.0869,
+      "step": 47188
+    },
+    {
+      "epoch": 0.4096231803543372,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.00113903609336659,
+      "loss": 0.0986,
+      "step": 47189
+    },
+    {
+      "epoch": 0.40963186083454134,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.001139006358897097,
+      "loss": 0.1055,
+      "step": 47190
+    },
+    {
+      "epoch": 0.40964054131474553,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001138976624384948,
+      "loss": 0.1318,
+      "step": 47191
+    },
+    {
+      "epoch": 0.40964922179494967,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0011389468898301742,
+      "loss": 0.0908,
+      "step": 47192
+    },
+    {
+      "epoch": 0.40965790227515386,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0011389171552328092,
+      "loss": 0.1191,
+      "step": 47193
+    },
+    {
+      "epoch": 0.409666582755358,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0011388874205928846,
+      "loss": 0.082,
+      "step": 47194
+    },
+    {
+      "epoch": 0.4096752632355622,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0011388576859104333,
+      "loss": 0.0757,
+      "step": 47195
+    },
+    {
+      "epoch": 0.40968394371576633,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0011388279511854883,
+      "loss": 0.1025,
+      "step": 47196
+    },
+    {
+      "epoch": 0.4096926241959705,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001138798216418081,
+      "loss": 0.1123,
+      "step": 47197
+    },
+    {
+      "epoch": 0.40970130467617466,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0011387684816082451,
+      "loss": 0.0859,
+      "step": 47198
+    },
+    {
+      "epoch": 0.40970998515637885,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0011387387467560119,
+      "loss": 0.1143,
+      "step": 47199
+    },
+    {
+      "epoch": 0.409718665636583,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0011387090118614154,
+      "loss": 0.124,
+      "step": 47200
+    },
+    {
+      "epoch": 0.4097273461167872,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0011386792769244867,
+      "loss": 0.1064,
+      "step": 47201
+    },
+    {
+      "epoch": 0.4097360265969913,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0011386495419452591,
+      "loss": 0.1006,
+      "step": 47202
+    },
+    {
+      "epoch": 0.4097447070771955,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0011386198069237648,
+      "loss": 0.1191,
+      "step": 47203
+    },
+    {
+      "epoch": 0.40975338755739965,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0011385900718600367,
+      "loss": 0.0859,
+      "step": 47204
+    },
+    {
+      "epoch": 0.40976206803760384,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001138560336754107,
+      "loss": 0.1367,
+      "step": 47205
+    },
+    {
+      "epoch": 0.409770748517808,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0011385306016060085,
+      "loss": 0.1104,
+      "step": 47206
+    },
+    {
+      "epoch": 0.4097794289980122,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001138500866415773,
+      "loss": 0.1182,
+      "step": 47207
+    },
+    {
+      "epoch": 0.4097881094782163,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001138471131183434,
+      "loss": 0.0884,
+      "step": 47208
+    },
+    {
+      "epoch": 0.4097967899584205,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0011384413959090235,
+      "loss": 0.0947,
+      "step": 47209
+    },
+    {
+      "epoch": 0.40980547043862464,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001138411660592574,
+      "loss": 0.0854,
+      "step": 47210
+    },
+    {
+      "epoch": 0.40981415091882883,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0011383819252341178,
+      "loss": 0.1426,
+      "step": 47211
+    },
+    {
+      "epoch": 0.40982283139903297,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0011383521898336879,
+      "loss": 0.0928,
+      "step": 47212
+    },
+    {
+      "epoch": 0.40983151187923716,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0011383224543913167,
+      "loss": 0.0771,
+      "step": 47213
+    },
+    {
+      "epoch": 0.4098401923594413,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0011382927189070367,
+      "loss": 0.0996,
+      "step": 47214
+    },
+    {
+      "epoch": 0.4098488728396455,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0011382629833808801,
+      "loss": 0.1074,
+      "step": 47215
+    },
+    {
+      "epoch": 0.40985755331984963,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0011382332478128798,
+      "loss": 0.0879,
+      "step": 47216
+    },
+    {
+      "epoch": 0.4098662338000538,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0011382035122030683,
+      "loss": 0.1699,
+      "step": 47217
+    },
+    {
+      "epoch": 0.40987491428025796,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0011381737765514782,
+      "loss": 0.123,
+      "step": 47218
+    },
+    {
+      "epoch": 0.40988359476046216,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001138144040858141,
+      "loss": 0.1504,
+      "step": 47219
+    },
+    {
+      "epoch": 0.4098922752406663,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0011381143051230908,
+      "loss": 0.0781,
+      "step": 47220
+    },
+    {
+      "epoch": 0.4099009557208705,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0011380845693463593,
+      "loss": 0.0747,
+      "step": 47221
+    },
+    {
+      "epoch": 0.4099096362010746,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0011380548335279787,
+      "loss": 0.0801,
+      "step": 47222
+    },
+    {
+      "epoch": 0.4099183166812788,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0011380250976679824,
+      "loss": 0.0757,
+      "step": 47223
+    },
+    {
+      "epoch": 0.40992699716148295,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0011379953617664024,
+      "loss": 0.0693,
+      "step": 47224
+    },
+    {
+      "epoch": 0.40993567764168715,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0011379656258232707,
+      "loss": 0.1221,
+      "step": 47225
+    },
+    {
+      "epoch": 0.4099443581218913,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0011379358898386208,
+      "loss": 0.123,
+      "step": 47226
+    },
+    {
+      "epoch": 0.4099530386020955,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0011379061538124846,
+      "loss": 0.0942,
+      "step": 47227
+    },
+    {
+      "epoch": 0.4099617190822996,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001137876417744895,
+      "loss": 0.0928,
+      "step": 47228
+    },
+    {
+      "epoch": 0.4099703995625038,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0011378466816358843,
+      "loss": 0.1289,
+      "step": 47229
+    },
+    {
+      "epoch": 0.40997908004270794,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0011378169454854847,
+      "loss": 0.0918,
+      "step": 47230
+    },
+    {
+      "epoch": 0.40998776052291214,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0011377872092937294,
+      "loss": 0.0991,
+      "step": 47231
+    },
+    {
+      "epoch": 0.4099964410031163,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0011377574730606504,
+      "loss": 0.0947,
+      "step": 47232
+    },
+    {
+      "epoch": 0.41000512148332047,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0011377277367862805,
+      "loss": 0.0894,
+      "step": 47233
+    },
+    {
+      "epoch": 0.4100138019635246,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.001137698000470652,
+      "loss": 0.1045,
+      "step": 47234
+    },
+    {
+      "epoch": 0.4100224824437288,
+      "grad_norm": 1.375,
+      "learning_rate": 0.0011376682641137978,
+      "loss": 0.1113,
+      "step": 47235
+    },
+    {
+      "epoch": 0.41003116292393293,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.00113763852771575,
+      "loss": 0.1445,
+      "step": 47236
+    },
+    {
+      "epoch": 0.4100398434041371,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0011376087912765413,
+      "loss": 0.1094,
+      "step": 47237
+    },
+    {
+      "epoch": 0.41004852388434126,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001137579054796204,
+      "loss": 0.0986,
+      "step": 47238
+    },
+    {
+      "epoch": 0.41005720436454546,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0011375493182747708,
+      "loss": 0.1191,
+      "step": 47239
+    },
+    {
+      "epoch": 0.4100658848447496,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0011375195817122745,
+      "loss": 0.1064,
+      "step": 47240
+    },
+    {
+      "epoch": 0.4100745653249538,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0011374898451087473,
+      "loss": 0.1494,
+      "step": 47241
+    },
+    {
+      "epoch": 0.4100832458051579,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0011374601084642218,
+      "loss": 0.0908,
+      "step": 47242
+    },
+    {
+      "epoch": 0.4100919262853621,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0011374303717787302,
+      "loss": 0.084,
+      "step": 47243
+    },
+    {
+      "epoch": 0.41010060676556626,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0011374006350523057,
+      "loss": 0.0898,
+      "step": 47244
+    },
+    {
+      "epoch": 0.41010928724577045,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.00113737089828498,
+      "loss": 0.0972,
+      "step": 47245
+    },
+    {
+      "epoch": 0.4101179677259746,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0011373411614767862,
+      "loss": 0.1074,
+      "step": 47246
+    },
+    {
+      "epoch": 0.4101266482061788,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001137311424627757,
+      "loss": 0.0986,
+      "step": 47247
+    },
+    {
+      "epoch": 0.4101353286863829,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0011372816877379243,
+      "loss": 0.0952,
+      "step": 47248
+    },
+    {
+      "epoch": 0.4101440091665871,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0011372519508073206,
+      "loss": 0.1475,
+      "step": 47249
+    },
+    {
+      "epoch": 0.41015268964679125,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.001137222213835979,
+      "loss": 0.0928,
+      "step": 47250
+    },
+    {
+      "epoch": 0.41016137012699544,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0011371924768239319,
+      "loss": 0.105,
+      "step": 47251
+    },
+    {
+      "epoch": 0.4101700506071996,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0011371627397712117,
+      "loss": 0.127,
+      "step": 47252
+    },
+    {
+      "epoch": 0.41017873108740377,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0011371330026778506,
+      "loss": 0.0791,
+      "step": 47253
+    },
+    {
+      "epoch": 0.4101874115676079,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0011371032655438818,
+      "loss": 0.1182,
+      "step": 47254
+    },
+    {
+      "epoch": 0.4101960920478121,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.001137073528369337,
+      "loss": 0.1084,
+      "step": 47255
+    },
+    {
+      "epoch": 0.41020477252801624,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.0011370437911542494,
+      "loss": 0.1309,
+      "step": 47256
+    },
+    {
+      "epoch": 0.41021345300822043,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0011370140538986514,
+      "loss": 0.082,
+      "step": 47257
+    },
+    {
+      "epoch": 0.41022213348842457,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0011369843166025748,
+      "loss": 0.1084,
+      "step": 47258
+    },
+    {
+      "epoch": 0.41023081396862876,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0011369545792660532,
+      "loss": 0.1182,
+      "step": 47259
+    },
+    {
+      "epoch": 0.4102394944488329,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0011369248418891182,
+      "loss": 0.1465,
+      "step": 47260
+    },
+    {
+      "epoch": 0.4102481749290371,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0011368951044718033,
+      "loss": 0.1143,
+      "step": 47261
+    },
+    {
+      "epoch": 0.41025685540924123,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0011368653670141404,
+      "loss": 0.1201,
+      "step": 47262
+    },
+    {
+      "epoch": 0.4102655358894454,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0011368356295161617,
+      "loss": 0.3262,
+      "step": 47263
+    },
+    {
+      "epoch": 0.41027421636964956,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0011368058919779005,
+      "loss": 0.0903,
+      "step": 47264
+    },
+    {
+      "epoch": 0.41028289684985375,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0011367761543993888,
+      "loss": 0.0889,
+      "step": 47265
+    },
+    {
+      "epoch": 0.4102915773300579,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0011367464167806591,
+      "loss": 0.1074,
+      "step": 47266
+    },
+    {
+      "epoch": 0.4103002578102621,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0011367166791217442,
+      "loss": 0.1279,
+      "step": 47267
+    },
+    {
+      "epoch": 0.4103089382904662,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0011366869414226764,
+      "loss": 0.1348,
+      "step": 47268
+    },
+    {
+      "epoch": 0.4103176187706704,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0011366572036834884,
+      "loss": 0.125,
+      "step": 47269
+    },
+    {
+      "epoch": 0.41032629925087455,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0011366274659042126,
+      "loss": 0.1006,
+      "step": 47270
+    },
+    {
+      "epoch": 0.41033497973107874,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0011365977280848817,
+      "loss": 0.1113,
+      "step": 47271
+    },
+    {
+      "epoch": 0.4103436602112829,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0011365679902255277,
+      "loss": 0.0942,
+      "step": 47272
+    },
+    {
+      "epoch": 0.41035234069148707,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0011365382523261841,
+      "loss": 0.1006,
+      "step": 47273
+    },
+    {
+      "epoch": 0.4103610211716912,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0011365085143868823,
+      "loss": 0.126,
+      "step": 47274
+    },
+    {
+      "epoch": 0.4103697016518954,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0011364787764076556,
+      "loss": 0.103,
+      "step": 47275
+    },
+    {
+      "epoch": 0.41037838213209954,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001136449038388536,
+      "loss": 0.1055,
+      "step": 47276
+    },
+    {
+      "epoch": 0.41038706261230373,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0011364193003295564,
+      "loss": 0.0908,
+      "step": 47277
+    },
+    {
+      "epoch": 0.41039574309250787,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0011363895622307492,
+      "loss": 0.0791,
+      "step": 47278
+    },
+    {
+      "epoch": 0.41040442357271206,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001136359824092147,
+      "loss": 0.082,
+      "step": 47279
+    },
+    {
+      "epoch": 0.4104131040529162,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0011363300859137822,
+      "loss": 0.0928,
+      "step": 47280
+    },
+    {
+      "epoch": 0.4104217845331204,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0011363003476956872,
+      "loss": 0.0664,
+      "step": 47281
+    },
+    {
+      "epoch": 0.41043046501332453,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0011362706094378949,
+      "loss": 0.1084,
+      "step": 47282
+    },
+    {
+      "epoch": 0.4104391454935287,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0011362408711404377,
+      "loss": 0.1006,
+      "step": 47283
+    },
+    {
+      "epoch": 0.41044782597373286,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0011362111328033478,
+      "loss": 0.1143,
+      "step": 47284
+    },
+    {
+      "epoch": 0.41045650645393705,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001136181394426658,
+      "loss": 0.0679,
+      "step": 47285
+    },
+    {
+      "epoch": 0.4104651869341412,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001136151656010401,
+      "loss": 0.1309,
+      "step": 47286
+    },
+    {
+      "epoch": 0.4104738674143454,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0011361219175546084,
+      "loss": 0.1006,
+      "step": 47287
+    },
+    {
+      "epoch": 0.4104825478945495,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0011360921790593143,
+      "loss": 0.1514,
+      "step": 47288
+    },
+    {
+      "epoch": 0.4104912283747537,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0011360624405245497,
+      "loss": 0.1797,
+      "step": 47289
+    },
+    {
+      "epoch": 0.41049990885495785,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001136032701950348,
+      "loss": 0.0952,
+      "step": 47290
+    },
+    {
+      "epoch": 0.41050858933516204,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0011360029633367414,
+      "loss": 0.0981,
+      "step": 47291
+    },
+    {
+      "epoch": 0.4105172698153662,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0011359732246837627,
+      "loss": 0.0767,
+      "step": 47292
+    },
+    {
+      "epoch": 0.4105259502955704,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.001135943485991444,
+      "loss": 0.1035,
+      "step": 47293
+    },
+    {
+      "epoch": 0.4105346307757745,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0011359137472598184,
+      "loss": 0.1318,
+      "step": 47294
+    },
+    {
+      "epoch": 0.4105433112559787,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0011358840084889177,
+      "loss": 0.1465,
+      "step": 47295
+    },
+    {
+      "epoch": 0.41055199173618284,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.001135854269678775,
+      "loss": 0.1133,
+      "step": 47296
+    },
+    {
+      "epoch": 0.41056067221638703,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0011358245308294224,
+      "loss": 0.0879,
+      "step": 47297
+    },
+    {
+      "epoch": 0.41056935269659117,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0011357947919408927,
+      "loss": 0.0645,
+      "step": 47298
+    },
+    {
+      "epoch": 0.41057803317679537,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0011357650530132183,
+      "loss": 0.1055,
+      "step": 47299
+    },
+    {
+      "epoch": 0.4105867136569995,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001135735314046432,
+      "loss": 0.1113,
+      "step": 47300
+    },
+    {
+      "epoch": 0.4105953941372037,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0011357055750405658,
+      "loss": 0.0903,
+      "step": 47301
+    },
+    {
+      "epoch": 0.41060407461740783,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0011356758359956527,
+      "loss": 0.0874,
+      "step": 47302
+    },
+    {
+      "epoch": 0.410612755097612,
+      "grad_norm": 0.125,
+      "learning_rate": 0.001135646096911725,
+      "loss": 0.0791,
+      "step": 47303
+    },
+    {
+      "epoch": 0.41062143557781616,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0011356163577888154,
+      "loss": 0.1152,
+      "step": 47304
+    },
+    {
+      "epoch": 0.41063011605802036,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0011355866186269564,
+      "loss": 0.0918,
+      "step": 47305
+    },
+    {
+      "epoch": 0.4106387965382245,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.00113555687942618,
+      "loss": 0.1099,
+      "step": 47306
+    },
+    {
+      "epoch": 0.4106474770184287,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001135527140186519,
+      "loss": 0.1895,
+      "step": 47307
+    },
+    {
+      "epoch": 0.4106561574986328,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0011354974009080065,
+      "loss": 0.0996,
+      "step": 47308
+    },
+    {
+      "epoch": 0.410664837978837,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0011354676615906744,
+      "loss": 0.1011,
+      "step": 47309
+    },
+    {
+      "epoch": 0.41067351845904115,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0011354379222345554,
+      "loss": 0.1016,
+      "step": 47310
+    },
+    {
+      "epoch": 0.41068219893924535,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0011354081828396823,
+      "loss": 0.0869,
+      "step": 47311
+    },
+    {
+      "epoch": 0.4106908794194495,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.001135378443406087,
+      "loss": 0.1079,
+      "step": 47312
+    },
+    {
+      "epoch": 0.4106995598996536,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0011353487039338023,
+      "loss": 0.0791,
+      "step": 47313
+    },
+    {
+      "epoch": 0.4107082403798578,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0011353189644228612,
+      "loss": 0.0957,
+      "step": 47314
+    },
+    {
+      "epoch": 0.41071692086006195,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0011352892248732954,
+      "loss": 0.0703,
+      "step": 47315
+    },
+    {
+      "epoch": 0.41072560134026614,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0011352594852851378,
+      "loss": 0.0693,
+      "step": 47316
+    },
+    {
+      "epoch": 0.4107342818204703,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.0011352297456584215,
+      "loss": 0.1416,
+      "step": 47317
+    },
+    {
+      "epoch": 0.4107429623006745,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.001135200005993178,
+      "loss": 0.0938,
+      "step": 47318
+    },
+    {
+      "epoch": 0.4107516427808786,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0011351702662894406,
+      "loss": 0.0845,
+      "step": 47319
+    },
+    {
+      "epoch": 0.4107603232610828,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0011351405265472411,
+      "loss": 0.0801,
+      "step": 47320
+    },
+    {
+      "epoch": 0.41076900374128694,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0011351107867666127,
+      "loss": 0.084,
+      "step": 47321
+    },
+    {
+      "epoch": 0.41077768422149114,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0011350810469475878,
+      "loss": 0.085,
+      "step": 47322
+    },
+    {
+      "epoch": 0.4107863647016953,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001135051307090199,
+      "loss": 0.1055,
+      "step": 47323
+    },
+    {
+      "epoch": 0.41079504518189947,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0011350215671944782,
+      "loss": 0.1108,
+      "step": 47324
+    },
+    {
+      "epoch": 0.4108037256621036,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0011349918272604584,
+      "loss": 0.0825,
+      "step": 47325
+    },
+    {
+      "epoch": 0.4108124061423078,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0011349620872881721,
+      "loss": 0.0791,
+      "step": 47326
+    },
+    {
+      "epoch": 0.41082108662251193,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001134932347277652,
+      "loss": 0.1162,
+      "step": 47327
+    },
+    {
+      "epoch": 0.4108297671027161,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0011349026072289296,
+      "loss": 0.1035,
+      "step": 47328
+    },
+    {
+      "epoch": 0.41083844758292026,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001134872867142039,
+      "loss": 0.1025,
+      "step": 47329
+    },
+    {
+      "epoch": 0.41084712806312446,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0011348431270170119,
+      "loss": 0.0859,
+      "step": 47330
+    },
+    {
+      "epoch": 0.4108558085433286,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0011348133868538809,
+      "loss": 0.0913,
+      "step": 47331
+    },
+    {
+      "epoch": 0.4108644890235328,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0011347836466526783,
+      "loss": 0.0996,
+      "step": 47332
+    },
+    {
+      "epoch": 0.4108731695037369,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.001134753906413437,
+      "loss": 0.125,
+      "step": 47333
+    },
+    {
+      "epoch": 0.4108818499839411,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0011347241661361892,
+      "loss": 0.0654,
+      "step": 47334
+    },
+    {
+      "epoch": 0.41089053046414525,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0011346944258209676,
+      "loss": 0.1387,
+      "step": 47335
+    },
+    {
+      "epoch": 0.41089921094434945,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0011346646854678043,
+      "loss": 0.1182,
+      "step": 47336
+    },
+    {
+      "epoch": 0.4109078914245536,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0011346349450767328,
+      "loss": 0.2188,
+      "step": 47337
+    },
+    {
+      "epoch": 0.4109165719047578,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0011346052046477847,
+      "loss": 0.0908,
+      "step": 47338
+    },
+    {
+      "epoch": 0.4109252523849619,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0011345754641809932,
+      "loss": 0.1006,
+      "step": 47339
+    },
+    {
+      "epoch": 0.4109339328651661,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0011345457236763906,
+      "loss": 0.1226,
+      "step": 47340
+    },
+    {
+      "epoch": 0.41094261334537024,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0011345159831340089,
+      "loss": 0.064,
+      "step": 47341
+    },
+    {
+      "epoch": 0.41095129382557444,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0011344862425538811,
+      "loss": 0.0654,
+      "step": 47342
+    },
+    {
+      "epoch": 0.4109599743057786,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.00113445650193604,
+      "loss": 0.1021,
+      "step": 47343
+    },
+    {
+      "epoch": 0.41096865478598277,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0011344267612805174,
+      "loss": 0.1279,
+      "step": 47344
+    },
+    {
+      "epoch": 0.4109773352661869,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0011343970205873464,
+      "loss": 0.1074,
+      "step": 47345
+    },
+    {
+      "epoch": 0.4109860157463911,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0011343672798565592,
+      "loss": 0.1025,
+      "step": 47346
+    },
+    {
+      "epoch": 0.41099469622659524,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0011343375390881884,
+      "loss": 0.1357,
+      "step": 47347
+    },
+    {
+      "epoch": 0.41100337670679943,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0011343077982822669,
+      "loss": 0.0991,
+      "step": 47348
+    },
+    {
+      "epoch": 0.41101205718700357,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001134278057438827,
+      "loss": 0.1582,
+      "step": 47349
+    },
+    {
+      "epoch": 0.41102073766720776,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0011342483165579006,
+      "loss": 0.0967,
+      "step": 47350
+    },
+    {
+      "epoch": 0.4110294181474119,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001134218575639521,
+      "loss": 0.0938,
+      "step": 47351
+    },
+    {
+      "epoch": 0.4110380986276161,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0011341888346837205,
+      "loss": 0.085,
+      "step": 47352
+    },
+    {
+      "epoch": 0.4110467791078202,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0011341590936905317,
+      "loss": 0.0762,
+      "step": 47353
+    },
+    {
+      "epoch": 0.4110554595880244,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0011341293526599867,
+      "loss": 0.085,
+      "step": 47354
+    },
+    {
+      "epoch": 0.41106414006822856,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0011340996115921185,
+      "loss": 0.0654,
+      "step": 47355
+    },
+    {
+      "epoch": 0.41107282054843275,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0011340698704869596,
+      "loss": 0.1025,
+      "step": 47356
+    },
+    {
+      "epoch": 0.4110815010286369,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.001134040129344543,
+      "loss": 0.1074,
+      "step": 47357
+    },
+    {
+      "epoch": 0.4110901815088411,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0011340103881648997,
+      "loss": 0.0791,
+      "step": 47358
+    },
+    {
+      "epoch": 0.4110988619890452,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0011339806469480634,
+      "loss": 0.1074,
+      "step": 47359
+    },
+    {
+      "epoch": 0.4111075424692494,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0011339509056940665,
+      "loss": 0.1543,
+      "step": 47360
+    },
+    {
+      "epoch": 0.41111622294945355,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0011339211644029413,
+      "loss": 0.0903,
+      "step": 47361
+    },
+    {
+      "epoch": 0.41112490342965774,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0011338914230747206,
+      "loss": 0.1113,
+      "step": 47362
+    },
+    {
+      "epoch": 0.4111335839098619,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0011338616817094362,
+      "loss": 0.0854,
+      "step": 47363
+    },
+    {
+      "epoch": 0.41114226439006607,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0011338319403071215,
+      "loss": 0.0835,
+      "step": 47364
+    },
+    {
+      "epoch": 0.4111509448702702,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0011338021988678089,
+      "loss": 0.0991,
+      "step": 47365
+    },
+    {
+      "epoch": 0.4111596253504744,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0011337724573915306,
+      "loss": 0.0864,
+      "step": 47366
+    },
+    {
+      "epoch": 0.41116830583067854,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0011337427158783192,
+      "loss": 0.1221,
+      "step": 47367
+    },
+    {
+      "epoch": 0.41117698631088273,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0011337129743282074,
+      "loss": 0.0771,
+      "step": 47368
+    },
+    {
+      "epoch": 0.41118566679108687,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0011336832327412277,
+      "loss": 0.0991,
+      "step": 47369
+    },
+    {
+      "epoch": 0.41119434727129106,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0011336534911174122,
+      "loss": 0.125,
+      "step": 47370
+    },
+    {
+      "epoch": 0.4112030277514952,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0011336237494567938,
+      "loss": 0.0962,
+      "step": 47371
+    },
+    {
+      "epoch": 0.4112117082316994,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0011335940077594052,
+      "loss": 0.0977,
+      "step": 47372
+    },
+    {
+      "epoch": 0.41122038871190353,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0011335642660252785,
+      "loss": 0.127,
+      "step": 47373
+    },
+    {
+      "epoch": 0.4112290691921077,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0011335345242544463,
+      "loss": 0.0723,
+      "step": 47374
+    },
+    {
+      "epoch": 0.41123774967231186,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0011335047824469415,
+      "loss": 0.0967,
+      "step": 47375
+    },
+    {
+      "epoch": 0.41124643015251605,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0011334750406027962,
+      "loss": 0.0898,
+      "step": 47376
+    },
+    {
+      "epoch": 0.4112551106327202,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0011334452987220433,
+      "loss": 0.1611,
+      "step": 47377
+    },
+    {
+      "epoch": 0.4112637911129244,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001133415556804715,
+      "loss": 0.0874,
+      "step": 47378
+    },
+    {
+      "epoch": 0.4112724715931285,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0011333858148508443,
+      "loss": 0.0684,
+      "step": 47379
+    },
+    {
+      "epoch": 0.4112811520733327,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0011333560728604625,
+      "loss": 0.0664,
+      "step": 47380
+    },
+    {
+      "epoch": 0.41128983255353685,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0011333263308336038,
+      "loss": 0.1377,
+      "step": 47381
+    },
+    {
+      "epoch": 0.41129851303374104,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0011332965887702997,
+      "loss": 0.0918,
+      "step": 47382
+    },
+    {
+      "epoch": 0.4113071935139452,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0011332668466705828,
+      "loss": 0.0825,
+      "step": 47383
+    },
+    {
+      "epoch": 0.4113158739941494,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0011332371045344857,
+      "loss": 0.1523,
+      "step": 47384
+    },
+    {
+      "epoch": 0.4113245544743535,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0011332073623620414,
+      "loss": 0.0938,
+      "step": 47385
+    },
+    {
+      "epoch": 0.4113332349545577,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001133177620153282,
+      "loss": 0.0957,
+      "step": 47386
+    },
+    {
+      "epoch": 0.41134191543476184,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0011331478779082395,
+      "loss": 0.1211,
+      "step": 47387
+    },
+    {
+      "epoch": 0.41135059591496603,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0011331181356269478,
+      "loss": 0.1064,
+      "step": 47388
+    },
+    {
+      "epoch": 0.41135927639517017,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0011330883933094377,
+      "loss": 0.0894,
+      "step": 47389
+    },
+    {
+      "epoch": 0.41136795687537436,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0011330586509557432,
+      "loss": 0.1099,
+      "step": 47390
+    },
+    {
+      "epoch": 0.4113766373555785,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.0011330289085658962,
+      "loss": 0.2002,
+      "step": 47391
+    },
+    {
+      "epoch": 0.4113853178357827,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0011329991661399293,
+      "loss": 0.0874,
+      "step": 47392
+    },
+    {
+      "epoch": 0.41139399831598683,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0011329694236778748,
+      "loss": 0.0825,
+      "step": 47393
+    },
+    {
+      "epoch": 0.411402678796191,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0011329396811797657,
+      "loss": 0.1099,
+      "step": 47394
+    },
+    {
+      "epoch": 0.41141135927639516,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0011329099386456342,
+      "loss": 0.1338,
+      "step": 47395
+    },
+    {
+      "epoch": 0.41142003975659935,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0011328801960755128,
+      "loss": 0.0894,
+      "step": 47396
+    },
+    {
+      "epoch": 0.4114287202368035,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.001132850453469434,
+      "loss": 0.0815,
+      "step": 47397
+    },
+    {
+      "epoch": 0.4114374007170077,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0011328207108274306,
+      "loss": 0.106,
+      "step": 47398
+    },
+    {
+      "epoch": 0.4114460811972118,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001132790968149535,
+      "loss": 0.1387,
+      "step": 47399
+    },
+    {
+      "epoch": 0.411454761677416,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0011327612254357798,
+      "loss": 0.0913,
+      "step": 47400
+    },
+    {
+      "epoch": 0.41146344215762015,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0011327314826861969,
+      "loss": 0.0972,
+      "step": 47401
+    },
+    {
+      "epoch": 0.41147212263782434,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.00113270173990082,
+      "loss": 0.1123,
+      "step": 47402
+    },
+    {
+      "epoch": 0.4114808031180285,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0011326719970796803,
+      "loss": 0.1182,
+      "step": 47403
+    },
+    {
+      "epoch": 0.4114894835982327,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0011326422542228113,
+      "loss": 0.1025,
+      "step": 47404
+    },
+    {
+      "epoch": 0.4114981640784368,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0011326125113302454,
+      "loss": 0.1416,
+      "step": 47405
+    },
+    {
+      "epoch": 0.411506844558641,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0011325827684020148,
+      "loss": 0.0762,
+      "step": 47406
+    },
+    {
+      "epoch": 0.41151552503884514,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0011325530254381521,
+      "loss": 0.0693,
+      "step": 47407
+    },
+    {
+      "epoch": 0.41152420551904934,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0011325232824386901,
+      "loss": 0.0957,
+      "step": 47408
+    },
+    {
+      "epoch": 0.4115328859992535,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001132493539403661,
+      "loss": 0.106,
+      "step": 47409
+    },
+    {
+      "epoch": 0.41154156647945767,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0011324637963330973,
+      "loss": 0.1035,
+      "step": 47410
+    },
+    {
+      "epoch": 0.4115502469596618,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001132434053227032,
+      "loss": 0.0864,
+      "step": 47411
+    },
+    {
+      "epoch": 0.411558927439866,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001132404310085497,
+      "loss": 0.0967,
+      "step": 47412
+    },
+    {
+      "epoch": 0.41156760792007013,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0011323745669085256,
+      "loss": 0.1299,
+      "step": 47413
+    },
+    {
+      "epoch": 0.4115762884002743,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0011323448236961492,
+      "loss": 0.1504,
+      "step": 47414
+    },
+    {
+      "epoch": 0.41158496888047846,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0011323150804484015,
+      "loss": 0.0981,
+      "step": 47415
+    },
+    {
+      "epoch": 0.41159364936068266,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0011322853371653144,
+      "loss": 0.0718,
+      "step": 47416
+    },
+    {
+      "epoch": 0.4116023298408868,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0011322555938469204,
+      "loss": 0.104,
+      "step": 47417
+    },
+    {
+      "epoch": 0.411611010321091,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0011322258504932524,
+      "loss": 0.0825,
+      "step": 47418
+    },
+    {
+      "epoch": 0.4116196908012951,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0011321961071043423,
+      "loss": 0.1084,
+      "step": 47419
+    },
+    {
+      "epoch": 0.4116283712814993,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0011321663636802235,
+      "loss": 0.1074,
+      "step": 47420
+    },
+    {
+      "epoch": 0.41163705176170345,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0011321366202209275,
+      "loss": 0.0786,
+      "step": 47421
+    },
+    {
+      "epoch": 0.41164573224190765,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0011321068767264878,
+      "loss": 0.0732,
+      "step": 47422
+    },
+    {
+      "epoch": 0.4116544127221118,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0011320771331969361,
+      "loss": 0.0752,
+      "step": 47423
+    },
+    {
+      "epoch": 0.411663093202316,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0011320473896323058,
+      "loss": 0.0835,
+      "step": 47424
+    },
+    {
+      "epoch": 0.4116717736825201,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.001132017646032629,
+      "loss": 0.0786,
+      "step": 47425
+    },
+    {
+      "epoch": 0.4116804541627243,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.001131987902397938,
+      "loss": 0.1201,
+      "step": 47426
+    },
+    {
+      "epoch": 0.41168913464292844,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0011319581587282655,
+      "loss": 0.104,
+      "step": 47427
+    },
+    {
+      "epoch": 0.41169781512313264,
+      "grad_norm": 1.9921875,
+      "learning_rate": 0.0011319284150236439,
+      "loss": 0.209,
+      "step": 47428
+    },
+    {
+      "epoch": 0.4117064956033368,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001131898671284106,
+      "loss": 0.0938,
+      "step": 47429
+    },
+    {
+      "epoch": 0.41171517608354097,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0011318689275096844,
+      "loss": 0.0933,
+      "step": 47430
+    },
+    {
+      "epoch": 0.4117238565637451,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001131839183700411,
+      "loss": 0.1182,
+      "step": 47431
+    },
+    {
+      "epoch": 0.4117325370439493,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0011318094398563188,
+      "loss": 0.0972,
+      "step": 47432
+    },
+    {
+      "epoch": 0.41174121752415344,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0011317796959774403,
+      "loss": 0.1064,
+      "step": 47433
+    },
+    {
+      "epoch": 0.41174989800435763,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0011317499520638083,
+      "loss": 0.0977,
+      "step": 47434
+    },
+    {
+      "epoch": 0.41175857848456177,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001131720208115455,
+      "loss": 0.0957,
+      "step": 47435
+    },
+    {
+      "epoch": 0.4117672589647659,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0011316904641324127,
+      "loss": 0.0864,
+      "step": 47436
+    },
+    {
+      "epoch": 0.4117759394449701,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.001131660720114714,
+      "loss": 0.0879,
+      "step": 47437
+    },
+    {
+      "epoch": 0.41178461992517423,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0011316309760623922,
+      "loss": 0.1396,
+      "step": 47438
+    },
+    {
+      "epoch": 0.4117933004053784,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0011316012319754788,
+      "loss": 0.0791,
+      "step": 47439
+    },
+    {
+      "epoch": 0.41180198088558256,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0011315714878540068,
+      "loss": 0.0747,
+      "step": 47440
+    },
+    {
+      "epoch": 0.41181066136578676,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0011315417436980086,
+      "loss": 0.1348,
+      "step": 47441
+    },
+    {
+      "epoch": 0.4118193418459909,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001131511999507517,
+      "loss": 0.0967,
+      "step": 47442
+    },
+    {
+      "epoch": 0.4118280223261951,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0011314822552825645,
+      "loss": 0.0981,
+      "step": 47443
+    },
+    {
+      "epoch": 0.4118367028063992,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0011314525110231836,
+      "loss": 0.1128,
+      "step": 47444
+    },
+    {
+      "epoch": 0.4118453832866034,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.001131422766729406,
+      "loss": 0.1094,
+      "step": 47445
+    },
+    {
+      "epoch": 0.41185406376680755,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0011313930224012656,
+      "loss": 0.1035,
+      "step": 47446
+    },
+    {
+      "epoch": 0.41186274424701175,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0011313632780387943,
+      "loss": 0.0957,
+      "step": 47447
+    },
+    {
+      "epoch": 0.4118714247272159,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0011313335336420242,
+      "loss": 0.0684,
+      "step": 47448
+    },
+    {
+      "epoch": 0.4118801052074201,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0011313037892109886,
+      "loss": 0.1089,
+      "step": 47449
+    },
+    {
+      "epoch": 0.4118887856876242,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0011312740447457193,
+      "loss": 0.0859,
+      "step": 47450
+    },
+    {
+      "epoch": 0.4118974661678284,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001131244300246249,
+      "loss": 0.0874,
+      "step": 47451
+    },
+    {
+      "epoch": 0.41190614664803255,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0011312145557126109,
+      "loss": 0.1396,
+      "step": 47452
+    },
+    {
+      "epoch": 0.41191482712823674,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0011311848111448368,
+      "loss": 0.1494,
+      "step": 47453
+    },
+    {
+      "epoch": 0.4119235076084409,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0011311550665429593,
+      "loss": 0.1328,
+      "step": 47454
+    },
+    {
+      "epoch": 0.41193218808864507,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0011311253219070114,
+      "loss": 0.1035,
+      "step": 47455
+    },
+    {
+      "epoch": 0.4119408685688492,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0011310955772370253,
+      "loss": 0.1138,
+      "step": 47456
+    },
+    {
+      "epoch": 0.4119495490490534,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0011310658325330335,
+      "loss": 0.1143,
+      "step": 47457
+    },
+    {
+      "epoch": 0.41195822952925754,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0011310360877950683,
+      "loss": 0.083,
+      "step": 47458
+    },
+    {
+      "epoch": 0.41196691000946173,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0011310063430231628,
+      "loss": 0.0664,
+      "step": 47459
+    },
+    {
+      "epoch": 0.41197559048966587,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0011309765982173488,
+      "loss": 0.1025,
+      "step": 47460
+    },
+    {
+      "epoch": 0.41198427096987006,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0011309468533776596,
+      "loss": 0.1167,
+      "step": 47461
+    },
+    {
+      "epoch": 0.4119929514500742,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0011309171085041275,
+      "loss": 0.0747,
+      "step": 47462
+    },
+    {
+      "epoch": 0.4120016319302784,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0011308873635967847,
+      "loss": 0.0879,
+      "step": 47463
+    },
+    {
+      "epoch": 0.4120103124104825,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0011308576186556641,
+      "loss": 0.1118,
+      "step": 47464
+    },
+    {
+      "epoch": 0.4120189928906867,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001130827873680798,
+      "loss": 0.0996,
+      "step": 47465
+    },
+    {
+      "epoch": 0.41202767337089086,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001130798128672219,
+      "loss": 0.104,
+      "step": 47466
+    },
+    {
+      "epoch": 0.41203635385109505,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0011307683836299595,
+      "loss": 0.1006,
+      "step": 47467
+    },
+    {
+      "epoch": 0.4120450343312992,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0011307386385540524,
+      "loss": 0.0713,
+      "step": 47468
+    },
+    {
+      "epoch": 0.4120537148115034,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0011307088934445297,
+      "loss": 0.0913,
+      "step": 47469
+    },
+    {
+      "epoch": 0.4120623952917075,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0011306791483014244,
+      "loss": 0.1172,
+      "step": 47470
+    },
+    {
+      "epoch": 0.4120710757719117,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0011306494031247687,
+      "loss": 0.1104,
+      "step": 47471
+    },
+    {
+      "epoch": 0.41207975625211585,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0011306196579145954,
+      "loss": 0.0889,
+      "step": 47472
+    },
+    {
+      "epoch": 0.41208843673232004,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001130589912670937,
+      "loss": 0.085,
+      "step": 47473
+    },
+    {
+      "epoch": 0.4120971172125242,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0011305601673938259,
+      "loss": 0.0859,
+      "step": 47474
+    },
+    {
+      "epoch": 0.41210579769272837,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0011305304220832948,
+      "loss": 0.1069,
+      "step": 47475
+    },
+    {
+      "epoch": 0.4121144781729325,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0011305006767393757,
+      "loss": 0.1338,
+      "step": 47476
+    },
+    {
+      "epoch": 0.4121231586531367,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0011304709313621018,
+      "loss": 0.0996,
+      "step": 47477
+    },
+    {
+      "epoch": 0.41213183913334084,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0011304411859515053,
+      "loss": 0.085,
+      "step": 47478
+    },
+    {
+      "epoch": 0.41214051961354503,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0011304114405076187,
+      "loss": 0.125,
+      "step": 47479
+    },
+    {
+      "epoch": 0.41214920009374917,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0011303816950304743,
+      "loss": 0.1001,
+      "step": 47480
+    },
+    {
+      "epoch": 0.41215788057395336,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0011303519495201055,
+      "loss": 0.0977,
+      "step": 47481
+    },
+    {
+      "epoch": 0.4121665610541575,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0011303222039765442,
+      "loss": 0.1338,
+      "step": 47482
+    },
+    {
+      "epoch": 0.4121752415343617,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0011302924583998228,
+      "loss": 0.0947,
+      "step": 47483
+    },
+    {
+      "epoch": 0.41218392201456583,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001130262712789974,
+      "loss": 0.1104,
+      "step": 47484
+    },
+    {
+      "epoch": 0.41219260249477,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0011302329671470303,
+      "loss": 0.0845,
+      "step": 47485
+    },
+    {
+      "epoch": 0.41220128297497416,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0011302032214710244,
+      "loss": 0.0586,
+      "step": 47486
+    },
+    {
+      "epoch": 0.41220996345517835,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0011301734757619888,
+      "loss": 0.0859,
+      "step": 47487
+    },
+    {
+      "epoch": 0.4122186439353825,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0011301437300199558,
+      "loss": 0.1108,
+      "step": 47488
+    },
+    {
+      "epoch": 0.4122273244155867,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001130113984244958,
+      "loss": 0.0869,
+      "step": 47489
+    },
+    {
+      "epoch": 0.4122360048957908,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0011300842384370283,
+      "loss": 0.1074,
+      "step": 47490
+    },
+    {
+      "epoch": 0.412244685375995,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0011300544925961986,
+      "loss": 0.1328,
+      "step": 47491
+    },
+    {
+      "epoch": 0.41225336585619915,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0011300247467225018,
+      "loss": 0.1172,
+      "step": 47492
+    },
+    {
+      "epoch": 0.41226204633640334,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0011299950008159705,
+      "loss": 0.1123,
+      "step": 47493
+    },
+    {
+      "epoch": 0.4122707268166075,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001129965254876637,
+      "loss": 0.0854,
+      "step": 47494
+    },
+    {
+      "epoch": 0.4122794072968117,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0011299355089045343,
+      "loss": 0.0874,
+      "step": 47495
+    },
+    {
+      "epoch": 0.4122880877770158,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0011299057628996942,
+      "loss": 0.0938,
+      "step": 47496
+    },
+    {
+      "epoch": 0.41229676825722,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0011298760168621498,
+      "loss": 0.1001,
+      "step": 47497
+    },
+    {
+      "epoch": 0.41230544873742414,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0011298462707919335,
+      "loss": 0.1113,
+      "step": 47498
+    },
+    {
+      "epoch": 0.41231412921762833,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0011298165246890772,
+      "loss": 0.1143,
+      "step": 47499
+    },
+    {
+      "epoch": 0.41232280969783247,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0011297867785536146,
+      "loss": 0.1455,
+      "step": 47500
+    },
+    {
+      "epoch": 0.41233149017803666,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0011297570323855774,
+      "loss": 0.1123,
+      "step": 47501
+    },
+    {
+      "epoch": 0.4123401706582408,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0011297272861849985,
+      "loss": 0.0996,
+      "step": 47502
+    },
+    {
+      "epoch": 0.412348851138445,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.00112969753995191,
+      "loss": 0.0986,
+      "step": 47503
+    },
+    {
+      "epoch": 0.41235753161864913,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0011296677936863454,
+      "loss": 0.1016,
+      "step": 47504
+    },
+    {
+      "epoch": 0.4123662120988533,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0011296380473883357,
+      "loss": 0.0898,
+      "step": 47505
+    },
+    {
+      "epoch": 0.41237489257905746,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0011296083010579148,
+      "loss": 0.0732,
+      "step": 47506
+    },
+    {
+      "epoch": 0.41238357305926165,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0011295785546951147,
+      "loss": 0.1172,
+      "step": 47507
+    },
+    {
+      "epoch": 0.4123922535394658,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0011295488082999677,
+      "loss": 0.125,
+      "step": 47508
+    },
+    {
+      "epoch": 0.41240093401967,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0011295190618725065,
+      "loss": 0.1006,
+      "step": 47509
+    },
+    {
+      "epoch": 0.4124096144998741,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0011294893154127636,
+      "loss": 0.0723,
+      "step": 47510
+    },
+    {
+      "epoch": 0.4124182949800783,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001129459568920772,
+      "loss": 0.0635,
+      "step": 47511
+    },
+    {
+      "epoch": 0.41242697546028245,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0011294298223965638,
+      "loss": 0.0947,
+      "step": 47512
+    },
+    {
+      "epoch": 0.41243565594048665,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0011294000758401714,
+      "loss": 0.1387,
+      "step": 47513
+    },
+    {
+      "epoch": 0.4124443364206908,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0011293703292516277,
+      "loss": 0.0742,
+      "step": 47514
+    },
+    {
+      "epoch": 0.412453016900895,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0011293405826309651,
+      "loss": 0.0698,
+      "step": 47515
+    },
+    {
+      "epoch": 0.4124616973810991,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001129310835978216,
+      "loss": 0.1484,
+      "step": 47516
+    },
+    {
+      "epoch": 0.4124703778613033,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.001129281089293413,
+      "loss": 0.064,
+      "step": 47517
+    },
+    {
+      "epoch": 0.41247905834150744,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0011292513425765881,
+      "loss": 0.123,
+      "step": 47518
+    },
+    {
+      "epoch": 0.41248773882171164,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0011292215958277753,
+      "loss": 0.1221,
+      "step": 47519
+    },
+    {
+      "epoch": 0.4124964193019158,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0011291918490470058,
+      "loss": 0.0854,
+      "step": 47520
+    },
+    {
+      "epoch": 0.41250509978211997,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0011291621022343123,
+      "loss": 0.0757,
+      "step": 47521
+    },
+    {
+      "epoch": 0.4125137802623241,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0011291323553897282,
+      "loss": 0.0986,
+      "step": 47522
+    },
+    {
+      "epoch": 0.4125224607425283,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0011291026085132848,
+      "loss": 0.1016,
+      "step": 47523
+    },
+    {
+      "epoch": 0.41253114122273243,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0011290728616050155,
+      "loss": 0.1006,
+      "step": 47524
+    },
+    {
+      "epoch": 0.4125398217029366,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0011290431146649524,
+      "loss": 0.1099,
+      "step": 47525
+    },
+    {
+      "epoch": 0.41254850218314076,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.001129013367693128,
+      "loss": 0.1094,
+      "step": 47526
+    },
+    {
+      "epoch": 0.41255718266334496,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0011289836206895753,
+      "loss": 0.0811,
+      "step": 47527
+    },
+    {
+      "epoch": 0.4125658631435491,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0011289538736543266,
+      "loss": 0.0703,
+      "step": 47528
+    },
+    {
+      "epoch": 0.4125745436237533,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001128924126587414,
+      "loss": 0.083,
+      "step": 47529
+    },
+    {
+      "epoch": 0.4125832241039574,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0011288943794888712,
+      "loss": 0.1089,
+      "step": 47530
+    },
+    {
+      "epoch": 0.4125919045841616,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001128864632358729,
+      "loss": 0.125,
+      "step": 47531
+    },
+    {
+      "epoch": 0.41260058506436575,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0011288348851970216,
+      "loss": 0.1216,
+      "step": 47532
+    },
+    {
+      "epoch": 0.41260926554456995,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0011288051380037806,
+      "loss": 0.0859,
+      "step": 47533
+    },
+    {
+      "epoch": 0.4126179460247741,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0011287753907790385,
+      "loss": 0.0933,
+      "step": 47534
+    },
+    {
+      "epoch": 0.4126266265049783,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0011287456435228283,
+      "loss": 0.0957,
+      "step": 47535
+    },
+    {
+      "epoch": 0.4126353069851824,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0011287158962351822,
+      "loss": 0.0737,
+      "step": 47536
+    },
+    {
+      "epoch": 0.4126439874653866,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001128686148916133,
+      "loss": 0.1221,
+      "step": 47537
+    },
+    {
+      "epoch": 0.41265266794559075,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0011286564015657126,
+      "loss": 0.1079,
+      "step": 47538
+    },
+    {
+      "epoch": 0.41266134842579494,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0011286266541839542,
+      "loss": 0.0898,
+      "step": 47539
+    },
+    {
+      "epoch": 0.4126700289059991,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0011285969067708903,
+      "loss": 0.1045,
+      "step": 47540
+    },
+    {
+      "epoch": 0.41267870938620327,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0011285671593265533,
+      "loss": 0.0796,
+      "step": 47541
+    },
+    {
+      "epoch": 0.4126873898664074,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0011285374118509754,
+      "loss": 0.1279,
+      "step": 47542
+    },
+    {
+      "epoch": 0.4126960703466116,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0011285076643441896,
+      "loss": 0.0801,
+      "step": 47543
+    },
+    {
+      "epoch": 0.41270475082681574,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001128477916806228,
+      "loss": 0.1396,
+      "step": 47544
+    },
+    {
+      "epoch": 0.41271343130701993,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0011284481692371239,
+      "loss": 0.0967,
+      "step": 47545
+    },
+    {
+      "epoch": 0.41272211178722407,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0011284184216369087,
+      "loss": 0.1416,
+      "step": 47546
+    },
+    {
+      "epoch": 0.41273079226742826,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001128388674005616,
+      "loss": 0.064,
+      "step": 47547
+    },
+    {
+      "epoch": 0.4127394727476324,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0011283589263432776,
+      "loss": 0.1133,
+      "step": 47548
+    },
+    {
+      "epoch": 0.4127481532278366,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0011283291786499264,
+      "loss": 0.1235,
+      "step": 47549
+    },
+    {
+      "epoch": 0.4127568337080407,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001128299430925595,
+      "loss": 0.0825,
+      "step": 47550
+    },
+    {
+      "epoch": 0.4127655141882449,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0011282696831703153,
+      "loss": 0.0947,
+      "step": 47551
+    },
+    {
+      "epoch": 0.41277419466844906,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0011282399353841211,
+      "loss": 0.1143,
+      "step": 47552
+    },
+    {
+      "epoch": 0.41278287514865325,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0011282101875670434,
+      "loss": 0.0703,
+      "step": 47553
+    },
+    {
+      "epoch": 0.4127915556288574,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0011281804397191158,
+      "loss": 0.0977,
+      "step": 47554
+    },
+    {
+      "epoch": 0.4128002361090616,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0011281506918403704,
+      "loss": 0.0928,
+      "step": 47555
+    },
+    {
+      "epoch": 0.4128089165892657,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0011281209439308398,
+      "loss": 0.0732,
+      "step": 47556
+    },
+    {
+      "epoch": 0.4128175970694699,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0011280911959905564,
+      "loss": 0.0801,
+      "step": 47557
+    },
+    {
+      "epoch": 0.41282627754967405,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.001128061448019553,
+      "loss": 0.0811,
+      "step": 47558
+    },
+    {
+      "epoch": 0.4128349580298782,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0011280317000178623,
+      "loss": 0.0923,
+      "step": 47559
+    },
+    {
+      "epoch": 0.4128436385100824,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0011280019519855165,
+      "loss": 0.1113,
+      "step": 47560
+    },
+    {
+      "epoch": 0.4128523189902865,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0011279722039225482,
+      "loss": 0.1484,
+      "step": 47561
+    },
+    {
+      "epoch": 0.4128609994704907,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0011279424558289894,
+      "loss": 0.082,
+      "step": 47562
+    },
+    {
+      "epoch": 0.41286967995069485,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0011279127077048737,
+      "loss": 0.0781,
+      "step": 47563
+    },
+    {
+      "epoch": 0.41287836043089904,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0011278829595502329,
+      "loss": 0.0947,
+      "step": 47564
+    },
+    {
+      "epoch": 0.4128870409111032,
+      "grad_norm": 0.95703125,
+      "learning_rate": 0.0011278532113650995,
+      "loss": 0.125,
+      "step": 47565
+    },
+    {
+      "epoch": 0.41289572139130737,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0011278234631495064,
+      "loss": 0.0664,
+      "step": 47566
+    },
+    {
+      "epoch": 0.4129044018715115,
+      "grad_norm": 1.875,
+      "learning_rate": 0.001127793714903486,
+      "loss": 0.1514,
+      "step": 47567
+    },
+    {
+      "epoch": 0.4129130823517157,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0011277639666270709,
+      "loss": 0.0771,
+      "step": 47568
+    },
+    {
+      "epoch": 0.41292176283191984,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0011277342183202935,
+      "loss": 0.0776,
+      "step": 47569
+    },
+    {
+      "epoch": 0.41293044331212403,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0011277044699831863,
+      "loss": 0.1094,
+      "step": 47570
+    },
+    {
+      "epoch": 0.41293912379232817,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001127674721615782,
+      "loss": 0.0747,
+      "step": 47571
+    },
+    {
+      "epoch": 0.41294780427253236,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001127644973218113,
+      "loss": 0.1152,
+      "step": 47572
+    },
+    {
+      "epoch": 0.4129564847527365,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.001127615224790212,
+      "loss": 0.1182,
+      "step": 47573
+    },
+    {
+      "epoch": 0.4129651652329407,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.001127585476332111,
+      "loss": 0.0874,
+      "step": 47574
+    },
+    {
+      "epoch": 0.4129738457131448,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0011275557278438435,
+      "loss": 0.1611,
+      "step": 47575
+    },
+    {
+      "epoch": 0.412982526193349,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0011275259793254412,
+      "loss": 0.0869,
+      "step": 47576
+    },
+    {
+      "epoch": 0.41299120667355316,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0011274962307769368,
+      "loss": 0.0894,
+      "step": 47577
+    },
+    {
+      "epoch": 0.41299988715375735,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0011274664821983632,
+      "loss": 0.1074,
+      "step": 47578
+    },
+    {
+      "epoch": 0.4130085676339615,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0011274367335897522,
+      "loss": 0.1182,
+      "step": 47579
+    },
+    {
+      "epoch": 0.4130172481141657,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0011274069849511372,
+      "loss": 0.1211,
+      "step": 47580
+    },
+    {
+      "epoch": 0.4130259285943698,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0011273772362825503,
+      "loss": 0.0771,
+      "step": 47581
+    },
+    {
+      "epoch": 0.413034609074574,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0011273474875840243,
+      "loss": 0.1113,
+      "step": 47582
+    },
+    {
+      "epoch": 0.41304328955477815,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001127317738855591,
+      "loss": 0.0776,
+      "step": 47583
+    },
+    {
+      "epoch": 0.41305197003498234,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0011272879900972835,
+      "loss": 0.1206,
+      "step": 47584
+    },
+    {
+      "epoch": 0.4130606505151865,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0011272582413091343,
+      "loss": 0.1191,
+      "step": 47585
+    },
+    {
+      "epoch": 0.41306933099539067,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001127228492491176,
+      "loss": 0.1133,
+      "step": 47586
+    },
+    {
+      "epoch": 0.4130780114755948,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001127198743643441,
+      "loss": 0.1123,
+      "step": 47587
+    },
+    {
+      "epoch": 0.413086691955799,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0011271689947659619,
+      "loss": 0.1167,
+      "step": 47588
+    },
+    {
+      "epoch": 0.41309537243600314,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001127139245858771,
+      "loss": 0.0894,
+      "step": 47589
+    },
+    {
+      "epoch": 0.41310405291620733,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0011271094969219013,
+      "loss": 0.1514,
+      "step": 47590
+    },
+    {
+      "epoch": 0.41311273339641147,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0011270797479553851,
+      "loss": 0.0903,
+      "step": 47591
+    },
+    {
+      "epoch": 0.41312141387661566,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0011270499989592547,
+      "loss": 0.085,
+      "step": 47592
+    },
+    {
+      "epoch": 0.4131300943568198,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0011270202499335429,
+      "loss": 0.0669,
+      "step": 47593
+    },
+    {
+      "epoch": 0.413138774837024,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0011269905008782822,
+      "loss": 0.0996,
+      "step": 47594
+    },
+    {
+      "epoch": 0.41314745531722813,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001126960751793505,
+      "loss": 0.1196,
+      "step": 47595
+    },
+    {
+      "epoch": 0.4131561357974323,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0011269310026792438,
+      "loss": 0.082,
+      "step": 47596
+    },
+    {
+      "epoch": 0.41316481627763646,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0011269012535355314,
+      "loss": 0.0947,
+      "step": 47597
+    },
+    {
+      "epoch": 0.41317349675784065,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0011268715043624006,
+      "loss": 0.0977,
+      "step": 47598
+    },
+    {
+      "epoch": 0.4131821772380448,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0011268417551598831,
+      "loss": 0.1094,
+      "step": 47599
+    },
+    {
+      "epoch": 0.413190857718249,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001126812005928012,
+      "loss": 0.0923,
+      "step": 47600
+    },
+    {
+      "epoch": 0.4131995381984531,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0011267822566668193,
+      "loss": 0.0708,
+      "step": 47601
+    },
+    {
+      "epoch": 0.4132082186786573,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0011267525073763386,
+      "loss": 0.0996,
+      "step": 47602
+    },
+    {
+      "epoch": 0.41321689915886145,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0011267227580566011,
+      "loss": 0.1826,
+      "step": 47603
+    },
+    {
+      "epoch": 0.41322557963906564,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0011266930087076403,
+      "loss": 0.0762,
+      "step": 47604
+    },
+    {
+      "epoch": 0.4132342601192698,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0011266632593294882,
+      "loss": 0.0977,
+      "step": 47605
+    },
+    {
+      "epoch": 0.413242940599474,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0011266335099221778,
+      "loss": 0.0654,
+      "step": 47606
+    },
+    {
+      "epoch": 0.4132516210796781,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0011266037604857415,
+      "loss": 0.0967,
+      "step": 47607
+    },
+    {
+      "epoch": 0.4132603015598823,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0011265740110202116,
+      "loss": 0.124,
+      "step": 47608
+    },
+    {
+      "epoch": 0.41326898204008644,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0011265442615256208,
+      "loss": 0.1289,
+      "step": 47609
+    },
+    {
+      "epoch": 0.41327766252029063,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0011265145120020015,
+      "loss": 0.0908,
+      "step": 47610
+    },
+    {
+      "epoch": 0.41328634300049477,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0011264847624493863,
+      "loss": 0.1045,
+      "step": 47611
+    },
+    {
+      "epoch": 0.41329502348069896,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001126455012867808,
+      "loss": 0.1035,
+      "step": 47612
+    },
+    {
+      "epoch": 0.4133037039609031,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0011264252632572985,
+      "loss": 0.1494,
+      "step": 47613
+    },
+    {
+      "epoch": 0.4133123844411073,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001126395513617891,
+      "loss": 0.085,
+      "step": 47614
+    },
+    {
+      "epoch": 0.41332106492131143,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0011263657639496176,
+      "loss": 0.084,
+      "step": 47615
+    },
+    {
+      "epoch": 0.4133297454015156,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0011263360142525112,
+      "loss": 0.1182,
+      "step": 47616
+    },
+    {
+      "epoch": 0.41333842588171976,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0011263062645266043,
+      "loss": 0.0786,
+      "step": 47617
+    },
+    {
+      "epoch": 0.41334710636192395,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0011262765147719287,
+      "loss": 0.082,
+      "step": 47618
+    },
+    {
+      "epoch": 0.4133557868421281,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0011262467649885179,
+      "loss": 0.0806,
+      "step": 47619
+    },
+    {
+      "epoch": 0.4133644673223323,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.001126217015176404,
+      "loss": 0.0732,
+      "step": 47620
+    },
+    {
+      "epoch": 0.4133731478025364,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0011261872653356196,
+      "loss": 0.1162,
+      "step": 47621
+    },
+    {
+      "epoch": 0.4133818282827406,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0011261575154661969,
+      "loss": 0.0654,
+      "step": 47622
+    },
+    {
+      "epoch": 0.41339050876294475,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0011261277655681687,
+      "loss": 0.1035,
+      "step": 47623
+    },
+    {
+      "epoch": 0.41339918924314895,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0011260980156415682,
+      "loss": 0.0938,
+      "step": 47624
+    },
+    {
+      "epoch": 0.4134078697233531,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0011260682656864267,
+      "loss": 0.0918,
+      "step": 47625
+    },
+    {
+      "epoch": 0.4134165502035573,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0011260385157027776,
+      "loss": 0.0728,
+      "step": 47626
+    },
+    {
+      "epoch": 0.4134252306837614,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001126008765690653,
+      "loss": 0.0967,
+      "step": 47627
+    },
+    {
+      "epoch": 0.4134339111639656,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.001125979015650086,
+      "loss": 0.1055,
+      "step": 47628
+    },
+    {
+      "epoch": 0.41344259164416974,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0011259492655811085,
+      "loss": 0.1182,
+      "step": 47629
+    },
+    {
+      "epoch": 0.41345127212437394,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0011259195154837533,
+      "loss": 0.1406,
+      "step": 47630
+    },
+    {
+      "epoch": 0.4134599526045781,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0011258897653580527,
+      "loss": 0.0972,
+      "step": 47631
+    },
+    {
+      "epoch": 0.41346863308478227,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0011258600152040396,
+      "loss": 0.1104,
+      "step": 47632
+    },
+    {
+      "epoch": 0.4134773135649864,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0011258302650217463,
+      "loss": 0.104,
+      "step": 47633
+    },
+    {
+      "epoch": 0.4134859940451906,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0011258005148112053,
+      "loss": 0.1113,
+      "step": 47634
+    },
+    {
+      "epoch": 0.41349467452539473,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0011257707645724493,
+      "loss": 0.0752,
+      "step": 47635
+    },
+    {
+      "epoch": 0.4135033550055989,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0011257410143055112,
+      "loss": 0.1162,
+      "step": 47636
+    },
+    {
+      "epoch": 0.41351203548580306,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0011257112640104228,
+      "loss": 0.1177,
+      "step": 47637
+    },
+    {
+      "epoch": 0.41352071596600726,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001125681513687217,
+      "loss": 0.0981,
+      "step": 47638
+    },
+    {
+      "epoch": 0.4135293964462114,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001125651763335926,
+      "loss": 0.1216,
+      "step": 47639
+    },
+    {
+      "epoch": 0.4135380769264156,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001125622012956583,
+      "loss": 0.0908,
+      "step": 47640
+    },
+    {
+      "epoch": 0.4135467574066197,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0011255922625492201,
+      "loss": 0.0854,
+      "step": 47641
+    },
+    {
+      "epoch": 0.4135554378868239,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0011255625121138696,
+      "loss": 0.1123,
+      "step": 47642
+    },
+    {
+      "epoch": 0.41356411836702806,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0011255327616505647,
+      "loss": 0.1562,
+      "step": 47643
+    },
+    {
+      "epoch": 0.41357279884723225,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0011255030111593373,
+      "loss": 0.0815,
+      "step": 47644
+    },
+    {
+      "epoch": 0.4135814793274364,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0011254732606402202,
+      "loss": 0.0903,
+      "step": 47645
+    },
+    {
+      "epoch": 0.4135901598076406,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0011254435100932461,
+      "loss": 0.0811,
+      "step": 47646
+    },
+    {
+      "epoch": 0.4135988402878447,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0011254137595184473,
+      "loss": 0.0996,
+      "step": 47647
+    },
+    {
+      "epoch": 0.4136075207680489,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0011253840089158562,
+      "loss": 0.1465,
+      "step": 47648
+    },
+    {
+      "epoch": 0.41361620124825305,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0011253542582855057,
+      "loss": 0.0635,
+      "step": 47649
+    },
+    {
+      "epoch": 0.41362488172845724,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0011253245076274282,
+      "loss": 0.0996,
+      "step": 47650
+    },
+    {
+      "epoch": 0.4136335622086614,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001125294756941656,
+      "loss": 0.1543,
+      "step": 47651
+    },
+    {
+      "epoch": 0.41364224268886557,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0011252650062282222,
+      "loss": 0.1523,
+      "step": 47652
+    },
+    {
+      "epoch": 0.4136509231690697,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0011252352554871584,
+      "loss": 0.0859,
+      "step": 47653
+    },
+    {
+      "epoch": 0.4136596036492739,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0011252055047184981,
+      "loss": 0.1299,
+      "step": 47654
+    },
+    {
+      "epoch": 0.41366828412947804,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0011251757539222735,
+      "loss": 0.1104,
+      "step": 47655
+    },
+    {
+      "epoch": 0.41367696460968223,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001125146003098517,
+      "loss": 0.1191,
+      "step": 47656
+    },
+    {
+      "epoch": 0.41368564508988637,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.001125116252247261,
+      "loss": 0.083,
+      "step": 47657
+    },
+    {
+      "epoch": 0.41369432557009056,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0011250865013685385,
+      "loss": 0.106,
+      "step": 47658
+    },
+    {
+      "epoch": 0.4137030060502947,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0011250567504623818,
+      "loss": 0.0566,
+      "step": 47659
+    },
+    {
+      "epoch": 0.4137116865304989,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001125026999528823,
+      "loss": 0.0913,
+      "step": 47660
+    },
+    {
+      "epoch": 0.413720367010703,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0011249972485678954,
+      "loss": 0.1138,
+      "step": 47661
+    },
+    {
+      "epoch": 0.4137290474909072,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001124967497579631,
+      "loss": 0.0854,
+      "step": 47662
+    },
+    {
+      "epoch": 0.41373772797111136,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0011249377465640626,
+      "loss": 0.084,
+      "step": 47663
+    },
+    {
+      "epoch": 0.41374640845131555,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0011249079955212228,
+      "loss": 0.0732,
+      "step": 47664
+    },
+    {
+      "epoch": 0.4137550889315197,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0011248782444511437,
+      "loss": 0.1143,
+      "step": 47665
+    },
+    {
+      "epoch": 0.4137637694117239,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0011248484933538585,
+      "loss": 0.0879,
+      "step": 47666
+    },
+    {
+      "epoch": 0.413772449891928,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0011248187422293993,
+      "loss": 0.0693,
+      "step": 47667
+    },
+    {
+      "epoch": 0.4137811303721322,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0011247889910777989,
+      "loss": 0.1021,
+      "step": 47668
+    },
+    {
+      "epoch": 0.41378981085233635,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0011247592398990892,
+      "loss": 0.083,
+      "step": 47669
+    },
+    {
+      "epoch": 0.41379849133254054,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0011247294886933034,
+      "loss": 0.1162,
+      "step": 47670
+    },
+    {
+      "epoch": 0.4138071718127447,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0011246997374604732,
+      "loss": 0.1143,
+      "step": 47671
+    },
+    {
+      "epoch": 0.41381585229294887,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0011246699862006325,
+      "loss": 0.0869,
+      "step": 47672
+    },
+    {
+      "epoch": 0.413824532773153,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0011246402349138127,
+      "loss": 0.0879,
+      "step": 47673
+    },
+    {
+      "epoch": 0.4138332132533572,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0011246104836000466,
+      "loss": 0.0967,
+      "step": 47674
+    },
+    {
+      "epoch": 0.41384189373356134,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.001124580732259367,
+      "loss": 0.0898,
+      "step": 47675
+    },
+    {
+      "epoch": 0.41385057421376553,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0011245509808918066,
+      "loss": 0.1074,
+      "step": 47676
+    },
+    {
+      "epoch": 0.41385925469396967,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0011245212294973973,
+      "loss": 0.1201,
+      "step": 47677
+    },
+    {
+      "epoch": 0.41386793517417386,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001124491478076172,
+      "loss": 0.1167,
+      "step": 47678
+    },
+    {
+      "epoch": 0.413876615654378,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0011244617266281634,
+      "loss": 0.1162,
+      "step": 47679
+    },
+    {
+      "epoch": 0.4138852961345822,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0011244319751534035,
+      "loss": 0.0811,
+      "step": 47680
+    },
+    {
+      "epoch": 0.41389397661478633,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0011244022236519251,
+      "loss": 0.1504,
+      "step": 47681
+    },
+    {
+      "epoch": 0.4139026570949905,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001124372472123761,
+      "loss": 0.1016,
+      "step": 47682
+    },
+    {
+      "epoch": 0.41391133757519466,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0011243427205689435,
+      "loss": 0.1099,
+      "step": 47683
+    },
+    {
+      "epoch": 0.4139200180553988,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0011243129689875052,
+      "loss": 0.084,
+      "step": 47684
+    },
+    {
+      "epoch": 0.413928698535603,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0011242832173794786,
+      "loss": 0.0923,
+      "step": 47685
+    },
+    {
+      "epoch": 0.4139373790158071,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0011242534657448962,
+      "loss": 0.0957,
+      "step": 47686
+    },
+    {
+      "epoch": 0.4139460594960113,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0011242237140837903,
+      "loss": 0.1089,
+      "step": 47687
+    },
+    {
+      "epoch": 0.41395473997621546,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0011241939623961943,
+      "loss": 0.1123,
+      "step": 47688
+    },
+    {
+      "epoch": 0.41396342045641965,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.00112416421068214,
+      "loss": 0.1162,
+      "step": 47689
+    },
+    {
+      "epoch": 0.4139721009366238,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0011241344589416596,
+      "loss": 0.1377,
+      "step": 47690
+    },
+    {
+      "epoch": 0.413980781416828,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0011241047071747863,
+      "loss": 0.1309,
+      "step": 47691
+    },
+    {
+      "epoch": 0.4139894618970321,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0011240749553815526,
+      "loss": 0.0933,
+      "step": 47692
+    },
+    {
+      "epoch": 0.4139981423772363,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001124045203561991,
+      "loss": 0.0903,
+      "step": 47693
+    },
+    {
+      "epoch": 0.41400682285744045,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0011240154517161338,
+      "loss": 0.1221,
+      "step": 47694
+    },
+    {
+      "epoch": 0.41401550333764464,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0011239856998440136,
+      "loss": 0.083,
+      "step": 47695
+    },
+    {
+      "epoch": 0.4140241838178488,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0011239559479456632,
+      "loss": 0.0874,
+      "step": 47696
+    },
+    {
+      "epoch": 0.41403286429805297,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0011239261960211146,
+      "loss": 0.0791,
+      "step": 47697
+    },
+    {
+      "epoch": 0.4140415447782571,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.001123896444070401,
+      "loss": 0.1143,
+      "step": 47698
+    },
+    {
+      "epoch": 0.4140502252584613,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0011238666920935544,
+      "loss": 0.103,
+      "step": 47699
+    },
+    {
+      "epoch": 0.41405890573866544,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0011238369400906076,
+      "loss": 0.1123,
+      "step": 47700
+    },
+    {
+      "epoch": 0.41406758621886963,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.001123807188061593,
+      "loss": 0.0845,
+      "step": 47701
+    },
+    {
+      "epoch": 0.41407626669907377,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0011237774360065434,
+      "loss": 0.0947,
+      "step": 47702
+    },
+    {
+      "epoch": 0.41408494717927796,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0011237476839254911,
+      "loss": 0.1123,
+      "step": 47703
+    },
+    {
+      "epoch": 0.4140936276594821,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0011237179318184685,
+      "loss": 0.0898,
+      "step": 47704
+    },
+    {
+      "epoch": 0.4141023081396863,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0011236881796855087,
+      "loss": 0.1128,
+      "step": 47705
+    },
+    {
+      "epoch": 0.41411098861989043,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0011236584275266437,
+      "loss": 0.1128,
+      "step": 47706
+    },
+    {
+      "epoch": 0.4141196691000946,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0011236286753419063,
+      "loss": 0.0752,
+      "step": 47707
+    },
+    {
+      "epoch": 0.41412834958029876,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001123598923131329,
+      "loss": 0.123,
+      "step": 47708
+    },
+    {
+      "epoch": 0.41413703006050295,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0011235691708949436,
+      "loss": 0.1123,
+      "step": 47709
+    },
+    {
+      "epoch": 0.4141457105407071,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0011235394186327836,
+      "loss": 0.0942,
+      "step": 47710
+    },
+    {
+      "epoch": 0.4141543910209113,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0011235096663448816,
+      "loss": 0.1167,
+      "step": 47711
+    },
+    {
+      "epoch": 0.4141630715011154,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0011234799140312697,
+      "loss": 0.1006,
+      "step": 47712
+    },
+    {
+      "epoch": 0.4141717519813196,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0011234501616919801,
+      "loss": 0.0903,
+      "step": 47713
+    },
+    {
+      "epoch": 0.41418043246152375,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0011234204093270464,
+      "loss": 0.1191,
+      "step": 47714
+    },
+    {
+      "epoch": 0.41418911294172794,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0011233906569365,
+      "loss": 0.0938,
+      "step": 47715
+    },
+    {
+      "epoch": 0.4141977934219321,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0011233609045203743,
+      "loss": 0.0981,
+      "step": 47716
+    },
+    {
+      "epoch": 0.4142064739021363,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.001123331152078701,
+      "loss": 0.0737,
+      "step": 47717
+    },
+    {
+      "epoch": 0.4142151543823404,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0011233013996115135,
+      "loss": 0.1152,
+      "step": 47718
+    },
+    {
+      "epoch": 0.4142238348625446,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0011232716471188437,
+      "loss": 0.0845,
+      "step": 47719
+    },
+    {
+      "epoch": 0.41423251534274874,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0011232418946007245,
+      "loss": 0.0986,
+      "step": 47720
+    },
+    {
+      "epoch": 0.41424119582295293,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0011232121420571884,
+      "loss": 0.1084,
+      "step": 47721
+    },
+    {
+      "epoch": 0.41424987630315707,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0011231823894882675,
+      "loss": 0.0889,
+      "step": 47722
+    },
+    {
+      "epoch": 0.41425855678336126,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001123152636893995,
+      "loss": 0.0854,
+      "step": 47723
+    },
+    {
+      "epoch": 0.4142672372635654,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0011231228842744033,
+      "loss": 0.0942,
+      "step": 47724
+    },
+    {
+      "epoch": 0.4142759177437696,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0011230931316295243,
+      "loss": 0.0898,
+      "step": 47725
+    },
+    {
+      "epoch": 0.41428459822397373,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001123063378959391,
+      "loss": 0.1162,
+      "step": 47726
+    },
+    {
+      "epoch": 0.4142932787041779,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0011230336262640364,
+      "loss": 0.0947,
+      "step": 47727
+    },
+    {
+      "epoch": 0.41430195918438206,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0011230038735434921,
+      "loss": 0.0811,
+      "step": 47728
+    },
+    {
+      "epoch": 0.41431063966458626,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0011229741207977916,
+      "loss": 0.0967,
+      "step": 47729
+    },
+    {
+      "epoch": 0.4143193201447904,
+      "grad_norm": 3.765625,
+      "learning_rate": 0.0011229443680269664,
+      "loss": 0.3203,
+      "step": 47730
+    },
+    {
+      "epoch": 0.4143280006249946,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0011229146152310502,
+      "loss": 0.1011,
+      "step": 47731
+    },
+    {
+      "epoch": 0.4143366811051987,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0011228848624100746,
+      "loss": 0.0928,
+      "step": 47732
+    },
+    {
+      "epoch": 0.4143453615854029,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0011228551095640723,
+      "loss": 0.0947,
+      "step": 47733
+    },
+    {
+      "epoch": 0.41435404206560705,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001122825356693076,
+      "loss": 0.0742,
+      "step": 47734
+    },
+    {
+      "epoch": 0.41436272254581125,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0011227956037971182,
+      "loss": 0.1206,
+      "step": 47735
+    },
+    {
+      "epoch": 0.4143714030260154,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0011227658508762316,
+      "loss": 0.0991,
+      "step": 47736
+    },
+    {
+      "epoch": 0.4143800835062196,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0011227360979304492,
+      "loss": 0.1035,
+      "step": 47737
+    },
+    {
+      "epoch": 0.4143887639864237,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.001122706344959802,
+      "loss": 0.0708,
+      "step": 47738
+    },
+    {
+      "epoch": 0.4143974444666279,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0011226765919643237,
+      "loss": 0.0884,
+      "step": 47739
+    },
+    {
+      "epoch": 0.41440612494683204,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001122646838944047,
+      "loss": 0.0776,
+      "step": 47740
+    },
+    {
+      "epoch": 0.41441480542703624,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0011226170858990039,
+      "loss": 0.1895,
+      "step": 47741
+    },
+    {
+      "epoch": 0.4144234859072404,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001122587332829227,
+      "loss": 0.1299,
+      "step": 47742
+    },
+    {
+      "epoch": 0.41443216638744457,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0011225575797347489,
+      "loss": 0.0938,
+      "step": 47743
+    },
+    {
+      "epoch": 0.4144408468676487,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0011225278266156023,
+      "loss": 0.1035,
+      "step": 47744
+    },
+    {
+      "epoch": 0.4144495273478529,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0011224980734718195,
+      "loss": 0.0791,
+      "step": 47745
+    },
+    {
+      "epoch": 0.41445820782805703,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0011224683203034335,
+      "loss": 0.126,
+      "step": 47746
+    },
+    {
+      "epoch": 0.4144668883082612,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.001122438567110476,
+      "loss": 0.0933,
+      "step": 47747
+    },
+    {
+      "epoch": 0.41447556878846536,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.00112240881389298,
+      "loss": 0.1069,
+      "step": 47748
+    },
+    {
+      "epoch": 0.41448424926866956,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0011223790606509783,
+      "loss": 0.124,
+      "step": 47749
+    },
+    {
+      "epoch": 0.4144929297488737,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0011223493073845032,
+      "loss": 0.1133,
+      "step": 47750
+    },
+    {
+      "epoch": 0.4145016102290779,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001122319554093587,
+      "loss": 0.1118,
+      "step": 47751
+    },
+    {
+      "epoch": 0.414510290709282,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0011222898007782628,
+      "loss": 0.1074,
+      "step": 47752
+    },
+    {
+      "epoch": 0.4145189711894862,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0011222600474385628,
+      "loss": 0.1152,
+      "step": 47753
+    },
+    {
+      "epoch": 0.41452765166969036,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0011222302940745192,
+      "loss": 0.1426,
+      "step": 47754
+    },
+    {
+      "epoch": 0.41453633214989455,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0011222005406861652,
+      "loss": 0.1152,
+      "step": 47755
+    },
+    {
+      "epoch": 0.4145450126300987,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0011221707872735328,
+      "loss": 0.1406,
+      "step": 47756
+    },
+    {
+      "epoch": 0.4145536931103029,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0011221410338366552,
+      "loss": 0.1621,
+      "step": 47757
+    },
+    {
+      "epoch": 0.414562373590507,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0011221112803755638,
+      "loss": 0.1011,
+      "step": 47758
+    },
+    {
+      "epoch": 0.4145710540707112,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0011220815268902925,
+      "loss": 0.1543,
+      "step": 47759
+    },
+    {
+      "epoch": 0.41457973455091535,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0011220517733808726,
+      "loss": 0.0908,
+      "step": 47760
+    },
+    {
+      "epoch": 0.41458841503111954,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0011220220198473376,
+      "loss": 0.0635,
+      "step": 47761
+    },
+    {
+      "epoch": 0.4145970955113237,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0011219922662897195,
+      "loss": 0.1182,
+      "step": 47762
+    },
+    {
+      "epoch": 0.41460577599152787,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0011219625127080514,
+      "loss": 0.083,
+      "step": 47763
+    },
+    {
+      "epoch": 0.414614456471732,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001121932759102365,
+      "loss": 0.0908,
+      "step": 47764
+    },
+    {
+      "epoch": 0.4146231369519362,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001121903005472693,
+      "loss": 0.1064,
+      "step": 47765
+    },
+    {
+      "epoch": 0.41463181743214034,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0011218732518190686,
+      "loss": 0.1133,
+      "step": 47766
+    },
+    {
+      "epoch": 0.41464049791234453,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0011218434981415239,
+      "loss": 0.083,
+      "step": 47767
+    },
+    {
+      "epoch": 0.41464917839254867,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0011218137444400915,
+      "loss": 0.0957,
+      "step": 47768
+    },
+    {
+      "epoch": 0.41465785887275286,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0011217839907148037,
+      "loss": 0.1133,
+      "step": 47769
+    },
+    {
+      "epoch": 0.414666539352957,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0011217542369656934,
+      "loss": 0.1172,
+      "step": 47770
+    },
+    {
+      "epoch": 0.4146752198331612,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001121724483192793,
+      "loss": 0.1309,
+      "step": 47771
+    },
+    {
+      "epoch": 0.41468390031336533,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0011216947293961352,
+      "loss": 0.0967,
+      "step": 47772
+    },
+    {
+      "epoch": 0.4146925807935695,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001121664975575752,
+      "loss": 0.1143,
+      "step": 47773
+    },
+    {
+      "epoch": 0.41470126127377366,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0011216352217316768,
+      "loss": 0.0908,
+      "step": 47774
+    },
+    {
+      "epoch": 0.41470994175397785,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0011216054678639413,
+      "loss": 0.1221,
+      "step": 47775
+    },
+    {
+      "epoch": 0.414718622234182,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0011215757139725784,
+      "loss": 0.0737,
+      "step": 47776
+    },
+    {
+      "epoch": 0.4147273027143862,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0011215459600576207,
+      "loss": 0.1143,
+      "step": 47777
+    },
+    {
+      "epoch": 0.4147359831945903,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0011215162061191004,
+      "loss": 0.0742,
+      "step": 47778
+    },
+    {
+      "epoch": 0.4147446636747945,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0011214864521570506,
+      "loss": 0.1152,
+      "step": 47779
+    },
+    {
+      "epoch": 0.41475334415499865,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0011214566981715035,
+      "loss": 0.1162,
+      "step": 47780
+    },
+    {
+      "epoch": 0.41476202463520284,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001121426944162492,
+      "loss": 0.1084,
+      "step": 47781
+    },
+    {
+      "epoch": 0.414770705115407,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0011213971901300477,
+      "loss": 0.1187,
+      "step": 47782
+    },
+    {
+      "epoch": 0.41477938559561117,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0011213674360742042,
+      "loss": 0.1387,
+      "step": 47783
+    },
+    {
+      "epoch": 0.4147880660758153,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0011213376819949935,
+      "loss": 0.0591,
+      "step": 47784
+    },
+    {
+      "epoch": 0.4147967465560195,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.001121307927892448,
+      "loss": 0.0938,
+      "step": 47785
+    },
+    {
+      "epoch": 0.41480542703622364,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0011212781737666004,
+      "loss": 0.0869,
+      "step": 47786
+    },
+    {
+      "epoch": 0.41481410751642783,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0011212484196174834,
+      "loss": 0.106,
+      "step": 47787
+    },
+    {
+      "epoch": 0.41482278799663197,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0011212186654451298,
+      "loss": 0.1162,
+      "step": 47788
+    },
+    {
+      "epoch": 0.41483146847683616,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0011211889112495716,
+      "loss": 0.0811,
+      "step": 47789
+    },
+    {
+      "epoch": 0.4148401489570403,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0011211591570308415,
+      "loss": 0.1221,
+      "step": 47790
+    },
+    {
+      "epoch": 0.4148488294372445,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001121129402788972,
+      "loss": 0.1396,
+      "step": 47791
+    },
+    {
+      "epoch": 0.41485750991744863,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.001121099648523996,
+      "loss": 0.0732,
+      "step": 47792
+    },
+    {
+      "epoch": 0.4148661903976528,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0011210698942359452,
+      "loss": 0.1021,
+      "step": 47793
+    },
+    {
+      "epoch": 0.41487487087785696,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0011210401399248534,
+      "loss": 0.1445,
+      "step": 47794
+    },
+    {
+      "epoch": 0.41488355135806115,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0011210103855907516,
+      "loss": 0.1055,
+      "step": 47795
+    },
+    {
+      "epoch": 0.4148922318382653,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0011209806312336733,
+      "loss": 0.1069,
+      "step": 47796
+    },
+    {
+      "epoch": 0.4149009123184695,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0011209508768536514,
+      "loss": 0.1157,
+      "step": 47797
+    },
+    {
+      "epoch": 0.4149095927986736,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.001120921122450718,
+      "loss": 0.0903,
+      "step": 47798
+    },
+    {
+      "epoch": 0.4149182732788778,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001120891368024905,
+      "loss": 0.0967,
+      "step": 47799
+    },
+    {
+      "epoch": 0.41492695375908195,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0011208616135762457,
+      "loss": 0.0996,
+      "step": 47800
+    },
+    {
+      "epoch": 0.41493563423928614,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0011208318591047726,
+      "loss": 0.0986,
+      "step": 47801
+    },
+    {
+      "epoch": 0.4149443147194903,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0011208021046105181,
+      "loss": 0.0703,
+      "step": 47802
+    },
+    {
+      "epoch": 0.4149529951996945,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0011207723500935144,
+      "loss": 0.1191,
+      "step": 47803
+    },
+    {
+      "epoch": 0.4149616756798986,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0011207425955537947,
+      "loss": 0.0957,
+      "step": 47804
+    },
+    {
+      "epoch": 0.4149703561601028,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0011207128409913912,
+      "loss": 0.1055,
+      "step": 47805
+    },
+    {
+      "epoch": 0.41497903664030694,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0011206830864063364,
+      "loss": 0.1191,
+      "step": 47806
+    },
+    {
+      "epoch": 0.4149877171205111,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0011206533317986629,
+      "loss": 0.1035,
+      "step": 47807
+    },
+    {
+      "epoch": 0.41499639760071527,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001120623577168403,
+      "loss": 0.0737,
+      "step": 47808
+    },
+    {
+      "epoch": 0.4150050780809194,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0011205938225155895,
+      "loss": 0.0986,
+      "step": 47809
+    },
+    {
+      "epoch": 0.4150137585611236,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0011205640678402552,
+      "loss": 0.1611,
+      "step": 47810
+    },
+    {
+      "epoch": 0.41502243904132774,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0011205343131424324,
+      "loss": 0.1582,
+      "step": 47811
+    },
+    {
+      "epoch": 0.41503111952153193,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0011205045584221532,
+      "loss": 0.0762,
+      "step": 47812
+    },
+    {
+      "epoch": 0.41503980000173607,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0011204748036794509,
+      "loss": 0.0771,
+      "step": 47813
+    },
+    {
+      "epoch": 0.41504848048194026,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0011204450489143575,
+      "loss": 0.0698,
+      "step": 47814
+    },
+    {
+      "epoch": 0.4150571609621444,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0011204152941269054,
+      "loss": 0.0977,
+      "step": 47815
+    },
+    {
+      "epoch": 0.4150658414423486,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0011203855393171278,
+      "loss": 0.1113,
+      "step": 47816
+    },
+    {
+      "epoch": 0.41507452192255273,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001120355784485057,
+      "loss": 0.1387,
+      "step": 47817
+    },
+    {
+      "epoch": 0.4150832024027569,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001120326029630725,
+      "loss": 0.0815,
+      "step": 47818
+    },
+    {
+      "epoch": 0.41509188288296106,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0011202962747541652,
+      "loss": 0.0859,
+      "step": 47819
+    },
+    {
+      "epoch": 0.41510056336316525,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0011202665198554093,
+      "loss": 0.0947,
+      "step": 47820
+    },
+    {
+      "epoch": 0.4151092438433694,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0011202367649344905,
+      "loss": 0.0991,
+      "step": 47821
+    },
+    {
+      "epoch": 0.4151179243235736,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.001120207009991441,
+      "loss": 0.1201,
+      "step": 47822
+    },
+    {
+      "epoch": 0.4151266048037777,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0011201772550262935,
+      "loss": 0.0684,
+      "step": 47823
+    },
+    {
+      "epoch": 0.4151352852839819,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0011201475000390804,
+      "loss": 0.1377,
+      "step": 47824
+    },
+    {
+      "epoch": 0.41514396576418605,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0011201177450298342,
+      "loss": 0.0688,
+      "step": 47825
+    },
+    {
+      "epoch": 0.41515264624439024,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0011200879899985873,
+      "loss": 0.1035,
+      "step": 47826
+    },
+    {
+      "epoch": 0.4151613267245944,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.001120058234945373,
+      "loss": 0.1084,
+      "step": 47827
+    },
+    {
+      "epoch": 0.4151700072047986,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0011200284798702231,
+      "loss": 0.0708,
+      "step": 47828
+    },
+    {
+      "epoch": 0.4151786876850027,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0011199987247731704,
+      "loss": 0.0942,
+      "step": 47829
+    },
+    {
+      "epoch": 0.4151873681652069,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0011199689696542471,
+      "loss": 0.0889,
+      "step": 47830
+    },
+    {
+      "epoch": 0.41519604864541104,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0011199392145134863,
+      "loss": 0.0732,
+      "step": 47831
+    },
+    {
+      "epoch": 0.41520472912561524,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0011199094593509205,
+      "loss": 0.126,
+      "step": 47832
+    },
+    {
+      "epoch": 0.4152134096058194,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0011198797041665817,
+      "loss": 0.1309,
+      "step": 47833
+    },
+    {
+      "epoch": 0.41522209008602357,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0011198499489605026,
+      "loss": 0.127,
+      "step": 47834
+    },
+    {
+      "epoch": 0.4152307705662277,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0011198201937327162,
+      "loss": 0.0859,
+      "step": 47835
+    },
+    {
+      "epoch": 0.4152394510464319,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0011197904384832548,
+      "loss": 0.1201,
+      "step": 47836
+    },
+    {
+      "epoch": 0.41524813152663603,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0011197606832121507,
+      "loss": 0.0767,
+      "step": 47837
+    },
+    {
+      "epoch": 0.4152568120068402,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0011197309279194366,
+      "loss": 0.125,
+      "step": 47838
+    },
+    {
+      "epoch": 0.41526549248704436,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0011197011726051449,
+      "loss": 0.0898,
+      "step": 47839
+    },
+    {
+      "epoch": 0.41527417296724856,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0011196714172693088,
+      "loss": 0.1436,
+      "step": 47840
+    },
+    {
+      "epoch": 0.4152828534474527,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.00111964166191196,
+      "loss": 0.1357,
+      "step": 47841
+    },
+    {
+      "epoch": 0.4152915339276569,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001119611906533131,
+      "loss": 0.0957,
+      "step": 47842
+    },
+    {
+      "epoch": 0.415300214407861,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0011195821511328554,
+      "loss": 0.1074,
+      "step": 47843
+    },
+    {
+      "epoch": 0.4153088948880652,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0011195523957111646,
+      "loss": 0.1006,
+      "step": 47844
+    },
+    {
+      "epoch": 0.41531757536826935,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0011195226402680918,
+      "loss": 0.1523,
+      "step": 47845
+    },
+    {
+      "epoch": 0.41532625584847355,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0011194928848036695,
+      "loss": 0.0977,
+      "step": 47846
+    },
+    {
+      "epoch": 0.4153349363286777,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0011194631293179296,
+      "loss": 0.1758,
+      "step": 47847
+    },
+    {
+      "epoch": 0.4153436168088819,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0011194333738109054,
+      "loss": 0.0938,
+      "step": 47848
+    },
+    {
+      "epoch": 0.415352297289086,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0011194036182826292,
+      "loss": 0.0879,
+      "step": 47849
+    },
+    {
+      "epoch": 0.4153609777692902,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0011193738627331335,
+      "loss": 0.1104,
+      "step": 47850
+    },
+    {
+      "epoch": 0.41536965824949434,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0011193441071624506,
+      "loss": 0.1104,
+      "step": 47851
+    },
+    {
+      "epoch": 0.41537833872969854,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0011193143515706135,
+      "loss": 0.0977,
+      "step": 47852
+    },
+    {
+      "epoch": 0.4153870192099027,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0011192845959576542,
+      "loss": 0.0957,
+      "step": 47853
+    },
+    {
+      "epoch": 0.41539569969010687,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001119254840323606,
+      "loss": 0.1016,
+      "step": 47854
+    },
+    {
+      "epoch": 0.415404380170311,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001119225084668501,
+      "loss": 0.0791,
+      "step": 47855
+    },
+    {
+      "epoch": 0.4154130606505152,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0011191953289923712,
+      "loss": 0.1406,
+      "step": 47856
+    },
+    {
+      "epoch": 0.41542174113071934,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.0011191655732952501,
+      "loss": 0.1406,
+      "step": 47857
+    },
+    {
+      "epoch": 0.41543042161092353,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.00111913581757717,
+      "loss": 0.0928,
+      "step": 47858
+    },
+    {
+      "epoch": 0.41543910209112767,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.001119106061838163,
+      "loss": 0.1221,
+      "step": 47859
+    },
+    {
+      "epoch": 0.41544778257133186,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0011190763060782616,
+      "loss": 0.0679,
+      "step": 47860
+    },
+    {
+      "epoch": 0.415456463051536,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001119046550297499,
+      "loss": 0.1152,
+      "step": 47861
+    },
+    {
+      "epoch": 0.4154651435317402,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0011190167944959072,
+      "loss": 0.0859,
+      "step": 47862
+    },
+    {
+      "epoch": 0.4154738240119443,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0011189870386735191,
+      "loss": 0.0957,
+      "step": 47863
+    },
+    {
+      "epoch": 0.4154825044921485,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0011189572828303668,
+      "loss": 0.1001,
+      "step": 47864
+    },
+    {
+      "epoch": 0.41549118497235266,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0011189275269664833,
+      "loss": 0.1011,
+      "step": 47865
+    },
+    {
+      "epoch": 0.41549986545255685,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001118897771081901,
+      "loss": 0.1162,
+      "step": 47866
+    },
+    {
+      "epoch": 0.415508545932761,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0011188680151766525,
+      "loss": 0.373,
+      "step": 47867
+    },
+    {
+      "epoch": 0.4155172264129652,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0011188382592507698,
+      "loss": 0.1099,
+      "step": 47868
+    },
+    {
+      "epoch": 0.4155259068931693,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0011188085033042863,
+      "loss": 0.0835,
+      "step": 47869
+    },
+    {
+      "epoch": 0.4155345873733735,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001118778747337234,
+      "loss": 0.1123,
+      "step": 47870
+    },
+    {
+      "epoch": 0.41554326785357765,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0011187489913496453,
+      "loss": 0.1035,
+      "step": 47871
+    },
+    {
+      "epoch": 0.41555194833378184,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.001118719235341553,
+      "loss": 0.0889,
+      "step": 47872
+    },
+    {
+      "epoch": 0.415560628813986,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0011186894793129897,
+      "loss": 0.123,
+      "step": 47873
+    },
+    {
+      "epoch": 0.41556930929419017,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.001118659723263988,
+      "loss": 0.1689,
+      "step": 47874
+    },
+    {
+      "epoch": 0.4155779897743943,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0011186299671945804,
+      "loss": 0.0977,
+      "step": 47875
+    },
+    {
+      "epoch": 0.4155866702545985,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.001118600211104799,
+      "loss": 0.1211,
+      "step": 47876
+    },
+    {
+      "epoch": 0.41559535073480264,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0011185704549946766,
+      "loss": 0.1152,
+      "step": 47877
+    },
+    {
+      "epoch": 0.41560403121500683,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0011185406988642463,
+      "loss": 0.1582,
+      "step": 47878
+    },
+    {
+      "epoch": 0.41561271169521097,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0011185109427135401,
+      "loss": 0.3301,
+      "step": 47879
+    },
+    {
+      "epoch": 0.41562139217541516,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0011184811865425903,
+      "loss": 0.0996,
+      "step": 47880
+    },
+    {
+      "epoch": 0.4156300726556193,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.00111845143035143,
+      "loss": 0.0942,
+      "step": 47881
+    },
+    {
+      "epoch": 0.4156387531358235,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0011184216741400915,
+      "loss": 0.1025,
+      "step": 47882
+    },
+    {
+      "epoch": 0.41564743361602763,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001118391917908607,
+      "loss": 0.0967,
+      "step": 47883
+    },
+    {
+      "epoch": 0.4156561140962318,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0011183621616570097,
+      "loss": 0.1465,
+      "step": 47884
+    },
+    {
+      "epoch": 0.41566479457643596,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0011183324053853317,
+      "loss": 0.1309,
+      "step": 47885
+    },
+    {
+      "epoch": 0.41567347505664015,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0011183026490936057,
+      "loss": 0.0977,
+      "step": 47886
+    },
+    {
+      "epoch": 0.4156821555368443,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0011182728927818643,
+      "loss": 0.105,
+      "step": 47887
+    },
+    {
+      "epoch": 0.4156908360170485,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0011182431364501397,
+      "loss": 0.0781,
+      "step": 47888
+    },
+    {
+      "epoch": 0.4156995164972526,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0011182133800984651,
+      "loss": 0.0889,
+      "step": 47889
+    },
+    {
+      "epoch": 0.4157081969774568,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001118183623726872,
+      "loss": 0.1074,
+      "step": 47890
+    },
+    {
+      "epoch": 0.41571687745766095,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.001118153867335394,
+      "loss": 0.083,
+      "step": 47891
+    },
+    {
+      "epoch": 0.41572555793786514,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001118124110924063,
+      "loss": 0.0781,
+      "step": 47892
+    },
+    {
+      "epoch": 0.4157342384180693,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0011180943544929118,
+      "loss": 0.1074,
+      "step": 47893
+    },
+    {
+      "epoch": 0.4157429188982735,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0011180645980419727,
+      "loss": 0.1011,
+      "step": 47894
+    },
+    {
+      "epoch": 0.4157515993784776,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001118034841571279,
+      "loss": 0.1191,
+      "step": 47895
+    },
+    {
+      "epoch": 0.4157602798586818,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0011180050850808623,
+      "loss": 0.0908,
+      "step": 47896
+    },
+    {
+      "epoch": 0.41576896033888594,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0011179753285707557,
+      "loss": 0.1084,
+      "step": 47897
+    },
+    {
+      "epoch": 0.41577764081909013,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0011179455720409911,
+      "loss": 0.0781,
+      "step": 47898
+    },
+    {
+      "epoch": 0.41578632129929427,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0011179158154916021,
+      "loss": 0.1211,
+      "step": 47899
+    },
+    {
+      "epoch": 0.41579500177949846,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.00111788605892262,
+      "loss": 0.0947,
+      "step": 47900
+    },
+    {
+      "epoch": 0.4158036822597026,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.001117856302334078,
+      "loss": 0.1128,
+      "step": 47901
+    },
+    {
+      "epoch": 0.4158123627399068,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0011178265457260092,
+      "loss": 0.0879,
+      "step": 47902
+    },
+    {
+      "epoch": 0.41582104322011093,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0011177967890984451,
+      "loss": 0.1035,
+      "step": 47903
+    },
+    {
+      "epoch": 0.4158297237003151,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0011177670324514191,
+      "loss": 0.0713,
+      "step": 47904
+    },
+    {
+      "epoch": 0.41583840418051926,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0011177372757849632,
+      "loss": 0.082,
+      "step": 47905
+    },
+    {
+      "epoch": 0.41584708466072345,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.00111770751909911,
+      "loss": 0.1523,
+      "step": 47906
+    },
+    {
+      "epoch": 0.4158557651409276,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001117677762393892,
+      "loss": 0.0981,
+      "step": 47907
+    },
+    {
+      "epoch": 0.4158644456211318,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001117648005669342,
+      "loss": 0.0903,
+      "step": 47908
+    },
+    {
+      "epoch": 0.4158731261013359,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0011176182489254925,
+      "loss": 0.083,
+      "step": 47909
+    },
+    {
+      "epoch": 0.4158818065815401,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0011175884921623758,
+      "loss": 0.0908,
+      "step": 47910
+    },
+    {
+      "epoch": 0.41589048706174425,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0011175587353800245,
+      "loss": 0.0869,
+      "step": 47911
+    },
+    {
+      "epoch": 0.41589916754194844,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0011175289785784713,
+      "loss": 0.1338,
+      "step": 47912
+    },
+    {
+      "epoch": 0.4159078480221526,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0011174992217577485,
+      "loss": 0.0923,
+      "step": 47913
+    },
+    {
+      "epoch": 0.4159165285023568,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0011174694649178892,
+      "loss": 0.1221,
+      "step": 47914
+    },
+    {
+      "epoch": 0.4159252089825609,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0011174397080589255,
+      "loss": 0.0986,
+      "step": 47915
+    },
+    {
+      "epoch": 0.4159338894627651,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0011174099511808897,
+      "loss": 0.1289,
+      "step": 47916
+    },
+    {
+      "epoch": 0.41594256994296924,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0011173801942838148,
+      "loss": 0.0889,
+      "step": 47917
+    },
+    {
+      "epoch": 0.41595125042317344,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0011173504373677334,
+      "loss": 0.1045,
+      "step": 47918
+    },
+    {
+      "epoch": 0.4159599309033776,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0011173206804326776,
+      "loss": 0.1084,
+      "step": 47919
+    },
+    {
+      "epoch": 0.41596861138358177,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0011172909234786802,
+      "loss": 0.0884,
+      "step": 47920
+    },
+    {
+      "epoch": 0.4159772918637859,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0011172611665057734,
+      "loss": 0.0923,
+      "step": 47921
+    },
+    {
+      "epoch": 0.4159859723439901,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0011172314095139904,
+      "loss": 0.0977,
+      "step": 47922
+    },
+    {
+      "epoch": 0.41599465282419423,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0011172016525033636,
+      "loss": 0.106,
+      "step": 47923
+    },
+    {
+      "epoch": 0.4160033333043984,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.001117171895473925,
+      "loss": 0.0884,
+      "step": 47924
+    },
+    {
+      "epoch": 0.41601201378460256,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0011171421384257075,
+      "loss": 0.0938,
+      "step": 47925
+    },
+    {
+      "epoch": 0.41602069426480676,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0011171123813587436,
+      "loss": 0.0908,
+      "step": 47926
+    },
+    {
+      "epoch": 0.4160293747450109,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001117082624273066,
+      "loss": 0.0835,
+      "step": 47927
+    },
+    {
+      "epoch": 0.4160380552252151,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001117052867168707,
+      "loss": 0.0967,
+      "step": 47928
+    },
+    {
+      "epoch": 0.4160467357054192,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.001117023110045699,
+      "loss": 0.1533,
+      "step": 47929
+    },
+    {
+      "epoch": 0.41605541618562336,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0011169933529040753,
+      "loss": 0.0908,
+      "step": 47930
+    },
+    {
+      "epoch": 0.41606409666582755,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0011169635957438675,
+      "loss": 0.1133,
+      "step": 47931
+    },
+    {
+      "epoch": 0.4160727771460317,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0011169338385651087,
+      "loss": 0.1035,
+      "step": 47932
+    },
+    {
+      "epoch": 0.4160814576262359,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.001116904081367831,
+      "loss": 0.0776,
+      "step": 47933
+    },
+    {
+      "epoch": 0.41609013810644,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0011168743241520677,
+      "loss": 0.0996,
+      "step": 47934
+    },
+    {
+      "epoch": 0.4160988185866442,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0011168445669178509,
+      "loss": 0.1367,
+      "step": 47935
+    },
+    {
+      "epoch": 0.41610749906684835,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0011168148096652132,
+      "loss": 0.1094,
+      "step": 47936
+    },
+    {
+      "epoch": 0.41611617954705254,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.001116785052394187,
+      "loss": 0.0942,
+      "step": 47937
+    },
+    {
+      "epoch": 0.4161248600272567,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0011167552951048046,
+      "loss": 0.0977,
+      "step": 47938
+    },
+    {
+      "epoch": 0.4161335405074609,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001116725537797099,
+      "loss": 0.1025,
+      "step": 47939
+    },
+    {
+      "epoch": 0.416142220987665,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0011166957804711027,
+      "loss": 0.0825,
+      "step": 47940
+    },
+    {
+      "epoch": 0.4161509014678692,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0011166660231268482,
+      "loss": 0.1118,
+      "step": 47941
+    },
+    {
+      "epoch": 0.41615958194807334,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0011166362657643676,
+      "loss": 0.1162,
+      "step": 47942
+    },
+    {
+      "epoch": 0.41616826242827754,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0011166065083836943,
+      "loss": 0.1123,
+      "step": 47943
+    },
+    {
+      "epoch": 0.4161769429084817,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0011165767509848604,
+      "loss": 0.0747,
+      "step": 47944
+    },
+    {
+      "epoch": 0.41618562338868587,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.001116546993567898,
+      "loss": 0.0894,
+      "step": 47945
+    },
+    {
+      "epoch": 0.41619430386889,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.00111651723613284,
+      "loss": 0.0664,
+      "step": 47946
+    },
+    {
+      "epoch": 0.4162029843490942,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0011164874786797198,
+      "loss": 0.0991,
+      "step": 47947
+    },
+    {
+      "epoch": 0.41621166482929833,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0011164577212085685,
+      "loss": 0.1064,
+      "step": 47948
+    },
+    {
+      "epoch": 0.4162203453095025,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0011164279637194193,
+      "loss": 0.123,
+      "step": 47949
+    },
+    {
+      "epoch": 0.41622902578970666,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0011163982062123048,
+      "loss": 0.1475,
+      "step": 47950
+    },
+    {
+      "epoch": 0.41623770626991086,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0011163684486872574,
+      "loss": 0.0957,
+      "step": 47951
+    },
+    {
+      "epoch": 0.416246386750115,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0011163386911443099,
+      "loss": 0.1748,
+      "step": 47952
+    },
+    {
+      "epoch": 0.4162550672303192,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0011163089335834945,
+      "loss": 0.1035,
+      "step": 47953
+    },
+    {
+      "epoch": 0.4162637477105233,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.001116279176004844,
+      "loss": 0.1289,
+      "step": 47954
+    },
+    {
+      "epoch": 0.4162724281907275,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0011162494184083905,
+      "loss": 0.1152,
+      "step": 47955
+    },
+    {
+      "epoch": 0.41628110867093165,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0011162196607941675,
+      "loss": 0.061,
+      "step": 47956
+    },
+    {
+      "epoch": 0.41628978915113585,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0011161899031622066,
+      "loss": 0.0776,
+      "step": 47957
+    },
+    {
+      "epoch": 0.41629846963134,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0011161601455125405,
+      "loss": 0.0835,
+      "step": 47958
+    },
+    {
+      "epoch": 0.4163071501115442,
+      "grad_norm": 2.734375,
+      "learning_rate": 0.001116130387845202,
+      "loss": 0.2363,
+      "step": 47959
+    },
+    {
+      "epoch": 0.4163158305917483,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0011161006301602233,
+      "loss": 0.0732,
+      "step": 47960
+    },
+    {
+      "epoch": 0.4163245110719525,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0011160708724576379,
+      "loss": 0.1035,
+      "step": 47961
+    },
+    {
+      "epoch": 0.41633319155215665,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0011160411147374772,
+      "loss": 0.0859,
+      "step": 47962
+    },
+    {
+      "epoch": 0.41634187203236084,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001116011356999774,
+      "loss": 0.125,
+      "step": 47963
+    },
+    {
+      "epoch": 0.416350552512565,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0011159815992445614,
+      "loss": 0.0864,
+      "step": 47964
+    },
+    {
+      "epoch": 0.41635923299276917,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0011159518414718714,
+      "loss": 0.0825,
+      "step": 47965
+    },
+    {
+      "epoch": 0.4163679134729733,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0011159220836817368,
+      "loss": 0.0874,
+      "step": 47966
+    },
+    {
+      "epoch": 0.4163765939531775,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0011158923258741899,
+      "loss": 0.0933,
+      "step": 47967
+    },
+    {
+      "epoch": 0.41638527443338164,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0011158625680492635,
+      "loss": 0.0889,
+      "step": 47968
+    },
+    {
+      "epoch": 0.41639395491358583,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0011158328102069896,
+      "loss": 0.0781,
+      "step": 47969
+    },
+    {
+      "epoch": 0.41640263539378997,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0011158030523474017,
+      "loss": 0.103,
+      "step": 47970
+    },
+    {
+      "epoch": 0.41641131587399416,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001115773294470532,
+      "loss": 0.1143,
+      "step": 47971
+    },
+    {
+      "epoch": 0.4164199963541983,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0011157435365764122,
+      "loss": 0.0864,
+      "step": 47972
+    },
+    {
+      "epoch": 0.4164286768344025,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001115713778665076,
+      "loss": 0.1143,
+      "step": 47973
+    },
+    {
+      "epoch": 0.4164373573146066,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0011156840207365553,
+      "loss": 0.0898,
+      "step": 47974
+    },
+    {
+      "epoch": 0.4164460377948108,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0011156542627908828,
+      "loss": 0.1113,
+      "step": 47975
+    },
+    {
+      "epoch": 0.41645471827501496,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.001115624504828091,
+      "loss": 0.082,
+      "step": 47976
+    },
+    {
+      "epoch": 0.41646339875521915,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0011155947468482123,
+      "loss": 0.0562,
+      "step": 47977
+    },
+    {
+      "epoch": 0.4164720792354233,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0011155649888512796,
+      "loss": 0.1328,
+      "step": 47978
+    },
+    {
+      "epoch": 0.4164807597156275,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0011155352308373253,
+      "loss": 0.0762,
+      "step": 47979
+    },
+    {
+      "epoch": 0.4164894401958316,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.001115505472806382,
+      "loss": 0.0811,
+      "step": 47980
+    },
+    {
+      "epoch": 0.4164981206760358,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0011154757147584819,
+      "loss": 0.1167,
+      "step": 47981
+    },
+    {
+      "epoch": 0.41650680115623995,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0011154459566936582,
+      "loss": 0.0693,
+      "step": 47982
+    },
+    {
+      "epoch": 0.41651548163644414,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0011154161986119426,
+      "loss": 0.0938,
+      "step": 47983
+    },
+    {
+      "epoch": 0.4165241621166483,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0011153864405133683,
+      "loss": 0.0913,
+      "step": 47984
+    },
+    {
+      "epoch": 0.41653284259685247,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0011153566823979672,
+      "loss": 0.0894,
+      "step": 47985
+    },
+    {
+      "epoch": 0.4165415230770566,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001115326924265773,
+      "loss": 0.0791,
+      "step": 47986
+    },
+    {
+      "epoch": 0.4165502035572608,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001115297166116817,
+      "loss": 0.0898,
+      "step": 47987
+    },
+    {
+      "epoch": 0.41655888403746494,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0011152674079511322,
+      "loss": 0.104,
+      "step": 47988
+    },
+    {
+      "epoch": 0.41656756451766913,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001115237649768751,
+      "loss": 0.0767,
+      "step": 47989
+    },
+    {
+      "epoch": 0.41657624499787327,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0011152078915697067,
+      "loss": 0.124,
+      "step": 47990
+    },
+    {
+      "epoch": 0.41658492547807746,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0011151781333540312,
+      "loss": 0.1235,
+      "step": 47991
+    },
+    {
+      "epoch": 0.4165936059582816,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001115148375121757,
+      "loss": 0.1123,
+      "step": 47992
+    },
+    {
+      "epoch": 0.4166022864384858,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001115118616872917,
+      "loss": 0.0854,
+      "step": 47993
+    },
+    {
+      "epoch": 0.41661096691868993,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001115088858607543,
+      "loss": 0.0967,
+      "step": 47994
+    },
+    {
+      "epoch": 0.4166196473988941,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0011150591003256682,
+      "loss": 0.0713,
+      "step": 47995
+    },
+    {
+      "epoch": 0.41662832787909826,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0011150293420273253,
+      "loss": 0.1226,
+      "step": 47996
+    },
+    {
+      "epoch": 0.41663700835930245,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0011149995837125464,
+      "loss": 0.1074,
+      "step": 47997
+    },
+    {
+      "epoch": 0.4166456888395066,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001114969825381364,
+      "loss": 0.1201,
+      "step": 47998
+    },
+    {
+      "epoch": 0.4166543693197108,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.001114940067033811,
+      "loss": 0.0854,
+      "step": 47999
+    },
+    {
+      "epoch": 0.4166630497999149,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0011149103086699197,
+      "loss": 0.1045,
+      "step": 48000
+    },
+    {
+      "epoch": 0.4166717302801191,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0011148805502897228,
+      "loss": 0.1035,
+      "step": 48001
+    },
+    {
+      "epoch": 0.41668041076032325,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0011148507918932527,
+      "loss": 0.1426,
+      "step": 48002
+    },
+    {
+      "epoch": 0.41668909124052744,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0011148210334805417,
+      "loss": 0.0986,
+      "step": 48003
+    },
+    {
+      "epoch": 0.4166977717207316,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0011147912750516231,
+      "loss": 0.0645,
+      "step": 48004
+    },
+    {
+      "epoch": 0.4167064522009358,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001114761516606529,
+      "loss": 0.0645,
+      "step": 48005
+    },
+    {
+      "epoch": 0.4167151326811399,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0011147317581452917,
+      "loss": 0.0791,
+      "step": 48006
+    },
+    {
+      "epoch": 0.4167238131613441,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001114701999667944,
+      "loss": 0.1113,
+      "step": 48007
+    },
+    {
+      "epoch": 0.41673249364154824,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0011146722411745182,
+      "loss": 0.1074,
+      "step": 48008
+    },
+    {
+      "epoch": 0.41674117412175243,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0011146424826650471,
+      "loss": 0.0864,
+      "step": 48009
+    },
+    {
+      "epoch": 0.41674985460195657,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0011146127241395637,
+      "loss": 0.1074,
+      "step": 48010
+    },
+    {
+      "epoch": 0.41675853508216076,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0011145829655980996,
+      "loss": 0.1201,
+      "step": 48011
+    },
+    {
+      "epoch": 0.4167672155623649,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0011145532070406879,
+      "loss": 0.1001,
+      "step": 48012
+    },
+    {
+      "epoch": 0.4167758960425691,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0011145234484673612,
+      "loss": 0.0957,
+      "step": 48013
+    },
+    {
+      "epoch": 0.41678457652277323,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0011144936898781516,
+      "loss": 0.0923,
+      "step": 48014
+    },
+    {
+      "epoch": 0.4167932570029774,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.001114463931273092,
+      "loss": 0.0967,
+      "step": 48015
+    },
+    {
+      "epoch": 0.41680193748318156,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0011144341726522149,
+      "loss": 0.0859,
+      "step": 48016
+    },
+    {
+      "epoch": 0.41681061796338575,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0011144044140155528,
+      "loss": 0.0771,
+      "step": 48017
+    },
+    {
+      "epoch": 0.4168192984435899,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001114374655363138,
+      "loss": 0.0991,
+      "step": 48018
+    },
+    {
+      "epoch": 0.4168279789237941,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0011143448966950038,
+      "loss": 0.0913,
+      "step": 48019
+    },
+    {
+      "epoch": 0.4168366594039982,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0011143151380111817,
+      "loss": 0.0767,
+      "step": 48020
+    },
+    {
+      "epoch": 0.4168453398842024,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.001114285379311705,
+      "loss": 0.0996,
+      "step": 48021
+    },
+    {
+      "epoch": 0.41685402036440655,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0011142556205966064,
+      "loss": 0.1211,
+      "step": 48022
+    },
+    {
+      "epoch": 0.41686270084461075,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0011142258618659178,
+      "loss": 0.1543,
+      "step": 48023
+    },
+    {
+      "epoch": 0.4168713813248149,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0011141961031196717,
+      "loss": 0.0566,
+      "step": 48024
+    },
+    {
+      "epoch": 0.4168800618050191,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0011141663443579013,
+      "loss": 0.0947,
+      "step": 48025
+    },
+    {
+      "epoch": 0.4168887422852232,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0011141365855806386,
+      "loss": 0.0947,
+      "step": 48026
+    },
+    {
+      "epoch": 0.4168974227654274,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0011141068267879165,
+      "loss": 0.0752,
+      "step": 48027
+    },
+    {
+      "epoch": 0.41690610324563154,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0011140770679797672,
+      "loss": 0.1191,
+      "step": 48028
+    },
+    {
+      "epoch": 0.41691478372583574,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0011140473091562238,
+      "loss": 0.1133,
+      "step": 48029
+    },
+    {
+      "epoch": 0.4169234642060399,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0011140175503173183,
+      "loss": 0.1357,
+      "step": 48030
+    },
+    {
+      "epoch": 0.41693214468624407,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001113987791463083,
+      "loss": 0.1016,
+      "step": 48031
+    },
+    {
+      "epoch": 0.4169408251664482,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0011139580325935517,
+      "loss": 0.0767,
+      "step": 48032
+    },
+    {
+      "epoch": 0.4169495056466524,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0011139282737087555,
+      "loss": 0.0986,
+      "step": 48033
+    },
+    {
+      "epoch": 0.41695818612685653,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0011138985148087277,
+      "loss": 0.1064,
+      "step": 48034
+    },
+    {
+      "epoch": 0.4169668666070607,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0011138687558935009,
+      "loss": 0.0771,
+      "step": 48035
+    },
+    {
+      "epoch": 0.41697554708726486,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001113838996963107,
+      "loss": 0.0913,
+      "step": 48036
+    },
+    {
+      "epoch": 0.41698422756746906,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0011138092380175788,
+      "loss": 0.0835,
+      "step": 48037
+    },
+    {
+      "epoch": 0.4169929080476732,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0011137794790569496,
+      "loss": 0.0718,
+      "step": 48038
+    },
+    {
+      "epoch": 0.4170015885278774,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0011137497200812512,
+      "loss": 0.1113,
+      "step": 48039
+    },
+    {
+      "epoch": 0.4170102690080815,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0011137199610905164,
+      "loss": 0.0835,
+      "step": 48040
+    },
+    {
+      "epoch": 0.4170189494882857,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0011136902020847772,
+      "loss": 0.1465,
+      "step": 48041
+    },
+    {
+      "epoch": 0.41702762996848985,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0011136604430640672,
+      "loss": 0.083,
+      "step": 48042
+    },
+    {
+      "epoch": 0.41703631044869405,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0011136306840284181,
+      "loss": 0.0796,
+      "step": 48043
+    },
+    {
+      "epoch": 0.4170449909288982,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001113600924977863,
+      "loss": 0.0889,
+      "step": 48044
+    },
+    {
+      "epoch": 0.4170536714091024,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0011135711659124334,
+      "loss": 0.0933,
+      "step": 48045
+    },
+    {
+      "epoch": 0.4170623518893065,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.001113541406832163,
+      "loss": 0.0908,
+      "step": 48046
+    },
+    {
+      "epoch": 0.4170710323695107,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001113511647737084,
+      "loss": 0.1367,
+      "step": 48047
+    },
+    {
+      "epoch": 0.41707971284971485,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0011134818886272288,
+      "loss": 0.0918,
+      "step": 48048
+    },
+    {
+      "epoch": 0.41708839332991904,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.00111345212950263,
+      "loss": 0.1289,
+      "step": 48049
+    },
+    {
+      "epoch": 0.4170970738101232,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.00111342237036332,
+      "loss": 0.1016,
+      "step": 48050
+    },
+    {
+      "epoch": 0.41710575429032737,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0011133926112093318,
+      "loss": 0.0972,
+      "step": 48051
+    },
+    {
+      "epoch": 0.4171144347705315,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0011133628520406974,
+      "loss": 0.1221,
+      "step": 48052
+    },
+    {
+      "epoch": 0.41712311525073564,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0011133330928574498,
+      "loss": 0.082,
+      "step": 48053
+    },
+    {
+      "epoch": 0.41713179573093984,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.001113303333659621,
+      "loss": 0.0713,
+      "step": 48054
+    },
+    {
+      "epoch": 0.417140476211144,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0011132735744472442,
+      "loss": 0.1162,
+      "step": 48055
+    },
+    {
+      "epoch": 0.41714915669134817,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0011132438152203512,
+      "loss": 0.0776,
+      "step": 48056
+    },
+    {
+      "epoch": 0.4171578371715523,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0011132140559789755,
+      "loss": 0.1143,
+      "step": 48057
+    },
+    {
+      "epoch": 0.4171665176517565,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001113184296723149,
+      "loss": 0.0923,
+      "step": 48058
+    },
+    {
+      "epoch": 0.41717519813196063,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0011131545374529041,
+      "loss": 0.1348,
+      "step": 48059
+    },
+    {
+      "epoch": 0.4171838786121648,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0011131247781682736,
+      "loss": 0.1494,
+      "step": 48060
+    },
+    {
+      "epoch": 0.41719255909236896,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0011130950188692904,
+      "loss": 0.125,
+      "step": 48061
+    },
+    {
+      "epoch": 0.41720123957257316,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0011130652595559864,
+      "loss": 0.125,
+      "step": 48062
+    },
+    {
+      "epoch": 0.4172099200527773,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0011130355002283944,
+      "loss": 0.1504,
+      "step": 48063
+    },
+    {
+      "epoch": 0.4172186005329815,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001113005740886547,
+      "loss": 0.0864,
+      "step": 48064
+    },
+    {
+      "epoch": 0.4172272810131856,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0011129759815304765,
+      "loss": 0.0654,
+      "step": 48065
+    },
+    {
+      "epoch": 0.4172359614933898,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0011129462221602163,
+      "loss": 0.1611,
+      "step": 48066
+    },
+    {
+      "epoch": 0.41724464197359395,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0011129164627757978,
+      "loss": 0.0693,
+      "step": 48067
+    },
+    {
+      "epoch": 0.41725332245379815,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.001112886703377254,
+      "loss": 0.0703,
+      "step": 48068
+    },
+    {
+      "epoch": 0.4172620029340023,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0011128569439646178,
+      "loss": 0.0845,
+      "step": 48069
+    },
+    {
+      "epoch": 0.4172706834142065,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0011128271845379212,
+      "loss": 0.0618,
+      "step": 48070
+    },
+    {
+      "epoch": 0.4172793638944106,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0011127974250971972,
+      "loss": 0.0728,
+      "step": 48071
+    },
+    {
+      "epoch": 0.4172880443746148,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.001112767665642478,
+      "loss": 0.0835,
+      "step": 48072
+    },
+    {
+      "epoch": 0.41729672485481895,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0011127379061737963,
+      "loss": 0.1094,
+      "step": 48073
+    },
+    {
+      "epoch": 0.41730540533502314,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0011127081466911844,
+      "loss": 0.1221,
+      "step": 48074
+    },
+    {
+      "epoch": 0.4173140858152273,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0011126783871946754,
+      "loss": 0.1094,
+      "step": 48075
+    },
+    {
+      "epoch": 0.41732276629543147,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0011126486276843012,
+      "loss": 0.0996,
+      "step": 48076
+    },
+    {
+      "epoch": 0.4173314467756356,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0011126188681600951,
+      "loss": 0.0767,
+      "step": 48077
+    },
+    {
+      "epoch": 0.4173401272558398,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0011125891086220887,
+      "loss": 0.0942,
+      "step": 48078
+    },
+    {
+      "epoch": 0.41734880773604394,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0011125593490703153,
+      "loss": 0.0737,
+      "step": 48079
+    },
+    {
+      "epoch": 0.41735748821624813,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.001112529589504807,
+      "loss": 0.0913,
+      "step": 48080
+    },
+    {
+      "epoch": 0.41736616869645227,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0011124998299255969,
+      "loss": 0.0864,
+      "step": 48081
+    },
+    {
+      "epoch": 0.41737484917665646,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0011124700703327168,
+      "loss": 0.0791,
+      "step": 48082
+    },
+    {
+      "epoch": 0.4173835296568606,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0011124403107261997,
+      "loss": 0.0889,
+      "step": 48083
+    },
+    {
+      "epoch": 0.4173922101370648,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0011124105511060782,
+      "loss": 0.0864,
+      "step": 48084
+    },
+    {
+      "epoch": 0.4174008906172689,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0011123807914723846,
+      "loss": 0.1113,
+      "step": 48085
+    },
+    {
+      "epoch": 0.4174095710974731,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0011123510318251516,
+      "loss": 0.1191,
+      "step": 48086
+    },
+    {
+      "epoch": 0.41741825157767726,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0011123212721644118,
+      "loss": 0.0801,
+      "step": 48087
+    },
+    {
+      "epoch": 0.41742693205788145,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 0.0011122915124901975,
+      "loss": 0.0664,
+      "step": 48088
+    },
+    {
+      "epoch": 0.4174356125380856,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0011122617528025415,
+      "loss": 0.0762,
+      "step": 48089
+    },
+    {
+      "epoch": 0.4174442930182898,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0011122319931014763,
+      "loss": 0.0928,
+      "step": 48090
+    },
+    {
+      "epoch": 0.4174529734984939,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.001112202233387034,
+      "loss": 0.0718,
+      "step": 48091
+    },
+    {
+      "epoch": 0.4174616539786981,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0011121724736592477,
+      "loss": 0.1143,
+      "step": 48092
+    },
+    {
+      "epoch": 0.41747033445890225,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0011121427139181498,
+      "loss": 0.0786,
+      "step": 48093
+    },
+    {
+      "epoch": 0.41747901493910644,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0011121129541637727,
+      "loss": 0.1299,
+      "step": 48094
+    },
+    {
+      "epoch": 0.4174876954193106,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0011120831943961494,
+      "loss": 0.0933,
+      "step": 48095
+    },
+    {
+      "epoch": 0.41749637589951477,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0011120534346153116,
+      "loss": 0.1099,
+      "step": 48096
+    },
+    {
+      "epoch": 0.4175050563797189,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0011120236748212926,
+      "loss": 0.0796,
+      "step": 48097
+    },
+    {
+      "epoch": 0.4175137368599231,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0011119939150141247,
+      "loss": 0.0996,
+      "step": 48098
+    },
+    {
+      "epoch": 0.41752241734012724,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0011119641551938406,
+      "loss": 0.103,
+      "step": 48099
+    },
+    {
+      "epoch": 0.41753109782033143,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0011119343953604726,
+      "loss": 0.0938,
+      "step": 48100
+    },
+    {
+      "epoch": 0.41753977830053557,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0011119046355140532,
+      "loss": 0.0718,
+      "step": 48101
+    },
+    {
+      "epoch": 0.41754845878073976,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0011118748756546151,
+      "loss": 0.0786,
+      "step": 48102
+    },
+    {
+      "epoch": 0.4175571392609439,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0011118451157821907,
+      "loss": 0.1011,
+      "step": 48103
+    },
+    {
+      "epoch": 0.4175658197411481,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0011118153558968126,
+      "loss": 0.1318,
+      "step": 48104
+    },
+    {
+      "epoch": 0.41757450022135223,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001111785595998514,
+      "loss": 0.0957,
+      "step": 48105
+    },
+    {
+      "epoch": 0.4175831807015564,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0011117558360873264,
+      "loss": 0.1201,
+      "step": 48106
+    },
+    {
+      "epoch": 0.41759186118176056,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0011117260761632827,
+      "loss": 0.0771,
+      "step": 48107
+    },
+    {
+      "epoch": 0.41760054166196475,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0011116963162264158,
+      "loss": 0.127,
+      "step": 48108
+    },
+    {
+      "epoch": 0.4176092221421689,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0011116665562767578,
+      "loss": 0.0962,
+      "step": 48109
+    },
+    {
+      "epoch": 0.4176179026223731,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0011116367963143416,
+      "loss": 0.1064,
+      "step": 48110
+    },
+    {
+      "epoch": 0.4176265831025772,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0011116070363391994,
+      "loss": 0.1104,
+      "step": 48111
+    },
+    {
+      "epoch": 0.4176352635827814,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0011115772763513638,
+      "loss": 0.2207,
+      "step": 48112
+    },
+    {
+      "epoch": 0.41764394406298555,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0011115475163508677,
+      "loss": 0.0977,
+      "step": 48113
+    },
+    {
+      "epoch": 0.41765262454318974,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0011115177563377435,
+      "loss": 0.0977,
+      "step": 48114
+    },
+    {
+      "epoch": 0.4176613050233939,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0011114879963120233,
+      "loss": 0.0752,
+      "step": 48115
+    },
+    {
+      "epoch": 0.4176699855035981,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0011114582362737404,
+      "loss": 0.0791,
+      "step": 48116
+    },
+    {
+      "epoch": 0.4176786659838022,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0011114284762229268,
+      "loss": 0.1406,
+      "step": 48117
+    },
+    {
+      "epoch": 0.4176873464640064,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0011113987161596153,
+      "loss": 0.1001,
+      "step": 48118
+    },
+    {
+      "epoch": 0.41769602694421054,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001111368956083838,
+      "loss": 0.0835,
+      "step": 48119
+    },
+    {
+      "epoch": 0.41770470742441473,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0011113391959956277,
+      "loss": 0.0684,
+      "step": 48120
+    },
+    {
+      "epoch": 0.41771338790461887,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0011113094358950176,
+      "loss": 0.1006,
+      "step": 48121
+    },
+    {
+      "epoch": 0.41772206838482306,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0011112796757820392,
+      "loss": 0.1172,
+      "step": 48122
+    },
+    {
+      "epoch": 0.4177307488650272,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0011112499156567255,
+      "loss": 0.125,
+      "step": 48123
+    },
+    {
+      "epoch": 0.4177394293452314,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0011112201555191095,
+      "loss": 0.1211,
+      "step": 48124
+    },
+    {
+      "epoch": 0.41774810982543553,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001111190395369223,
+      "loss": 0.0762,
+      "step": 48125
+    },
+    {
+      "epoch": 0.4177567903056397,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0011111606352070989,
+      "loss": 0.1133,
+      "step": 48126
+    },
+    {
+      "epoch": 0.41776547078584386,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0011111308750327699,
+      "loss": 0.125,
+      "step": 48127
+    },
+    {
+      "epoch": 0.41777415126604805,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0011111011148462679,
+      "loss": 0.0869,
+      "step": 48128
+    },
+    {
+      "epoch": 0.4177828317462522,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001111071354647626,
+      "loss": 0.1426,
+      "step": 48129
+    },
+    {
+      "epoch": 0.4177915122264564,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.001111041594436877,
+      "loss": 0.1133,
+      "step": 48130
+    },
+    {
+      "epoch": 0.4178001927066605,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0011110118342140527,
+      "loss": 0.1289,
+      "step": 48131
+    },
+    {
+      "epoch": 0.4178088731868647,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0011109820739791862,
+      "loss": 0.1133,
+      "step": 48132
+    },
+    {
+      "epoch": 0.41781755366706885,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0011109523137323096,
+      "loss": 0.1064,
+      "step": 48133
+    },
+    {
+      "epoch": 0.41782623414727305,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0011109225534734562,
+      "loss": 0.168,
+      "step": 48134
+    },
+    {
+      "epoch": 0.4178349146274772,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001110892793202658,
+      "loss": 0.1182,
+      "step": 48135
+    },
+    {
+      "epoch": 0.4178435951076814,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0011108630329199468,
+      "loss": 0.0664,
+      "step": 48136
+    },
+    {
+      "epoch": 0.4178522755878855,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0011108332726253566,
+      "loss": 0.0723,
+      "step": 48137
+    },
+    {
+      "epoch": 0.4178609560680897,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0011108035123189195,
+      "loss": 0.1094,
+      "step": 48138
+    },
+    {
+      "epoch": 0.41786963654829384,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0011107737520006675,
+      "loss": 0.0977,
+      "step": 48139
+    },
+    {
+      "epoch": 0.41787831702849804,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0011107439916706338,
+      "loss": 0.1118,
+      "step": 48140
+    },
+    {
+      "epoch": 0.4178869975087022,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0011107142313288502,
+      "loss": 0.0845,
+      "step": 48141
+    },
+    {
+      "epoch": 0.41789567798890637,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0011106844709753499,
+      "loss": 0.0933,
+      "step": 48142
+    },
+    {
+      "epoch": 0.4179043584691105,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0011106547106101653,
+      "loss": 0.0933,
+      "step": 48143
+    },
+    {
+      "epoch": 0.4179130389493147,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0011106249502333288,
+      "loss": 0.1533,
+      "step": 48144
+    },
+    {
+      "epoch": 0.41792171942951883,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0011105951898448727,
+      "loss": 0.0742,
+      "step": 48145
+    },
+    {
+      "epoch": 0.417930399909723,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0011105654294448304,
+      "loss": 0.1348,
+      "step": 48146
+    },
+    {
+      "epoch": 0.41793908038992716,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0011105356690332337,
+      "loss": 0.0811,
+      "step": 48147
+    },
+    {
+      "epoch": 0.41794776087013136,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0011105059086101152,
+      "loss": 0.0845,
+      "step": 48148
+    },
+    {
+      "epoch": 0.4179564413503355,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0011104761481755079,
+      "loss": 0.0952,
+      "step": 48149
+    },
+    {
+      "epoch": 0.4179651218305397,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0011104463877294438,
+      "loss": 0.0908,
+      "step": 48150
+    },
+    {
+      "epoch": 0.4179738023107438,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0011104166272719557,
+      "loss": 0.1191,
+      "step": 48151
+    },
+    {
+      "epoch": 0.417982482790948,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.001110386866803076,
+      "loss": 0.1045,
+      "step": 48152
+    },
+    {
+      "epoch": 0.41799116327115216,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0011103571063228377,
+      "loss": 0.0996,
+      "step": 48153
+    },
+    {
+      "epoch": 0.41799984375135635,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0011103273458312728,
+      "loss": 0.105,
+      "step": 48154
+    },
+    {
+      "epoch": 0.4180085242315605,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001110297585328414,
+      "loss": 0.0879,
+      "step": 48155
+    },
+    {
+      "epoch": 0.4180172047117647,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0011102678248142944,
+      "loss": 0.0986,
+      "step": 48156
+    },
+    {
+      "epoch": 0.4180258851919688,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0011102380642889456,
+      "loss": 0.1016,
+      "step": 48157
+    },
+    {
+      "epoch": 0.418034565672173,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0011102083037524007,
+      "loss": 0.0635,
+      "step": 48158
+    },
+    {
+      "epoch": 0.41804324615237715,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0011101785432046922,
+      "loss": 0.1465,
+      "step": 48159
+    },
+    {
+      "epoch": 0.41805192663258134,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0011101487826458525,
+      "loss": 0.1011,
+      "step": 48160
+    },
+    {
+      "epoch": 0.4180606071127855,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0011101190220759144,
+      "loss": 0.0835,
+      "step": 48161
+    },
+    {
+      "epoch": 0.41806928759298967,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0011100892614949098,
+      "loss": 0.1006,
+      "step": 48162
+    },
+    {
+      "epoch": 0.4180779680731938,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0011100595009028724,
+      "loss": 0.1069,
+      "step": 48163
+    },
+    {
+      "epoch": 0.418086648553398,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0011100297402998338,
+      "loss": 0.1191,
+      "step": 48164
+    },
+    {
+      "epoch": 0.41809532903360214,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0011099999796858268,
+      "loss": 0.1084,
+      "step": 48165
+    },
+    {
+      "epoch": 0.41810400951380633,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0011099702190608843,
+      "loss": 0.1113,
+      "step": 48166
+    },
+    {
+      "epoch": 0.41811268999401047,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.001109940458425038,
+      "loss": 0.0786,
+      "step": 48167
+    },
+    {
+      "epoch": 0.41812137047421466,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0011099106977783214,
+      "loss": 0.0835,
+      "step": 48168
+    },
+    {
+      "epoch": 0.4181300509544188,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0011098809371207663,
+      "loss": 0.0669,
+      "step": 48169
+    },
+    {
+      "epoch": 0.418138731434623,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0011098511764524054,
+      "loss": 0.124,
+      "step": 48170
+    },
+    {
+      "epoch": 0.4181474119148271,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0011098214157732717,
+      "loss": 0.1465,
+      "step": 48171
+    },
+    {
+      "epoch": 0.4181560923950313,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0011097916550833976,
+      "loss": 0.125,
+      "step": 48172
+    },
+    {
+      "epoch": 0.41816477287523546,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0011097618943828151,
+      "loss": 0.1133,
+      "step": 48173
+    },
+    {
+      "epoch": 0.41817345335543965,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0011097321336715578,
+      "loss": 0.124,
+      "step": 48174
+    },
+    {
+      "epoch": 0.4181821338356438,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.001109702372949657,
+      "loss": 0.0908,
+      "step": 48175
+    },
+    {
+      "epoch": 0.4181908143158479,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0011096726122171458,
+      "loss": 0.0928,
+      "step": 48176
+    },
+    {
+      "epoch": 0.4181994947960521,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0011096428514740572,
+      "loss": 0.0698,
+      "step": 48177
+    },
+    {
+      "epoch": 0.41820817527625626,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001109613090720423,
+      "loss": 0.0781,
+      "step": 48178
+    },
+    {
+      "epoch": 0.41821685575646045,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001109583329956276,
+      "loss": 0.1484,
+      "step": 48179
+    },
+    {
+      "epoch": 0.4182255362366646,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001109553569181649,
+      "loss": 0.0889,
+      "step": 48180
+    },
+    {
+      "epoch": 0.4182342167168688,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0011095238083965741,
+      "loss": 0.0869,
+      "step": 48181
+    },
+    {
+      "epoch": 0.4182428971970729,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0011094940476010845,
+      "loss": 0.0864,
+      "step": 48182
+    },
+    {
+      "epoch": 0.4182515776772771,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0011094642867952123,
+      "loss": 0.0854,
+      "step": 48183
+    },
+    {
+      "epoch": 0.41826025815748125,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0011094345259789898,
+      "loss": 0.0918,
+      "step": 48184
+    },
+    {
+      "epoch": 0.41826893863768544,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.00110940476515245,
+      "loss": 0.0747,
+      "step": 48185
+    },
+    {
+      "epoch": 0.4182776191178896,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0011093750043156254,
+      "loss": 0.0977,
+      "step": 48186
+    },
+    {
+      "epoch": 0.41828629959809377,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0011093452434685486,
+      "loss": 0.1108,
+      "step": 48187
+    },
+    {
+      "epoch": 0.4182949800782979,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0011093154826112514,
+      "loss": 0.1074,
+      "step": 48188
+    },
+    {
+      "epoch": 0.4183036605585021,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0011092857217437674,
+      "loss": 0.0762,
+      "step": 48189
+    },
+    {
+      "epoch": 0.41831234103870624,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0011092559608661282,
+      "loss": 0.1221,
+      "step": 48190
+    },
+    {
+      "epoch": 0.41832102151891043,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0011092261999783672,
+      "loss": 0.1216,
+      "step": 48191
+    },
+    {
+      "epoch": 0.41832970199911457,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0011091964390805167,
+      "loss": 0.1133,
+      "step": 48192
+    },
+    {
+      "epoch": 0.41833838247931876,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0011091666781726085,
+      "loss": 0.082,
+      "step": 48193
+    },
+    {
+      "epoch": 0.4183470629595229,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0011091369172546764,
+      "loss": 0.0879,
+      "step": 48194
+    },
+    {
+      "epoch": 0.4183557434397271,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0011091071563267521,
+      "loss": 0.0889,
+      "step": 48195
+    },
+    {
+      "epoch": 0.4183644239199312,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0011090773953888683,
+      "loss": 0.1133,
+      "step": 48196
+    },
+    {
+      "epoch": 0.4183731044001354,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0011090476344410576,
+      "loss": 0.1426,
+      "step": 48197
+    },
+    {
+      "epoch": 0.41838178488033956,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0011090178734833528,
+      "loss": 0.1289,
+      "step": 48198
+    },
+    {
+      "epoch": 0.41839046536054375,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0011089881125157857,
+      "loss": 0.1348,
+      "step": 48199
+    },
+    {
+      "epoch": 0.4183991458407479,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.00110895835153839,
+      "loss": 0.0918,
+      "step": 48200
+    },
+    {
+      "epoch": 0.4184078263209521,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0011089285905511968,
+      "loss": 0.0957,
+      "step": 48201
+    },
+    {
+      "epoch": 0.4184165068011562,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0011088988295542398,
+      "loss": 0.0525,
+      "step": 48202
+    },
+    {
+      "epoch": 0.4184251872813604,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0011088690685475513,
+      "loss": 0.1108,
+      "step": 48203
+    },
+    {
+      "epoch": 0.41843386776156455,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0011088393075311636,
+      "loss": 0.0962,
+      "step": 48204
+    },
+    {
+      "epoch": 0.41844254824176874,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001108809546505109,
+      "loss": 0.0928,
+      "step": 48205
+    },
+    {
+      "epoch": 0.4184512287219729,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0011087797854694211,
+      "loss": 0.0942,
+      "step": 48206
+    },
+    {
+      "epoch": 0.41845990920217707,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0011087500244241315,
+      "loss": 0.0986,
+      "step": 48207
+    },
+    {
+      "epoch": 0.4184685896823812,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001108720263369273,
+      "loss": 0.1069,
+      "step": 48208
+    },
+    {
+      "epoch": 0.4184772701625854,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0011086905023048779,
+      "loss": 0.0933,
+      "step": 48209
+    },
+    {
+      "epoch": 0.41848595064278954,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0011086607412309792,
+      "loss": 0.1143,
+      "step": 48210
+    },
+    {
+      "epoch": 0.41849463112299373,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0011086309801476093,
+      "loss": 0.125,
+      "step": 48211
+    },
+    {
+      "epoch": 0.41850331160319787,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0011086012190548004,
+      "loss": 0.1104,
+      "step": 48212
+    },
+    {
+      "epoch": 0.41851199208340206,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0011085714579525858,
+      "loss": 0.0879,
+      "step": 48213
+    },
+    {
+      "epoch": 0.4185206725636062,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0011085416968409972,
+      "loss": 0.1104,
+      "step": 48214
+    },
+    {
+      "epoch": 0.4185293530438104,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0011085119357200678,
+      "loss": 0.0972,
+      "step": 48215
+    },
+    {
+      "epoch": 0.41853803352401453,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0011084821745898298,
+      "loss": 0.1118,
+      "step": 48216
+    },
+    {
+      "epoch": 0.4185467140042187,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0011084524134503159,
+      "loss": 0.0928,
+      "step": 48217
+    },
+    {
+      "epoch": 0.41855539448442286,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0011084226523015582,
+      "loss": 0.1133,
+      "step": 48218
+    },
+    {
+      "epoch": 0.41856407496462705,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0011083928911435898,
+      "loss": 0.127,
+      "step": 48219
+    },
+    {
+      "epoch": 0.4185727554448312,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0011083631299764432,
+      "loss": 0.1328,
+      "step": 48220
+    },
+    {
+      "epoch": 0.4185814359250354,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0011083333688001508,
+      "loss": 0.1113,
+      "step": 48221
+    },
+    {
+      "epoch": 0.4185901164052395,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0011083036076147451,
+      "loss": 0.085,
+      "step": 48222
+    },
+    {
+      "epoch": 0.4185987968854437,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0011082738464202584,
+      "loss": 0.082,
+      "step": 48223
+    },
+    {
+      "epoch": 0.41860747736564785,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0011082440852167239,
+      "loss": 0.106,
+      "step": 48224
+    },
+    {
+      "epoch": 0.41861615784585204,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0011082143240041737,
+      "loss": 0.1021,
+      "step": 48225
+    },
+    {
+      "epoch": 0.4186248383260562,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0011081845627826405,
+      "loss": 0.1035,
+      "step": 48226
+    },
+    {
+      "epoch": 0.4186335188062604,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0011081548015521568,
+      "loss": 0.0913,
+      "step": 48227
+    },
+    {
+      "epoch": 0.4186421992864645,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001108125040312755,
+      "loss": 0.0933,
+      "step": 48228
+    },
+    {
+      "epoch": 0.4186508797666687,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0011080952790644676,
+      "loss": 0.0869,
+      "step": 48229
+    },
+    {
+      "epoch": 0.41865956024687284,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0011080655178073277,
+      "loss": 0.124,
+      "step": 48230
+    },
+    {
+      "epoch": 0.41866824072707703,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0011080357565413671,
+      "loss": 0.0889,
+      "step": 48231
+    },
+    {
+      "epoch": 0.41867692120728117,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0011080059952666194,
+      "loss": 0.103,
+      "step": 48232
+    },
+    {
+      "epoch": 0.41868560168748536,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.001107976233983116,
+      "loss": 0.0879,
+      "step": 48233
+    },
+    {
+      "epoch": 0.4186942821676895,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.00110794647269089,
+      "loss": 0.0996,
+      "step": 48234
+    },
+    {
+      "epoch": 0.4187029626478937,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0011079167113899736,
+      "loss": 0.0967,
+      "step": 48235
+    },
+    {
+      "epoch": 0.41871164312809783,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0011078869500803995,
+      "loss": 0.0952,
+      "step": 48236
+    },
+    {
+      "epoch": 0.418720323608302,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0011078571887622008,
+      "loss": 0.0879,
+      "step": 48237
+    },
+    {
+      "epoch": 0.41872900408850616,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0011078274274354094,
+      "loss": 0.1182,
+      "step": 48238
+    },
+    {
+      "epoch": 0.41873768456871036,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0011077976661000581,
+      "loss": 0.0952,
+      "step": 48239
+    },
+    {
+      "epoch": 0.4187463650489145,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0011077679047561792,
+      "loss": 0.0723,
+      "step": 48240
+    },
+    {
+      "epoch": 0.4187550455291187,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0011077381434038058,
+      "loss": 0.1035,
+      "step": 48241
+    },
+    {
+      "epoch": 0.4187637260093228,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.00110770838204297,
+      "loss": 0.0996,
+      "step": 48242
+    },
+    {
+      "epoch": 0.418772406489527,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0011076786206737042,
+      "loss": 0.0957,
+      "step": 48243
+    },
+    {
+      "epoch": 0.41878108696973115,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0011076488592960414,
+      "loss": 0.0635,
+      "step": 48244
+    },
+    {
+      "epoch": 0.41878976744993535,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0011076190979100139,
+      "loss": 0.0593,
+      "step": 48245
+    },
+    {
+      "epoch": 0.4187984479301395,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0011075893365156543,
+      "loss": 0.0903,
+      "step": 48246
+    },
+    {
+      "epoch": 0.4188071284103437,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0011075595751129945,
+      "loss": 0.0996,
+      "step": 48247
+    },
+    {
+      "epoch": 0.4188158088905478,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0011075298137020685,
+      "loss": 0.1143,
+      "step": 48248
+    },
+    {
+      "epoch": 0.418824489370752,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0011075000522829076,
+      "loss": 0.0737,
+      "step": 48249
+    },
+    {
+      "epoch": 0.41883316985095614,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0011074702908555447,
+      "loss": 0.0825,
+      "step": 48250
+    },
+    {
+      "epoch": 0.41884185033116034,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0011074405294200125,
+      "loss": 0.0996,
+      "step": 48251
+    },
+    {
+      "epoch": 0.4188505308113645,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0011074107679763436,
+      "loss": 0.0908,
+      "step": 48252
+    },
+    {
+      "epoch": 0.41885921129156867,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0011073810065245701,
+      "loss": 0.0703,
+      "step": 48253
+    },
+    {
+      "epoch": 0.4188678917717728,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0011073512450647251,
+      "loss": 0.1113,
+      "step": 48254
+    },
+    {
+      "epoch": 0.418876572251977,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0011073214835968408,
+      "loss": 0.0664,
+      "step": 48255
+    },
+    {
+      "epoch": 0.41888525273218113,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0011072917221209497,
+      "loss": 0.1416,
+      "step": 48256
+    },
+    {
+      "epoch": 0.4188939332123853,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0011072619606370849,
+      "loss": 0.1016,
+      "step": 48257
+    },
+    {
+      "epoch": 0.41890261369258946,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0011072321991452778,
+      "loss": 0.0952,
+      "step": 48258
+    },
+    {
+      "epoch": 0.41891129417279366,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0011072024376455622,
+      "loss": 0.1289,
+      "step": 48259
+    },
+    {
+      "epoch": 0.4189199746529978,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0011071726761379703,
+      "loss": 0.0913,
+      "step": 48260
+    },
+    {
+      "epoch": 0.418928655133202,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0011071429146225342,
+      "loss": 0.1309,
+      "step": 48261
+    },
+    {
+      "epoch": 0.4189373356134061,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0011071131530992866,
+      "loss": 0.1108,
+      "step": 48262
+    },
+    {
+      "epoch": 0.4189460160936103,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0011070833915682603,
+      "loss": 0.1025,
+      "step": 48263
+    },
+    {
+      "epoch": 0.41895469657381446,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0011070536300294878,
+      "loss": 0.0825,
+      "step": 48264
+    },
+    {
+      "epoch": 0.41896337705401865,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0011070238684830016,
+      "loss": 0.1182,
+      "step": 48265
+    },
+    {
+      "epoch": 0.4189720575342228,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0011069941069288339,
+      "loss": 0.0952,
+      "step": 48266
+    },
+    {
+      "epoch": 0.418980738014427,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0011069643453670175,
+      "loss": 0.0879,
+      "step": 48267
+    },
+    {
+      "epoch": 0.4189894184946311,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0011069345837975855,
+      "loss": 0.127,
+      "step": 48268
+    },
+    {
+      "epoch": 0.4189980989748353,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0011069048222205695,
+      "loss": 0.166,
+      "step": 48269
+    },
+    {
+      "epoch": 0.41900677945503945,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0011068750606360025,
+      "loss": 0.084,
+      "step": 48270
+    },
+    {
+      "epoch": 0.41901545993524364,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0011068452990439172,
+      "loss": 0.1504,
+      "step": 48271
+    },
+    {
+      "epoch": 0.4190241404154478,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0011068155374443463,
+      "loss": 0.0972,
+      "step": 48272
+    },
+    {
+      "epoch": 0.41903282089565197,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0011067857758373214,
+      "loss": 0.1045,
+      "step": 48273
+    },
+    {
+      "epoch": 0.4190415013758561,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0011067560142228763,
+      "loss": 0.1074,
+      "step": 48274
+    },
+    {
+      "epoch": 0.4190501818560603,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0011067262526010424,
+      "loss": 0.0635,
+      "step": 48275
+    },
+    {
+      "epoch": 0.41905886233626444,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0011066964909718527,
+      "loss": 0.0796,
+      "step": 48276
+    },
+    {
+      "epoch": 0.41906754281646863,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0011066667293353402,
+      "loss": 0.1289,
+      "step": 48277
+    },
+    {
+      "epoch": 0.41907622329667277,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0011066369676915372,
+      "loss": 0.0957,
+      "step": 48278
+    },
+    {
+      "epoch": 0.41908490377687696,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0011066072060404758,
+      "loss": 0.1123,
+      "step": 48279
+    },
+    {
+      "epoch": 0.4190935842570811,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0011065774443821887,
+      "loss": 0.1172,
+      "step": 48280
+    },
+    {
+      "epoch": 0.4191022647372853,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.001106547682716709,
+      "loss": 0.0996,
+      "step": 48281
+    },
+    {
+      "epoch": 0.41911094521748943,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0011065179210440687,
+      "loss": 0.1074,
+      "step": 48282
+    },
+    {
+      "epoch": 0.4191196256976936,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0011064881593643003,
+      "loss": 0.085,
+      "step": 48283
+    },
+    {
+      "epoch": 0.41912830617789776,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0011064583976774368,
+      "loss": 0.0781,
+      "step": 48284
+    },
+    {
+      "epoch": 0.41913698665810195,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0011064286359835102,
+      "loss": 0.1348,
+      "step": 48285
+    },
+    {
+      "epoch": 0.4191456671383061,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0011063988742825537,
+      "loss": 0.1021,
+      "step": 48286
+    },
+    {
+      "epoch": 0.4191543476185103,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.0011063691125745994,
+      "loss": 0.0938,
+      "step": 48287
+    },
+    {
+      "epoch": 0.4191630280987144,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0011063393508596797,
+      "loss": 0.125,
+      "step": 48288
+    },
+    {
+      "epoch": 0.4191717085789186,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0011063095891378277,
+      "loss": 0.1504,
+      "step": 48289
+    },
+    {
+      "epoch": 0.41918038905912275,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0011062798274090754,
+      "loss": 0.1035,
+      "step": 48290
+    },
+    {
+      "epoch": 0.41918906953932694,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001106250065673456,
+      "loss": 0.1484,
+      "step": 48291
+    },
+    {
+      "epoch": 0.4191977500195311,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.001106220303931001,
+      "loss": 0.0747,
+      "step": 48292
+    },
+    {
+      "epoch": 0.41920643049973527,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.001106190542181744,
+      "loss": 0.0967,
+      "step": 48293
+    },
+    {
+      "epoch": 0.4192151109799394,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.001106160780425717,
+      "loss": 0.0762,
+      "step": 48294
+    },
+    {
+      "epoch": 0.4192237914601436,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0011061310186629525,
+      "loss": 0.082,
+      "step": 48295
+    },
+    {
+      "epoch": 0.41923247194034774,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0011061012568934832,
+      "loss": 0.0986,
+      "step": 48296
+    },
+    {
+      "epoch": 0.41924115242055193,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0011060714951173418,
+      "loss": 0.1104,
+      "step": 48297
+    },
+    {
+      "epoch": 0.41924983290075607,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0011060417333345607,
+      "loss": 0.0933,
+      "step": 48298
+    },
+    {
+      "epoch": 0.4192585133809602,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0011060119715451727,
+      "loss": 0.0894,
+      "step": 48299
+    },
+    {
+      "epoch": 0.4192671938611644,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.00110598220974921,
+      "loss": 0.0806,
+      "step": 48300
+    },
+    {
+      "epoch": 0.41927587434136854,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.001105952447946705,
+      "loss": 0.0942,
+      "step": 48301
+    },
+    {
+      "epoch": 0.41928455482157273,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0011059226861376906,
+      "loss": 0.0947,
+      "step": 48302
+    },
+    {
+      "epoch": 0.41929323530177687,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0011058929243221993,
+      "loss": 0.1318,
+      "step": 48303
+    },
+    {
+      "epoch": 0.41930191578198106,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0011058631625002634,
+      "loss": 0.0972,
+      "step": 48304
+    },
+    {
+      "epoch": 0.4193105962621852,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0011058334006719154,
+      "loss": 0.1387,
+      "step": 48305
+    },
+    {
+      "epoch": 0.4193192767423894,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0011058036388371887,
+      "loss": 0.0967,
+      "step": 48306
+    },
+    {
+      "epoch": 0.41932795722259353,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0011057738769961148,
+      "loss": 0.1177,
+      "step": 48307
+    },
+    {
+      "epoch": 0.4193366377027977,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001105744115148727,
+      "loss": 0.082,
+      "step": 48308
+    },
+    {
+      "epoch": 0.41934531818300186,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001105714353295057,
+      "loss": 0.0664,
+      "step": 48309
+    },
+    {
+      "epoch": 0.41935399866320605,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0011056845914351385,
+      "loss": 0.0669,
+      "step": 48310
+    },
+    {
+      "epoch": 0.4193626791434102,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0011056548295690033,
+      "loss": 0.1016,
+      "step": 48311
+    },
+    {
+      "epoch": 0.4193713596236144,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0011056250676966836,
+      "loss": 0.1074,
+      "step": 48312
+    },
+    {
+      "epoch": 0.4193800401038185,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0011055953058182128,
+      "loss": 0.0942,
+      "step": 48313
+    },
+    {
+      "epoch": 0.4193887205840227,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0011055655439336232,
+      "loss": 0.1016,
+      "step": 48314
+    },
+    {
+      "epoch": 0.41939740106422685,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0011055357820429468,
+      "loss": 0.083,
+      "step": 48315
+    },
+    {
+      "epoch": 0.41940608154443104,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0011055060201462165,
+      "loss": 0.1318,
+      "step": 48316
+    },
+    {
+      "epoch": 0.4194147620246352,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0011054762582434654,
+      "loss": 0.123,
+      "step": 48317
+    },
+    {
+      "epoch": 0.41942344250483937,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.001105446496334725,
+      "loss": 0.1084,
+      "step": 48318
+    },
+    {
+      "epoch": 0.4194321229850435,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001105416734420029,
+      "loss": 0.0781,
+      "step": 48319
+    },
+    {
+      "epoch": 0.4194408034652477,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001105386972499409,
+      "loss": 0.1152,
+      "step": 48320
+    },
+    {
+      "epoch": 0.41944948394545184,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001105357210572898,
+      "loss": 0.0928,
+      "step": 48321
+    },
+    {
+      "epoch": 0.41945816442565603,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.001105327448640528,
+      "loss": 0.0664,
+      "step": 48322
+    },
+    {
+      "epoch": 0.41946684490586017,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0011052976867023325,
+      "loss": 0.1367,
+      "step": 48323
+    },
+    {
+      "epoch": 0.41947552538606436,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0011052679247583433,
+      "loss": 0.1045,
+      "step": 48324
+    },
+    {
+      "epoch": 0.4194842058662685,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0011052381628085931,
+      "loss": 0.1069,
+      "step": 48325
+    },
+    {
+      "epoch": 0.4194928863464727,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0011052084008531147,
+      "loss": 0.0923,
+      "step": 48326
+    },
+    {
+      "epoch": 0.41950156682667683,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0011051786388919403,
+      "loss": 0.125,
+      "step": 48327
+    },
+    {
+      "epoch": 0.419510247306881,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001105148876925103,
+      "loss": 0.1226,
+      "step": 48328
+    },
+    {
+      "epoch": 0.41951892778708516,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0011051191149526348,
+      "loss": 0.0933,
+      "step": 48329
+    },
+    {
+      "epoch": 0.41952760826728935,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0011050893529745683,
+      "loss": 0.0977,
+      "step": 48330
+    },
+    {
+      "epoch": 0.4195362887474935,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.001105059590990936,
+      "loss": 0.0752,
+      "step": 48331
+    },
+    {
+      "epoch": 0.4195449692276977,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0011050298290017709,
+      "loss": 0.0991,
+      "step": 48332
+    },
+    {
+      "epoch": 0.4195536497079018,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.001105000067007105,
+      "loss": 0.1084,
+      "step": 48333
+    },
+    {
+      "epoch": 0.419562330188106,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0011049703050069711,
+      "loss": 0.1348,
+      "step": 48334
+    },
+    {
+      "epoch": 0.41957101066831015,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0011049405430014019,
+      "loss": 0.1113,
+      "step": 48335
+    },
+    {
+      "epoch": 0.41957969114851434,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0011049107809904298,
+      "loss": 0.1729,
+      "step": 48336
+    },
+    {
+      "epoch": 0.4195883716287185,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0011048810189740875,
+      "loss": 0.0991,
+      "step": 48337
+    },
+    {
+      "epoch": 0.4195970521089227,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0011048512569524073,
+      "loss": 0.0977,
+      "step": 48338
+    },
+    {
+      "epoch": 0.4196057325891268,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0011048214949254216,
+      "loss": 0.1118,
+      "step": 48339
+    },
+    {
+      "epoch": 0.419614413069331,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0011047917328931632,
+      "loss": 0.0918,
+      "step": 48340
+    },
+    {
+      "epoch": 0.41962309354953514,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0011047619708556649,
+      "loss": 0.1113,
+      "step": 48341
+    },
+    {
+      "epoch": 0.41963177402973934,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001104732208812959,
+      "loss": 0.1523,
+      "step": 48342
+    },
+    {
+      "epoch": 0.4196404545099435,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0011047024467650778,
+      "loss": 0.0825,
+      "step": 48343
+    },
+    {
+      "epoch": 0.41964913499014767,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0011046726847120539,
+      "loss": 0.0986,
+      "step": 48344
+    },
+    {
+      "epoch": 0.4196578154703518,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0011046429226539204,
+      "loss": 0.0811,
+      "step": 48345
+    },
+    {
+      "epoch": 0.419666495950556,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0011046131605907095,
+      "loss": 0.0762,
+      "step": 48346
+    },
+    {
+      "epoch": 0.41967517643076013,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0011045833985224537,
+      "loss": 0.0908,
+      "step": 48347
+    },
+    {
+      "epoch": 0.4196838569109643,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0011045536364491851,
+      "loss": 0.0952,
+      "step": 48348
+    },
+    {
+      "epoch": 0.41969253739116846,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0011045238743709372,
+      "loss": 0.125,
+      "step": 48349
+    },
+    {
+      "epoch": 0.41970121787137266,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001104494112287742,
+      "loss": 0.0869,
+      "step": 48350
+    },
+    {
+      "epoch": 0.4197098983515768,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001104464350199632,
+      "loss": 0.127,
+      "step": 48351
+    },
+    {
+      "epoch": 0.419718578831781,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0011044345881066398,
+      "loss": 0.106,
+      "step": 48352
+    },
+    {
+      "epoch": 0.4197272593119851,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0011044048260087978,
+      "loss": 0.0635,
+      "step": 48353
+    },
+    {
+      "epoch": 0.4197359397921893,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0011043750639061392,
+      "loss": 0.0708,
+      "step": 48354
+    },
+    {
+      "epoch": 0.41974462027239345,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0011043453017986957,
+      "loss": 0.0762,
+      "step": 48355
+    },
+    {
+      "epoch": 0.41975330075259765,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0011043155396865006,
+      "loss": 0.1367,
+      "step": 48356
+    },
+    {
+      "epoch": 0.4197619812328018,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0011042857775695857,
+      "loss": 0.1992,
+      "step": 48357
+    },
+    {
+      "epoch": 0.419770661713006,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0011042560154479843,
+      "loss": 0.0947,
+      "step": 48358
+    },
+    {
+      "epoch": 0.4197793421932101,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0011042262533217286,
+      "loss": 0.1035,
+      "step": 48359
+    },
+    {
+      "epoch": 0.4197880226734143,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0011041964911908508,
+      "loss": 0.1099,
+      "step": 48360
+    },
+    {
+      "epoch": 0.41979670315361844,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0011041667290553842,
+      "loss": 0.1089,
+      "step": 48361
+    },
+    {
+      "epoch": 0.41980538363382264,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0011041369669153604,
+      "loss": 0.0752,
+      "step": 48362
+    },
+    {
+      "epoch": 0.4198140641140268,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.001104107204770813,
+      "loss": 0.0703,
+      "step": 48363
+    },
+    {
+      "epoch": 0.41982274459423097,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0011040774426217738,
+      "loss": 0.1162,
+      "step": 48364
+    },
+    {
+      "epoch": 0.4198314250744351,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0011040476804682754,
+      "loss": 0.085,
+      "step": 48365
+    },
+    {
+      "epoch": 0.4198401055546393,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0011040179183103506,
+      "loss": 0.0957,
+      "step": 48366
+    },
+    {
+      "epoch": 0.41984878603484344,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001103988156148032,
+      "loss": 0.0684,
+      "step": 48367
+    },
+    {
+      "epoch": 0.41985746651504763,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0011039583939813523,
+      "loss": 0.0996,
+      "step": 48368
+    },
+    {
+      "epoch": 0.41986614699525177,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0011039286318103435,
+      "loss": 0.0986,
+      "step": 48369
+    },
+    {
+      "epoch": 0.41987482747545596,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.001103898869635038,
+      "loss": 0.1367,
+      "step": 48370
+    },
+    {
+      "epoch": 0.4198835079556601,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0011038691074554692,
+      "loss": 0.082,
+      "step": 48371
+    },
+    {
+      "epoch": 0.4198921884358643,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0011038393452716688,
+      "loss": 0.1396,
+      "step": 48372
+    },
+    {
+      "epoch": 0.4199008689160684,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0011038095830836704,
+      "loss": 0.1035,
+      "step": 48373
+    },
+    {
+      "epoch": 0.4199095493962726,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0011037798208915052,
+      "loss": 0.0923,
+      "step": 48374
+    },
+    {
+      "epoch": 0.41991822987647676,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.001103750058695207,
+      "loss": 0.1084,
+      "step": 48375
+    },
+    {
+      "epoch": 0.41992691035668095,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0011037202964948077,
+      "loss": 0.1011,
+      "step": 48376
+    },
+    {
+      "epoch": 0.4199355908368851,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0011036905342903399,
+      "loss": 0.0942,
+      "step": 48377
+    },
+    {
+      "epoch": 0.4199442713170893,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0011036607720818358,
+      "loss": 0.1133,
+      "step": 48378
+    },
+    {
+      "epoch": 0.4199529517972934,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0011036310098693285,
+      "loss": 0.0757,
+      "step": 48379
+    },
+    {
+      "epoch": 0.4199616322774976,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0011036012476528508,
+      "loss": 0.0786,
+      "step": 48380
+    },
+    {
+      "epoch": 0.41997031275770175,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0011035714854324343,
+      "loss": 0.1172,
+      "step": 48381
+    },
+    {
+      "epoch": 0.41997899323790594,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0011035417232081125,
+      "loss": 0.1582,
+      "step": 48382
+    },
+    {
+      "epoch": 0.4199876737181101,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0011035119609799171,
+      "loss": 0.1211,
+      "step": 48383
+    },
+    {
+      "epoch": 0.41999635419831427,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0011034821987478815,
+      "loss": 0.0947,
+      "step": 48384
+    },
+    {
+      "epoch": 0.4200050346785184,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0011034524365120374,
+      "loss": 0.0615,
+      "step": 48385
+    },
+    {
+      "epoch": 0.4200137151587226,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0011034226742724185,
+      "loss": 0.0728,
+      "step": 48386
+    },
+    {
+      "epoch": 0.42002239563892674,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.0011033929120290557,
+      "loss": 0.2715,
+      "step": 48387
+    },
+    {
+      "epoch": 0.42003107611913093,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0011033631497819833,
+      "loss": 0.1162,
+      "step": 48388
+    },
+    {
+      "epoch": 0.42003975659933507,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0011033333875312323,
+      "loss": 0.0889,
+      "step": 48389
+    },
+    {
+      "epoch": 0.42004843707953926,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0011033036252768363,
+      "loss": 0.1104,
+      "step": 48390
+    },
+    {
+      "epoch": 0.4200571175597434,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0011032738630188275,
+      "loss": 0.0942,
+      "step": 48391
+    },
+    {
+      "epoch": 0.4200657980399476,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0011032441007572382,
+      "loss": 0.1035,
+      "step": 48392
+    },
+    {
+      "epoch": 0.42007447852015173,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0011032143384921015,
+      "loss": 0.1094,
+      "step": 48393
+    },
+    {
+      "epoch": 0.4200831590003559,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0011031845762234495,
+      "loss": 0.0947,
+      "step": 48394
+    },
+    {
+      "epoch": 0.42009183948056006,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.001103154813951315,
+      "loss": 0.2344,
+      "step": 48395
+    },
+    {
+      "epoch": 0.42010051996076425,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0011031250516757305,
+      "loss": 0.127,
+      "step": 48396
+    },
+    {
+      "epoch": 0.4201092004409684,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0011030952893967283,
+      "loss": 0.1064,
+      "step": 48397
+    },
+    {
+      "epoch": 0.4201178809211726,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0011030655271143412,
+      "loss": 0.0918,
+      "step": 48398
+    },
+    {
+      "epoch": 0.4201265614013767,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001103035764828602,
+      "loss": 0.0874,
+      "step": 48399
+    },
+    {
+      "epoch": 0.4201352418815809,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0011030060025395423,
+      "loss": 0.249,
+      "step": 48400
+    },
+    {
+      "epoch": 0.42014392236178505,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0011029762402471957,
+      "loss": 0.0718,
+      "step": 48401
+    },
+    {
+      "epoch": 0.42015260284198924,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0011029464779515944,
+      "loss": 0.0596,
+      "step": 48402
+    },
+    {
+      "epoch": 0.4201612833221934,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0011029167156527706,
+      "loss": 0.0786,
+      "step": 48403
+    },
+    {
+      "epoch": 0.4201699638023976,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.001102886953350757,
+      "loss": 0.2012,
+      "step": 48404
+    },
+    {
+      "epoch": 0.4201786442826017,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0011028571910455867,
+      "loss": 0.1338,
+      "step": 48405
+    },
+    {
+      "epoch": 0.4201873247628059,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0011028274287372918,
+      "loss": 0.085,
+      "step": 48406
+    },
+    {
+      "epoch": 0.42019600524301004,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0011027976664259047,
+      "loss": 0.1064,
+      "step": 48407
+    },
+    {
+      "epoch": 0.42020468572321423,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0011027679041114582,
+      "loss": 0.0923,
+      "step": 48408
+    },
+    {
+      "epoch": 0.42021336620341837,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0011027381417939846,
+      "loss": 0.064,
+      "step": 48409
+    },
+    {
+      "epoch": 0.42022204668362256,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0011027083794735164,
+      "loss": 0.1001,
+      "step": 48410
+    },
+    {
+      "epoch": 0.4202307271638267,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0011026786171500868,
+      "loss": 0.0879,
+      "step": 48411
+    },
+    {
+      "epoch": 0.4202394076440309,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.001102648854823728,
+      "loss": 0.1079,
+      "step": 48412
+    },
+    {
+      "epoch": 0.42024808812423503,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001102619092494472,
+      "loss": 0.1641,
+      "step": 48413
+    },
+    {
+      "epoch": 0.4202567686044392,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001102589330162352,
+      "loss": 0.0811,
+      "step": 48414
+    },
+    {
+      "epoch": 0.42026544908464336,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0011025595678274003,
+      "loss": 0.0957,
+      "step": 48415
+    },
+    {
+      "epoch": 0.42027412956484755,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0011025298054896495,
+      "loss": 0.0728,
+      "step": 48416
+    },
+    {
+      "epoch": 0.4202828100450517,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0011025000431491322,
+      "loss": 0.1006,
+      "step": 48417
+    },
+    {
+      "epoch": 0.4202914905252559,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.001102470280805881,
+      "loss": 0.1074,
+      "step": 48418
+    },
+    {
+      "epoch": 0.42030017100546,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0011024405184599283,
+      "loss": 0.1201,
+      "step": 48419
+    },
+    {
+      "epoch": 0.4203088514856642,
+      "grad_norm": 5.46875,
+      "learning_rate": 0.0011024107561113064,
+      "loss": 0.3164,
+      "step": 48420
+    },
+    {
+      "epoch": 0.42031753196586835,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0011023809937600485,
+      "loss": 0.1152,
+      "step": 48421
+    },
+    {
+      "epoch": 0.4203262124460725,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0011023512314061866,
+      "loss": 0.0635,
+      "step": 48422
+    },
+    {
+      "epoch": 0.4203348929262767,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0011023214690497533,
+      "loss": 0.0938,
+      "step": 48423
+    },
+    {
+      "epoch": 0.4203435734064808,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0011022917066907817,
+      "loss": 0.1523,
+      "step": 48424
+    },
+    {
+      "epoch": 0.420352253886685,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0011022619443293036,
+      "loss": 0.1279,
+      "step": 48425
+    },
+    {
+      "epoch": 0.42036093436688915,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0011022321819653515,
+      "loss": 0.0986,
+      "step": 48426
+    },
+    {
+      "epoch": 0.42036961484709334,
+      "grad_norm": 3.1875,
+      "learning_rate": 0.0011022024195989591,
+      "loss": 0.1719,
+      "step": 48427
+    },
+    {
+      "epoch": 0.4203782953272975,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0011021726572301579,
+      "loss": 0.1357,
+      "step": 48428
+    },
+    {
+      "epoch": 0.4203869758075017,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0011021428948589805,
+      "loss": 0.105,
+      "step": 48429
+    },
+    {
+      "epoch": 0.4203956562877058,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0011021131324854595,
+      "loss": 0.1143,
+      "step": 48430
+    },
+    {
+      "epoch": 0.42040433676791,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.001102083370109628,
+      "loss": 0.0898,
+      "step": 48431
+    },
+    {
+      "epoch": 0.42041301724811414,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0011020536077315183,
+      "loss": 0.1104,
+      "step": 48432
+    },
+    {
+      "epoch": 0.42042169772831833,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0011020238453511625,
+      "loss": 0.0908,
+      "step": 48433
+    },
+    {
+      "epoch": 0.42043037820852247,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0011019940829685934,
+      "loss": 0.1143,
+      "step": 48434
+    },
+    {
+      "epoch": 0.42043905868872666,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0011019643205838437,
+      "loss": 0.0552,
+      "step": 48435
+    },
+    {
+      "epoch": 0.4204477391689308,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.001101934558196946,
+      "loss": 0.0869,
+      "step": 48436
+    },
+    {
+      "epoch": 0.420456419649135,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0011019047958079325,
+      "loss": 0.0952,
+      "step": 48437
+    },
+    {
+      "epoch": 0.42046510012933913,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001101875033416836,
+      "loss": 0.083,
+      "step": 48438
+    },
+    {
+      "epoch": 0.4204737806095433,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0011018452710236891,
+      "loss": 0.0889,
+      "step": 48439
+    },
+    {
+      "epoch": 0.42048246108974746,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0011018155086285238,
+      "loss": 0.1025,
+      "step": 48440
+    },
+    {
+      "epoch": 0.42049114156995165,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0011017857462313734,
+      "loss": 0.1211,
+      "step": 48441
+    },
+    {
+      "epoch": 0.4204998220501558,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0011017559838322705,
+      "loss": 0.0791,
+      "step": 48442
+    },
+    {
+      "epoch": 0.42050850253036,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0011017262214312467,
+      "loss": 0.103,
+      "step": 48443
+    },
+    {
+      "epoch": 0.4205171830105641,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0011016964590283354,
+      "loss": 0.1104,
+      "step": 48444
+    },
+    {
+      "epoch": 0.4205258634907683,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0011016666966235688,
+      "loss": 0.0903,
+      "step": 48445
+    },
+    {
+      "epoch": 0.42053454397097245,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0011016369342169798,
+      "loss": 0.0928,
+      "step": 48446
+    },
+    {
+      "epoch": 0.42054322445117664,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0011016071718086,
+      "loss": 0.1094,
+      "step": 48447
+    },
+    {
+      "epoch": 0.4205519049313808,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0011015774093984633,
+      "loss": 0.125,
+      "step": 48448
+    },
+    {
+      "epoch": 0.420560585411585,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0011015476469866013,
+      "loss": 0.1113,
+      "step": 48449
+    },
+    {
+      "epoch": 0.4205692658917891,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.001101517884573047,
+      "loss": 0.0747,
+      "step": 48450
+    },
+    {
+      "epoch": 0.4205779463719933,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0011014881221578325,
+      "loss": 0.0811,
+      "step": 48451
+    },
+    {
+      "epoch": 0.42058662685219744,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0011014583597409907,
+      "loss": 0.1035,
+      "step": 48452
+    },
+    {
+      "epoch": 0.42059530733240164,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.001101428597322554,
+      "loss": 0.0811,
+      "step": 48453
+    },
+    {
+      "epoch": 0.4206039878126058,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0011013988349025553,
+      "loss": 0.0938,
+      "step": 48454
+    },
+    {
+      "epoch": 0.42061266829280997,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0011013690724810267,
+      "loss": 0.0938,
+      "step": 48455
+    },
+    {
+      "epoch": 0.4206213487730141,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0011013393100580008,
+      "loss": 0.0967,
+      "step": 48456
+    },
+    {
+      "epoch": 0.4206300292532183,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0011013095476335101,
+      "loss": 0.0732,
+      "step": 48457
+    },
+    {
+      "epoch": 0.42063870973342243,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0011012797852075872,
+      "loss": 0.1113,
+      "step": 48458
+    },
+    {
+      "epoch": 0.4206473902136266,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0011012500227802653,
+      "loss": 0.0918,
+      "step": 48459
+    },
+    {
+      "epoch": 0.42065607069383076,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0011012202603515762,
+      "loss": 0.1357,
+      "step": 48460
+    },
+    {
+      "epoch": 0.42066475117403496,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0011011904979215527,
+      "loss": 0.0708,
+      "step": 48461
+    },
+    {
+      "epoch": 0.4206734316542391,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0011011607354902268,
+      "loss": 0.1025,
+      "step": 48462
+    },
+    {
+      "epoch": 0.4206821121344433,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0011011309730576324,
+      "loss": 0.165,
+      "step": 48463
+    },
+    {
+      "epoch": 0.4206907926146474,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0011011012106238007,
+      "loss": 0.125,
+      "step": 48464
+    },
+    {
+      "epoch": 0.4206994730948516,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0011010714481887644,
+      "loss": 0.0825,
+      "step": 48465
+    },
+    {
+      "epoch": 0.42070815357505575,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0011010416857525568,
+      "loss": 0.1729,
+      "step": 48466
+    },
+    {
+      "epoch": 0.42071683405525995,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0011010119233152101,
+      "loss": 0.1045,
+      "step": 48467
+    },
+    {
+      "epoch": 0.4207255145354641,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0011009821608767565,
+      "loss": 0.1357,
+      "step": 48468
+    },
+    {
+      "epoch": 0.4207341950156683,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0011009523984372286,
+      "loss": 0.126,
+      "step": 48469
+    },
+    {
+      "epoch": 0.4207428754958724,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0011009226359966595,
+      "loss": 0.167,
+      "step": 48470
+    },
+    {
+      "epoch": 0.4207515559760766,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0011008928735550818,
+      "loss": 0.1074,
+      "step": 48471
+    },
+    {
+      "epoch": 0.42076023645628075,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0011008631111125273,
+      "loss": 0.1113,
+      "step": 48472
+    },
+    {
+      "epoch": 0.42076891693648494,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0011008333486690288,
+      "loss": 0.0894,
+      "step": 48473
+    },
+    {
+      "epoch": 0.4207775974166891,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.001100803586224619,
+      "loss": 0.1045,
+      "step": 48474
+    },
+    {
+      "epoch": 0.42078627789689327,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0011007738237793308,
+      "loss": 0.0879,
+      "step": 48475
+    },
+    {
+      "epoch": 0.4207949583770974,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0011007440613331958,
+      "loss": 0.1035,
+      "step": 48476
+    },
+    {
+      "epoch": 0.4208036388573016,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0011007142988862474,
+      "loss": 0.1016,
+      "step": 48477
+    },
+    {
+      "epoch": 0.42081231933750574,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0011006845364385177,
+      "loss": 0.1084,
+      "step": 48478
+    },
+    {
+      "epoch": 0.42082099981770993,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0011006547739900393,
+      "loss": 0.1133,
+      "step": 48479
+    },
+    {
+      "epoch": 0.42082968029791407,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0011006250115408451,
+      "loss": 0.084,
+      "step": 48480
+    },
+    {
+      "epoch": 0.42083836077811826,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0011005952490909677,
+      "loss": 0.104,
+      "step": 48481
+    },
+    {
+      "epoch": 0.4208470412583224,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0011005654866404386,
+      "loss": 0.0879,
+      "step": 48482
+    },
+    {
+      "epoch": 0.4208557217385266,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0011005357241892918,
+      "loss": 0.0659,
+      "step": 48483
+    },
+    {
+      "epoch": 0.4208644022187307,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0011005059617375588,
+      "loss": 0.1123,
+      "step": 48484
+    },
+    {
+      "epoch": 0.4208730826989349,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0011004761992852725,
+      "loss": 0.1084,
+      "step": 48485
+    },
+    {
+      "epoch": 0.42088176317913906,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.001100446436832465,
+      "loss": 0.0742,
+      "step": 48486
+    },
+    {
+      "epoch": 0.42089044365934325,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0011004166743791702,
+      "loss": 0.0977,
+      "step": 48487
+    },
+    {
+      "epoch": 0.4208991241395474,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0011003869119254191,
+      "loss": 0.0908,
+      "step": 48488
+    },
+    {
+      "epoch": 0.4209078046197516,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.001100357149471245,
+      "loss": 0.123,
+      "step": 48489
+    },
+    {
+      "epoch": 0.4209164850999557,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0011003273870166803,
+      "loss": 0.1123,
+      "step": 48490
+    },
+    {
+      "epoch": 0.4209251655801599,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0011002976245617574,
+      "loss": 0.1211,
+      "step": 48491
+    },
+    {
+      "epoch": 0.42093384606036405,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0011002678621065097,
+      "loss": 0.1064,
+      "step": 48492
+    },
+    {
+      "epoch": 0.42094252654056824,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0011002380996509684,
+      "loss": 0.1748,
+      "step": 48493
+    },
+    {
+      "epoch": 0.4209512070207724,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0011002083371951673,
+      "loss": 0.1602,
+      "step": 48494
+    },
+    {
+      "epoch": 0.42095988750097657,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0011001785747391378,
+      "loss": 0.0835,
+      "step": 48495
+    },
+    {
+      "epoch": 0.4209685679811807,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.001100148812282913,
+      "loss": 0.0864,
+      "step": 48496
+    },
+    {
+      "epoch": 0.4209772484613849,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0011001190498265256,
+      "loss": 0.0894,
+      "step": 48497
+    },
+    {
+      "epoch": 0.42098592894158904,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.001100089287370008,
+      "loss": 0.1416,
+      "step": 48498
+    },
+    {
+      "epoch": 0.42099460942179323,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.001100059524913393,
+      "loss": 0.0732,
+      "step": 48499
+    },
+    {
+      "epoch": 0.42100328990199737,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0011000297624567128,
+      "loss": 0.1221,
+      "step": 48500
+    },
+    {
+      "epoch": 0.42101197038220156,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0011,
+      "loss": 0.0698,
+      "step": 48501
+    },
+    {
+      "epoch": 0.4210206508624057,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0010999702375432874,
+      "loss": 0.1006,
+      "step": 48502
+    },
+    {
+      "epoch": 0.4210293313426099,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0010999404750866072,
+      "loss": 0.1387,
+      "step": 48503
+    },
+    {
+      "epoch": 0.42103801182281403,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.001099910712629992,
+      "loss": 0.1387,
+      "step": 48504
+    },
+    {
+      "epoch": 0.4210466923030182,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0010998809501734745,
+      "loss": 0.1309,
+      "step": 48505
+    },
+    {
+      "epoch": 0.42105537278322236,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.001099851187717087,
+      "loss": 0.1104,
+      "step": 48506
+    },
+    {
+      "epoch": 0.42106405326342655,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0010998214252608625,
+      "loss": 0.0908,
+      "step": 48507
+    },
+    {
+      "epoch": 0.4210727337436307,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.001099791662804833,
+      "loss": 0.1553,
+      "step": 48508
+    },
+    {
+      "epoch": 0.4210814142238349,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0010997619003490318,
+      "loss": 0.1201,
+      "step": 48509
+    },
+    {
+      "epoch": 0.421090094704039,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0010997321378934909,
+      "loss": 0.126,
+      "step": 48510
+    },
+    {
+      "epoch": 0.4210987751842432,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0010997023754382427,
+      "loss": 0.0996,
+      "step": 48511
+    },
+    {
+      "epoch": 0.42110745566444735,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.00109967261298332,
+      "loss": 0.0903,
+      "step": 48512
+    },
+    {
+      "epoch": 0.42111613614465154,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0010996428505287553,
+      "loss": 0.1011,
+      "step": 48513
+    },
+    {
+      "epoch": 0.4211248166248557,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.001099613088074581,
+      "loss": 0.1152,
+      "step": 48514
+    },
+    {
+      "epoch": 0.4211334971050599,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0010995833256208301,
+      "loss": 0.0811,
+      "step": 48515
+    },
+    {
+      "epoch": 0.421142177585264,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0010995535631675348,
+      "loss": 0.0918,
+      "step": 48516
+    },
+    {
+      "epoch": 0.4211508580654682,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0010995238007147276,
+      "loss": 0.1025,
+      "step": 48517
+    },
+    {
+      "epoch": 0.42115953854567234,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0010994940382624417,
+      "loss": 0.0894,
+      "step": 48518
+    },
+    {
+      "epoch": 0.42116821902587653,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0010994642758107086,
+      "loss": 0.1143,
+      "step": 48519
+    },
+    {
+      "epoch": 0.42117689950608067,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0010994345133595615,
+      "loss": 0.1182,
+      "step": 48520
+    },
+    {
+      "epoch": 0.42118557998628486,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0010994047509090329,
+      "loss": 0.104,
+      "step": 48521
+    },
+    {
+      "epoch": 0.421194260466489,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001099374988459155,
+      "loss": 0.083,
+      "step": 48522
+    },
+    {
+      "epoch": 0.4212029409466932,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0010993452260099606,
+      "loss": 0.123,
+      "step": 48523
+    },
+    {
+      "epoch": 0.42121162142689733,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0010993154635614824,
+      "loss": 0.0986,
+      "step": 48524
+    },
+    {
+      "epoch": 0.4212203019071015,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001099285701113753,
+      "loss": 0.0859,
+      "step": 48525
+    },
+    {
+      "epoch": 0.42122898238730566,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0010992559386668044,
+      "loss": 0.0781,
+      "step": 48526
+    },
+    {
+      "epoch": 0.42123766286750985,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0010992261762206696,
+      "loss": 0.0923,
+      "step": 48527
+    },
+    {
+      "epoch": 0.421246343347714,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0010991964137753811,
+      "loss": 0.0996,
+      "step": 48528
+    },
+    {
+      "epoch": 0.4212550238279182,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0010991666513309715,
+      "loss": 0.1152,
+      "step": 48529
+    },
+    {
+      "epoch": 0.4212637043081223,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0010991368888874733,
+      "loss": 0.1006,
+      "step": 48530
+    },
+    {
+      "epoch": 0.4212723847883265,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0010991071264449188,
+      "loss": 0.0791,
+      "step": 48531
+    },
+    {
+      "epoch": 0.42128106526853065,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0010990773640033402,
+      "loss": 0.1108,
+      "step": 48532
+    },
+    {
+      "epoch": 0.42128974574873485,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0010990476015627713,
+      "loss": 0.0708,
+      "step": 48533
+    },
+    {
+      "epoch": 0.421298426228939,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0010990178391232436,
+      "loss": 0.0859,
+      "step": 48534
+    },
+    {
+      "epoch": 0.4213071067091432,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0010989880766847902,
+      "loss": 0.1387,
+      "step": 48535
+    },
+    {
+      "epoch": 0.4213157871893473,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0010989583142474433,
+      "loss": 0.1074,
+      "step": 48536
+    },
+    {
+      "epoch": 0.4213244676695515,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0010989285518112357,
+      "loss": 0.0898,
+      "step": 48537
+    },
+    {
+      "epoch": 0.42133314814975564,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0010988987893761999,
+      "loss": 0.0796,
+      "step": 48538
+    },
+    {
+      "epoch": 0.42134182862995984,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0010988690269423682,
+      "loss": 0.0913,
+      "step": 48539
+    },
+    {
+      "epoch": 0.421350509110164,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.001098839264509773,
+      "loss": 0.1113,
+      "step": 48540
+    },
+    {
+      "epoch": 0.42135918959036817,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0010988095020784475,
+      "loss": 0.0918,
+      "step": 48541
+    },
+    {
+      "epoch": 0.4213678700705723,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0010987797396484241,
+      "loss": 0.0767,
+      "step": 48542
+    },
+    {
+      "epoch": 0.4213765505507765,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0010987499772197346,
+      "loss": 0.0732,
+      "step": 48543
+    },
+    {
+      "epoch": 0.42138523103098063,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0010987202147924127,
+      "loss": 0.0762,
+      "step": 48544
+    },
+    {
+      "epoch": 0.42139391151118477,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.00109869045236649,
+      "loss": 0.0859,
+      "step": 48545
+    },
+    {
+      "epoch": 0.42140259199138896,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0010986606899419996,
+      "loss": 0.1016,
+      "step": 48546
+    },
+    {
+      "epoch": 0.4214112724715931,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001098630927518974,
+      "loss": 0.0898,
+      "step": 48547
+    },
+    {
+      "epoch": 0.4214199529517973,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0010986011650974453,
+      "loss": 0.1138,
+      "step": 48548
+    },
+    {
+      "epoch": 0.42142863343200143,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001098571402677446,
+      "loss": 0.0601,
+      "step": 48549
+    },
+    {
+      "epoch": 0.4214373139122056,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0010985416402590094,
+      "loss": 0.125,
+      "step": 48550
+    },
+    {
+      "epoch": 0.42144599439240976,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0010985118778421676,
+      "loss": 0.0938,
+      "step": 48551
+    },
+    {
+      "epoch": 0.42145467487261395,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0010984821154269534,
+      "loss": 0.1367,
+      "step": 48552
+    },
+    {
+      "epoch": 0.4214633553528181,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0010984523530133989,
+      "loss": 0.1133,
+      "step": 48553
+    },
+    {
+      "epoch": 0.4214720358330223,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0010984225906015368,
+      "loss": 0.0942,
+      "step": 48554
+    },
+    {
+      "epoch": 0.4214807163132264,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0010983928281914,
+      "loss": 0.1084,
+      "step": 48555
+    },
+    {
+      "epoch": 0.4214893967934306,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0010983630657830205,
+      "loss": 0.1094,
+      "step": 48556
+    },
+    {
+      "epoch": 0.42149807727363475,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0010983333033764316,
+      "loss": 0.124,
+      "step": 48557
+    },
+    {
+      "epoch": 0.42150675775383895,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0010983035409716647,
+      "loss": 0.0801,
+      "step": 48558
+    },
+    {
+      "epoch": 0.4215154382340431,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0010982737785687534,
+      "loss": 0.0776,
+      "step": 48559
+    },
+    {
+      "epoch": 0.4215241187142473,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0010982440161677299,
+      "loss": 0.1143,
+      "step": 48560
+    },
+    {
+      "epoch": 0.4215327991944514,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0010982142537686267,
+      "loss": 0.0835,
+      "step": 48561
+    },
+    {
+      "epoch": 0.4215414796746556,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0010981844913714762,
+      "loss": 0.125,
+      "step": 48562
+    },
+    {
+      "epoch": 0.42155016015485974,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0010981547289763112,
+      "loss": 0.1016,
+      "step": 48563
+    },
+    {
+      "epoch": 0.42155884063506394,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0010981249665831642,
+      "loss": 0.1221,
+      "step": 48564
+    },
+    {
+      "epoch": 0.4215675211152681,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.001098095204192068,
+      "loss": 0.123,
+      "step": 48565
+    },
+    {
+      "epoch": 0.42157620159547227,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001098065441803054,
+      "loss": 0.084,
+      "step": 48566
+    },
+    {
+      "epoch": 0.4215848820756764,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0010980356794161564,
+      "loss": 0.0703,
+      "step": 48567
+    },
+    {
+      "epoch": 0.4215935625558806,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0010980059170314067,
+      "loss": 0.0796,
+      "step": 48568
+    },
+    {
+      "epoch": 0.42160224303608473,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0010979761546488379,
+      "loss": 0.0928,
+      "step": 48569
+    },
+    {
+      "epoch": 0.4216109235162889,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.001097946392268482,
+      "loss": 0.1289,
+      "step": 48570
+    },
+    {
+      "epoch": 0.42161960399649306,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0010979166298903723,
+      "loss": 0.1152,
+      "step": 48571
+    },
+    {
+      "epoch": 0.42162828447669726,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0010978868675145406,
+      "loss": 0.1094,
+      "step": 48572
+    },
+    {
+      "epoch": 0.4216369649569014,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0010978571051410198,
+      "loss": 0.0723,
+      "step": 48573
+    },
+    {
+      "epoch": 0.4216456454371056,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0010978273427698427,
+      "loss": 0.0674,
+      "step": 48574
+    },
+    {
+      "epoch": 0.4216543259173097,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.001097797580401041,
+      "loss": 0.0757,
+      "step": 48575
+    },
+    {
+      "epoch": 0.4216630063975139,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0010977678180346482,
+      "loss": 0.0737,
+      "step": 48576
+    },
+    {
+      "epoch": 0.42167168687771805,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0010977380556706966,
+      "loss": 0.0933,
+      "step": 48577
+    },
+    {
+      "epoch": 0.42168036735792225,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0010977082933092187,
+      "loss": 0.1191,
+      "step": 48578
+    },
+    {
+      "epoch": 0.4216890478381264,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0010976785309502468,
+      "loss": 0.0737,
+      "step": 48579
+    },
+    {
+      "epoch": 0.4216977283183306,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0010976487685938137,
+      "loss": 0.0996,
+      "step": 48580
+    },
+    {
+      "epoch": 0.4217064087985347,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0010976190062399521,
+      "loss": 0.0596,
+      "step": 48581
+    },
+    {
+      "epoch": 0.4217150892787389,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001097589243888694,
+      "loss": 0.0791,
+      "step": 48582
+    },
+    {
+      "epoch": 0.42172376975894305,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0010975594815400723,
+      "loss": 0.1094,
+      "step": 48583
+    },
+    {
+      "epoch": 0.42173245023914724,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001097529719194119,
+      "loss": 0.1001,
+      "step": 48584
+    },
+    {
+      "epoch": 0.4217411307193514,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.001097499956850868,
+      "loss": 0.1094,
+      "step": 48585
+    },
+    {
+      "epoch": 0.42174981119955557,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0010974701945103504,
+      "loss": 0.1211,
+      "step": 48586
+    },
+    {
+      "epoch": 0.4217584916797597,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0010974404321725997,
+      "loss": 0.1045,
+      "step": 48587
+    },
+    {
+      "epoch": 0.4217671721599639,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0010974106698376483,
+      "loss": 0.0947,
+      "step": 48588
+    },
+    {
+      "epoch": 0.42177585264016804,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0010973809075055284,
+      "loss": 0.0879,
+      "step": 48589
+    },
+    {
+      "epoch": 0.42178453312037223,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0010973511451762726,
+      "loss": 0.1162,
+      "step": 48590
+    },
+    {
+      "epoch": 0.42179321360057637,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0010973213828499133,
+      "loss": 0.0972,
+      "step": 48591
+    },
+    {
+      "epoch": 0.42180189408078056,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0010972916205264835,
+      "loss": 0.126,
+      "step": 48592
+    },
+    {
+      "epoch": 0.4218105745609847,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0010972618582060156,
+      "loss": 0.0957,
+      "step": 48593
+    },
+    {
+      "epoch": 0.4218192550411889,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001097232095888542,
+      "loss": 0.1074,
+      "step": 48594
+    },
+    {
+      "epoch": 0.421827935521393,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0010972023335740955,
+      "loss": 0.0781,
+      "step": 48595
+    },
+    {
+      "epoch": 0.4218366160015972,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0010971725712627086,
+      "loss": 0.0947,
+      "step": 48596
+    },
+    {
+      "epoch": 0.42184529648180136,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0010971428089544136,
+      "loss": 0.0996,
+      "step": 48597
+    },
+    {
+      "epoch": 0.42185397696200555,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.001097113046649243,
+      "loss": 0.0913,
+      "step": 48598
+    },
+    {
+      "epoch": 0.4218626574422097,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0010970832843472298,
+      "loss": 0.0957,
+      "step": 48599
+    },
+    {
+      "epoch": 0.4218713379224139,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001097053522048406,
+      "loss": 0.0918,
+      "step": 48600
+    },
+    {
+      "epoch": 0.421880018402618,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0010970237597528044,
+      "loss": 0.0835,
+      "step": 48601
+    },
+    {
+      "epoch": 0.4218886988828222,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0010969939974604576,
+      "loss": 0.127,
+      "step": 48602
+    },
+    {
+      "epoch": 0.42189737936302635,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0010969642351713982,
+      "loss": 0.064,
+      "step": 48603
+    },
+    {
+      "epoch": 0.42190605984323054,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.001096934472885659,
+      "loss": 0.1035,
+      "step": 48604
+    },
+    {
+      "epoch": 0.4219147403234347,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0010969047106032719,
+      "loss": 0.0796,
+      "step": 48605
+    },
+    {
+      "epoch": 0.42192342080363887,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0010968749483242697,
+      "loss": 0.0986,
+      "step": 48606
+    },
+    {
+      "epoch": 0.421932101283843,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0010968451860486853,
+      "loss": 0.0767,
+      "step": 48607
+    },
+    {
+      "epoch": 0.4219407817640472,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0010968154237765509,
+      "loss": 0.0874,
+      "step": 48608
+    },
+    {
+      "epoch": 0.42194946224425134,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0010967856615078989,
+      "loss": 0.127,
+      "step": 48609
+    },
+    {
+      "epoch": 0.42195814272445553,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0010967558992427617,
+      "loss": 0.1035,
+      "step": 48610
+    },
+    {
+      "epoch": 0.42196682320465967,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0010967261369811728,
+      "loss": 0.1094,
+      "step": 48611
+    },
+    {
+      "epoch": 0.42197550368486386,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.001096696374723164,
+      "loss": 0.0859,
+      "step": 48612
+    },
+    {
+      "epoch": 0.421984184165068,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0010966666124687678,
+      "loss": 0.0923,
+      "step": 48613
+    },
+    {
+      "epoch": 0.4219928646452722,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0010966368502180173,
+      "loss": 0.1074,
+      "step": 48614
+    },
+    {
+      "epoch": 0.42200154512547633,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0010966070879709444,
+      "loss": 0.1484,
+      "step": 48615
+    },
+    {
+      "epoch": 0.4220102256056805,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0010965773257275821,
+      "loss": 0.0977,
+      "step": 48616
+    },
+    {
+      "epoch": 0.42201890608588466,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0010965475634879627,
+      "loss": 0.1133,
+      "step": 48617
+    },
+    {
+      "epoch": 0.42202758656608885,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0010965178012521187,
+      "loss": 0.0938,
+      "step": 48618
+    },
+    {
+      "epoch": 0.422036267046293,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0010964880390200828,
+      "loss": 0.0767,
+      "step": 48619
+    },
+    {
+      "epoch": 0.4220449475264972,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0010964582767918878,
+      "loss": 0.082,
+      "step": 48620
+    },
+    {
+      "epoch": 0.4220536280067013,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0010964285145675658,
+      "loss": 0.0703,
+      "step": 48621
+    },
+    {
+      "epoch": 0.4220623084869055,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0010963987523471494,
+      "loss": 0.0889,
+      "step": 48622
+    },
+    {
+      "epoch": 0.42207098896710965,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0010963689901306716,
+      "loss": 0.0815,
+      "step": 48623
+    },
+    {
+      "epoch": 0.42207966944731384,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0010963392279181644,
+      "loss": 0.0781,
+      "step": 48624
+    },
+    {
+      "epoch": 0.422088349927518,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0010963094657096607,
+      "loss": 0.124,
+      "step": 48625
+    },
+    {
+      "epoch": 0.4220970304077222,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0010962797035051928,
+      "loss": 0.0649,
+      "step": 48626
+    },
+    {
+      "epoch": 0.4221057108879263,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.001096249941304793,
+      "loss": 0.0933,
+      "step": 48627
+    },
+    {
+      "epoch": 0.4221143913681305,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0010962201791084947,
+      "loss": 0.0957,
+      "step": 48628
+    },
+    {
+      "epoch": 0.42212307184833464,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.00109619041691633,
+      "loss": 0.0996,
+      "step": 48629
+    },
+    {
+      "epoch": 0.42213175232853883,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0010961606547283311,
+      "loss": 0.0732,
+      "step": 48630
+    },
+    {
+      "epoch": 0.42214043280874297,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.001096130892544531,
+      "loss": 0.1191,
+      "step": 48631
+    },
+    {
+      "epoch": 0.42214911328894716,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0010961011303649622,
+      "loss": 0.0811,
+      "step": 48632
+    },
+    {
+      "epoch": 0.4221577937691513,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001096071368189657,
+      "loss": 0.1211,
+      "step": 48633
+    },
+    {
+      "epoch": 0.4221664742493555,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0010960416060186483,
+      "loss": 0.0903,
+      "step": 48634
+    },
+    {
+      "epoch": 0.42217515472955963,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001096011843851968,
+      "loss": 0.1406,
+      "step": 48635
+    },
+    {
+      "epoch": 0.4221838352097638,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0010959820816896493,
+      "loss": 0.1289,
+      "step": 48636
+    },
+    {
+      "epoch": 0.42219251568996796,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0010959523195317247,
+      "loss": 0.0986,
+      "step": 48637
+    },
+    {
+      "epoch": 0.42220119617017215,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0010959225573782265,
+      "loss": 0.0791,
+      "step": 48638
+    },
+    {
+      "epoch": 0.4222098766503763,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0010958927952291872,
+      "loss": 0.0996,
+      "step": 48639
+    },
+    {
+      "epoch": 0.4222185571305805,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0010958630330846395,
+      "loss": 0.0986,
+      "step": 48640
+    },
+    {
+      "epoch": 0.4222272376107846,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0010958332709446162,
+      "loss": 0.1001,
+      "step": 48641
+    },
+    {
+      "epoch": 0.4222359180909888,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0010958035088091493,
+      "loss": 0.1387,
+      "step": 48642
+    },
+    {
+      "epoch": 0.42224459857119295,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0010957737466782717,
+      "loss": 0.1064,
+      "step": 48643
+    },
+    {
+      "epoch": 0.42225327905139715,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0010957439845520156,
+      "loss": 0.1465,
+      "step": 48644
+    },
+    {
+      "epoch": 0.4222619595316013,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0010957142224304142,
+      "loss": 0.0986,
+      "step": 48645
+    },
+    {
+      "epoch": 0.4222706400118055,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0010956844603134998,
+      "loss": 0.0791,
+      "step": 48646
+    },
+    {
+      "epoch": 0.4222793204920096,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0010956546982013044,
+      "loss": 0.1309,
+      "step": 48647
+    },
+    {
+      "epoch": 0.4222880009722138,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0010956249360938612,
+      "loss": 0.0977,
+      "step": 48648
+    },
+    {
+      "epoch": 0.42229668145241794,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0010955951739912025,
+      "loss": 0.0977,
+      "step": 48649
+    },
+    {
+      "epoch": 0.42230536193262214,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0010955654118933607,
+      "loss": 0.1289,
+      "step": 48650
+    },
+    {
+      "epoch": 0.4223140424128263,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0010955356498003686,
+      "loss": 0.0742,
+      "step": 48651
+    },
+    {
+      "epoch": 0.42232272289303047,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0010955058877122586,
+      "loss": 0.1172,
+      "step": 48652
+    },
+    {
+      "epoch": 0.4223314033732346,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0010954761256290629,
+      "loss": 0.1289,
+      "step": 48653
+    },
+    {
+      "epoch": 0.4223400838534388,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.001095446363550815,
+      "loss": 0.083,
+      "step": 48654
+    },
+    {
+      "epoch": 0.42234876433364293,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.001095416601477547,
+      "loss": 0.1035,
+      "step": 48655
+    },
+    {
+      "epoch": 0.4223574448138471,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.001095386839409291,
+      "loss": 0.0625,
+      "step": 48656
+    },
+    {
+      "epoch": 0.42236612529405126,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.00109535707734608,
+      "loss": 0.085,
+      "step": 48657
+    },
+    {
+      "epoch": 0.42237480577425546,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0010953273152879462,
+      "loss": 0.1011,
+      "step": 48658
+    },
+    {
+      "epoch": 0.4223834862544596,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0010952975532349226,
+      "loss": 0.0605,
+      "step": 48659
+    },
+    {
+      "epoch": 0.4223921667346638,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0010952677911870415,
+      "loss": 0.1396,
+      "step": 48660
+    },
+    {
+      "epoch": 0.4224008472148679,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0010952380291443353,
+      "loss": 0.1289,
+      "step": 48661
+    },
+    {
+      "epoch": 0.4224095276950721,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0010952082671068367,
+      "loss": 0.1211,
+      "step": 48662
+    },
+    {
+      "epoch": 0.42241820817527626,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0010951785050745783,
+      "loss": 0.084,
+      "step": 48663
+    },
+    {
+      "epoch": 0.42242688865548045,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.001095148743047593,
+      "loss": 0.0913,
+      "step": 48664
+    },
+    {
+      "epoch": 0.4224355691356846,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0010951189810259128,
+      "loss": 0.1108,
+      "step": 48665
+    },
+    {
+      "epoch": 0.4224442496158888,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0010950892190095705,
+      "loss": 0.0884,
+      "step": 48666
+    },
+    {
+      "epoch": 0.4224529300960929,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0010950594569985983,
+      "loss": 0.1162,
+      "step": 48667
+    },
+    {
+      "epoch": 0.42246161057629705,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.001095029694993029,
+      "loss": 0.0752,
+      "step": 48668
+    },
+    {
+      "epoch": 0.42247029105650125,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0010949999329928953,
+      "loss": 0.1182,
+      "step": 48669
+    },
+    {
+      "epoch": 0.4224789715367054,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0010949701709982295,
+      "loss": 0.1211,
+      "step": 48670
+    },
+    {
+      "epoch": 0.4224876520169096,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.001094940409009064,
+      "loss": 0.1348,
+      "step": 48671
+    },
+    {
+      "epoch": 0.4224963324971137,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0010949106470254319,
+      "loss": 0.125,
+      "step": 48672
+    },
+    {
+      "epoch": 0.4225050129773179,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0010948808850473656,
+      "loss": 0.085,
+      "step": 48673
+    },
+    {
+      "epoch": 0.42251369345752204,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0010948511230748975,
+      "loss": 0.1191,
+      "step": 48674
+    },
+    {
+      "epoch": 0.42252237393772624,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0010948213611080598,
+      "loss": 0.085,
+      "step": 48675
+    },
+    {
+      "epoch": 0.4225310544179304,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0010947915991468854,
+      "loss": 0.1279,
+      "step": 48676
+    },
+    {
+      "epoch": 0.42253973489813457,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0010947618371914072,
+      "loss": 0.1138,
+      "step": 48677
+    },
+    {
+      "epoch": 0.4225484153783387,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.001094732075241657,
+      "loss": 0.0928,
+      "step": 48678
+    },
+    {
+      "epoch": 0.4225570958585429,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0010947023132976677,
+      "loss": 0.0874,
+      "step": 48679
+    },
+    {
+      "epoch": 0.42256577633874703,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001094672551359472,
+      "loss": 0.1318,
+      "step": 48680
+    },
+    {
+      "epoch": 0.4225744568189512,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0010946427894271023,
+      "loss": 0.0923,
+      "step": 48681
+    },
+    {
+      "epoch": 0.42258313729915536,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0010946130275005911,
+      "loss": 0.0986,
+      "step": 48682
+    },
+    {
+      "epoch": 0.42259181777935956,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0010945832655799715,
+      "loss": 0.1816,
+      "step": 48683
+    },
+    {
+      "epoch": 0.4226004982595637,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001094553503665275,
+      "loss": 0.0972,
+      "step": 48684
+    },
+    {
+      "epoch": 0.4226091787397679,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0010945237417565351,
+      "loss": 0.1064,
+      "step": 48685
+    },
+    {
+      "epoch": 0.422617859219972,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0010944939798537836,
+      "loss": 0.1318,
+      "step": 48686
+    },
+    {
+      "epoch": 0.4226265397001762,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0010944642179570533,
+      "loss": 0.1064,
+      "step": 48687
+    },
+    {
+      "epoch": 0.42263522018038036,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.001094434456066377,
+      "loss": 0.123,
+      "step": 48688
+    },
+    {
+      "epoch": 0.42264390066058455,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0010944046941817873,
+      "loss": 0.1113,
+      "step": 48689
+    },
+    {
+      "epoch": 0.4226525811407887,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0010943749323033163,
+      "loss": 0.1191,
+      "step": 48690
+    },
+    {
+      "epoch": 0.4226612616209929,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001094345170430997,
+      "loss": 0.0674,
+      "step": 48691
+    },
+    {
+      "epoch": 0.422669942101197,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0010943154085648616,
+      "loss": 0.1484,
+      "step": 48692
+    },
+    {
+      "epoch": 0.4226786225814012,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0010942856467049428,
+      "loss": 0.0752,
+      "step": 48693
+    },
+    {
+      "epoch": 0.42268730306160535,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0010942558848512733,
+      "loss": 0.1069,
+      "step": 48694
+    },
+    {
+      "epoch": 0.42269598354180954,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0010942261230038855,
+      "loss": 0.0938,
+      "step": 48695
+    },
+    {
+      "epoch": 0.4227046640220137,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0010941963611628114,
+      "loss": 0.0659,
+      "step": 48696
+    },
+    {
+      "epoch": 0.42271334450221787,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0010941665993280845,
+      "loss": 0.1006,
+      "step": 48697
+    },
+    {
+      "epoch": 0.422722024982422,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.001094136837499737,
+      "loss": 0.1055,
+      "step": 48698
+    },
+    {
+      "epoch": 0.4227307054626262,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001094107075677801,
+      "loss": 0.104,
+      "step": 48699
+    },
+    {
+      "epoch": 0.42273938594283034,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0010940773138623098,
+      "loss": 0.0933,
+      "step": 48700
+    },
+    {
+      "epoch": 0.42274806642303453,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0010940475520532953,
+      "loss": 0.1113,
+      "step": 48701
+    },
+    {
+      "epoch": 0.42275674690323867,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0010940177902507907,
+      "loss": 0.0654,
+      "step": 48702
+    },
+    {
+      "epoch": 0.42276542738344286,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0010939880284548279,
+      "loss": 0.083,
+      "step": 48703
+    },
+    {
+      "epoch": 0.422774107863647,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0010939582666654394,
+      "loss": 0.083,
+      "step": 48704
+    },
+    {
+      "epoch": 0.4227827883438512,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0010939285048826581,
+      "loss": 0.1367,
+      "step": 48705
+    },
+    {
+      "epoch": 0.4227914688240553,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001093898743106517,
+      "loss": 0.1426,
+      "step": 48706
+    },
+    {
+      "epoch": 0.4228001493042595,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0010938689813370477,
+      "loss": 0.0879,
+      "step": 48707
+    },
+    {
+      "epoch": 0.42280882978446366,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0010938392195742833,
+      "loss": 0.1055,
+      "step": 48708
+    },
+    {
+      "epoch": 0.42281751026466785,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0010938094578182562,
+      "loss": 0.1289,
+      "step": 48709
+    },
+    {
+      "epoch": 0.422826190744872,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0010937796960689991,
+      "loss": 0.1211,
+      "step": 48710
+    },
+    {
+      "epoch": 0.4228348712250762,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0010937499343265445,
+      "loss": 0.0986,
+      "step": 48711
+    },
+    {
+      "epoch": 0.4228435517052803,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001093720172590925,
+      "loss": 0.1025,
+      "step": 48712
+    },
+    {
+      "epoch": 0.4228522321854845,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0010936904108621724,
+      "loss": 0.0879,
+      "step": 48713
+    },
+    {
+      "epoch": 0.42286091266568865,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0010936606491403203,
+      "loss": 0.0977,
+      "step": 48714
+    },
+    {
+      "epoch": 0.42286959314589284,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.001093630887425401,
+      "loss": 0.0996,
+      "step": 48715
+    },
+    {
+      "epoch": 0.422878273626097,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0010936011257174465,
+      "loss": 0.0654,
+      "step": 48716
+    },
+    {
+      "epoch": 0.42288695410630117,
+      "grad_norm": 0.125,
+      "learning_rate": 0.00109357136401649,
+      "loss": 0.1074,
+      "step": 48717
+    },
+    {
+      "epoch": 0.4228956345865053,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0010935416023225633,
+      "loss": 0.1328,
+      "step": 48718
+    },
+    {
+      "epoch": 0.4229043150667095,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0010935118406356999,
+      "loss": 0.1021,
+      "step": 48719
+    },
+    {
+      "epoch": 0.42291299554691364,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0010934820789559316,
+      "loss": 0.0854,
+      "step": 48720
+    },
+    {
+      "epoch": 0.42292167602711783,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0010934523172832916,
+      "loss": 0.1045,
+      "step": 48721
+    },
+    {
+      "epoch": 0.42293035650732197,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0010934225556178112,
+      "loss": 0.1934,
+      "step": 48722
+    },
+    {
+      "epoch": 0.42293903698752616,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0010933927939595245,
+      "loss": 0.1084,
+      "step": 48723
+    },
+    {
+      "epoch": 0.4229477174677303,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0010933630323084632,
+      "loss": 0.1187,
+      "step": 48724
+    },
+    {
+      "epoch": 0.4229563979479345,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0010933332706646599,
+      "loss": 0.1055,
+      "step": 48725
+    },
+    {
+      "epoch": 0.42296507842813863,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0010933035090281474,
+      "loss": 0.0835,
+      "step": 48726
+    },
+    {
+      "epoch": 0.4229737589083428,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.001093273747398958,
+      "loss": 0.1035,
+      "step": 48727
+    },
+    {
+      "epoch": 0.42298243938854696,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0010932439857771242,
+      "loss": 0.0942,
+      "step": 48728
+    },
+    {
+      "epoch": 0.42299111986875115,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0010932142241626788,
+      "loss": 0.0654,
+      "step": 48729
+    },
+    {
+      "epoch": 0.4229998003489553,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0010931844625556542,
+      "loss": 0.084,
+      "step": 48730
+    },
+    {
+      "epoch": 0.4230084808291595,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0010931547009560827,
+      "loss": 0.1021,
+      "step": 48731
+    },
+    {
+      "epoch": 0.4230171613093636,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0010931249393639974,
+      "loss": 0.1025,
+      "step": 48732
+    },
+    {
+      "epoch": 0.4230258417895678,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0010930951777794306,
+      "loss": 0.0996,
+      "step": 48733
+    },
+    {
+      "epoch": 0.42303452226977195,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001093065416202415,
+      "loss": 0.064,
+      "step": 48734
+    },
+    {
+      "epoch": 0.42304320274997614,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0010930356546329828,
+      "loss": 0.1104,
+      "step": 48735
+    },
+    {
+      "epoch": 0.4230518832301803,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0010930058930711665,
+      "loss": 0.1055,
+      "step": 48736
+    },
+    {
+      "epoch": 0.4230605637103845,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0010929761315169987,
+      "loss": 0.1064,
+      "step": 48737
+    },
+    {
+      "epoch": 0.4230692441905886,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0010929463699705124,
+      "loss": 0.1406,
+      "step": 48738
+    },
+    {
+      "epoch": 0.4230779246707928,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0010929166084317397,
+      "loss": 0.1064,
+      "step": 48739
+    },
+    {
+      "epoch": 0.42308660515099694,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0010928868469007133,
+      "loss": 0.1045,
+      "step": 48740
+    },
+    {
+      "epoch": 0.42309528563120113,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001092857085377466,
+      "loss": 0.1191,
+      "step": 48741
+    },
+    {
+      "epoch": 0.42310396611140527,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0010928273238620299,
+      "loss": 0.1187,
+      "step": 48742
+    },
+    {
+      "epoch": 0.42311264659160946,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001092797562354438,
+      "loss": 0.1118,
+      "step": 48743
+    },
+    {
+      "epoch": 0.4231213270718136,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001092767800854722,
+      "loss": 0.1201,
+      "step": 48744
+    },
+    {
+      "epoch": 0.4231300075520178,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0010927380393629155,
+      "loss": 0.0835,
+      "step": 48745
+    },
+    {
+      "epoch": 0.42313868803222193,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0010927082778790505,
+      "loss": 0.0879,
+      "step": 48746
+    },
+    {
+      "epoch": 0.4231473685124261,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0010926785164031595,
+      "loss": 0.127,
+      "step": 48747
+    },
+    {
+      "epoch": 0.42315604899263026,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.001092648754935275,
+      "loss": 0.0967,
+      "step": 48748
+    },
+    {
+      "epoch": 0.42316472947283446,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0010926189934754298,
+      "loss": 0.0874,
+      "step": 48749
+    },
+    {
+      "epoch": 0.4231734099530386,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0010925892320236565,
+      "loss": 0.051,
+      "step": 48750
+    },
+    {
+      "epoch": 0.4231820904332428,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0010925594705799876,
+      "loss": 0.0986,
+      "step": 48751
+    },
+    {
+      "epoch": 0.4231907709134469,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0010925297091444555,
+      "loss": 0.0957,
+      "step": 48752
+    },
+    {
+      "epoch": 0.4231994513936511,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001092499947717093,
+      "loss": 0.1035,
+      "step": 48753
+    },
+    {
+      "epoch": 0.42320813187385525,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.001092470186297932,
+      "loss": 0.1094,
+      "step": 48754
+    },
+    {
+      "epoch": 0.42321681235405945,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0010924404248870054,
+      "loss": 0.1084,
+      "step": 48755
+    },
+    {
+      "epoch": 0.4232254928342636,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0010924106634843463,
+      "loss": 0.1035,
+      "step": 48756
+    },
+    {
+      "epoch": 0.4232341733144678,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0010923809020899863,
+      "loss": 0.0806,
+      "step": 48757
+    },
+    {
+      "epoch": 0.4232428537946719,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0010923511407039588,
+      "loss": 0.082,
+      "step": 48758
+    },
+    {
+      "epoch": 0.4232515342748761,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.001092321379326296,
+      "loss": 0.0767,
+      "step": 48759
+    },
+    {
+      "epoch": 0.42326021475508024,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0010922916179570304,
+      "loss": 0.0898,
+      "step": 48760
+    },
+    {
+      "epoch": 0.42326889523528444,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0010922618565961946,
+      "loss": 0.0957,
+      "step": 48761
+    },
+    {
+      "epoch": 0.4232775757154886,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001092232095243821,
+      "loss": 0.1377,
+      "step": 48762
+    },
+    {
+      "epoch": 0.42328625619569277,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0010922023338999422,
+      "loss": 0.106,
+      "step": 48763
+    },
+    {
+      "epoch": 0.4232949366758969,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0010921725725645908,
+      "loss": 0.082,
+      "step": 48764
+    },
+    {
+      "epoch": 0.4233036171561011,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0010921428112377993,
+      "loss": 0.0957,
+      "step": 48765
+    },
+    {
+      "epoch": 0.42331229763630523,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0010921130499196006,
+      "loss": 0.1074,
+      "step": 48766
+    },
+    {
+      "epoch": 0.4233209781165094,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0010920832886100265,
+      "loss": 0.1108,
+      "step": 48767
+    },
+    {
+      "epoch": 0.42332965859671357,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0010920535273091104,
+      "loss": 0.0859,
+      "step": 48768
+    },
+    {
+      "epoch": 0.42333833907691776,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0010920237660168844,
+      "loss": 0.0771,
+      "step": 48769
+    },
+    {
+      "epoch": 0.4233470195571219,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.001091994004733381,
+      "loss": 0.1055,
+      "step": 48770
+    },
+    {
+      "epoch": 0.4233557000373261,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.001091964243458633,
+      "loss": 0.0898,
+      "step": 48771
+    },
+    {
+      "epoch": 0.4233643805175302,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0010919344821926726,
+      "loss": 0.1279,
+      "step": 48772
+    },
+    {
+      "epoch": 0.4233730609977344,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0010919047209355323,
+      "loss": 0.0835,
+      "step": 48773
+    },
+    {
+      "epoch": 0.42338174147793856,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.001091874959687245,
+      "loss": 0.125,
+      "step": 48774
+    },
+    {
+      "epoch": 0.42339042195814275,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0010918451984478434,
+      "loss": 0.1099,
+      "step": 48775
+    },
+    {
+      "epoch": 0.4233991024383469,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0010918154372173596,
+      "loss": 0.1191,
+      "step": 48776
+    },
+    {
+      "epoch": 0.4234077829185511,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0010917856759958264,
+      "loss": 0.0874,
+      "step": 48777
+    },
+    {
+      "epoch": 0.4234164633987552,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0010917559147832763,
+      "loss": 0.083,
+      "step": 48778
+    },
+    {
+      "epoch": 0.4234251438789594,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.001091726153579742,
+      "loss": 0.0977,
+      "step": 48779
+    },
+    {
+      "epoch": 0.42343382435916355,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0010916963923852554,
+      "loss": 0.0693,
+      "step": 48780
+    },
+    {
+      "epoch": 0.42344250483936774,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0010916666311998498,
+      "loss": 0.0996,
+      "step": 48781
+    },
+    {
+      "epoch": 0.4234511853195719,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0010916368700235569,
+      "loss": 0.0933,
+      "step": 48782
+    },
+    {
+      "epoch": 0.42345986579977607,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0010916071088564103,
+      "loss": 0.1064,
+      "step": 48783
+    },
+    {
+      "epoch": 0.4234685462799802,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001091577347698442,
+      "loss": 0.124,
+      "step": 48784
+    },
+    {
+      "epoch": 0.4234772267601844,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0010915475865496845,
+      "loss": 0.1211,
+      "step": 48785
+    },
+    {
+      "epoch": 0.42348590724038854,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0010915178254101705,
+      "loss": 0.1045,
+      "step": 48786
+    },
+    {
+      "epoch": 0.42349458772059273,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0010914880642799326,
+      "loss": 0.0884,
+      "step": 48787
+    },
+    {
+      "epoch": 0.42350326820079687,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001091458303159003,
+      "loss": 0.0903,
+      "step": 48788
+    },
+    {
+      "epoch": 0.42351194868100106,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0010914285420474145,
+      "loss": 0.1157,
+      "step": 48789
+    },
+    {
+      "epoch": 0.4235206291612052,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0010913987809451997,
+      "loss": 0.0942,
+      "step": 48790
+    },
+    {
+      "epoch": 0.4235293096414094,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0010913690198523908,
+      "loss": 0.1104,
+      "step": 48791
+    },
+    {
+      "epoch": 0.42353799012161353,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001091339258769021,
+      "loss": 0.1084,
+      "step": 48792
+    },
+    {
+      "epoch": 0.42354667060181767,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0010913094976951223,
+      "loss": 0.1211,
+      "step": 48793
+    },
+    {
+      "epoch": 0.42355535108202186,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0010912797366307275,
+      "loss": 0.0918,
+      "step": 48794
+    },
+    {
+      "epoch": 0.423564031562226,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0010912499755758687,
+      "loss": 0.0928,
+      "step": 48795
+    },
+    {
+      "epoch": 0.4235727120424302,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0010912202145305792,
+      "loss": 0.0898,
+      "step": 48796
+    },
+    {
+      "epoch": 0.4235813925226343,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0010911904534948909,
+      "loss": 0.1152,
+      "step": 48797
+    },
+    {
+      "epoch": 0.4235900730028385,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.001091160692468837,
+      "loss": 0.083,
+      "step": 48798
+    },
+    {
+      "epoch": 0.42359875348304266,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.001091130931452449,
+      "loss": 0.0913,
+      "step": 48799
+    },
+    {
+      "epoch": 0.42360743396324685,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0010911011704457603,
+      "loss": 0.0825,
+      "step": 48800
+    },
+    {
+      "epoch": 0.423616114443451,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0010910714094488033,
+      "loss": 0.1328,
+      "step": 48801
+    },
+    {
+      "epoch": 0.4236247949236552,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0010910416484616106,
+      "loss": 0.1035,
+      "step": 48802
+    },
+    {
+      "epoch": 0.4236334754038593,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0010910118874842146,
+      "loss": 0.1157,
+      "step": 48803
+    },
+    {
+      "epoch": 0.4236421558840635,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0010909821265166475,
+      "loss": 0.0918,
+      "step": 48804
+    },
+    {
+      "epoch": 0.42365083636426765,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0010909523655589426,
+      "loss": 0.0947,
+      "step": 48805
+    },
+    {
+      "epoch": 0.42365951684447184,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0010909226046111318,
+      "loss": 0.0698,
+      "step": 48806
+    },
+    {
+      "epoch": 0.423668197324676,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0010908928436732482,
+      "loss": 0.0703,
+      "step": 48807
+    },
+    {
+      "epoch": 0.42367687780488017,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0010908630827453237,
+      "loss": 0.0771,
+      "step": 48808
+    },
+    {
+      "epoch": 0.4236855582850843,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0010908333218273914,
+      "loss": 0.1133,
+      "step": 48809
+    },
+    {
+      "epoch": 0.4236942387652885,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0010908035609194836,
+      "loss": 0.0806,
+      "step": 48810
+    },
+    {
+      "epoch": 0.42370291924549264,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0010907738000216329,
+      "loss": 0.1611,
+      "step": 48811
+    },
+    {
+      "epoch": 0.42371159972569683,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0010907440391338722,
+      "loss": 0.0859,
+      "step": 48812
+    },
+    {
+      "epoch": 0.42372028020590097,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001090714278256233,
+      "loss": 0.1367,
+      "step": 48813
+    },
+    {
+      "epoch": 0.42372896068610516,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0010906845173887488,
+      "loss": 0.0874,
+      "step": 48814
+    },
+    {
+      "epoch": 0.4237376411663093,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001090654756531452,
+      "loss": 0.1021,
+      "step": 48815
+    },
+    {
+      "epoch": 0.4237463216465135,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0010906249956843751,
+      "loss": 0.0781,
+      "step": 48816
+    },
+    {
+      "epoch": 0.42375500212671763,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.00109059523484755,
+      "loss": 0.0796,
+      "step": 48817
+    },
+    {
+      "epoch": 0.4237636826069218,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0010905654740210103,
+      "loss": 0.1079,
+      "step": 48818
+    },
+    {
+      "epoch": 0.42377236308712596,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0010905357132047879,
+      "loss": 0.0781,
+      "step": 48819
+    },
+    {
+      "epoch": 0.42378104356733015,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0010905059523989159,
+      "loss": 0.1484,
+      "step": 48820
+    },
+    {
+      "epoch": 0.4237897240475343,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001090476191603426,
+      "loss": 0.0884,
+      "step": 48821
+    },
+    {
+      "epoch": 0.4237984045277385,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0010904464308183514,
+      "loss": 0.0986,
+      "step": 48822
+    },
+    {
+      "epoch": 0.4238070850079426,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0010904166700437241,
+      "loss": 0.1328,
+      "step": 48823
+    },
+    {
+      "epoch": 0.4238157654881468,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0010903869092795775,
+      "loss": 0.1206,
+      "step": 48824
+    },
+    {
+      "epoch": 0.42382444596835095,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0010903571485259434,
+      "loss": 0.0762,
+      "step": 48825
+    },
+    {
+      "epoch": 0.42383312644855514,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001090327387782854,
+      "loss": 0.0913,
+      "step": 48826
+    },
+    {
+      "epoch": 0.4238418069287593,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0010902976270503431,
+      "loss": 0.0928,
+      "step": 48827
+    },
+    {
+      "epoch": 0.42385048740896347,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0010902678663284426,
+      "loss": 0.1045,
+      "step": 48828
+    },
+    {
+      "epoch": 0.4238591678891676,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0010902381056171848,
+      "loss": 0.0981,
+      "step": 48829
+    },
+    {
+      "epoch": 0.4238678483693718,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0010902083449166027,
+      "loss": 0.0742,
+      "step": 48830
+    },
+    {
+      "epoch": 0.42387652884957594,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0010901785842267284,
+      "loss": 0.0737,
+      "step": 48831
+    },
+    {
+      "epoch": 0.42388520932978013,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.0010901488235475945,
+      "loss": 0.1777,
+      "step": 48832
+    },
+    {
+      "epoch": 0.42389388980998427,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.001090119062879234,
+      "loss": 0.0801,
+      "step": 48833
+    },
+    {
+      "epoch": 0.42390257029018846,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.001090089302221679,
+      "loss": 0.0923,
+      "step": 48834
+    },
+    {
+      "epoch": 0.4239112507703926,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001090059541574962,
+      "loss": 0.0786,
+      "step": 48835
+    },
+    {
+      "epoch": 0.4239199312505968,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.001090029780939116,
+      "loss": 0.0815,
+      "step": 48836
+    },
+    {
+      "epoch": 0.42392861173080093,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0010900000203141733,
+      "loss": 0.1108,
+      "step": 48837
+    },
+    {
+      "epoch": 0.4239372922110051,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0010899702597001663,
+      "loss": 0.1523,
+      "step": 48838
+    },
+    {
+      "epoch": 0.42394597269120926,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0010899404990971277,
+      "loss": 0.104,
+      "step": 48839
+    },
+    {
+      "epoch": 0.42395465317141345,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0010899107385050903,
+      "loss": 0.0786,
+      "step": 48840
+    },
+    {
+      "epoch": 0.4239633336516176,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.001089880977924086,
+      "loss": 0.0947,
+      "step": 48841
+    },
+    {
+      "epoch": 0.4239720141318218,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0010898512173541477,
+      "loss": 0.0708,
+      "step": 48842
+    },
+    {
+      "epoch": 0.4239806946120259,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0010898214567953081,
+      "loss": 0.1074,
+      "step": 48843
+    },
+    {
+      "epoch": 0.4239893750922301,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0010897916962475994,
+      "loss": 0.1016,
+      "step": 48844
+    },
+    {
+      "epoch": 0.42399805557243425,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0010897619357110547,
+      "loss": 0.1123,
+      "step": 48845
+    },
+    {
+      "epoch": 0.42400673605263844,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.001089732175185706,
+      "loss": 0.0815,
+      "step": 48846
+    },
+    {
+      "epoch": 0.4240154165328426,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0010897024146715862,
+      "loss": 0.1416,
+      "step": 48847
+    },
+    {
+      "epoch": 0.4240240970130468,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0010896726541687277,
+      "loss": 0.1045,
+      "step": 48848
+    },
+    {
+      "epoch": 0.4240327774932509,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0010896428936771627,
+      "loss": 0.1001,
+      "step": 48849
+    },
+    {
+      "epoch": 0.4240414579734551,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.001089613133196924,
+      "loss": 0.0562,
+      "step": 48850
+    },
+    {
+      "epoch": 0.42405013845365924,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0010895833727280447,
+      "loss": 0.0908,
+      "step": 48851
+    },
+    {
+      "epoch": 0.42405881893386344,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0010895536122705566,
+      "loss": 0.0894,
+      "step": 48852
+    },
+    {
+      "epoch": 0.4240674994140676,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0010895238518244922,
+      "loss": 0.0898,
+      "step": 48853
+    },
+    {
+      "epoch": 0.42407617989427177,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0010894940913898847,
+      "loss": 0.0796,
+      "step": 48854
+    },
+    {
+      "epoch": 0.4240848603744759,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0010894643309667666,
+      "loss": 0.1201,
+      "step": 48855
+    },
+    {
+      "epoch": 0.4240935408546801,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0010894345705551698,
+      "loss": 0.1318,
+      "step": 48856
+    },
+    {
+      "epoch": 0.42410222133488423,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0010894048101551272,
+      "loss": 0.1152,
+      "step": 48857
+    },
+    {
+      "epoch": 0.4241109018150884,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0010893750497666718,
+      "loss": 0.1191,
+      "step": 48858
+    },
+    {
+      "epoch": 0.42411958229529256,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0010893452893898353,
+      "loss": 0.1328,
+      "step": 48859
+    },
+    {
+      "epoch": 0.42412826277549676,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0010893155290246502,
+      "loss": 0.0938,
+      "step": 48860
+    },
+    {
+      "epoch": 0.4241369432557009,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.00108928576867115,
+      "loss": 0.0996,
+      "step": 48861
+    },
+    {
+      "epoch": 0.4241456237359051,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0010892560083293663,
+      "loss": 0.1377,
+      "step": 48862
+    },
+    {
+      "epoch": 0.4241543042161092,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0010892262479993324,
+      "loss": 0.1348,
+      "step": 48863
+    },
+    {
+      "epoch": 0.4241629846963134,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0010891964876810808,
+      "loss": 0.1006,
+      "step": 48864
+    },
+    {
+      "epoch": 0.42417166517651755,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0010891667273746435,
+      "loss": 0.1025,
+      "step": 48865
+    },
+    {
+      "epoch": 0.42418034565672175,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0010891369670800533,
+      "loss": 0.0947,
+      "step": 48866
+    },
+    {
+      "epoch": 0.4241890261369259,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0010891072067973427,
+      "loss": 0.0723,
+      "step": 48867
+    },
+    {
+      "epoch": 0.4241977066171301,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0010890774465265442,
+      "loss": 0.084,
+      "step": 48868
+    },
+    {
+      "epoch": 0.4242063870973342,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0010890476862676903,
+      "loss": 0.0928,
+      "step": 48869
+    },
+    {
+      "epoch": 0.4242150675775384,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0010890179260208141,
+      "loss": 0.1133,
+      "step": 48870
+    },
+    {
+      "epoch": 0.42422374805774254,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0010889881657859474,
+      "loss": 0.0825,
+      "step": 48871
+    },
+    {
+      "epoch": 0.42423242853794674,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0010889584055631231,
+      "loss": 0.1025,
+      "step": 48872
+    },
+    {
+      "epoch": 0.4242411090181509,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.001088928645352374,
+      "loss": 0.1523,
+      "step": 48873
+    },
+    {
+      "epoch": 0.42424978949835507,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0010888988851537325,
+      "loss": 0.1074,
+      "step": 48874
+    },
+    {
+      "epoch": 0.4242584699785592,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0010888691249672307,
+      "loss": 0.0898,
+      "step": 48875
+    },
+    {
+      "epoch": 0.4242671504587634,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0010888393647929015,
+      "loss": 0.0898,
+      "step": 48876
+    },
+    {
+      "epoch": 0.42427583093896754,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0010888096046307771,
+      "loss": 0.0938,
+      "step": 48877
+    },
+    {
+      "epoch": 0.42428451141917173,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0010887798444808908,
+      "loss": 0.0967,
+      "step": 48878
+    },
+    {
+      "epoch": 0.42429319189937587,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0010887500843432744,
+      "loss": 0.0977,
+      "step": 48879
+    },
+    {
+      "epoch": 0.42430187237958006,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0010887203242179611,
+      "loss": 0.1387,
+      "step": 48880
+    },
+    {
+      "epoch": 0.4243105528597842,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.0010886905641049828,
+      "loss": 0.1289,
+      "step": 48881
+    },
+    {
+      "epoch": 0.4243192333399884,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0010886608040043722,
+      "loss": 0.0879,
+      "step": 48882
+    },
+    {
+      "epoch": 0.4243279138201925,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0010886310439161624,
+      "loss": 0.1162,
+      "step": 48883
+    },
+    {
+      "epoch": 0.4243365943003967,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0010886012838403855,
+      "loss": 0.1069,
+      "step": 48884
+    },
+    {
+      "epoch": 0.42434527478060086,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0010885715237770738,
+      "loss": 0.0654,
+      "step": 48885
+    },
+    {
+      "epoch": 0.42435395526080505,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0010885417637262597,
+      "loss": 0.0996,
+      "step": 48886
+    },
+    {
+      "epoch": 0.4243626357410092,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0010885120036879768,
+      "loss": 0.125,
+      "step": 48887
+    },
+    {
+      "epoch": 0.4243713162212134,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0010884822436622568,
+      "loss": 0.1016,
+      "step": 48888
+    },
+    {
+      "epoch": 0.4243799967014175,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0010884524836491324,
+      "loss": 0.0986,
+      "step": 48889
+    },
+    {
+      "epoch": 0.4243886771816217,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0010884227236486362,
+      "loss": 0.084,
+      "step": 48890
+    },
+    {
+      "epoch": 0.42439735766182585,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001088392963660801,
+      "loss": 0.0977,
+      "step": 48891
+    },
+    {
+      "epoch": 0.42440603814203004,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0010883632036856588,
+      "loss": 0.083,
+      "step": 48892
+    },
+    {
+      "epoch": 0.4244147186222342,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0010883334437232425,
+      "loss": 0.1035,
+      "step": 48893
+    },
+    {
+      "epoch": 0.42442339910243837,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0010883036837735847,
+      "loss": 0.0908,
+      "step": 48894
+    },
+    {
+      "epoch": 0.4244320795826425,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0010882739238367173,
+      "loss": 0.1221,
+      "step": 48895
+    },
+    {
+      "epoch": 0.4244407600628467,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.001088244163912674,
+      "loss": 0.1172,
+      "step": 48896
+    },
+    {
+      "epoch": 0.42444944054305084,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0010882144040014862,
+      "loss": 0.1465,
+      "step": 48897
+    },
+    {
+      "epoch": 0.42445812102325503,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0010881846441031873,
+      "loss": 0.0869,
+      "step": 48898
+    },
+    {
+      "epoch": 0.42446680150345917,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0010881548842178095,
+      "loss": 0.1465,
+      "step": 48899
+    },
+    {
+      "epoch": 0.42447548198366336,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0010881251243453852,
+      "loss": 0.082,
+      "step": 48900
+    },
+    {
+      "epoch": 0.4244841624638675,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0010880953644859471,
+      "loss": 0.123,
+      "step": 48901
+    },
+    {
+      "epoch": 0.4244928429440717,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.001088065604639528,
+      "loss": 0.085,
+      "step": 48902
+    },
+    {
+      "epoch": 0.42450152342427583,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0010880358448061593,
+      "loss": 0.0859,
+      "step": 48903
+    },
+    {
+      "epoch": 0.42451020390448,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0010880060849858752,
+      "loss": 0.1318,
+      "step": 48904
+    },
+    {
+      "epoch": 0.42451888438468416,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0010879763251787074,
+      "loss": 0.0898,
+      "step": 48905
+    },
+    {
+      "epoch": 0.42452756486488835,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0010879465653846885,
+      "loss": 0.0918,
+      "step": 48906
+    },
+    {
+      "epoch": 0.4245362453450925,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001087916805603851,
+      "loss": 0.0771,
+      "step": 48907
+    },
+    {
+      "epoch": 0.4245449258252967,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0010878870458362274,
+      "loss": 0.051,
+      "step": 48908
+    },
+    {
+      "epoch": 0.4245536063055008,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0010878572860818505,
+      "loss": 0.0688,
+      "step": 48909
+    },
+    {
+      "epoch": 0.424562286785705,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0010878275263407526,
+      "loss": 0.1396,
+      "step": 48910
+    },
+    {
+      "epoch": 0.42457096726590915,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0010877977666129664,
+      "loss": 0.0977,
+      "step": 48911
+    },
+    {
+      "epoch": 0.42457964774611334,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.001087768006898524,
+      "loss": 0.1172,
+      "step": 48912
+    },
+    {
+      "epoch": 0.4245883282263175,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0010877382471974587,
+      "loss": 0.0977,
+      "step": 48913
+    },
+    {
+      "epoch": 0.4245970087065217,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0010877084875098027,
+      "loss": 0.1201,
+      "step": 48914
+    },
+    {
+      "epoch": 0.4246056891867258,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0010876787278355884,
+      "loss": 0.1128,
+      "step": 48915
+    },
+    {
+      "epoch": 0.42461436966692995,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0010876489681748485,
+      "loss": 0.1177,
+      "step": 48916
+    },
+    {
+      "epoch": 0.42462305014713414,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0010876192085276157,
+      "loss": 0.1055,
+      "step": 48917
+    },
+    {
+      "epoch": 0.4246317306273383,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001087589448893922,
+      "loss": 0.0874,
+      "step": 48918
+    },
+    {
+      "epoch": 0.42464041110754247,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0010875596892738005,
+      "loss": 0.127,
+      "step": 48919
+    },
+    {
+      "epoch": 0.4246490915877466,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0010875299296672833,
+      "loss": 0.0913,
+      "step": 48920
+    },
+    {
+      "epoch": 0.4246577720679508,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.001087500170074403,
+      "loss": 0.1182,
+      "step": 48921
+    },
+    {
+      "epoch": 0.42466645254815494,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0010874704104951928,
+      "loss": 0.0894,
+      "step": 48922
+    },
+    {
+      "epoch": 0.42467513302835913,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0010874406509296848,
+      "loss": 0.0879,
+      "step": 48923
+    },
+    {
+      "epoch": 0.42468381350856327,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0010874108913779114,
+      "loss": 0.0928,
+      "step": 48924
+    },
+    {
+      "epoch": 0.42469249398876746,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0010873811318399054,
+      "loss": 0.1582,
+      "step": 48925
+    },
+    {
+      "epoch": 0.4247011744689716,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.001087351372315699,
+      "loss": 0.1167,
+      "step": 48926
+    },
+    {
+      "epoch": 0.4247098549491758,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0010873216128053247,
+      "loss": 0.1035,
+      "step": 48927
+    },
+    {
+      "epoch": 0.42471853542937993,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0010872918533088155,
+      "loss": 0.1069,
+      "step": 48928
+    },
+    {
+      "epoch": 0.4247272159095841,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0010872620938262038,
+      "loss": 0.0972,
+      "step": 48929
+    },
+    {
+      "epoch": 0.42473589638978826,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0010872323343575222,
+      "loss": 0.0894,
+      "step": 48930
+    },
+    {
+      "epoch": 0.42474457686999245,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0010872025749028029,
+      "loss": 0.1377,
+      "step": 48931
+    },
+    {
+      "epoch": 0.4247532573501966,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.001087172815462079,
+      "loss": 0.0894,
+      "step": 48932
+    },
+    {
+      "epoch": 0.4247619378304008,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0010871430560353825,
+      "loss": 0.0913,
+      "step": 48933
+    },
+    {
+      "epoch": 0.4247706183106049,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0010871132966227461,
+      "loss": 0.1011,
+      "step": 48934
+    },
+    {
+      "epoch": 0.4247792987908091,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0010870835372242028,
+      "loss": 0.0869,
+      "step": 48935
+    },
+    {
+      "epoch": 0.42478797927101325,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0010870537778397845,
+      "loss": 0.0908,
+      "step": 48936
+    },
+    {
+      "epoch": 0.42479665975121744,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0010870240184695235,
+      "loss": 0.1387,
+      "step": 48937
+    },
+    {
+      "epoch": 0.4248053402314216,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0010869942591134531,
+      "loss": 0.1226,
+      "step": 48938
+    },
+    {
+      "epoch": 0.4248140207116258,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.001086964499771606,
+      "loss": 0.1021,
+      "step": 48939
+    },
+    {
+      "epoch": 0.4248227011918299,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0010869347404440137,
+      "loss": 0.1016,
+      "step": 48940
+    },
+    {
+      "epoch": 0.4248313816720341,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00108690498113071,
+      "loss": 0.1445,
+      "step": 48941
+    },
+    {
+      "epoch": 0.42484006215223824,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0010868752218317263,
+      "loss": 0.0942,
+      "step": 48942
+    },
+    {
+      "epoch": 0.42484874263244243,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0010868454625470962,
+      "loss": 0.1182,
+      "step": 48943
+    },
+    {
+      "epoch": 0.42485742311264657,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0010868157032768516,
+      "loss": 0.0967,
+      "step": 48944
+    },
+    {
+      "epoch": 0.42486610359285076,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0010867859440210248,
+      "loss": 0.084,
+      "step": 48945
+    },
+    {
+      "epoch": 0.4248747840730549,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0010867561847796488,
+      "loss": 0.1011,
+      "step": 48946
+    },
+    {
+      "epoch": 0.4248834645532591,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0010867264255527561,
+      "loss": 0.082,
+      "step": 48947
+    },
+    {
+      "epoch": 0.42489214503346323,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001086696666340379,
+      "loss": 0.1211,
+      "step": 48948
+    },
+    {
+      "epoch": 0.4249008255136674,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0010866669071425503,
+      "loss": 0.0991,
+      "step": 48949
+    },
+    {
+      "epoch": 0.42490950599387156,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0010866371479593027,
+      "loss": 0.1211,
+      "step": 48950
+    },
+    {
+      "epoch": 0.42491818647407575,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0010866073887906683,
+      "loss": 0.0957,
+      "step": 48951
+    },
+    {
+      "epoch": 0.4249268669542799,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.00108657762963668,
+      "loss": 0.0889,
+      "step": 48952
+    },
+    {
+      "epoch": 0.4249355474344841,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0010865478704973704,
+      "loss": 0.0522,
+      "step": 48953
+    },
+    {
+      "epoch": 0.4249442279146882,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0010865181113727715,
+      "loss": 0.0908,
+      "step": 48954
+    },
+    {
+      "epoch": 0.4249529083948924,
+      "grad_norm": 2.046875,
+      "learning_rate": 0.0010864883522629161,
+      "loss": 0.209,
+      "step": 48955
+    },
+    {
+      "epoch": 0.42496158887509655,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0010864585931678371,
+      "loss": 0.1167,
+      "step": 48956
+    },
+    {
+      "epoch": 0.42497026935530074,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0010864288340875668,
+      "loss": 0.0669,
+      "step": 48957
+    },
+    {
+      "epoch": 0.4249789498355049,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0010863990750221372,
+      "loss": 0.1572,
+      "step": 48958
+    },
+    {
+      "epoch": 0.4249876303157091,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0010863693159715822,
+      "loss": 0.1016,
+      "step": 48959
+    },
+    {
+      "epoch": 0.4249963107959132,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0010863395569359329,
+      "loss": 0.105,
+      "step": 48960
+    },
+    {
+      "epoch": 0.4250049912761174,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0010863097979152228,
+      "loss": 0.1543,
+      "step": 48961
+    },
+    {
+      "epoch": 0.42501367175632154,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.001086280038909484,
+      "loss": 0.125,
+      "step": 48962
+    },
+    {
+      "epoch": 0.42502235223652574,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0010862502799187492,
+      "loss": 0.083,
+      "step": 48963
+    },
+    {
+      "epoch": 0.4250310327167299,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0010862205209430504,
+      "loss": 0.1196,
+      "step": 48964
+    },
+    {
+      "epoch": 0.42503971319693407,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001086190761982421,
+      "loss": 0.1221,
+      "step": 48965
+    },
+    {
+      "epoch": 0.4250483936771382,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0010861610030368934,
+      "loss": 0.1006,
+      "step": 48966
+    },
+    {
+      "epoch": 0.4250570741573424,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0010861312441064995,
+      "loss": 0.0874,
+      "step": 48967
+    },
+    {
+      "epoch": 0.42506575463754653,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0010861014851912725,
+      "loss": 0.0752,
+      "step": 48968
+    },
+    {
+      "epoch": 0.4250744351177507,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0010860717262912447,
+      "loss": 0.0972,
+      "step": 48969
+    },
+    {
+      "epoch": 0.42508311559795486,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0010860419674064488,
+      "loss": 0.123,
+      "step": 48970
+    },
+    {
+      "epoch": 0.42509179607815906,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.001086012208536917,
+      "loss": 0.1025,
+      "step": 48971
+    },
+    {
+      "epoch": 0.4251004765583632,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0010859824496826822,
+      "loss": 0.0771,
+      "step": 48972
+    },
+    {
+      "epoch": 0.4251091570385674,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0010859526908437763,
+      "loss": 0.1025,
+      "step": 48973
+    },
+    {
+      "epoch": 0.4251178375187715,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001085922932020233,
+      "loss": 0.0591,
+      "step": 48974
+    },
+    {
+      "epoch": 0.4251265179989757,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0010858931732120837,
+      "loss": 0.0791,
+      "step": 48975
+    },
+    {
+      "epoch": 0.42513519847917985,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0010858634144193615,
+      "loss": 0.0957,
+      "step": 48976
+    },
+    {
+      "epoch": 0.42514387895938405,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.001085833655642099,
+      "loss": 0.1396,
+      "step": 48977
+    },
+    {
+      "epoch": 0.4251525594395882,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0010858038968803285,
+      "loss": 0.0933,
+      "step": 48978
+    },
+    {
+      "epoch": 0.4251612399197924,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0010857741381340826,
+      "loss": 0.1006,
+      "step": 48979
+    },
+    {
+      "epoch": 0.4251699203999965,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0010857443794033942,
+      "loss": 0.0928,
+      "step": 48980
+    },
+    {
+      "epoch": 0.4251786008802007,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0010857146206882948,
+      "loss": 0.1289,
+      "step": 48981
+    },
+    {
+      "epoch": 0.42518728136040485,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0010856848619888182,
+      "loss": 0.0747,
+      "step": 48982
+    },
+    {
+      "epoch": 0.42519596184060904,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0010856551033049963,
+      "loss": 0.1758,
+      "step": 48983
+    },
+    {
+      "epoch": 0.4252046423208132,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001085625344636862,
+      "loss": 0.1465,
+      "step": 48984
+    },
+    {
+      "epoch": 0.42521332280101737,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0010855955859844476,
+      "loss": 0.1113,
+      "step": 48985
+    },
+    {
+      "epoch": 0.4252220032812215,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0010855658273477853,
+      "loss": 0.0801,
+      "step": 48986
+    },
+    {
+      "epoch": 0.4252306837614257,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0010855360687269083,
+      "loss": 0.082,
+      "step": 48987
+    },
+    {
+      "epoch": 0.42523936424162984,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0010855063101218487,
+      "loss": 0.0908,
+      "step": 48988
+    },
+    {
+      "epoch": 0.42524804472183403,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0010854765515326394,
+      "loss": 0.0928,
+      "step": 48989
+    },
+    {
+      "epoch": 0.42525672520203817,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.001085446792959312,
+      "loss": 0.0908,
+      "step": 48990
+    },
+    {
+      "epoch": 0.42526540568224236,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0010854170344019006,
+      "loss": 0.0923,
+      "step": 48991
+    },
+    {
+      "epoch": 0.4252740861624465,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0010853872758604366,
+      "loss": 0.0972,
+      "step": 48992
+    },
+    {
+      "epoch": 0.4252827666426507,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001085357517334953,
+      "loss": 0.1221,
+      "step": 48993
+    },
+    {
+      "epoch": 0.4252914471228548,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0010853277588254821,
+      "loss": 0.1016,
+      "step": 48994
+    },
+    {
+      "epoch": 0.425300127603059,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0010852980003320564,
+      "loss": 0.0869,
+      "step": 48995
+    },
+    {
+      "epoch": 0.42530880808326316,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0010852682418547087,
+      "loss": 0.1943,
+      "step": 48996
+    },
+    {
+      "epoch": 0.42531748856346735,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0010852384833934714,
+      "loss": 0.1172,
+      "step": 48997
+    },
+    {
+      "epoch": 0.4253261690436715,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0010852087249483768,
+      "loss": 0.0908,
+      "step": 48998
+    },
+    {
+      "epoch": 0.4253348495238757,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0010851789665194582,
+      "loss": 0.082,
+      "step": 48999
+    },
+    {
+      "epoch": 0.4253435300040798,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0010851492081067476,
+      "loss": 0.0986,
+      "step": 49000
+    },
+    {
+      "epoch": 0.425352210484284,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0010851194497102773,
+      "loss": 0.0986,
+      "step": 49001
+    },
+    {
+      "epoch": 0.42536089096448815,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0010850896913300804,
+      "loss": 0.1045,
+      "step": 49002
+    },
+    {
+      "epoch": 0.42536957144469234,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0010850599329661893,
+      "loss": 0.0996,
+      "step": 49003
+    },
+    {
+      "epoch": 0.4253782519248965,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0010850301746186365,
+      "loss": 0.1309,
+      "step": 49004
+    },
+    {
+      "epoch": 0.42538693240510067,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0010850004162874538,
+      "loss": 0.0869,
+      "step": 49005
+    },
+    {
+      "epoch": 0.4253956128853048,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001084970657972675,
+      "loss": 0.1357,
+      "step": 49006
+    },
+    {
+      "epoch": 0.425404293365509,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0010849408996743317,
+      "loss": 0.062,
+      "step": 49007
+    },
+    {
+      "epoch": 0.42541297384571314,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.001084911141392457,
+      "loss": 0.0864,
+      "step": 49008
+    },
+    {
+      "epoch": 0.42542165432591733,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0010848813831270832,
+      "loss": 0.103,
+      "step": 49009
+    },
+    {
+      "epoch": 0.42543033480612147,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.001084851624878243,
+      "loss": 0.0762,
+      "step": 49010
+    },
+    {
+      "epoch": 0.42543901528632566,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.001084821866645969,
+      "loss": 0.0938,
+      "step": 49011
+    },
+    {
+      "epoch": 0.4254476957665298,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0010847921084302934,
+      "loss": 0.0815,
+      "step": 49012
+    },
+    {
+      "epoch": 0.425456376246734,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.001084762350231249,
+      "loss": 0.0752,
+      "step": 49013
+    },
+    {
+      "epoch": 0.42546505672693813,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001084732592048868,
+      "loss": 0.0923,
+      "step": 49014
+    },
+    {
+      "epoch": 0.4254737372071423,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0010847028338831835,
+      "loss": 0.0947,
+      "step": 49015
+    },
+    {
+      "epoch": 0.42548241768734646,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0010846730757342275,
+      "loss": 0.0898,
+      "step": 49016
+    },
+    {
+      "epoch": 0.42549109816755065,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0010846433176020327,
+      "loss": 0.0996,
+      "step": 49017
+    },
+    {
+      "epoch": 0.4254997786477548,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.001084613559486632,
+      "loss": 0.1465,
+      "step": 49018
+    },
+    {
+      "epoch": 0.425508459127959,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0010845838013880578,
+      "loss": 0.0898,
+      "step": 49019
+    },
+    {
+      "epoch": 0.4255171396081631,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0010845540433063424,
+      "loss": 0.1113,
+      "step": 49020
+    },
+    {
+      "epoch": 0.4255258200883673,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0010845242852415185,
+      "loss": 0.0898,
+      "step": 49021
+    },
+    {
+      "epoch": 0.42553450056857145,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0010844945271936185,
+      "loss": 0.0981,
+      "step": 49022
+    },
+    {
+      "epoch": 0.42554318104877564,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0010844647691626748,
+      "loss": 0.0845,
+      "step": 49023
+    },
+    {
+      "epoch": 0.4255518615289798,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0010844350111487205,
+      "loss": 0.0854,
+      "step": 49024
+    },
+    {
+      "epoch": 0.425560542009184,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0010844052531517878,
+      "loss": 0.1064,
+      "step": 49025
+    },
+    {
+      "epoch": 0.4255692224893881,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0010843754951719092,
+      "loss": 0.0859,
+      "step": 49026
+    },
+    {
+      "epoch": 0.4255779029695923,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0010843457372091176,
+      "loss": 0.1025,
+      "step": 49027
+    },
+    {
+      "epoch": 0.42558658344979644,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001084315979263445,
+      "loss": 0.124,
+      "step": 49028
+    },
+    {
+      "epoch": 0.42559526393000063,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0010842862213349242,
+      "loss": 0.1123,
+      "step": 49029
+    },
+    {
+      "epoch": 0.42560394441020477,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0010842564634235882,
+      "loss": 0.0781,
+      "step": 49030
+    },
+    {
+      "epoch": 0.42561262489040896,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0010842267055294686,
+      "loss": 0.0767,
+      "step": 49031
+    },
+    {
+      "epoch": 0.4256213053706131,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0010841969476525983,
+      "loss": 0.1177,
+      "step": 49032
+    },
+    {
+      "epoch": 0.4256299858508173,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0010841671897930103,
+      "loss": 0.0967,
+      "step": 49033
+    },
+    {
+      "epoch": 0.42563866633102143,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0010841374319507369,
+      "loss": 0.0898,
+      "step": 49034
+    },
+    {
+      "epoch": 0.4256473468112256,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0010841076741258102,
+      "loss": 0.1084,
+      "step": 49035
+    },
+    {
+      "epoch": 0.42565602729142976,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0010840779163182634,
+      "loss": 0.084,
+      "step": 49036
+    },
+    {
+      "epoch": 0.42566470777163395,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0010840481585281288,
+      "loss": 0.1079,
+      "step": 49037
+    },
+    {
+      "epoch": 0.4256733882518381,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.001084018400755439,
+      "loss": 0.1182,
+      "step": 49038
+    },
+    {
+      "epoch": 0.42568206873204223,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0010839886430002261,
+      "loss": 0.0986,
+      "step": 49039
+    },
+    {
+      "epoch": 0.4256907492122464,
+      "grad_norm": 2.453125,
+      "learning_rate": 0.0010839588852625233,
+      "loss": 0.2812,
+      "step": 49040
+    },
+    {
+      "epoch": 0.42569942969245056,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0010839291275423627,
+      "loss": 0.0986,
+      "step": 49041
+    },
+    {
+      "epoch": 0.42570811017265475,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0010838993698397766,
+      "loss": 0.1396,
+      "step": 49042
+    },
+    {
+      "epoch": 0.4257167906528589,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.001083869612154798,
+      "loss": 0.0908,
+      "step": 49043
+    },
+    {
+      "epoch": 0.4257254711330631,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0010838398544874596,
+      "loss": 0.1484,
+      "step": 49044
+    },
+    {
+      "epoch": 0.4257341516132672,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0010838100968377938,
+      "loss": 0.0781,
+      "step": 49045
+    },
+    {
+      "epoch": 0.4257428320934714,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0010837803392058328,
+      "loss": 0.1006,
+      "step": 49046
+    },
+    {
+      "epoch": 0.42575151257367555,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0010837505815916096,
+      "loss": 0.1016,
+      "step": 49047
+    },
+    {
+      "epoch": 0.42576019305387974,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0010837208239951564,
+      "loss": 0.0918,
+      "step": 49048
+    },
+    {
+      "epoch": 0.4257688735340839,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0010836910664165059,
+      "loss": 0.0591,
+      "step": 49049
+    },
+    {
+      "epoch": 0.4257775540142881,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0010836613088556902,
+      "loss": 0.1133,
+      "step": 49050
+    },
+    {
+      "epoch": 0.4257862344944922,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0010836315513127427,
+      "loss": 0.0781,
+      "step": 49051
+    },
+    {
+      "epoch": 0.4257949149746964,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0010836017937876953,
+      "loss": 0.082,
+      "step": 49052
+    },
+    {
+      "epoch": 0.42580359545490054,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001083572036280581,
+      "loss": 0.0996,
+      "step": 49053
+    },
+    {
+      "epoch": 0.42581227593510473,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0010835422787914318,
+      "loss": 0.0898,
+      "step": 49054
+    },
+    {
+      "epoch": 0.42582095641530887,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0010835125213202808,
+      "loss": 0.1211,
+      "step": 49055
+    },
+    {
+      "epoch": 0.42582963689551306,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.00108348276386716,
+      "loss": 0.0811,
+      "step": 49056
+    },
+    {
+      "epoch": 0.4258383173757172,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0010834530064321023,
+      "loss": 0.1123,
+      "step": 49057
+    },
+    {
+      "epoch": 0.4258469978559214,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0010834232490151403,
+      "loss": 0.062,
+      "step": 49058
+    },
+    {
+      "epoch": 0.42585567833612553,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0010833934916163058,
+      "loss": 0.1445,
+      "step": 49059
+    },
+    {
+      "epoch": 0.4258643588163297,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0010833637342356323,
+      "loss": 0.0742,
+      "step": 49060
+    },
+    {
+      "epoch": 0.42587303929653386,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0010833339768731522,
+      "loss": 0.1289,
+      "step": 49061
+    },
+    {
+      "epoch": 0.42588171977673805,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0010833042195288977,
+      "loss": 0.1436,
+      "step": 49062
+    },
+    {
+      "epoch": 0.4258904002569422,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0010832744622029011,
+      "loss": 0.1084,
+      "step": 49063
+    },
+    {
+      "epoch": 0.4258990807371464,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0010832447048951957,
+      "loss": 0.1035,
+      "step": 49064
+    },
+    {
+      "epoch": 0.4259077612173505,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0010832149476058136,
+      "loss": 0.0981,
+      "step": 49065
+    },
+    {
+      "epoch": 0.4259164416975547,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0010831851903347874,
+      "loss": 0.0996,
+      "step": 49066
+    },
+    {
+      "epoch": 0.42592512217775885,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0010831554330821493,
+      "loss": 0.1055,
+      "step": 49067
+    },
+    {
+      "epoch": 0.42593380265796305,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0010831256758479322,
+      "loss": 0.1099,
+      "step": 49068
+    },
+    {
+      "epoch": 0.4259424831381672,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0010830959186321689,
+      "loss": 0.0933,
+      "step": 49069
+    },
+    {
+      "epoch": 0.4259511636183714,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0010830661614348916,
+      "loss": 0.3242,
+      "step": 49070
+    },
+    {
+      "epoch": 0.4259598440985755,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0010830364042561328,
+      "loss": 0.0801,
+      "step": 49071
+    },
+    {
+      "epoch": 0.4259685245787797,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001083006647095925,
+      "loss": 0.0918,
+      "step": 49072
+    },
+    {
+      "epoch": 0.42597720505898384,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001082976889954301,
+      "loss": 0.0874,
+      "step": 49073
+    },
+    {
+      "epoch": 0.42598588553918804,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0010829471328312934,
+      "loss": 0.0791,
+      "step": 49074
+    },
+    {
+      "epoch": 0.4259945660193922,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0010829173757269347,
+      "loss": 0.0972,
+      "step": 49075
+    },
+    {
+      "epoch": 0.42600324649959637,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0010828876186412563,
+      "loss": 0.1025,
+      "step": 49076
+    },
+    {
+      "epoch": 0.4260119269798005,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0010828578615742926,
+      "loss": 0.0918,
+      "step": 49077
+    },
+    {
+      "epoch": 0.4260206074600047,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0010828281045260752,
+      "loss": 0.0645,
+      "step": 49078
+    },
+    {
+      "epoch": 0.42602928794020883,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0010827983474966368,
+      "loss": 0.1973,
+      "step": 49079
+    },
+    {
+      "epoch": 0.426037968420413,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.00108276859048601,
+      "loss": 0.0879,
+      "step": 49080
+    },
+    {
+      "epoch": 0.42604664890061716,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0010827388334942267,
+      "loss": 0.0723,
+      "step": 49081
+    },
+    {
+      "epoch": 0.42605532938082136,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0010827090765213202,
+      "loss": 0.0938,
+      "step": 49082
+    },
+    {
+      "epoch": 0.4260640098610255,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0010826793195673227,
+      "loss": 0.1011,
+      "step": 49083
+    },
+    {
+      "epoch": 0.4260726903412297,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.001082649562632267,
+      "loss": 0.1055,
+      "step": 49084
+    },
+    {
+      "epoch": 0.4260813708214338,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0010826198057161851,
+      "loss": 0.0771,
+      "step": 49085
+    },
+    {
+      "epoch": 0.426090051301638,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0010825900488191102,
+      "loss": 0.1172,
+      "step": 49086
+    },
+    {
+      "epoch": 0.42609873178184215,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0010825602919410746,
+      "loss": 0.1011,
+      "step": 49087
+    },
+    {
+      "epoch": 0.42610741226204635,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0010825305350821112,
+      "loss": 0.1196,
+      "step": 49088
+    },
+    {
+      "epoch": 0.4261160927422505,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0010825007782422516,
+      "loss": 0.1182,
+      "step": 49089
+    },
+    {
+      "epoch": 0.4261247732224547,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001082471021421529,
+      "loss": 0.1113,
+      "step": 49090
+    },
+    {
+      "epoch": 0.4261334537026588,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0010824412646199756,
+      "loss": 0.1328,
+      "step": 49091
+    },
+    {
+      "epoch": 0.426142134182863,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0010824115078376247,
+      "loss": 0.0889,
+      "step": 49092
+    },
+    {
+      "epoch": 0.42615081466306715,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001082381751074508,
+      "loss": 0.0928,
+      "step": 49093
+    },
+    {
+      "epoch": 0.42615949514327134,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001082351994330658,
+      "loss": 0.0996,
+      "step": 49094
+    },
+    {
+      "epoch": 0.4261681756234755,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0010823222376061081,
+      "loss": 0.0825,
+      "step": 49095
+    },
+    {
+      "epoch": 0.42617685610367967,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0010822924809008902,
+      "loss": 0.1387,
+      "step": 49096
+    },
+    {
+      "epoch": 0.4261855365838838,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0010822627242150372,
+      "loss": 0.0815,
+      "step": 49097
+    },
+    {
+      "epoch": 0.426194217064088,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0010822329675485812,
+      "loss": 0.1025,
+      "step": 49098
+    },
+    {
+      "epoch": 0.42620289754429214,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001082203210901555,
+      "loss": 0.1074,
+      "step": 49099
+    },
+    {
+      "epoch": 0.42621157802449633,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001082173454273991,
+      "loss": 0.0908,
+      "step": 49100
+    },
+    {
+      "epoch": 0.42622025850470047,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0010821436976659218,
+      "loss": 0.0801,
+      "step": 49101
+    },
+    {
+      "epoch": 0.42622893898490466,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0010821139410773799,
+      "loss": 0.0854,
+      "step": 49102
+    },
+    {
+      "epoch": 0.4262376194651088,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0010820841845083982,
+      "loss": 0.124,
+      "step": 49103
+    },
+    {
+      "epoch": 0.426246299945313,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0010820544279590088,
+      "loss": 0.105,
+      "step": 49104
+    },
+    {
+      "epoch": 0.4262549804255171,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0010820246714292445,
+      "loss": 0.1436,
+      "step": 49105
+    },
+    {
+      "epoch": 0.4262636609057213,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.001081994914919138,
+      "loss": 0.0669,
+      "step": 49106
+    },
+    {
+      "epoch": 0.42627234138592546,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0010819651584287214,
+      "loss": 0.1177,
+      "step": 49107
+    },
+    {
+      "epoch": 0.42628102186612965,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0010819354019580272,
+      "loss": 0.0957,
+      "step": 49108
+    },
+    {
+      "epoch": 0.4262897023463338,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0010819056455070885,
+      "loss": 0.0957,
+      "step": 49109
+    },
+    {
+      "epoch": 0.426298382826538,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0010818758890759372,
+      "loss": 0.0718,
+      "step": 49110
+    },
+    {
+      "epoch": 0.4263070633067421,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001081846132664606,
+      "loss": 0.1309,
+      "step": 49111
+    },
+    {
+      "epoch": 0.4263157437869463,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0010818163762731279,
+      "loss": 0.0762,
+      "step": 49112
+    },
+    {
+      "epoch": 0.42632442426715045,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0010817866199015352,
+      "loss": 0.084,
+      "step": 49113
+    },
+    {
+      "epoch": 0.42633310474735464,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0010817568635498602,
+      "loss": 0.0972,
+      "step": 49114
+    },
+    {
+      "epoch": 0.4263417852275588,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.001081727107218136,
+      "loss": 0.1602,
+      "step": 49115
+    },
+    {
+      "epoch": 0.42635046570776297,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0010816973509063946,
+      "loss": 0.1406,
+      "step": 49116
+    },
+    {
+      "epoch": 0.4263591461879671,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0010816675946146686,
+      "loss": 0.1021,
+      "step": 49117
+    },
+    {
+      "epoch": 0.4263678266681713,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0010816378383429908,
+      "loss": 0.0742,
+      "step": 49118
+    },
+    {
+      "epoch": 0.42637650714837544,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0010816080820913929,
+      "loss": 0.0933,
+      "step": 49119
+    },
+    {
+      "epoch": 0.42638518762857963,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0010815783258599089,
+      "loss": 0.0986,
+      "step": 49120
+    },
+    {
+      "epoch": 0.42639386810878377,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0010815485696485704,
+      "loss": 0.1113,
+      "step": 49121
+    },
+    {
+      "epoch": 0.42640254858898796,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0010815188134574098,
+      "loss": 0.1074,
+      "step": 49122
+    },
+    {
+      "epoch": 0.4264112290691921,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0010814890572864602,
+      "loss": 0.0908,
+      "step": 49123
+    },
+    {
+      "epoch": 0.4264199095493963,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0010814593011357538,
+      "loss": 0.1104,
+      "step": 49124
+    },
+    {
+      "epoch": 0.42642859002960043,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0010814295450053235,
+      "loss": 0.0742,
+      "step": 49125
+    },
+    {
+      "epoch": 0.4264372705098046,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0010813997888952014,
+      "loss": 0.0854,
+      "step": 49126
+    },
+    {
+      "epoch": 0.42644595099000876,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0010813700328054202,
+      "loss": 0.0825,
+      "step": 49127
+    },
+    {
+      "epoch": 0.42645463147021295,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001081340276736012,
+      "loss": 0.0928,
+      "step": 49128
+    },
+    {
+      "epoch": 0.4264633119504171,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0010813105206870105,
+      "loss": 0.0781,
+      "step": 49129
+    },
+    {
+      "epoch": 0.4264719924306213,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0010812807646584472,
+      "loss": 0.084,
+      "step": 49130
+    },
+    {
+      "epoch": 0.4264806729108254,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0010812510086503548,
+      "loss": 0.0654,
+      "step": 49131
+    },
+    {
+      "epoch": 0.4264893533910296,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0010812212526627662,
+      "loss": 0.1006,
+      "step": 49132
+    },
+    {
+      "epoch": 0.42649803387123375,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0010811914966957139,
+      "loss": 0.1123,
+      "step": 49133
+    },
+    {
+      "epoch": 0.42650671435143794,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0010811617407492304,
+      "loss": 0.1338,
+      "step": 49134
+    },
+    {
+      "epoch": 0.4265153948316421,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.001081131984823348,
+      "loss": 0.1436,
+      "step": 49135
+    },
+    {
+      "epoch": 0.4265240753118463,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0010811022289180993,
+      "loss": 0.0938,
+      "step": 49136
+    },
+    {
+      "epoch": 0.4265327557920504,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0010810724730335166,
+      "loss": 0.1162,
+      "step": 49137
+    },
+    {
+      "epoch": 0.4265414362722546,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.001081042717169633,
+      "loss": 0.1016,
+      "step": 49138
+    },
+    {
+      "epoch": 0.42655011675245874,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001081012961326481,
+      "loss": 0.1553,
+      "step": 49139
+    },
+    {
+      "epoch": 0.42655879723266293,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001080983205504093,
+      "loss": 0.0864,
+      "step": 49140
+    },
+    {
+      "epoch": 0.42656747771286707,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0010809534497025013,
+      "loss": 0.0898,
+      "step": 49141
+    },
+    {
+      "epoch": 0.42657615819307126,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0010809236939217386,
+      "loss": 0.0713,
+      "step": 49142
+    },
+    {
+      "epoch": 0.4265848386732754,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0010808939381618374,
+      "loss": 0.1758,
+      "step": 49143
+    },
+    {
+      "epoch": 0.4265935191534796,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0010808641824228305,
+      "loss": 0.125,
+      "step": 49144
+    },
+    {
+      "epoch": 0.42660219963368373,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0010808344267047498,
+      "loss": 0.207,
+      "step": 49145
+    },
+    {
+      "epoch": 0.4266108801138879,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0010808046710076287,
+      "loss": 0.063,
+      "step": 49146
+    },
+    {
+      "epoch": 0.42661956059409206,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0010807749153314992,
+      "loss": 0.1143,
+      "step": 49147
+    },
+    {
+      "epoch": 0.42662824107429625,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0010807451596763942,
+      "loss": 0.1094,
+      "step": 49148
+    },
+    {
+      "epoch": 0.4266369215545004,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001080715404042346,
+      "loss": 0.1152,
+      "step": 49149
+    },
+    {
+      "epoch": 0.4266456020347046,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0010806856484293869,
+      "loss": 0.085,
+      "step": 49150
+    },
+    {
+      "epoch": 0.4266542825149087,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0010806558928375496,
+      "loss": 0.1357,
+      "step": 49151
+    },
+    {
+      "epoch": 0.4266629629951129,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0010806261372668668,
+      "loss": 0.126,
+      "step": 49152
+    },
+    {
+      "epoch": 0.42667164347531705,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0010805963817173711,
+      "loss": 0.085,
+      "step": 49153
+    },
+    {
+      "epoch": 0.42668032395552125,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0010805666261890945,
+      "loss": 0.1084,
+      "step": 49154
+    },
+    {
+      "epoch": 0.4266890044357254,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0010805368706820705,
+      "loss": 0.0977,
+      "step": 49155
+    },
+    {
+      "epoch": 0.4266976849159296,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0010805071151963309,
+      "loss": 0.1436,
+      "step": 49156
+    },
+    {
+      "epoch": 0.4267063653961337,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0010804773597319084,
+      "loss": 0.1406,
+      "step": 49157
+    },
+    {
+      "epoch": 0.4267150458763379,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0010804476042888356,
+      "loss": 0.0962,
+      "step": 49158
+    },
+    {
+      "epoch": 0.42672372635654204,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0010804178488671447,
+      "loss": 0.1069,
+      "step": 49159
+    },
+    {
+      "epoch": 0.42673240683674624,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.001080388093466869,
+      "loss": 0.0962,
+      "step": 49160
+    },
+    {
+      "epoch": 0.4267410873169504,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0010803583380880404,
+      "loss": 0.1167,
+      "step": 49161
+    },
+    {
+      "epoch": 0.4267497677971545,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0010803285827306916,
+      "loss": 0.0889,
+      "step": 49162
+    },
+    {
+      "epoch": 0.4267584482773587,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.001080298827394855,
+      "loss": 0.0581,
+      "step": 49163
+    },
+    {
+      "epoch": 0.42676712875756284,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0010802690720805638,
+      "loss": 0.1309,
+      "step": 49164
+    },
+    {
+      "epoch": 0.42677580923776703,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0010802393167878494,
+      "loss": 0.0752,
+      "step": 49165
+    },
+    {
+      "epoch": 0.42678448971797117,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0010802095615167456,
+      "loss": 0.0889,
+      "step": 49166
+    },
+    {
+      "epoch": 0.42679317019817536,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0010801798062672842,
+      "loss": 0.0752,
+      "step": 49167
+    },
+    {
+      "epoch": 0.4268018506783795,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0010801500510394975,
+      "loss": 0.1455,
+      "step": 49168
+    },
+    {
+      "epoch": 0.4268105311585837,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0010801202958334187,
+      "loss": 0.0967,
+      "step": 49169
+    },
+    {
+      "epoch": 0.42681921163878783,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0010800905406490799,
+      "loss": 0.1133,
+      "step": 49170
+    },
+    {
+      "epoch": 0.426827892118992,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0010800607854865136,
+      "loss": 0.0957,
+      "step": 49171
+    },
+    {
+      "epoch": 0.42683657259919616,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0010800310303457528,
+      "loss": 0.0874,
+      "step": 49172
+    },
+    {
+      "epoch": 0.42684525307940036,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0010800012752268297,
+      "loss": 0.0811,
+      "step": 49173
+    },
+    {
+      "epoch": 0.4268539335596045,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.001079971520129777,
+      "loss": 0.082,
+      "step": 49174
+    },
+    {
+      "epoch": 0.4268626140398087,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0010799417650546273,
+      "loss": 0.1016,
+      "step": 49175
+    },
+    {
+      "epoch": 0.4268712945200128,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0010799120100014128,
+      "loss": 0.0654,
+      "step": 49176
+    },
+    {
+      "epoch": 0.426879975000217,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0010798822549701662,
+      "loss": 0.125,
+      "step": 49177
+    },
+    {
+      "epoch": 0.42688865548042115,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.00107985249996092,
+      "loss": 0.0859,
+      "step": 49178
+    },
+    {
+      "epoch": 0.42689733596062535,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001079822744973707,
+      "loss": 0.082,
+      "step": 49179
+    },
+    {
+      "epoch": 0.4269060164408295,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.001079792990008559,
+      "loss": 0.1152,
+      "step": 49180
+    },
+    {
+      "epoch": 0.4269146969210337,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0010797632350655095,
+      "loss": 0.1299,
+      "step": 49181
+    },
+    {
+      "epoch": 0.4269233774012378,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0010797334801445908,
+      "loss": 0.1504,
+      "step": 49182
+    },
+    {
+      "epoch": 0.426932057881442,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001079703725245835,
+      "loss": 0.1328,
+      "step": 49183
+    },
+    {
+      "epoch": 0.42694073836164614,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0010796739703692751,
+      "loss": 0.1113,
+      "step": 49184
+    },
+    {
+      "epoch": 0.42694941884185034,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0010796442155149436,
+      "loss": 0.0991,
+      "step": 49185
+    },
+    {
+      "epoch": 0.4269580993220545,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0010796144606828727,
+      "loss": 0.0806,
+      "step": 49186
+    },
+    {
+      "epoch": 0.42696677980225867,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0010795847058730948,
+      "loss": 0.0996,
+      "step": 49187
+    },
+    {
+      "epoch": 0.4269754602824628,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.001079554951085643,
+      "loss": 0.0957,
+      "step": 49188
+    },
+    {
+      "epoch": 0.426984140762667,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0010795251963205492,
+      "loss": 0.1299,
+      "step": 49189
+    },
+    {
+      "epoch": 0.42699282124287113,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0010794954415778465,
+      "loss": 0.1021,
+      "step": 49190
+    },
+    {
+      "epoch": 0.4270015017230753,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001079465686857568,
+      "loss": 0.0801,
+      "step": 49191
+    },
+    {
+      "epoch": 0.42701018220327946,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.001079435932159745,
+      "loss": 0.084,
+      "step": 49192
+    },
+    {
+      "epoch": 0.42701886268348366,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0010794061774844104,
+      "loss": 0.0918,
+      "step": 49193
+    },
+    {
+      "epoch": 0.4270275431636878,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0010793764228315972,
+      "loss": 0.0742,
+      "step": 49194
+    },
+    {
+      "epoch": 0.427036223643892,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0010793466682013375,
+      "loss": 0.0771,
+      "step": 49195
+    },
+    {
+      "epoch": 0.4270449041240961,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0010793169135936637,
+      "loss": 0.0854,
+      "step": 49196
+    },
+    {
+      "epoch": 0.4270535846043003,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0010792871590086088,
+      "loss": 0.106,
+      "step": 49197
+    },
+    {
+      "epoch": 0.42706226508450446,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0010792574044462056,
+      "loss": 0.1021,
+      "step": 49198
+    },
+    {
+      "epoch": 0.42707094556470865,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0010792276499064855,
+      "loss": 0.0684,
+      "step": 49199
+    },
+    {
+      "epoch": 0.4270796260449128,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001079197895389482,
+      "loss": 0.2266,
+      "step": 49200
+    },
+    {
+      "epoch": 0.427088306525117,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0010791681408952277,
+      "loss": 0.0757,
+      "step": 49201
+    },
+    {
+      "epoch": 0.4270969870053211,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0010791383864237544,
+      "loss": 0.0791,
+      "step": 49202
+    },
+    {
+      "epoch": 0.4271056674855253,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0010791086319750954,
+      "loss": 0.126,
+      "step": 49203
+    },
+    {
+      "epoch": 0.42711434796572945,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0010790788775492828,
+      "loss": 0.0977,
+      "step": 49204
+    },
+    {
+      "epoch": 0.42712302844593364,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.001079049123146349,
+      "loss": 0.0952,
+      "step": 49205
+    },
+    {
+      "epoch": 0.4271317089261378,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0010790193687663266,
+      "loss": 0.1123,
+      "step": 49206
+    },
+    {
+      "epoch": 0.42714038940634197,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0010789896144092486,
+      "loss": 0.1016,
+      "step": 49207
+    },
+    {
+      "epoch": 0.4271490698865461,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001078959860075147,
+      "loss": 0.1289,
+      "step": 49208
+    },
+    {
+      "epoch": 0.4271577503667503,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0010789301057640547,
+      "loss": 0.1021,
+      "step": 49209
+    },
+    {
+      "epoch": 0.42716643084695444,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0010789003514760046,
+      "loss": 0.1104,
+      "step": 49210
+    },
+    {
+      "epoch": 0.42717511132715863,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0010788705972110282,
+      "loss": 0.0977,
+      "step": 49211
+    },
+    {
+      "epoch": 0.42718379180736277,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0010788408429691588,
+      "loss": 0.1494,
+      "step": 49212
+    },
+    {
+      "epoch": 0.42719247228756696,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0010788110887504287,
+      "loss": 0.0879,
+      "step": 49213
+    },
+    {
+      "epoch": 0.4272011527677711,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0010787813345548708,
+      "loss": 0.125,
+      "step": 49214
+    },
+    {
+      "epoch": 0.4272098332479753,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0010787515803825165,
+      "loss": 0.0801,
+      "step": 49215
+    },
+    {
+      "epoch": 0.4272185137281794,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0010787218262333996,
+      "loss": 0.0801,
+      "step": 49216
+    },
+    {
+      "epoch": 0.4272271942083836,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0010786920721075522,
+      "loss": 0.1074,
+      "step": 49217
+    },
+    {
+      "epoch": 0.42723587468858776,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0010786623180050069,
+      "loss": 0.0752,
+      "step": 49218
+    },
+    {
+      "epoch": 0.42724455516879195,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0010786325639257962,
+      "loss": 0.1162,
+      "step": 49219
+    },
+    {
+      "epoch": 0.4272532356489961,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0010786028098699525,
+      "loss": 0.1143,
+      "step": 49220
+    },
+    {
+      "epoch": 0.4272619161292003,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0010785730558375087,
+      "loss": 0.1025,
+      "step": 49221
+    },
+    {
+      "epoch": 0.4272705966094044,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0010785433018284966,
+      "loss": 0.1172,
+      "step": 49222
+    },
+    {
+      "epoch": 0.4272792770896086,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0010785135478429493,
+      "loss": 0.124,
+      "step": 49223
+    },
+    {
+      "epoch": 0.42728795756981275,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0010784837938808995,
+      "loss": 0.1138,
+      "step": 49224
+    },
+    {
+      "epoch": 0.42729663805001694,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0010784540399423794,
+      "loss": 0.0879,
+      "step": 49225
+    },
+    {
+      "epoch": 0.4273053185302211,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0010784242860274217,
+      "loss": 0.0825,
+      "step": 49226
+    },
+    {
+      "epoch": 0.42731399901042527,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0010783945321360588,
+      "loss": 0.1162,
+      "step": 49227
+    },
+    {
+      "epoch": 0.4273226794906294,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0010783647782683235,
+      "loss": 0.0986,
+      "step": 49228
+    },
+    {
+      "epoch": 0.4273313599708336,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001078335024424248,
+      "loss": 0.0601,
+      "step": 49229
+    },
+    {
+      "epoch": 0.42734004045103774,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0010783052706038652,
+      "loss": 0.0962,
+      "step": 49230
+    },
+    {
+      "epoch": 0.42734872093124193,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0010782755168072071,
+      "loss": 0.1113,
+      "step": 49231
+    },
+    {
+      "epoch": 0.42735740141144607,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0010782457630343066,
+      "loss": 0.0742,
+      "step": 49232
+    },
+    {
+      "epoch": 0.42736608189165026,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0010782160092851962,
+      "loss": 0.085,
+      "step": 49233
+    },
+    {
+      "epoch": 0.4273747623718544,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0010781862555599086,
+      "loss": 0.0977,
+      "step": 49234
+    },
+    {
+      "epoch": 0.4273834428520586,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0010781565018584765,
+      "loss": 0.1055,
+      "step": 49235
+    },
+    {
+      "epoch": 0.42739212333226273,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0010781267481809315,
+      "loss": 0.0918,
+      "step": 49236
+    },
+    {
+      "epoch": 0.4274008038124669,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001078096994527307,
+      "loss": 0.0791,
+      "step": 49237
+    },
+    {
+      "epoch": 0.42740948429267106,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0010780672408976354,
+      "loss": 0.0933,
+      "step": 49238
+    },
+    {
+      "epoch": 0.42741816477287525,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0010780374872919492,
+      "loss": 0.123,
+      "step": 49239
+    },
+    {
+      "epoch": 0.4274268452530794,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0010780077337102804,
+      "loss": 0.1099,
+      "step": 49240
+    },
+    {
+      "epoch": 0.4274355257332836,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0010779779801526625,
+      "loss": 0.1201,
+      "step": 49241
+    },
+    {
+      "epoch": 0.4274442062134877,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0010779482266191273,
+      "loss": 0.0786,
+      "step": 49242
+    },
+    {
+      "epoch": 0.4274528866936919,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0010779184731097079,
+      "loss": 0.0669,
+      "step": 49243
+    },
+    {
+      "epoch": 0.42746156717389605,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0010778887196244361,
+      "loss": 0.1084,
+      "step": 49244
+    },
+    {
+      "epoch": 0.42747024765410024,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0010778589661633452,
+      "loss": 0.1055,
+      "step": 49245
+    },
+    {
+      "epoch": 0.4274789281343044,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0010778292127264673,
+      "loss": 0.1079,
+      "step": 49246
+    },
+    {
+      "epoch": 0.4274876086145086,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0010777994593138352,
+      "loss": 0.1157,
+      "step": 49247
+    },
+    {
+      "epoch": 0.4274962890947127,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001077769705925481,
+      "loss": 0.1069,
+      "step": 49248
+    },
+    {
+      "epoch": 0.4275049695749169,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0010777399525614374,
+      "loss": 0.0835,
+      "step": 49249
+    },
+    {
+      "epoch": 0.42751365005512104,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0010777101992217374,
+      "loss": 0.0977,
+      "step": 49250
+    },
+    {
+      "epoch": 0.42752233053532523,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0010776804459064128,
+      "loss": 0.1138,
+      "step": 49251
+    },
+    {
+      "epoch": 0.42753101101552937,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001077650692615497,
+      "loss": 0.1045,
+      "step": 49252
+    },
+    {
+      "epoch": 0.42753969149573356,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.001077620939349022,
+      "loss": 0.1094,
+      "step": 49253
+    },
+    {
+      "epoch": 0.4275483719759377,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0010775911861070201,
+      "loss": 0.0967,
+      "step": 49254
+    },
+    {
+      "epoch": 0.4275570524561419,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0010775614328895245,
+      "loss": 0.1162,
+      "step": 49255
+    },
+    {
+      "epoch": 0.42756573293634603,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001077531679696567,
+      "loss": 0.0898,
+      "step": 49256
+    },
+    {
+      "epoch": 0.4275744134165502,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0010775019265281808,
+      "loss": 0.0947,
+      "step": 49257
+    },
+    {
+      "epoch": 0.42758309389675436,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0010774721733843977,
+      "loss": 0.0972,
+      "step": 49258
+    },
+    {
+      "epoch": 0.42759177437695856,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001077442420265251,
+      "loss": 0.0684,
+      "step": 49259
+    },
+    {
+      "epoch": 0.4276004548571627,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.001077412667170773,
+      "loss": 0.124,
+      "step": 49260
+    },
+    {
+      "epoch": 0.4276091353373669,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0010773829141009963,
+      "loss": 0.1064,
+      "step": 49261
+    },
+    {
+      "epoch": 0.427617815817571,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0010773531610559534,
+      "loss": 0.1094,
+      "step": 49262
+    },
+    {
+      "epoch": 0.4276264962977752,
+      "grad_norm": 2.125,
+      "learning_rate": 0.0010773234080356765,
+      "loss": 0.4629,
+      "step": 49263
+    },
+    {
+      "epoch": 0.42763517677797935,
+      "grad_norm": 0.05322265625,
+      "learning_rate": 0.001077293655040198,
+      "loss": 0.0571,
+      "step": 49264
+    },
+    {
+      "epoch": 0.42764385725818355,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0010772639020695514,
+      "loss": 0.0713,
+      "step": 49265
+    },
+    {
+      "epoch": 0.4276525377383877,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0010772341491237683,
+      "loss": 0.106,
+      "step": 49266
+    },
+    {
+      "epoch": 0.4276612182185919,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0010772043962028817,
+      "loss": 0.0986,
+      "step": 49267
+    },
+    {
+      "epoch": 0.427669898698796,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.001077174643306924,
+      "loss": 0.0762,
+      "step": 49268
+    },
+    {
+      "epoch": 0.4276785791790002,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.001077144890435928,
+      "loss": 0.1143,
+      "step": 49269
+    },
+    {
+      "epoch": 0.42768725965920434,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.001077115137589926,
+      "loss": 0.1855,
+      "step": 49270
+    },
+    {
+      "epoch": 0.42769594013940854,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0010770853847689504,
+      "loss": 0.2012,
+      "step": 49271
+    },
+    {
+      "epoch": 0.4277046206196127,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0010770556319730337,
+      "loss": 0.1328,
+      "step": 49272
+    },
+    {
+      "epoch": 0.42771330109981687,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0010770258792022088,
+      "loss": 0.124,
+      "step": 49273
+    },
+    {
+      "epoch": 0.427721981580021,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.001076996126456508,
+      "loss": 0.1147,
+      "step": 49274
+    },
+    {
+      "epoch": 0.4277306620602252,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0010769663737359638,
+      "loss": 0.0874,
+      "step": 49275
+    },
+    {
+      "epoch": 0.42773934254042933,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0010769366210406088,
+      "loss": 0.1484,
+      "step": 49276
+    },
+    {
+      "epoch": 0.4277480230206335,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0010769068683704759,
+      "loss": 0.1143,
+      "step": 49277
+    },
+    {
+      "epoch": 0.42775670350083767,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001076877115725597,
+      "loss": 0.1152,
+      "step": 49278
+    },
+    {
+      "epoch": 0.42776538398104186,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0010768473631060052,
+      "loss": 0.123,
+      "step": 49279
+    },
+    {
+      "epoch": 0.427774064461246,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0010768176105117326,
+      "loss": 0.1084,
+      "step": 49280
+    },
+    {
+      "epoch": 0.4277827449414502,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0010767878579428121,
+      "loss": 0.0947,
+      "step": 49281
+    },
+    {
+      "epoch": 0.4277914254216543,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0010767581053992758,
+      "loss": 0.1216,
+      "step": 49282
+    },
+    {
+      "epoch": 0.4278001059018585,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0010767283528811567,
+      "loss": 0.1099,
+      "step": 49283
+    },
+    {
+      "epoch": 0.42780878638206266,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0010766986003884868,
+      "loss": 0.1133,
+      "step": 49284
+    },
+    {
+      "epoch": 0.4278174668622668,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001076668847921299,
+      "loss": 0.0918,
+      "step": 49285
+    },
+    {
+      "epoch": 0.427826147342471,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0010766390954796258,
+      "loss": 0.0967,
+      "step": 49286
+    },
+    {
+      "epoch": 0.4278348278226751,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0010766093430635,
+      "loss": 0.0796,
+      "step": 49287
+    },
+    {
+      "epoch": 0.4278435083028793,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.001076579590672954,
+      "loss": 0.0933,
+      "step": 49288
+    },
+    {
+      "epoch": 0.42785218878308345,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0010765498383080198,
+      "loss": 0.0845,
+      "step": 49289
+    },
+    {
+      "epoch": 0.42786086926328765,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0010765200859687307,
+      "loss": 0.105,
+      "step": 49290
+    },
+    {
+      "epoch": 0.4278695497434918,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0010764903336551188,
+      "loss": 0.0693,
+      "step": 49291
+    },
+    {
+      "epoch": 0.427878230223696,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0010764605813672163,
+      "loss": 0.0952,
+      "step": 49292
+    },
+    {
+      "epoch": 0.4278869107039001,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0010764308291050567,
+      "loss": 0.1006,
+      "step": 49293
+    },
+    {
+      "epoch": 0.4278955911841043,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0010764010768686714,
+      "loss": 0.1162,
+      "step": 49294
+    },
+    {
+      "epoch": 0.42790427166430844,
+      "grad_norm": 0.625,
+      "learning_rate": 0.001076371324658094,
+      "loss": 0.0928,
+      "step": 49295
+    },
+    {
+      "epoch": 0.42791295214451264,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0010763415724733565,
+      "loss": 0.1035,
+      "step": 49296
+    },
+    {
+      "epoch": 0.4279216326247168,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0010763118203144914,
+      "loss": 0.1143,
+      "step": 49297
+    },
+    {
+      "epoch": 0.42793031310492097,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0010762820681815316,
+      "loss": 0.0898,
+      "step": 49298
+    },
+    {
+      "epoch": 0.4279389935851251,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0010762523160745092,
+      "loss": 0.0737,
+      "step": 49299
+    },
+    {
+      "epoch": 0.4279476740653293,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0010762225639934571,
+      "loss": 0.1001,
+      "step": 49300
+    },
+    {
+      "epoch": 0.42795635454553344,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001076192811938407,
+      "loss": 0.0933,
+      "step": 49301
+    },
+    {
+      "epoch": 0.42796503502573763,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0010761630599093925,
+      "loss": 0.0952,
+      "step": 49302
+    },
+    {
+      "epoch": 0.42797371550594177,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0010761333079064457,
+      "loss": 0.0718,
+      "step": 49303
+    },
+    {
+      "epoch": 0.42798239598614596,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0010761035559295991,
+      "loss": 0.0732,
+      "step": 49304
+    },
+    {
+      "epoch": 0.4279910764663501,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0010760738039788855,
+      "loss": 0.0918,
+      "step": 49305
+    },
+    {
+      "epoch": 0.4279997569465543,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0010760440520543371,
+      "loss": 0.0928,
+      "step": 49306
+    },
+    {
+      "epoch": 0.4280084374267584,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0010760143001559866,
+      "loss": 0.0884,
+      "step": 49307
+    },
+    {
+      "epoch": 0.4280171179069626,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0010759845482838665,
+      "loss": 0.0703,
+      "step": 49308
+    },
+    {
+      "epoch": 0.42802579838716676,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0010759547964380094,
+      "loss": 0.1079,
+      "step": 49309
+    },
+    {
+      "epoch": 0.42803447886737095,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0010759250446184473,
+      "loss": 0.0986,
+      "step": 49310
+    },
+    {
+      "epoch": 0.4280431593475751,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0010758952928252136,
+      "loss": 0.1055,
+      "step": 49311
+    },
+    {
+      "epoch": 0.4280518398277793,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0010758655410583408,
+      "loss": 0.1143,
+      "step": 49312
+    },
+    {
+      "epoch": 0.4280605203079834,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0010758357893178602,
+      "loss": 0.0588,
+      "step": 49313
+    },
+    {
+      "epoch": 0.4280692007881876,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0010758060376038058,
+      "loss": 0.0918,
+      "step": 49314
+    },
+    {
+      "epoch": 0.42807788126839175,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0010757762859162096,
+      "loss": 0.1299,
+      "step": 49315
+    },
+    {
+      "epoch": 0.42808656174859594,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0010757465342551042,
+      "loss": 0.1011,
+      "step": 49316
+    },
+    {
+      "epoch": 0.4280952422288001,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0010757167826205218,
+      "loss": 0.0996,
+      "step": 49317
+    },
+    {
+      "epoch": 0.42810392270900427,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.001075687031012495,
+      "loss": 0.0894,
+      "step": 49318
+    },
+    {
+      "epoch": 0.4281126031892084,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0010756572794310567,
+      "loss": 0.1133,
+      "step": 49319
+    },
+    {
+      "epoch": 0.4281212836694126,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001075627527876239,
+      "loss": 0.0654,
+      "step": 49320
+    },
+    {
+      "epoch": 0.42812996414961674,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0010755977763480752,
+      "loss": 0.0957,
+      "step": 49321
+    },
+    {
+      "epoch": 0.42813864462982093,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0010755680248465966,
+      "loss": 0.127,
+      "step": 49322
+    },
+    {
+      "epoch": 0.42814732511002507,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0010755382733718371,
+      "loss": 0.1094,
+      "step": 49323
+    },
+    {
+      "epoch": 0.42815600559022926,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0010755085219238284,
+      "loss": 0.1069,
+      "step": 49324
+    },
+    {
+      "epoch": 0.4281646860704334,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.001075478770502603,
+      "loss": 0.084,
+      "step": 49325
+    },
+    {
+      "epoch": 0.4281733665506376,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0010754490191081938,
+      "loss": 0.1123,
+      "step": 49326
+    },
+    {
+      "epoch": 0.42818204703084173,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0010754192677406328,
+      "loss": 0.0894,
+      "step": 49327
+    },
+    {
+      "epoch": 0.4281907275110459,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0010753895163999531,
+      "loss": 0.0967,
+      "step": 49328
+    },
+    {
+      "epoch": 0.42819940799125006,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0010753597650861875,
+      "loss": 0.0957,
+      "step": 49329
+    },
+    {
+      "epoch": 0.42820808847145425,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0010753300137993679,
+      "loss": 0.1006,
+      "step": 49330
+    },
+    {
+      "epoch": 0.4282167689516584,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001075300262539527,
+      "loss": 0.1387,
+      "step": 49331
+    },
+    {
+      "epoch": 0.4282254494318626,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0010752705113066972,
+      "loss": 0.1011,
+      "step": 49332
+    },
+    {
+      "epoch": 0.4282341299120667,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0010752407601009113,
+      "loss": 0.1045,
+      "step": 49333
+    },
+    {
+      "epoch": 0.4282428103922709,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0010752110089222017,
+      "loss": 0.1104,
+      "step": 49334
+    },
+    {
+      "epoch": 0.42825149087247505,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001075181257770601,
+      "loss": 0.1025,
+      "step": 49335
+    },
+    {
+      "epoch": 0.42826017135267924,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0010751515066461414,
+      "loss": 0.0786,
+      "step": 49336
+    },
+    {
+      "epoch": 0.4282688518328834,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0010751217555488562,
+      "loss": 0.1104,
+      "step": 49337
+    },
+    {
+      "epoch": 0.42827753231308757,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001075092004478777,
+      "loss": 0.1221,
+      "step": 49338
+    },
+    {
+      "epoch": 0.4282862127932917,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0010750622534359373,
+      "loss": 0.123,
+      "step": 49339
+    },
+    {
+      "epoch": 0.4282948932734959,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.001075032502420369,
+      "loss": 0.0771,
+      "step": 49340
+    },
+    {
+      "epoch": 0.42830357375370004,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0010750027514321047,
+      "loss": 0.0942,
+      "step": 49341
+    },
+    {
+      "epoch": 0.42831225423390423,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001074973000471177,
+      "loss": 0.0742,
+      "step": 49342
+    },
+    {
+      "epoch": 0.42832093471410837,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0010749432495376187,
+      "loss": 0.082,
+      "step": 49343
+    },
+    {
+      "epoch": 0.42832961519431256,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0010749134986314615,
+      "loss": 0.1543,
+      "step": 49344
+    },
+    {
+      "epoch": 0.4283382956745167,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0010748837477527389,
+      "loss": 0.0737,
+      "step": 49345
+    },
+    {
+      "epoch": 0.4283469761547209,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0010748539969014832,
+      "loss": 0.1123,
+      "step": 49346
+    },
+    {
+      "epoch": 0.42835565663492503,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0010748242460777268,
+      "loss": 0.1191,
+      "step": 49347
+    },
+    {
+      "epoch": 0.4283643371151292,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.001074794495281502,
+      "loss": 0.0869,
+      "step": 49348
+    },
+    {
+      "epoch": 0.42837301759533336,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0010747647445128418,
+      "loss": 0.125,
+      "step": 49349
+    },
+    {
+      "epoch": 0.42838169807553755,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0010747349937717781,
+      "loss": 0.1123,
+      "step": 49350
+    },
+    {
+      "epoch": 0.4283903785557417,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.001074705243058344,
+      "loss": 0.1113,
+      "step": 49351
+    },
+    {
+      "epoch": 0.4283990590359459,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0010746754923725722,
+      "loss": 0.1768,
+      "step": 49352
+    },
+    {
+      "epoch": 0.42840773951615,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0010746457417144945,
+      "loss": 0.6328,
+      "step": 49353
+    },
+    {
+      "epoch": 0.4284164199963542,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0010746159910841437,
+      "loss": 0.1006,
+      "step": 49354
+    },
+    {
+      "epoch": 0.42842510047655835,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.001074586240481553,
+      "loss": 0.1416,
+      "step": 49355
+    },
+    {
+      "epoch": 0.42843378095676254,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0010745564899067542,
+      "loss": 0.1094,
+      "step": 49356
+    },
+    {
+      "epoch": 0.4284424614369667,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.00107452673935978,
+      "loss": 0.1094,
+      "step": 49357
+    },
+    {
+      "epoch": 0.4284511419171709,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.001074496988840663,
+      "loss": 0.0972,
+      "step": 49358
+    },
+    {
+      "epoch": 0.428459822397375,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0010744672383494358,
+      "loss": 0.1055,
+      "step": 49359
+    },
+    {
+      "epoch": 0.4284685028775792,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0010744374878861305,
+      "loss": 0.0996,
+      "step": 49360
+    },
+    {
+      "epoch": 0.42847718335778334,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0010744077374507802,
+      "loss": 0.123,
+      "step": 49361
+    },
+    {
+      "epoch": 0.42848586383798754,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001074377987043417,
+      "loss": 0.0928,
+      "step": 49362
+    },
+    {
+      "epoch": 0.4284945443181917,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.001074348236664074,
+      "loss": 0.0845,
+      "step": 49363
+    },
+    {
+      "epoch": 0.42850322479839587,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0010743184863127833,
+      "loss": 0.0776,
+      "step": 49364
+    },
+    {
+      "epoch": 0.4285119052786,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0010742887359895775,
+      "loss": 0.1406,
+      "step": 49365
+    },
+    {
+      "epoch": 0.4285205857588042,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0010742589856944892,
+      "loss": 0.0845,
+      "step": 49366
+    },
+    {
+      "epoch": 0.42852926623900833,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0010742292354275506,
+      "loss": 0.1133,
+      "step": 49367
+    },
+    {
+      "epoch": 0.4285379467192125,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0010741994851887948,
+      "loss": 0.0889,
+      "step": 49368
+    },
+    {
+      "epoch": 0.42854662719941666,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001074169734978254,
+      "loss": 0.1094,
+      "step": 49369
+    },
+    {
+      "epoch": 0.42855530767962086,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0010741399847959605,
+      "loss": 0.1006,
+      "step": 49370
+    },
+    {
+      "epoch": 0.428563988159825,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0010741102346419477,
+      "loss": 0.0884,
+      "step": 49371
+    },
+    {
+      "epoch": 0.4285726686400292,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0010740804845162468,
+      "loss": 0.0854,
+      "step": 49372
+    },
+    {
+      "epoch": 0.4285813491202333,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0010740507344188918,
+      "loss": 0.1099,
+      "step": 49373
+    },
+    {
+      "epoch": 0.4285900296004375,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0010740209843499143,
+      "loss": 0.0835,
+      "step": 49374
+    },
+    {
+      "epoch": 0.42859871008064165,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.001073991234309347,
+      "loss": 0.1094,
+      "step": 49375
+    },
+    {
+      "epoch": 0.42860739056084585,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0010739614842972228,
+      "loss": 0.1113,
+      "step": 49376
+    },
+    {
+      "epoch": 0.42861607104105,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0010739317343135735,
+      "loss": 0.1045,
+      "step": 49377
+    },
+    {
+      "epoch": 0.4286247515212542,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0010739019843584322,
+      "loss": 0.0996,
+      "step": 49378
+    },
+    {
+      "epoch": 0.4286334320014583,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0010738722344318312,
+      "loss": 0.105,
+      "step": 49379
+    },
+    {
+      "epoch": 0.4286421124816625,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0010738424845338033,
+      "loss": 0.1016,
+      "step": 49380
+    },
+    {
+      "epoch": 0.42865079296186664,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0010738127346643807,
+      "loss": 0.0791,
+      "step": 49381
+    },
+    {
+      "epoch": 0.42865947344207084,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0010737829848235963,
+      "loss": 0.1328,
+      "step": 49382
+    },
+    {
+      "epoch": 0.428668153922275,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0010737532350114823,
+      "loss": 0.0957,
+      "step": 49383
+    },
+    {
+      "epoch": 0.42867683440247917,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0010737234852280714,
+      "loss": 0.1064,
+      "step": 49384
+    },
+    {
+      "epoch": 0.4286855148826833,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0010736937354733963,
+      "loss": 0.0977,
+      "step": 49385
+    },
+    {
+      "epoch": 0.4286941953628875,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0010736639857474892,
+      "loss": 0.1562,
+      "step": 49386
+    },
+    {
+      "epoch": 0.42870287584309164,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0010736342360503823,
+      "loss": 0.1055,
+      "step": 49387
+    },
+    {
+      "epoch": 0.42871155632329583,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0010736044863821092,
+      "loss": 0.1011,
+      "step": 49388
+    },
+    {
+      "epoch": 0.42872023680349997,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0010735747367427017,
+      "loss": 0.1113,
+      "step": 49389
+    },
+    {
+      "epoch": 0.42872891728370416,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0010735449871321922,
+      "loss": 0.0801,
+      "step": 49390
+    },
+    {
+      "epoch": 0.4287375977639083,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0010735152375506139,
+      "loss": 0.1045,
+      "step": 49391
+    },
+    {
+      "epoch": 0.4287462782441125,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0010734854879979987,
+      "loss": 0.1079,
+      "step": 49392
+    },
+    {
+      "epoch": 0.4287549587243166,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0010734557384743796,
+      "loss": 0.0645,
+      "step": 49393
+    },
+    {
+      "epoch": 0.4287636392045208,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0010734259889797887,
+      "loss": 0.1357,
+      "step": 49394
+    },
+    {
+      "epoch": 0.42877231968472496,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0010733962395142588,
+      "loss": 0.1445,
+      "step": 49395
+    },
+    {
+      "epoch": 0.42878100016492915,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0010733664900778222,
+      "loss": 0.063,
+      "step": 49396
+    },
+    {
+      "epoch": 0.4287896806451333,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.001073336740670512,
+      "loss": 0.1138,
+      "step": 49397
+    },
+    {
+      "epoch": 0.4287983611253375,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.00107330699129236,
+      "loss": 0.1484,
+      "step": 49398
+    },
+    {
+      "epoch": 0.4288070416055416,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0010732772419433992,
+      "loss": 0.1133,
+      "step": 49399
+    },
+    {
+      "epoch": 0.4288157220857458,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.001073247492623662,
+      "loss": 0.0703,
+      "step": 49400
+    },
+    {
+      "epoch": 0.42882440256594995,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0010732177433331808,
+      "loss": 0.1235,
+      "step": 49401
+    },
+    {
+      "epoch": 0.42883308304615414,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0010731879940719885,
+      "loss": 0.125,
+      "step": 49402
+    },
+    {
+      "epoch": 0.4288417635263583,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0010731582448401174,
+      "loss": 0.1084,
+      "step": 49403
+    },
+    {
+      "epoch": 0.42885044400656247,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0010731284956376,
+      "loss": 0.1064,
+      "step": 49404
+    },
+    {
+      "epoch": 0.4288591244867666,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0010730987464644686,
+      "loss": 0.1172,
+      "step": 49405
+    },
+    {
+      "epoch": 0.4288678049669708,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0010730689973207563,
+      "loss": 0.1138,
+      "step": 49406
+    },
+    {
+      "epoch": 0.42887648544717494,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0010730392482064953,
+      "loss": 0.0737,
+      "step": 49407
+    },
+    {
+      "epoch": 0.4288851659273791,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0010730094991217181,
+      "loss": 0.373,
+      "step": 49408
+    },
+    {
+      "epoch": 0.42889384640758327,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0010729797500664572,
+      "loss": 0.0781,
+      "step": 49409
+    },
+    {
+      "epoch": 0.4289025268877874,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0010729500010407456,
+      "loss": 0.1001,
+      "step": 49410
+    },
+    {
+      "epoch": 0.4289112073679916,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0010729202520446152,
+      "loss": 0.0835,
+      "step": 49411
+    },
+    {
+      "epoch": 0.42891988784819574,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0010728905030780989,
+      "loss": 0.0854,
+      "step": 49412
+    },
+    {
+      "epoch": 0.42892856832839993,
+      "grad_norm": 0.5,
+      "learning_rate": 0.001072860754141229,
+      "loss": 0.0967,
+      "step": 49413
+    },
+    {
+      "epoch": 0.42893724880860407,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0010728310052340382,
+      "loss": 0.0664,
+      "step": 49414
+    },
+    {
+      "epoch": 0.42894592928880826,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.001072801256356559,
+      "loss": 0.1201,
+      "step": 49415
+    },
+    {
+      "epoch": 0.4289546097690124,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0010727715075088242,
+      "loss": 0.1167,
+      "step": 49416
+    },
+    {
+      "epoch": 0.4289632902492166,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0010727417586908659,
+      "loss": 0.1069,
+      "step": 49417
+    },
+    {
+      "epoch": 0.4289719707294207,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0010727120099027169,
+      "loss": 0.0918,
+      "step": 49418
+    },
+    {
+      "epoch": 0.4289806512096249,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0010726822611444094,
+      "loss": 0.125,
+      "step": 49419
+    },
+    {
+      "epoch": 0.42898933168982906,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0010726525124159765,
+      "loss": 0.0757,
+      "step": 49420
+    },
+    {
+      "epoch": 0.42899801217003325,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0010726227637174502,
+      "loss": 0.0923,
+      "step": 49421
+    },
+    {
+      "epoch": 0.4290066926502374,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0010725930150488627,
+      "loss": 0.1006,
+      "step": 49422
+    },
+    {
+      "epoch": 0.4290153731304416,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0010725632664102477,
+      "loss": 0.0791,
+      "step": 49423
+    },
+    {
+      "epoch": 0.4290240536106457,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0010725335178016372,
+      "loss": 0.0781,
+      "step": 49424
+    },
+    {
+      "epoch": 0.4290327340908499,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0010725037692230635,
+      "loss": 0.1768,
+      "step": 49425
+    },
+    {
+      "epoch": 0.42904141457105405,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0010724740206745592,
+      "loss": 0.0889,
+      "step": 49426
+    },
+    {
+      "epoch": 0.42905009505125824,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0010724442721561566,
+      "loss": 0.1113,
+      "step": 49427
+    },
+    {
+      "epoch": 0.4290587755314624,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.001072414523667889,
+      "loss": 0.0977,
+      "step": 49428
+    },
+    {
+      "epoch": 0.42906745601166657,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0010723847752097883,
+      "loss": 0.1118,
+      "step": 49429
+    },
+    {
+      "epoch": 0.4290761364918707,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0010723550267818874,
+      "loss": 0.1089,
+      "step": 49430
+    },
+    {
+      "epoch": 0.4290848169720749,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0010723252783842179,
+      "loss": 0.1069,
+      "step": 49431
+    },
+    {
+      "epoch": 0.42909349745227904,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0010722955300168139,
+      "loss": 0.0742,
+      "step": 49432
+    },
+    {
+      "epoch": 0.42910217793248323,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0010722657816797066,
+      "loss": 0.1089,
+      "step": 49433
+    },
+    {
+      "epoch": 0.42911085841268737,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0010722360333729293,
+      "loss": 0.1035,
+      "step": 49434
+    },
+    {
+      "epoch": 0.42911953889289156,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0010722062850965141,
+      "loss": 0.1025,
+      "step": 49435
+    },
+    {
+      "epoch": 0.4291282193730957,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001072176536850494,
+      "loss": 0.1182,
+      "step": 49436
+    },
+    {
+      "epoch": 0.4291368998532999,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0010721467886349008,
+      "loss": 0.0923,
+      "step": 49437
+    },
+    {
+      "epoch": 0.42914558033350403,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0010721170404497675,
+      "loss": 0.0874,
+      "step": 49438
+    },
+    {
+      "epoch": 0.4291542608137082,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0010720872922951265,
+      "loss": 0.1338,
+      "step": 49439
+    },
+    {
+      "epoch": 0.42916294129391236,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0010720575441710105,
+      "loss": 0.1045,
+      "step": 49440
+    },
+    {
+      "epoch": 0.42917162177411655,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0010720277960774524,
+      "loss": 0.1045,
+      "step": 49441
+    },
+    {
+      "epoch": 0.4291803022543207,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0010719980480144837,
+      "loss": 0.0898,
+      "step": 49442
+    },
+    {
+      "epoch": 0.4291889827345249,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0010719682999821378,
+      "loss": 0.1533,
+      "step": 49443
+    },
+    {
+      "epoch": 0.429197663214729,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0010719385519804472,
+      "loss": 0.0845,
+      "step": 49444
+    },
+    {
+      "epoch": 0.4292063436949332,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0010719088040094438,
+      "loss": 0.1055,
+      "step": 49445
+    },
+    {
+      "epoch": 0.42921502417513735,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0010718790560691606,
+      "loss": 0.1016,
+      "step": 49446
+    },
+    {
+      "epoch": 0.42922370465534154,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.00107184930815963,
+      "loss": 0.085,
+      "step": 49447
+    },
+    {
+      "epoch": 0.4292323851355457,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0010718195602808845,
+      "loss": 0.0918,
+      "step": 49448
+    },
+    {
+      "epoch": 0.4292410656157499,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0010717898124329565,
+      "loss": 0.1436,
+      "step": 49449
+    },
+    {
+      "epoch": 0.429249746095954,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0010717600646158792,
+      "loss": 0.1016,
+      "step": 49450
+    },
+    {
+      "epoch": 0.4292584265761582,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0010717303168296846,
+      "loss": 0.1143,
+      "step": 49451
+    },
+    {
+      "epoch": 0.42926710705636234,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0010717005690744053,
+      "loss": 0.1426,
+      "step": 49452
+    },
+    {
+      "epoch": 0.42927578753656653,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0010716708213500737,
+      "loss": 0.105,
+      "step": 49453
+    },
+    {
+      "epoch": 0.42928446801677067,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0010716410736567226,
+      "loss": 0.1016,
+      "step": 49454
+    },
+    {
+      "epoch": 0.42929314849697486,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0010716113259943844,
+      "loss": 0.0781,
+      "step": 49455
+    },
+    {
+      "epoch": 0.429301828977179,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0010715815783630914,
+      "loss": 0.0786,
+      "step": 49456
+    },
+    {
+      "epoch": 0.4293105094573832,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0010715518307628765,
+      "loss": 0.1904,
+      "step": 49457
+    },
+    {
+      "epoch": 0.42931918993758733,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0010715220831937718,
+      "loss": 0.0918,
+      "step": 49458
+    },
+    {
+      "epoch": 0.4293278704177915,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0010714923356558105,
+      "loss": 0.1045,
+      "step": 49459
+    },
+    {
+      "epoch": 0.42933655089799566,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0010714625881490247,
+      "loss": 0.0918,
+      "step": 49460
+    },
+    {
+      "epoch": 0.42934523137819985,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.001071432840673447,
+      "loss": 0.0679,
+      "step": 49461
+    },
+    {
+      "epoch": 0.429353911858404,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0010714030932291098,
+      "loss": 0.1138,
+      "step": 49462
+    },
+    {
+      "epoch": 0.4293625923386082,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001071373345816046,
+      "loss": 0.1357,
+      "step": 49463
+    },
+    {
+      "epoch": 0.4293712728188123,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0010713435984342878,
+      "loss": 0.1147,
+      "step": 49464
+    },
+    {
+      "epoch": 0.4293799532990165,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0010713138510838674,
+      "loss": 0.0967,
+      "step": 49465
+    },
+    {
+      "epoch": 0.42938863377922065,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.001071284103764818,
+      "loss": 0.0889,
+      "step": 49466
+    },
+    {
+      "epoch": 0.42939731425942484,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.001071254356477172,
+      "loss": 0.0986,
+      "step": 49467
+    },
+    {
+      "epoch": 0.429405994739629,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0010712246092209616,
+      "loss": 0.0938,
+      "step": 49468
+    },
+    {
+      "epoch": 0.4294146752198332,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0010711948619962198,
+      "loss": 0.0972,
+      "step": 49469
+    },
+    {
+      "epoch": 0.4294233557000373,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0010711651148029785,
+      "loss": 0.0947,
+      "step": 49470
+    },
+    {
+      "epoch": 0.4294320361802415,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.001071135367641271,
+      "loss": 0.0845,
+      "step": 49471
+    },
+    {
+      "epoch": 0.42944071666044564,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0010711056205111294,
+      "loss": 0.0967,
+      "step": 49472
+    },
+    {
+      "epoch": 0.42944939714064984,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001071075873412586,
+      "loss": 0.0991,
+      "step": 49473
+    },
+    {
+      "epoch": 0.429458077620854,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0010710461263456735,
+      "loss": 0.0732,
+      "step": 49474
+    },
+    {
+      "epoch": 0.42946675810105817,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0010710163793104248,
+      "loss": 0.1201,
+      "step": 49475
+    },
+    {
+      "epoch": 0.4294754385812623,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001070986632306872,
+      "loss": 0.0762,
+      "step": 49476
+    },
+    {
+      "epoch": 0.4294841190614665,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0010709568853350478,
+      "loss": 0.1064,
+      "step": 49477
+    },
+    {
+      "epoch": 0.42949279954167063,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0010709271383949849,
+      "loss": 0.1104,
+      "step": 49478
+    },
+    {
+      "epoch": 0.4295014800218748,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0010708973914867156,
+      "loss": 0.0898,
+      "step": 49479
+    },
+    {
+      "epoch": 0.42951016050207896,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0010708676446102724,
+      "loss": 0.0938,
+      "step": 49480
+    },
+    {
+      "epoch": 0.42951884098228316,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0010708378977656878,
+      "loss": 0.0957,
+      "step": 49481
+    },
+    {
+      "epoch": 0.4295275214624873,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0010708081509529944,
+      "loss": 0.124,
+      "step": 49482
+    },
+    {
+      "epoch": 0.4295362019426915,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0010707784041722249,
+      "loss": 0.0898,
+      "step": 49483
+    },
+    {
+      "epoch": 0.4295448824228956,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0010707486574234118,
+      "loss": 0.125,
+      "step": 49484
+    },
+    {
+      "epoch": 0.4295535629030998,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0010707189107065874,
+      "loss": 0.0713,
+      "step": 49485
+    },
+    {
+      "epoch": 0.42956224338330395,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.001070689164021784,
+      "loss": 0.0815,
+      "step": 49486
+    },
+    {
+      "epoch": 0.42957092386350815,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0010706594173690352,
+      "loss": 0.0903,
+      "step": 49487
+    },
+    {
+      "epoch": 0.4295796043437123,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0010706296707483725,
+      "loss": 0.0977,
+      "step": 49488
+    },
+    {
+      "epoch": 0.4295882848239165,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0010705999241598287,
+      "loss": 0.126,
+      "step": 49489
+    },
+    {
+      "epoch": 0.4295969653041206,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0010705701776034364,
+      "loss": 0.104,
+      "step": 49490
+    },
+    {
+      "epoch": 0.4296056457843248,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001070540431079228,
+      "loss": 0.1055,
+      "step": 49491
+    },
+    {
+      "epoch": 0.42961432626452895,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0010705106845872363,
+      "loss": 0.1104,
+      "step": 49492
+    },
+    {
+      "epoch": 0.42962300674473314,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0010704809381274936,
+      "loss": 0.1172,
+      "step": 49493
+    },
+    {
+      "epoch": 0.4296316872249373,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0010704511917000328,
+      "loss": 0.0898,
+      "step": 49494
+    },
+    {
+      "epoch": 0.42964036770514147,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0010704214453048856,
+      "loss": 0.1475,
+      "step": 49495
+    },
+    {
+      "epoch": 0.4296490481853456,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0010703916989420854,
+      "loss": 0.0723,
+      "step": 49496
+    },
+    {
+      "epoch": 0.4296577286655498,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0010703619526116646,
+      "loss": 0.0732,
+      "step": 49497
+    },
+    {
+      "epoch": 0.42966640914575394,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0010703322063136552,
+      "loss": 0.1001,
+      "step": 49498
+    },
+    {
+      "epoch": 0.42967508962595813,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.00107030246004809,
+      "loss": 0.1123,
+      "step": 49499
+    },
+    {
+      "epoch": 0.42968377010616227,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0010702727138150014,
+      "loss": 0.1094,
+      "step": 49500
+    },
+    {
+      "epoch": 0.42969245058636646,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0010702429676144225,
+      "loss": 0.104,
+      "step": 49501
+    },
+    {
+      "epoch": 0.4297011310665706,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0010702132214463855,
+      "loss": 0.1094,
+      "step": 49502
+    },
+    {
+      "epoch": 0.4297098115467748,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.001070183475310923,
+      "loss": 0.085,
+      "step": 49503
+    },
+    {
+      "epoch": 0.4297184920269789,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0010701537292080667,
+      "loss": 0.1641,
+      "step": 49504
+    },
+    {
+      "epoch": 0.4297271725071831,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0010701239831378505,
+      "loss": 0.0859,
+      "step": 49505
+    },
+    {
+      "epoch": 0.42973585298738726,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.001070094237100306,
+      "loss": 0.1035,
+      "step": 49506
+    },
+    {
+      "epoch": 0.42974453346759145,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0010700644910954663,
+      "loss": 0.0889,
+      "step": 49507
+    },
+    {
+      "epoch": 0.4297532139477956,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001070034745123363,
+      "loss": 0.126,
+      "step": 49508
+    },
+    {
+      "epoch": 0.4297618944279998,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0010700049991840294,
+      "loss": 0.1006,
+      "step": 49509
+    },
+    {
+      "epoch": 0.4297705749082039,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0010699752532774983,
+      "loss": 0.0596,
+      "step": 49510
+    },
+    {
+      "epoch": 0.4297792553884081,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0010699455074038016,
+      "loss": 0.0928,
+      "step": 49511
+    },
+    {
+      "epoch": 0.42978793586861225,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001069915761562972,
+      "loss": 0.126,
+      "step": 49512
+    },
+    {
+      "epoch": 0.42979661634881644,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0010698860157550424,
+      "loss": 0.1133,
+      "step": 49513
+    },
+    {
+      "epoch": 0.4298052968290206,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0010698562699800445,
+      "loss": 0.1289,
+      "step": 49514
+    },
+    {
+      "epoch": 0.42981397730922477,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0010698265242380114,
+      "loss": 0.1289,
+      "step": 49515
+    },
+    {
+      "epoch": 0.4298226577894289,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001069796778528976,
+      "loss": 0.1562,
+      "step": 49516
+    },
+    {
+      "epoch": 0.4298313382696331,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0010697670328529696,
+      "loss": 0.0801,
+      "step": 49517
+    },
+    {
+      "epoch": 0.42984001874983724,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0010697372872100262,
+      "loss": 0.1221,
+      "step": 49518
+    },
+    {
+      "epoch": 0.42984869923004143,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0010697075416001773,
+      "loss": 0.082,
+      "step": 49519
+    },
+    {
+      "epoch": 0.42985737971024557,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0010696777960234561,
+      "loss": 0.0791,
+      "step": 49520
+    },
+    {
+      "epoch": 0.42986606019044976,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0010696480504798947,
+      "loss": 0.083,
+      "step": 49521
+    },
+    {
+      "epoch": 0.4298747406706539,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0010696183049695256,
+      "loss": 0.0986,
+      "step": 49522
+    },
+    {
+      "epoch": 0.4298834211508581,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0010695885594923815,
+      "loss": 0.1113,
+      "step": 49523
+    },
+    {
+      "epoch": 0.42989210163106223,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001069558814048495,
+      "loss": 0.0757,
+      "step": 49524
+    },
+    {
+      "epoch": 0.4299007821112664,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0010695290686378984,
+      "loss": 0.0693,
+      "step": 49525
+    },
+    {
+      "epoch": 0.42990946259147056,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0010694993232606242,
+      "loss": 0.1182,
+      "step": 49526
+    },
+    {
+      "epoch": 0.42991814307167475,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0010694695779167054,
+      "loss": 0.1035,
+      "step": 49527
+    },
+    {
+      "epoch": 0.4299268235518789,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.001069439832606174,
+      "loss": 0.0762,
+      "step": 49528
+    },
+    {
+      "epoch": 0.4299355040320831,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.001069410087329063,
+      "loss": 0.1328,
+      "step": 49529
+    },
+    {
+      "epoch": 0.4299441845122872,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0010693803420854047,
+      "loss": 0.1108,
+      "step": 49530
+    },
+    {
+      "epoch": 0.42995286499249136,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0010693505968752313,
+      "loss": 0.1162,
+      "step": 49531
+    },
+    {
+      "epoch": 0.42996154547269555,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0010693208516985757,
+      "loss": 0.083,
+      "step": 49532
+    },
+    {
+      "epoch": 0.4299702259528997,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0010692911065554704,
+      "loss": 0.0752,
+      "step": 49533
+    },
+    {
+      "epoch": 0.4299789064331039,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0010692613614459478,
+      "loss": 0.106,
+      "step": 49534
+    },
+    {
+      "epoch": 0.429987586913308,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0010692316163700404,
+      "loss": 0.0767,
+      "step": 49535
+    },
+    {
+      "epoch": 0.4299962673935122,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001069201871327781,
+      "loss": 0.1104,
+      "step": 49536
+    },
+    {
+      "epoch": 0.43000494787371635,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001069172126319202,
+      "loss": 0.1758,
+      "step": 49537
+    },
+    {
+      "epoch": 0.43001362835392054,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0010691423813443362,
+      "loss": 0.0898,
+      "step": 49538
+    },
+    {
+      "epoch": 0.4300223088341247,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0010691126364032156,
+      "loss": 0.1152,
+      "step": 49539
+    },
+    {
+      "epoch": 0.43003098931432887,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0010690828914958728,
+      "loss": 0.1338,
+      "step": 49540
+    },
+    {
+      "epoch": 0.430039669794533,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0010690531466223405,
+      "loss": 0.0879,
+      "step": 49541
+    },
+    {
+      "epoch": 0.4300483502747372,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0010690234017826513,
+      "loss": 0.083,
+      "step": 49542
+    },
+    {
+      "epoch": 0.43005703075494134,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0010689936569768376,
+      "loss": 0.1025,
+      "step": 49543
+    },
+    {
+      "epoch": 0.43006571123514553,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0010689639122049316,
+      "loss": 0.1074,
+      "step": 49544
+    },
+    {
+      "epoch": 0.43007439171534967,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0010689341674669668,
+      "loss": 0.1006,
+      "step": 49545
+    },
+    {
+      "epoch": 0.43008307219555386,
+      "grad_norm": 0.375,
+      "learning_rate": 0.0010689044227629749,
+      "loss": 0.0898,
+      "step": 49546
+    },
+    {
+      "epoch": 0.430091752675758,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0010688746780929887,
+      "loss": 0.1133,
+      "step": 49547
+    },
+    {
+      "epoch": 0.4301004331559622,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0010688449334570408,
+      "loss": 0.1504,
+      "step": 49548
+    },
+    {
+      "epoch": 0.43010911363616633,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0010688151888551636,
+      "loss": 0.0864,
+      "step": 49549
+    },
+    {
+      "epoch": 0.4301177941163705,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0010687854442873897,
+      "loss": 0.0742,
+      "step": 49550
+    },
+    {
+      "epoch": 0.43012647459657466,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.001068755699753751,
+      "loss": 0.3906,
+      "step": 49551
+    },
+    {
+      "epoch": 0.43013515507677885,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.001068725955254281,
+      "loss": 0.1025,
+      "step": 49552
+    },
+    {
+      "epoch": 0.430143835556983,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001068696210789012,
+      "loss": 0.1001,
+      "step": 49553
+    },
+    {
+      "epoch": 0.4301525160371872,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001068666466357976,
+      "loss": 0.1064,
+      "step": 49554
+    },
+    {
+      "epoch": 0.4301611965173913,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0010686367219612059,
+      "loss": 0.0762,
+      "step": 49555
+    },
+    {
+      "epoch": 0.4301698769975955,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0010686069775987343,
+      "loss": 0.1016,
+      "step": 49556
+    },
+    {
+      "epoch": 0.43017855747779965,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0010685772332705938,
+      "loss": 0.1113,
+      "step": 49557
+    },
+    {
+      "epoch": 0.43018723795800384,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.001068547488976817,
+      "loss": 0.0898,
+      "step": 49558
+    },
+    {
+      "epoch": 0.430195918438208,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0010685177447174357,
+      "loss": 0.2539,
+      "step": 49559
+    },
+    {
+      "epoch": 0.4302045989184122,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0010684880004924828,
+      "loss": 0.0957,
+      "step": 49560
+    },
+    {
+      "epoch": 0.4302132793986163,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0010684582563019913,
+      "loss": 0.085,
+      "step": 49561
+    },
+    {
+      "epoch": 0.4302219598788205,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0010684285121459933,
+      "loss": 0.1172,
+      "step": 49562
+    },
+    {
+      "epoch": 0.43023064035902464,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0010683987680245213,
+      "loss": 0.0776,
+      "step": 49563
+    },
+    {
+      "epoch": 0.43023932083922883,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0010683690239376082,
+      "loss": 0.124,
+      "step": 49564
+    },
+    {
+      "epoch": 0.43024800131943297,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.001068339279885286,
+      "loss": 0.083,
+      "step": 49565
+    },
+    {
+      "epoch": 0.43025668179963716,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0010683095358675879,
+      "loss": 0.0693,
+      "step": 49566
+    },
+    {
+      "epoch": 0.4302653622798413,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0010682797918845457,
+      "loss": 0.0962,
+      "step": 49567
+    },
+    {
+      "epoch": 0.4302740427600455,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0010682500479361923,
+      "loss": 0.0977,
+      "step": 49568
+    },
+    {
+      "epoch": 0.43028272324024963,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0010682203040225594,
+      "loss": 0.0796,
+      "step": 49569
+    },
+    {
+      "epoch": 0.4302914037204538,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0010681905601436813,
+      "loss": 0.0977,
+      "step": 49570
+    },
+    {
+      "epoch": 0.43030008420065796,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0010681608162995892,
+      "loss": 0.0928,
+      "step": 49571
+    },
+    {
+      "epoch": 0.43030876468086215,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.001068131072490316,
+      "loss": 0.1162,
+      "step": 49572
+    },
+    {
+      "epoch": 0.4303174451610663,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0010681013287158943,
+      "loss": 0.0845,
+      "step": 49573
+    },
+    {
+      "epoch": 0.4303261256412705,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0010680715849763563,
+      "loss": 0.0864,
+      "step": 49574
+    },
+    {
+      "epoch": 0.4303348061214746,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.001068041841271735,
+      "loss": 0.0767,
+      "step": 49575
+    },
+    {
+      "epoch": 0.4303434866016788,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0010680120976020624,
+      "loss": 0.0986,
+      "step": 49576
+    },
+    {
+      "epoch": 0.43035216708188295,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0010679823539673713,
+      "loss": 0.0669,
+      "step": 49577
+    },
+    {
+      "epoch": 0.43036084756208715,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0010679526103676942,
+      "loss": 0.1621,
+      "step": 49578
+    },
+    {
+      "epoch": 0.4303695280422913,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0010679228668030636,
+      "loss": 0.1084,
+      "step": 49579
+    },
+    {
+      "epoch": 0.4303782085224955,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0010678931232735124,
+      "loss": 0.0957,
+      "step": 49580
+    },
+    {
+      "epoch": 0.4303868890026996,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0010678633797790726,
+      "loss": 0.0752,
+      "step": 49581
+    },
+    {
+      "epoch": 0.4303955694829038,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0010678336363197768,
+      "loss": 0.0918,
+      "step": 49582
+    },
+    {
+      "epoch": 0.43040424996310794,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001067803892895658,
+      "loss": 0.1152,
+      "step": 49583
+    },
+    {
+      "epoch": 0.43041293044331214,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0010677741495067482,
+      "loss": 0.1572,
+      "step": 49584
+    },
+    {
+      "epoch": 0.4304216109235163,
+      "grad_norm": 0.125,
+      "learning_rate": 0.00106774440615308,
+      "loss": 0.0889,
+      "step": 49585
+    },
+    {
+      "epoch": 0.43043029140372047,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0010677146628346856,
+      "loss": 0.0835,
+      "step": 49586
+    },
+    {
+      "epoch": 0.4304389718839246,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0010676849195515987,
+      "loss": 0.103,
+      "step": 49587
+    },
+    {
+      "epoch": 0.4304476523641288,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.001067655176303851,
+      "loss": 0.0864,
+      "step": 49588
+    },
+    {
+      "epoch": 0.43045633284433293,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.001067625433091475,
+      "loss": 0.0645,
+      "step": 49589
+    },
+    {
+      "epoch": 0.4304650133245371,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.001067595689914503,
+      "loss": 0.0679,
+      "step": 49590
+    },
+    {
+      "epoch": 0.43047369380474126,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001067565946772968,
+      "loss": 0.084,
+      "step": 49591
+    },
+    {
+      "epoch": 0.43048237428494546,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0010675362036669028,
+      "loss": 0.1084,
+      "step": 49592
+    },
+    {
+      "epoch": 0.4304910547651496,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0010675064605963395,
+      "loss": 0.0879,
+      "step": 49593
+    },
+    {
+      "epoch": 0.4304997352453538,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0010674767175613102,
+      "loss": 0.1196,
+      "step": 49594
+    },
+    {
+      "epoch": 0.4305084157255579,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0010674469745618478,
+      "loss": 0.0688,
+      "step": 49595
+    },
+    {
+      "epoch": 0.4305170962057621,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0010674172315979853,
+      "loss": 0.1162,
+      "step": 49596
+    },
+    {
+      "epoch": 0.43052577668596625,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0010673874886697548,
+      "loss": 0.0991,
+      "step": 49597
+    },
+    {
+      "epoch": 0.43053445716617045,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0010673577457771888,
+      "loss": 0.1045,
+      "step": 49598
+    },
+    {
+      "epoch": 0.4305431376463746,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0010673280029203198,
+      "loss": 0.1064,
+      "step": 49599
+    },
+    {
+      "epoch": 0.4305518181265788,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0010672982600991804,
+      "loss": 0.1138,
+      "step": 49600
+    },
+    {
+      "epoch": 0.4305604986067829,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0010672685173138032,
+      "loss": 0.1387,
+      "step": 49601
+    },
+    {
+      "epoch": 0.4305691790869871,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0010672387745642208,
+      "loss": 0.1328,
+      "step": 49602
+    },
+    {
+      "epoch": 0.43057785956719125,
+      "grad_norm": 0.875,
+      "learning_rate": 0.001067209031850465,
+      "loss": 0.0962,
+      "step": 49603
+    },
+    {
+      "epoch": 0.43058654004739544,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0010671792891725693,
+      "loss": 0.106,
+      "step": 49604
+    },
+    {
+      "epoch": 0.4305952205275996,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0010671495465305661,
+      "loss": 0.0645,
+      "step": 49605
+    },
+    {
+      "epoch": 0.43060390100780377,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0010671198039244874,
+      "loss": 0.085,
+      "step": 49606
+    },
+    {
+      "epoch": 0.4306125814880079,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0010670900613543662,
+      "loss": 0.1523,
+      "step": 49607
+    },
+    {
+      "epoch": 0.4306212619682121,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0010670603188202346,
+      "loss": 0.1099,
+      "step": 49608
+    },
+    {
+      "epoch": 0.43062994244841624,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0010670305763221253,
+      "loss": 0.1055,
+      "step": 49609
+    },
+    {
+      "epoch": 0.43063862292862043,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0010670008338600709,
+      "loss": 0.0933,
+      "step": 49610
+    },
+    {
+      "epoch": 0.43064730340882457,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0010669710914341039,
+      "loss": 0.1201,
+      "step": 49611
+    },
+    {
+      "epoch": 0.43065598388902876,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0010669413490442567,
+      "loss": 0.082,
+      "step": 49612
+    },
+    {
+      "epoch": 0.4306646643692329,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0010669116066905622,
+      "loss": 0.0908,
+      "step": 49613
+    },
+    {
+      "epoch": 0.4306733448494371,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0010668818643730525,
+      "loss": 0.0718,
+      "step": 49614
+    },
+    {
+      "epoch": 0.4306820253296412,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.0010668521220917604,
+      "loss": 0.1611,
+      "step": 49615
+    },
+    {
+      "epoch": 0.4306907058098454,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0010668223798467186,
+      "loss": 0.1006,
+      "step": 49616
+    },
+    {
+      "epoch": 0.43069938629004956,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001066792637637959,
+      "loss": 0.0874,
+      "step": 49617
+    },
+    {
+      "epoch": 0.43070806677025375,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0010667628954655144,
+      "loss": 0.0908,
+      "step": 49618
+    },
+    {
+      "epoch": 0.4307167472504579,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0010667331533294174,
+      "loss": 0.1162,
+      "step": 49619
+    },
+    {
+      "epoch": 0.4307254277306621,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0010667034112297007,
+      "loss": 0.1123,
+      "step": 49620
+    },
+    {
+      "epoch": 0.4307341082108662,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0010666736691663965,
+      "loss": 0.0801,
+      "step": 49621
+    },
+    {
+      "epoch": 0.4307427886910704,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0010666439271395374,
+      "loss": 0.0938,
+      "step": 49622
+    },
+    {
+      "epoch": 0.43075146917127455,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0010666141851491562,
+      "loss": 0.0693,
+      "step": 49623
+    },
+    {
+      "epoch": 0.43076014965147874,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0010665844431952851,
+      "loss": 0.1201,
+      "step": 49624
+    },
+    {
+      "epoch": 0.4307688301316829,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.001066554701277957,
+      "loss": 0.1016,
+      "step": 49625
+    },
+    {
+      "epoch": 0.43077751061188707,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001066524959397204,
+      "loss": 0.0791,
+      "step": 49626
+    },
+    {
+      "epoch": 0.4307861910920912,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0010664952175530589,
+      "loss": 0.0869,
+      "step": 49627
+    },
+    {
+      "epoch": 0.4307948715722954,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0010664654757455538,
+      "loss": 0.1035,
+      "step": 49628
+    },
+    {
+      "epoch": 0.43080355205249954,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0010664357339747216,
+      "loss": 0.1875,
+      "step": 49629
+    },
+    {
+      "epoch": 0.43081223253270373,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0010664059922405951,
+      "loss": 0.123,
+      "step": 49630
+    },
+    {
+      "epoch": 0.43082091301290787,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0010663762505432061,
+      "loss": 0.0879,
+      "step": 49631
+    },
+    {
+      "epoch": 0.43082959349311206,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001066346508882588,
+      "loss": 0.1406,
+      "step": 49632
+    },
+    {
+      "epoch": 0.4308382739733162,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0010663167672587729,
+      "loss": 0.0967,
+      "step": 49633
+    },
+    {
+      "epoch": 0.4308469544535204,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.001066287025671793,
+      "loss": 0.1309,
+      "step": 49634
+    },
+    {
+      "epoch": 0.43085563493372453,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0010662572841216812,
+      "loss": 0.0645,
+      "step": 49635
+    },
+    {
+      "epoch": 0.4308643154139287,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0010662275426084698,
+      "loss": 0.0879,
+      "step": 49636
+    },
+    {
+      "epoch": 0.43087299589413286,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0010661978011321912,
+      "loss": 0.0918,
+      "step": 49637
+    },
+    {
+      "epoch": 0.43088167637433705,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0010661680596928784,
+      "loss": 0.1045,
+      "step": 49638
+    },
+    {
+      "epoch": 0.4308903568545412,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0010661383182905637,
+      "loss": 0.0811,
+      "step": 49639
+    },
+    {
+      "epoch": 0.4308990373347454,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0010661085769252795,
+      "loss": 0.0879,
+      "step": 49640
+    },
+    {
+      "epoch": 0.4309077178149495,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001066078835597059,
+      "loss": 0.1055,
+      "step": 49641
+    },
+    {
+      "epoch": 0.4309163982951537,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0010660490943059338,
+      "loss": 0.1172,
+      "step": 49642
+    },
+    {
+      "epoch": 0.43092507877535785,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001066019353051937,
+      "loss": 0.1211,
+      "step": 49643
+    },
+    {
+      "epoch": 0.43093375925556204,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0010659896118351006,
+      "loss": 0.0889,
+      "step": 49644
+    },
+    {
+      "epoch": 0.4309424397357662,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0010659598706554579,
+      "loss": 0.0981,
+      "step": 49645
+    },
+    {
+      "epoch": 0.4309511202159704,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0010659301295130406,
+      "loss": 0.0898,
+      "step": 49646
+    },
+    {
+      "epoch": 0.4309598006961745,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0010659003884078812,
+      "loss": 0.0693,
+      "step": 49647
+    },
+    {
+      "epoch": 0.4309684811763787,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0010658706473400132,
+      "loss": 0.0889,
+      "step": 49648
+    },
+    {
+      "epoch": 0.43097716165658284,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0010658409063094684,
+      "loss": 0.0747,
+      "step": 49649
+    },
+    {
+      "epoch": 0.43098584213678703,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0010658111653162796,
+      "loss": 0.1064,
+      "step": 49650
+    },
+    {
+      "epoch": 0.43099452261699117,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.001065781424360479,
+      "loss": 0.1426,
+      "step": 49651
+    },
+    {
+      "epoch": 0.43100320309719536,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0010657516834420998,
+      "loss": 0.0825,
+      "step": 49652
+    },
+    {
+      "epoch": 0.4310118835773995,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0010657219425611738,
+      "loss": 0.0952,
+      "step": 49653
+    },
+    {
+      "epoch": 0.43102056405760364,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0010656922017177335,
+      "loss": 0.0918,
+      "step": 49654
+    },
+    {
+      "epoch": 0.43102924453780783,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0010656624609118115,
+      "loss": 0.1367,
+      "step": 49655
+    },
+    {
+      "epoch": 0.43103792501801197,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0010656327201434407,
+      "loss": 0.1113,
+      "step": 49656
+    },
+    {
+      "epoch": 0.43104660549821616,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0010656029794126538,
+      "loss": 0.126,
+      "step": 49657
+    },
+    {
+      "epoch": 0.4310552859784203,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.001065573238719483,
+      "loss": 0.0635,
+      "step": 49658
+    },
+    {
+      "epoch": 0.4310639664586245,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0010655434980639603,
+      "loss": 0.0913,
+      "step": 49659
+    },
+    {
+      "epoch": 0.43107264693882863,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0010655137574461192,
+      "loss": 0.1104,
+      "step": 49660
+    },
+    {
+      "epoch": 0.4310813274190328,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0010654840168659915,
+      "loss": 0.1328,
+      "step": 49661
+    },
+    {
+      "epoch": 0.43109000789923696,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.00106545427632361,
+      "loss": 0.0835,
+      "step": 49662
+    },
+    {
+      "epoch": 0.43109868837944115,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0010654245358190071,
+      "loss": 0.1113,
+      "step": 49663
+    },
+    {
+      "epoch": 0.4311073688596453,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0010653947953522152,
+      "loss": 0.1318,
+      "step": 49664
+    },
+    {
+      "epoch": 0.4311160493398495,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0010653650549232675,
+      "loss": 0.1113,
+      "step": 49665
+    },
+    {
+      "epoch": 0.4311247298200536,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0010653353145321956,
+      "loss": 0.0623,
+      "step": 49666
+    },
+    {
+      "epoch": 0.4311334103002578,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.001065305574179033,
+      "loss": 0.0781,
+      "step": 49667
+    },
+    {
+      "epoch": 0.43114209078046195,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0010652758338638112,
+      "loss": 0.1055,
+      "step": 49668
+    },
+    {
+      "epoch": 0.43115077126066614,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0010652460935865636,
+      "loss": 0.0752,
+      "step": 49669
+    },
+    {
+      "epoch": 0.4311594517408703,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001065216353347322,
+      "loss": 0.1426,
+      "step": 49670
+    },
+    {
+      "epoch": 0.4311681322210745,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0010651866131461197,
+      "loss": 0.0854,
+      "step": 49671
+    },
+    {
+      "epoch": 0.4311768127012786,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0010651568729829887,
+      "loss": 0.0991,
+      "step": 49672
+    },
+    {
+      "epoch": 0.4311854931814828,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.001065127132857961,
+      "loss": 0.0815,
+      "step": 49673
+    },
+    {
+      "epoch": 0.43119417366168694,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0010650973927710703,
+      "loss": 0.1299,
+      "step": 49674
+    },
+    {
+      "epoch": 0.43120285414189113,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0010650676527223487,
+      "loss": 0.1074,
+      "step": 49675
+    },
+    {
+      "epoch": 0.43121153462209527,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0010650379127118284,
+      "loss": 0.0815,
+      "step": 49676
+    },
+    {
+      "epoch": 0.43122021510229946,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.001065008172739542,
+      "loss": 0.0713,
+      "step": 49677
+    },
+    {
+      "epoch": 0.4312288955825036,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0010649784328055222,
+      "loss": 0.1338,
+      "step": 49678
+    },
+    {
+      "epoch": 0.4312375760627078,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0010649486929098016,
+      "loss": 0.1177,
+      "step": 49679
+    },
+    {
+      "epoch": 0.43124625654291193,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0010649189530524126,
+      "loss": 0.1465,
+      "step": 49680
+    },
+    {
+      "epoch": 0.4312549370231161,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0010648892132333872,
+      "loss": 0.1104,
+      "step": 49681
+    },
+    {
+      "epoch": 0.43126361750332026,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0010648594734527588,
+      "loss": 0.1201,
+      "step": 49682
+    },
+    {
+      "epoch": 0.43127229798352446,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0010648297337105597,
+      "loss": 0.1211,
+      "step": 49683
+    },
+    {
+      "epoch": 0.4312809784637286,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0010647999940068221,
+      "loss": 0.0889,
+      "step": 49684
+    },
+    {
+      "epoch": 0.4312896589439328,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0010647702543415789,
+      "loss": 0.1211,
+      "step": 49685
+    },
+    {
+      "epoch": 0.4312983394241369,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0010647405147148623,
+      "loss": 0.1045,
+      "step": 49686
+    },
+    {
+      "epoch": 0.4313070199043411,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0010647107751267048,
+      "loss": 0.123,
+      "step": 49687
+    },
+    {
+      "epoch": 0.43131570038454525,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.001064681035577139,
+      "loss": 0.1064,
+      "step": 49688
+    },
+    {
+      "epoch": 0.43132438086474945,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0010646512960661978,
+      "loss": 0.0811,
+      "step": 49689
+    },
+    {
+      "epoch": 0.4313330613449536,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001064621556593913,
+      "loss": 0.0801,
+      "step": 49690
+    },
+    {
+      "epoch": 0.4313417418251578,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0010645918171603179,
+      "loss": 0.064,
+      "step": 49691
+    },
+    {
+      "epoch": 0.4313504223053619,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0010645620777654445,
+      "loss": 0.123,
+      "step": 49692
+    },
+    {
+      "epoch": 0.4313591027855661,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0010645323384093257,
+      "loss": 0.1074,
+      "step": 49693
+    },
+    {
+      "epoch": 0.43136778326577024,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0010645025990919936,
+      "loss": 0.1113,
+      "step": 49694
+    },
+    {
+      "epoch": 0.43137646374597444,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0010644728598134812,
+      "loss": 0.083,
+      "step": 49695
+    },
+    {
+      "epoch": 0.4313851442261786,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0010644431205738202,
+      "loss": 0.0723,
+      "step": 49696
+    },
+    {
+      "epoch": 0.43139382470638277,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.0010644133813730442,
+      "loss": 0.1211,
+      "step": 49697
+    },
+    {
+      "epoch": 0.4314025051865869,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001064383642211185,
+      "loss": 0.1006,
+      "step": 49698
+    },
+    {
+      "epoch": 0.4314111856667911,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001064353903088275,
+      "loss": 0.0737,
+      "step": 49699
+    },
+    {
+      "epoch": 0.43141986614699523,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0010643241640043474,
+      "loss": 0.0967,
+      "step": 49700
+    },
+    {
+      "epoch": 0.4314285466271994,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001064294424959434,
+      "loss": 0.1172,
+      "step": 49701
+    },
+    {
+      "epoch": 0.43143722710740356,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0010642646859535683,
+      "loss": 0.1084,
+      "step": 49702
+    },
+    {
+      "epoch": 0.43144590758760776,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.001064234946986782,
+      "loss": 0.1406,
+      "step": 49703
+    },
+    {
+      "epoch": 0.4314545880678119,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0010642052080591078,
+      "loss": 0.1172,
+      "step": 49704
+    },
+    {
+      "epoch": 0.4314632685480161,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.001064175469170578,
+      "loss": 0.1143,
+      "step": 49705
+    },
+    {
+      "epoch": 0.4314719490282202,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0010641457303212254,
+      "loss": 0.0986,
+      "step": 49706
+    },
+    {
+      "epoch": 0.4314806295084244,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0010641159915110825,
+      "loss": 0.0991,
+      "step": 49707
+    },
+    {
+      "epoch": 0.43148930998862856,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0010640862527401818,
+      "loss": 0.1011,
+      "step": 49708
+    },
+    {
+      "epoch": 0.43149799046883275,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.001064056514008556,
+      "loss": 0.1113,
+      "step": 49709
+    },
+    {
+      "epoch": 0.4315066709490369,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0010640267753162375,
+      "loss": 0.0742,
+      "step": 49710
+    },
+    {
+      "epoch": 0.4315153514292411,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0010639970366632585,
+      "loss": 0.1128,
+      "step": 49711
+    },
+    {
+      "epoch": 0.4315240319094452,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.001063967298049652,
+      "loss": 0.166,
+      "step": 49712
+    },
+    {
+      "epoch": 0.4315327123896494,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0010639375594754506,
+      "loss": 0.123,
+      "step": 49713
+    },
+    {
+      "epoch": 0.43154139286985355,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001063907820940686,
+      "loss": 0.103,
+      "step": 49714
+    },
+    {
+      "epoch": 0.43155007335005774,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0010638780824453915,
+      "loss": 0.0938,
+      "step": 49715
+    },
+    {
+      "epoch": 0.4315587538302619,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0010638483439895994,
+      "loss": 0.1104,
+      "step": 49716
+    },
+    {
+      "epoch": 0.43156743431046607,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0010638186055733421,
+      "loss": 0.0928,
+      "step": 49717
+    },
+    {
+      "epoch": 0.4315761147906702,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0010637888671966524,
+      "loss": 0.1377,
+      "step": 49718
+    },
+    {
+      "epoch": 0.4315847952708744,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0010637591288595624,
+      "loss": 0.0874,
+      "step": 49719
+    },
+    {
+      "epoch": 0.43159347575107854,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0010637293905621052,
+      "loss": 0.0859,
+      "step": 49720
+    },
+    {
+      "epoch": 0.43160215623128273,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001063699652304313,
+      "loss": 0.0635,
+      "step": 49721
+    },
+    {
+      "epoch": 0.43161083671148687,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0010636699140862183,
+      "loss": 0.0713,
+      "step": 49722
+    },
+    {
+      "epoch": 0.43161951719169106,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0010636401759078534,
+      "loss": 0.1475,
+      "step": 49723
+    },
+    {
+      "epoch": 0.4316281976718952,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0010636104377692507,
+      "loss": 0.1104,
+      "step": 49724
+    },
+    {
+      "epoch": 0.4316368781520994,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0010635806996704437,
+      "loss": 0.083,
+      "step": 49725
+    },
+    {
+      "epoch": 0.4316455586323035,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0010635509616114642,
+      "loss": 0.1699,
+      "step": 49726
+    },
+    {
+      "epoch": 0.4316542391125077,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0010635212235923447,
+      "loss": 0.1475,
+      "step": 49727
+    },
+    {
+      "epoch": 0.43166291959271186,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0010634914856131178,
+      "loss": 0.0737,
+      "step": 49728
+    },
+    {
+      "epoch": 0.43167160007291605,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0010634617476738164,
+      "loss": 0.083,
+      "step": 49729
+    },
+    {
+      "epoch": 0.4316802805531202,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0010634320097744725,
+      "loss": 0.1289,
+      "step": 49730
+    },
+    {
+      "epoch": 0.4316889610333244,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0010634022719151186,
+      "loss": 0.1074,
+      "step": 49731
+    },
+    {
+      "epoch": 0.4316976415135285,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.001063372534095788,
+      "loss": 0.0811,
+      "step": 49732
+    },
+    {
+      "epoch": 0.4317063219937327,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0010633427963165118,
+      "loss": 0.0854,
+      "step": 49733
+    },
+    {
+      "epoch": 0.43171500247393685,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0010633130585773237,
+      "loss": 0.1006,
+      "step": 49734
+    },
+    {
+      "epoch": 0.43172368295414104,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0010632833208782562,
+      "loss": 0.0952,
+      "step": 49735
+    },
+    {
+      "epoch": 0.4317323634343452,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.001063253583219341,
+      "loss": 0.0889,
+      "step": 49736
+    },
+    {
+      "epoch": 0.43174104391454937,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0010632238456006113,
+      "loss": 0.0986,
+      "step": 49737
+    },
+    {
+      "epoch": 0.4317497243947535,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0010631941080220997,
+      "loss": 0.1426,
+      "step": 49738
+    },
+    {
+      "epoch": 0.4317584048749577,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0010631643704838386,
+      "loss": 0.1152,
+      "step": 49739
+    },
+    {
+      "epoch": 0.43176708535516184,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.00106313463298586,
+      "loss": 0.0811,
+      "step": 49740
+    },
+    {
+      "epoch": 0.43177576583536603,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.001063104895528197,
+      "loss": 0.0762,
+      "step": 49741
+    },
+    {
+      "epoch": 0.43178444631557017,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0010630751581108815,
+      "loss": 0.1504,
+      "step": 49742
+    },
+    {
+      "epoch": 0.43179312679577436,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001063045420733947,
+      "loss": 0.0791,
+      "step": 49743
+    },
+    {
+      "epoch": 0.4318018072759785,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0010630156833974253,
+      "loss": 0.0801,
+      "step": 49744
+    },
+    {
+      "epoch": 0.4318104877561827,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.001062985946101349,
+      "loss": 0.0986,
+      "step": 49745
+    },
+    {
+      "epoch": 0.43181916823638683,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.001062956208845751,
+      "loss": 0.0864,
+      "step": 49746
+    },
+    {
+      "epoch": 0.431827848716591,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0010629264716306632,
+      "loss": 0.1094,
+      "step": 49747
+    },
+    {
+      "epoch": 0.43183652919679516,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0010628967344561188,
+      "loss": 0.125,
+      "step": 49748
+    },
+    {
+      "epoch": 0.43184520967699935,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0010628669973221495,
+      "loss": 0.1001,
+      "step": 49749
+    },
+    {
+      "epoch": 0.4318538901572035,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0010628372602287885,
+      "loss": 0.0977,
+      "step": 49750
+    },
+    {
+      "epoch": 0.4318625706374077,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.001062807523176068,
+      "loss": 0.0781,
+      "step": 49751
+    },
+    {
+      "epoch": 0.4318712511176118,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0010627777861640208,
+      "loss": 0.0923,
+      "step": 49752
+    },
+    {
+      "epoch": 0.431879931597816,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0010627480491926795,
+      "loss": 0.0957,
+      "step": 49753
+    },
+    {
+      "epoch": 0.43188861207802015,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.001062718312262076,
+      "loss": 0.1172,
+      "step": 49754
+    },
+    {
+      "epoch": 0.43189729255822434,
+      "grad_norm": 0.060791015625,
+      "learning_rate": 0.0010626885753722433,
+      "loss": 0.0674,
+      "step": 49755
+    },
+    {
+      "epoch": 0.4319059730384285,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.001062658838523214,
+      "loss": 0.0752,
+      "step": 49756
+    },
+    {
+      "epoch": 0.4319146535186327,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0010626291017150204,
+      "loss": 0.0645,
+      "step": 49757
+    },
+    {
+      "epoch": 0.4319233339988368,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.001062599364947695,
+      "loss": 0.1123,
+      "step": 49758
+    },
+    {
+      "epoch": 0.431932014479041,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.00106256962822127,
+      "loss": 0.0859,
+      "step": 49759
+    },
+    {
+      "epoch": 0.43194069495924514,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0010625398915357785,
+      "loss": 0.0664,
+      "step": 49760
+    },
+    {
+      "epoch": 0.43194937543944933,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0010625101548912528,
+      "loss": 0.082,
+      "step": 49761
+    },
+    {
+      "epoch": 0.43195805591965347,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0010624804182877256,
+      "loss": 0.0952,
+      "step": 49762
+    },
+    {
+      "epoch": 0.43196673639985766,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0010624506817252294,
+      "loss": 0.0859,
+      "step": 49763
+    },
+    {
+      "epoch": 0.4319754168800618,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0010624209452037962,
+      "loss": 0.1187,
+      "step": 49764
+    },
+    {
+      "epoch": 0.431984097360266,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.001062391208723459,
+      "loss": 0.0781,
+      "step": 49765
+    },
+    {
+      "epoch": 0.43199277784047013,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0010623614722842503,
+      "loss": 0.0967,
+      "step": 49766
+    },
+    {
+      "epoch": 0.4320014583206743,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0010623317358862026,
+      "loss": 0.0908,
+      "step": 49767
+    },
+    {
+      "epoch": 0.43201013880087846,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0010623019995293478,
+      "loss": 0.0854,
+      "step": 49768
+    },
+    {
+      "epoch": 0.43201881928108266,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0010622722632137197,
+      "loss": 0.0913,
+      "step": 49769
+    },
+    {
+      "epoch": 0.4320274997612868,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0010622425269393497,
+      "loss": 0.1074,
+      "step": 49770
+    },
+    {
+      "epoch": 0.432036180241491,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.001062212790706271,
+      "loss": 0.085,
+      "step": 49771
+    },
+    {
+      "epoch": 0.4320448607216951,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.0010621830545145155,
+      "loss": 0.0947,
+      "step": 49772
+    },
+    {
+      "epoch": 0.4320535412018993,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0010621533183641158,
+      "loss": 0.0952,
+      "step": 49773
+    },
+    {
+      "epoch": 0.43206222168210345,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0010621235822551051,
+      "loss": 0.1211,
+      "step": 49774
+    },
+    {
+      "epoch": 0.43207090216230765,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0010620938461875155,
+      "loss": 0.0957,
+      "step": 49775
+    },
+    {
+      "epoch": 0.4320795826425118,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.001062064110161379,
+      "loss": 0.0845,
+      "step": 49776
+    },
+    {
+      "epoch": 0.4320882631227159,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0010620343741767294,
+      "loss": 0.082,
+      "step": 49777
+    },
+    {
+      "epoch": 0.4320969436029201,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.001062004638233598,
+      "loss": 0.1055,
+      "step": 49778
+    },
+    {
+      "epoch": 0.43210562408312425,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0010619749023320177,
+      "loss": 0.126,
+      "step": 49779
+    },
+    {
+      "epoch": 0.43211430456332844,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0010619451664720212,
+      "loss": 0.0894,
+      "step": 49780
+    },
+    {
+      "epoch": 0.4321229850435326,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.001061915430653641,
+      "loss": 0.0654,
+      "step": 49781
+    },
+    {
+      "epoch": 0.4321316655237368,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0010618856948769093,
+      "loss": 0.1162,
+      "step": 49782
+    },
+    {
+      "epoch": 0.4321403460039409,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.001061855959141859,
+      "loss": 0.0898,
+      "step": 49783
+    },
+    {
+      "epoch": 0.4321490264841451,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0010618262234485226,
+      "loss": 0.082,
+      "step": 49784
+    },
+    {
+      "epoch": 0.43215770696434924,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0010617964877969318,
+      "loss": 0.1006,
+      "step": 49785
+    },
+    {
+      "epoch": 0.43216638744455343,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0010617667521871201,
+      "loss": 0.1543,
+      "step": 49786
+    },
+    {
+      "epoch": 0.43217506792475757,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.00106173701661912,
+      "loss": 0.1309,
+      "step": 49787
+    },
+    {
+      "epoch": 0.43218374840496177,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0010617072810929639,
+      "loss": 0.0986,
+      "step": 49788
+    },
+    {
+      "epoch": 0.4321924288851659,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0010616775456086835,
+      "loss": 0.1001,
+      "step": 49789
+    },
+    {
+      "epoch": 0.4322011093653701,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0010616478101663125,
+      "loss": 0.106,
+      "step": 49790
+    },
+    {
+      "epoch": 0.43220978984557423,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0010616180747658826,
+      "loss": 0.1211,
+      "step": 49791
+    },
+    {
+      "epoch": 0.4322184703257784,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0010615883394074267,
+      "loss": 0.1211,
+      "step": 49792
+    },
+    {
+      "epoch": 0.43222715080598256,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.001061558604090977,
+      "loss": 0.1191,
+      "step": 49793
+    },
+    {
+      "epoch": 0.43223583128618676,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0010615288688165662,
+      "loss": 0.0835,
+      "step": 49794
+    },
+    {
+      "epoch": 0.4322445117663909,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001061499133584227,
+      "loss": 0.0874,
+      "step": 49795
+    },
+    {
+      "epoch": 0.4322531922465951,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0010614693983939918,
+      "loss": 0.0903,
+      "step": 49796
+    },
+    {
+      "epoch": 0.4322618727267992,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.001061439663245893,
+      "loss": 0.0898,
+      "step": 49797
+    },
+    {
+      "epoch": 0.4322705532070034,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0010614099281399637,
+      "loss": 0.0986,
+      "step": 49798
+    },
+    {
+      "epoch": 0.43227923368720755,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0010613801930762356,
+      "loss": 0.0918,
+      "step": 49799
+    },
+    {
+      "epoch": 0.43228791416741175,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0010613504580547412,
+      "loss": 0.127,
+      "step": 49800
+    },
+    {
+      "epoch": 0.4322965946476159,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0010613207230755135,
+      "loss": 0.0898,
+      "step": 49801
+    },
+    {
+      "epoch": 0.4323052751278201,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0010612909881385847,
+      "loss": 0.0918,
+      "step": 49802
+    },
+    {
+      "epoch": 0.4323139556080242,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001061261253243988,
+      "loss": 0.0869,
+      "step": 49803
+    },
+    {
+      "epoch": 0.4323226360882284,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0010612315183917552,
+      "loss": 0.1011,
+      "step": 49804
+    },
+    {
+      "epoch": 0.43233131656843254,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001061201783581919,
+      "loss": 0.1094,
+      "step": 49805
+    },
+    {
+      "epoch": 0.43233999704863674,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.001061172048814512,
+      "loss": 0.0947,
+      "step": 49806
+    },
+    {
+      "epoch": 0.4323486775288409,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.0010611423140895666,
+      "loss": 0.084,
+      "step": 49807
+    },
+    {
+      "epoch": 0.43235735800904507,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0010611125794071157,
+      "loss": 0.0928,
+      "step": 49808
+    },
+    {
+      "epoch": 0.4323660384892492,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0010610828447671912,
+      "loss": 0.1245,
+      "step": 49809
+    },
+    {
+      "epoch": 0.4323747189694534,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0010610531101698257,
+      "loss": 0.082,
+      "step": 49810
+    },
+    {
+      "epoch": 0.43238339944965754,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0010610233756150522,
+      "loss": 0.1182,
+      "step": 49811
+    },
+    {
+      "epoch": 0.43239207992986173,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0010609936411029028,
+      "loss": 0.1123,
+      "step": 49812
+    },
+    {
+      "epoch": 0.43240076041006587,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0010609639066334103,
+      "loss": 0.1406,
+      "step": 49813
+    },
+    {
+      "epoch": 0.43240944089027006,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.001060934172206607,
+      "loss": 0.0947,
+      "step": 49814
+    },
+    {
+      "epoch": 0.4324181213704742,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0010609044378225257,
+      "loss": 0.0728,
+      "step": 49815
+    },
+    {
+      "epoch": 0.4324268018506784,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0010608747034811987,
+      "loss": 0.0908,
+      "step": 49816
+    },
+    {
+      "epoch": 0.4324354823308825,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0010608449691826584,
+      "loss": 0.1543,
+      "step": 49817
+    },
+    {
+      "epoch": 0.4324441628110867,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0010608152349269376,
+      "loss": 0.1133,
+      "step": 49818
+    },
+    {
+      "epoch": 0.43245284329129086,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0010607855007140686,
+      "loss": 0.1328,
+      "step": 49819
+    },
+    {
+      "epoch": 0.43246152377149505,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0010607557665440837,
+      "loss": 0.0996,
+      "step": 49820
+    },
+    {
+      "epoch": 0.4324702042516992,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0010607260324170158,
+      "loss": 0.0884,
+      "step": 49821
+    },
+    {
+      "epoch": 0.4324788847319034,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0010606962983328973,
+      "loss": 0.1055,
+      "step": 49822
+    },
+    {
+      "epoch": 0.4324875652121075,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.001060666564291761,
+      "loss": 0.0654,
+      "step": 49823
+    },
+    {
+      "epoch": 0.4324962456923117,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0010606368302936392,
+      "loss": 0.1104,
+      "step": 49824
+    },
+    {
+      "epoch": 0.43250492617251585,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0010606070963385644,
+      "loss": 0.1416,
+      "step": 49825
+    },
+    {
+      "epoch": 0.43251360665272004,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0010605773624265688,
+      "loss": 0.0908,
+      "step": 49826
+    },
+    {
+      "epoch": 0.4325222871329242,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0010605476285576853,
+      "loss": 0.1484,
+      "step": 49827
+    },
+    {
+      "epoch": 0.43253096761312837,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0010605178947319463,
+      "loss": 0.0791,
+      "step": 49828
+    },
+    {
+      "epoch": 0.4325396480933325,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0010604881609493843,
+      "loss": 0.1113,
+      "step": 49829
+    },
+    {
+      "epoch": 0.4325483285735367,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.001060458427210032,
+      "loss": 0.1006,
+      "step": 49830
+    },
+    {
+      "epoch": 0.43255700905374084,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0010604286935139218,
+      "loss": 0.0889,
+      "step": 49831
+    },
+    {
+      "epoch": 0.43256568953394503,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.001060398959861086,
+      "loss": 0.0801,
+      "step": 49832
+    },
+    {
+      "epoch": 0.43257437001414917,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0010603692262515574,
+      "loss": 0.0835,
+      "step": 49833
+    },
+    {
+      "epoch": 0.43258305049435336,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0010603394926853684,
+      "loss": 0.0991,
+      "step": 49834
+    },
+    {
+      "epoch": 0.4325917309745575,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0010603097591625518,
+      "loss": 0.0884,
+      "step": 49835
+    },
+    {
+      "epoch": 0.4326004114547617,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0010602800256831395,
+      "loss": 0.0835,
+      "step": 49836
+    },
+    {
+      "epoch": 0.43260909193496583,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0010602502922471642,
+      "loss": 0.083,
+      "step": 49837
+    },
+    {
+      "epoch": 0.43261777241517,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0010602205588546587,
+      "loss": 0.0913,
+      "step": 49838
+    },
+    {
+      "epoch": 0.43262645289537416,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.0010601908255056555,
+      "loss": 0.0811,
+      "step": 49839
+    },
+    {
+      "epoch": 0.43263513337557835,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0010601610922001874,
+      "loss": 0.0684,
+      "step": 49840
+    },
+    {
+      "epoch": 0.4326438138557825,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.001060131358938286,
+      "loss": 0.0938,
+      "step": 49841
+    },
+    {
+      "epoch": 0.4326524943359867,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0010601016257199844,
+      "loss": 0.126,
+      "step": 49842
+    },
+    {
+      "epoch": 0.4326611748161908,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0010600718925453155,
+      "loss": 0.0972,
+      "step": 49843
+    },
+    {
+      "epoch": 0.432669855296395,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.001060042159414311,
+      "loss": 0.0991,
+      "step": 49844
+    },
+    {
+      "epoch": 0.43267853577659915,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0010600124263270036,
+      "loss": 0.1128,
+      "step": 49845
+    },
+    {
+      "epoch": 0.43268721625680334,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0010599826932834264,
+      "loss": 0.1123,
+      "step": 49846
+    },
+    {
+      "epoch": 0.4326958967370075,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0010599529602836114,
+      "loss": 0.0767,
+      "step": 49847
+    },
+    {
+      "epoch": 0.43270457721721167,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0010599232273275914,
+      "loss": 0.0884,
+      "step": 49848
+    },
+    {
+      "epoch": 0.4327132576974158,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0010598934944153987,
+      "loss": 0.1084,
+      "step": 49849
+    },
+    {
+      "epoch": 0.43272193817762,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0010598637615470655,
+      "loss": 0.0649,
+      "step": 49850
+    },
+    {
+      "epoch": 0.43273061865782414,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001059834028722625,
+      "loss": 0.0728,
+      "step": 49851
+    },
+    {
+      "epoch": 0.43273929913802833,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0010598042959421095,
+      "loss": 0.0664,
+      "step": 49852
+    },
+    {
+      "epoch": 0.43274797961823247,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0010597745632055513,
+      "loss": 0.1279,
+      "step": 49853
+    },
+    {
+      "epoch": 0.43275666009843666,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0010597448305129827,
+      "loss": 0.0908,
+      "step": 49854
+    },
+    {
+      "epoch": 0.4327653405786408,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.001059715097864437,
+      "loss": 0.0898,
+      "step": 49855
+    },
+    {
+      "epoch": 0.432774021058845,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.001059685365259946,
+      "loss": 0.1025,
+      "step": 49856
+    },
+    {
+      "epoch": 0.43278270153904913,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0010596556326995428,
+      "loss": 0.083,
+      "step": 49857
+    },
+    {
+      "epoch": 0.4327913820192533,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0010596259001832596,
+      "loss": 0.0894,
+      "step": 49858
+    },
+    {
+      "epoch": 0.43280006249945746,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0010595961677111283,
+      "loss": 0.1104,
+      "step": 49859
+    },
+    {
+      "epoch": 0.43280874297966165,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0010595664352831826,
+      "loss": 0.1299,
+      "step": 49860
+    },
+    {
+      "epoch": 0.4328174234598658,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0010595367028994542,
+      "loss": 0.1216,
+      "step": 49861
+    },
+    {
+      "epoch": 0.43282610394007,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0010595069705599763,
+      "loss": 0.0703,
+      "step": 49862
+    },
+    {
+      "epoch": 0.4328347844202741,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0010594772382647802,
+      "loss": 0.0723,
+      "step": 49863
+    },
+    {
+      "epoch": 0.4328434649004783,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0010594475060138994,
+      "loss": 0.0957,
+      "step": 49864
+    },
+    {
+      "epoch": 0.43285214538068245,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0010594177738073665,
+      "loss": 0.1133,
+      "step": 49865
+    },
+    {
+      "epoch": 0.43286082586088664,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.0010593880416452138,
+      "loss": 0.2949,
+      "step": 49866
+    },
+    {
+      "epoch": 0.4328695063410908,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0010593583095274736,
+      "loss": 0.1006,
+      "step": 49867
+    },
+    {
+      "epoch": 0.432878186821295,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.0010593285774541782,
+      "loss": 0.123,
+      "step": 49868
+    },
+    {
+      "epoch": 0.4328868673014991,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0010592988454253608,
+      "loss": 0.1079,
+      "step": 49869
+    },
+    {
+      "epoch": 0.4328955477817033,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0010592691134410535,
+      "loss": 0.0957,
+      "step": 49870
+    },
+    {
+      "epoch": 0.43290422826190744,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0010592393815012887,
+      "loss": 0.0771,
+      "step": 49871
+    },
+    {
+      "epoch": 0.43291290874211164,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0010592096496060991,
+      "loss": 0.0762,
+      "step": 49872
+    },
+    {
+      "epoch": 0.4329215892223158,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0010591799177555176,
+      "loss": 0.1113,
+      "step": 49873
+    },
+    {
+      "epoch": 0.43293026970251997,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0010591501859495763,
+      "loss": 0.0869,
+      "step": 49874
+    },
+    {
+      "epoch": 0.4329389501827241,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0010591204541883074,
+      "loss": 0.0952,
+      "step": 49875
+    },
+    {
+      "epoch": 0.4329476306629283,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0010590907224717438,
+      "loss": 0.1602,
+      "step": 49876
+    },
+    {
+      "epoch": 0.43295631114313243,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0010590609907999181,
+      "loss": 0.0903,
+      "step": 49877
+    },
+    {
+      "epoch": 0.4329649916233366,
+      "grad_norm": 0.396484375,
+      "learning_rate": 0.0010590312591728628,
+      "loss": 0.0859,
+      "step": 49878
+    },
+    {
+      "epoch": 0.43297367210354076,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0010590015275906103,
+      "loss": 0.1172,
+      "step": 49879
+    },
+    {
+      "epoch": 0.43298235258374496,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0010589717960531927,
+      "loss": 0.0908,
+      "step": 49880
+    },
+    {
+      "epoch": 0.4329910330639491,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.001058942064560643,
+      "loss": 0.1226,
+      "step": 49881
+    },
+    {
+      "epoch": 0.4329997135441533,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001058912333112994,
+      "loss": 0.0664,
+      "step": 49882
+    },
+    {
+      "epoch": 0.4330083940243574,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0010588826017102777,
+      "loss": 0.083,
+      "step": 49883
+    },
+    {
+      "epoch": 0.4330170745045616,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.001058852870352527,
+      "loss": 0.0938,
+      "step": 49884
+    },
+    {
+      "epoch": 0.43302575498476575,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.001058823139039774,
+      "loss": 0.1162,
+      "step": 49885
+    },
+    {
+      "epoch": 0.43303443546496995,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0010587934077720513,
+      "loss": 0.1089,
+      "step": 49886
+    },
+    {
+      "epoch": 0.4330431159451741,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0010587636765493915,
+      "loss": 0.1089,
+      "step": 49887
+    },
+    {
+      "epoch": 0.4330517964253783,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0010587339453718271,
+      "loss": 0.0815,
+      "step": 49888
+    },
+    {
+      "epoch": 0.4330604769055824,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0010587042142393907,
+      "loss": 0.0996,
+      "step": 49889
+    },
+    {
+      "epoch": 0.4330691573857866,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0010586744831521146,
+      "loss": 0.1118,
+      "step": 49890
+    },
+    {
+      "epoch": 0.43307783786599074,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0010586447521100317,
+      "loss": 0.0962,
+      "step": 49891
+    },
+    {
+      "epoch": 0.43308651834619494,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0010586150211131745,
+      "loss": 0.0728,
+      "step": 49892
+    },
+    {
+      "epoch": 0.4330951988263991,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0010585852901615748,
+      "loss": 0.0825,
+      "step": 49893
+    },
+    {
+      "epoch": 0.43310387930660327,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001058555559255266,
+      "loss": 0.0845,
+      "step": 49894
+    },
+    {
+      "epoch": 0.4331125597868074,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.00105852582839428,
+      "loss": 0.083,
+      "step": 49895
+    },
+    {
+      "epoch": 0.4331212402670116,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0010584960975786497,
+      "loss": 0.0884,
+      "step": 49896
+    },
+    {
+      "epoch": 0.43312992074721574,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0010584663668084069,
+      "loss": 0.0588,
+      "step": 49897
+    },
+    {
+      "epoch": 0.43313860122741993,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0010584366360835854,
+      "loss": 0.1035,
+      "step": 49898
+    },
+    {
+      "epoch": 0.43314728170762407,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0010584069054042162,
+      "loss": 0.0981,
+      "step": 49899
+    },
+    {
+      "epoch": 0.4331559621878282,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0010583771747703333,
+      "loss": 0.1318,
+      "step": 49900
+    },
+    {
+      "epoch": 0.4331646426680324,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0010583474441819682,
+      "loss": 0.1123,
+      "step": 49901
+    },
+    {
+      "epoch": 0.43317332314823653,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0010583177136391537,
+      "loss": 0.1147,
+      "step": 49902
+    },
+    {
+      "epoch": 0.4331820036284407,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0010582879831419225,
+      "loss": 0.083,
+      "step": 49903
+    },
+    {
+      "epoch": 0.43319068410864486,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.001058258252690307,
+      "loss": 0.0776,
+      "step": 49904
+    },
+    {
+      "epoch": 0.43319936458884906,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0010582285222843393,
+      "loss": 0.0903,
+      "step": 49905
+    },
+    {
+      "epoch": 0.4332080450690532,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0010581987919240522,
+      "loss": 0.0898,
+      "step": 49906
+    },
+    {
+      "epoch": 0.4332167255492574,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0010581690616094784,
+      "loss": 0.0752,
+      "step": 49907
+    },
+    {
+      "epoch": 0.4332254060294615,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0010581393313406505,
+      "loss": 0.0874,
+      "step": 49908
+    },
+    {
+      "epoch": 0.4332340865096657,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0010581096011176004,
+      "loss": 0.0781,
+      "step": 49909
+    },
+    {
+      "epoch": 0.43324276698986985,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0010580798709403615,
+      "loss": 0.0928,
+      "step": 49910
+    },
+    {
+      "epoch": 0.43325144747007405,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0010580501408089654,
+      "loss": 0.1055,
+      "step": 49911
+    },
+    {
+      "epoch": 0.4332601279502782,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0010580204107234455,
+      "loss": 0.085,
+      "step": 49912
+    },
+    {
+      "epoch": 0.4332688084304824,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0010579906806838335,
+      "loss": 0.0869,
+      "step": 49913
+    },
+    {
+      "epoch": 0.4332774889106865,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0010579609506901624,
+      "loss": 0.1143,
+      "step": 49914
+    },
+    {
+      "epoch": 0.4332861693908907,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.001057931220742464,
+      "loss": 0.1006,
+      "step": 49915
+    },
+    {
+      "epoch": 0.43329484987109484,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.001057901490840772,
+      "loss": 0.1025,
+      "step": 49916
+    },
+    {
+      "epoch": 0.43330353035129904,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0010578717609851182,
+      "loss": 0.1279,
+      "step": 49917
+    },
+    {
+      "epoch": 0.4333122108315032,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0010578420311755353,
+      "loss": 0.0811,
+      "step": 49918
+    },
+    {
+      "epoch": 0.43332089131170737,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0010578123014120555,
+      "loss": 0.0957,
+      "step": 49919
+    },
+    {
+      "epoch": 0.4333295717919115,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.001057782571694712,
+      "loss": 0.1079,
+      "step": 49920
+    },
+    {
+      "epoch": 0.4333382522721157,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0010577528420235363,
+      "loss": 0.1348,
+      "step": 49921
+    },
+    {
+      "epoch": 0.43334693275231984,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0010577231123985618,
+      "loss": 0.0879,
+      "step": 49922
+    },
+    {
+      "epoch": 0.43335561323252403,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.00105769338281982,
+      "loss": 0.1084,
+      "step": 49923
+    },
+    {
+      "epoch": 0.43336429371272817,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0010576636532873447,
+      "loss": 0.1191,
+      "step": 49924
+    },
+    {
+      "epoch": 0.43337297419293236,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0010576339238011678,
+      "loss": 0.0967,
+      "step": 49925
+    },
+    {
+      "epoch": 0.4333816546731365,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0010576041943613217,
+      "loss": 0.0981,
+      "step": 49926
+    },
+    {
+      "epoch": 0.4333903351533407,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.001057574464967839,
+      "loss": 0.0884,
+      "step": 49927
+    },
+    {
+      "epoch": 0.4333990156335448,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0010575447356207521,
+      "loss": 0.0986,
+      "step": 49928
+    },
+    {
+      "epoch": 0.433407696113749,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.001057515006320094,
+      "loss": 0.0742,
+      "step": 49929
+    },
+    {
+      "epoch": 0.43341637659395316,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0010574852770658966,
+      "loss": 0.1289,
+      "step": 49930
+    },
+    {
+      "epoch": 0.43342505707415735,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.001057455547858193,
+      "loss": 0.0537,
+      "step": 49931
+    },
+    {
+      "epoch": 0.4334337375543615,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0010574258186970146,
+      "loss": 0.0928,
+      "step": 49932
+    },
+    {
+      "epoch": 0.4334424180345657,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001057396089582395,
+      "loss": 0.1162,
+      "step": 49933
+    },
+    {
+      "epoch": 0.4334510985147698,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.001057366360514367,
+      "loss": 0.0947,
+      "step": 49934
+    },
+    {
+      "epoch": 0.433459778994974,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0010573366314929617,
+      "loss": 0.1094,
+      "step": 49935
+    },
+    {
+      "epoch": 0.43346845947517815,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.001057306902518213,
+      "loss": 0.0962,
+      "step": 49936
+    },
+    {
+      "epoch": 0.43347713995538234,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0010572771735901526,
+      "loss": 0.1504,
+      "step": 49937
+    },
+    {
+      "epoch": 0.4334858204355865,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0010572474447088133,
+      "loss": 0.0903,
+      "step": 49938
+    },
+    {
+      "epoch": 0.43349450091579067,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0010572177158742276,
+      "loss": 0.0811,
+      "step": 49939
+    },
+    {
+      "epoch": 0.4335031813959948,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0010571879870864278,
+      "loss": 0.0879,
+      "step": 49940
+    },
+    {
+      "epoch": 0.433511861876199,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0010571582583454463,
+      "loss": 0.0957,
+      "step": 49941
+    },
+    {
+      "epoch": 0.43352054235640314,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0010571285296513161,
+      "loss": 0.1338,
+      "step": 49942
+    },
+    {
+      "epoch": 0.43352922283660733,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0010570988010040697,
+      "loss": 0.1138,
+      "step": 49943
+    },
+    {
+      "epoch": 0.43353790331681147,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0010570690724037396,
+      "loss": 0.1367,
+      "step": 49944
+    },
+    {
+      "epoch": 0.43354658379701566,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0010570393438503578,
+      "loss": 0.0962,
+      "step": 49945
+    },
+    {
+      "epoch": 0.4335552642772198,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.001057009615343957,
+      "loss": 0.1089,
+      "step": 49946
+    },
+    {
+      "epoch": 0.433563944757424,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0010569798868845701,
+      "loss": 0.0986,
+      "step": 49947
+    },
+    {
+      "epoch": 0.43357262523762813,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0010569501584722291,
+      "loss": 0.1045,
+      "step": 49948
+    },
+    {
+      "epoch": 0.4335813057178323,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0010569204301069668,
+      "loss": 0.0898,
+      "step": 49949
+    },
+    {
+      "epoch": 0.43358998619803646,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0010568907017888159,
+      "loss": 0.1064,
+      "step": 49950
+    },
+    {
+      "epoch": 0.43359866667824065,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0010568609735178084,
+      "loss": 0.1167,
+      "step": 49951
+    },
+    {
+      "epoch": 0.4336073471584448,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0010568312452939775,
+      "loss": 0.1025,
+      "step": 49952
+    },
+    {
+      "epoch": 0.433616027638649,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.001056801517117355,
+      "loss": 0.0874,
+      "step": 49953
+    },
+    {
+      "epoch": 0.4336247081188531,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0010567717889879737,
+      "loss": 0.1367,
+      "step": 49954
+    },
+    {
+      "epoch": 0.4336333885990573,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0010567420609058661,
+      "loss": 0.1406,
+      "step": 49955
+    },
+    {
+      "epoch": 0.43364206907926145,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0010567123328710651,
+      "loss": 0.0894,
+      "step": 49956
+    },
+    {
+      "epoch": 0.43365074955946564,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0010566826048836024,
+      "loss": 0.1338,
+      "step": 49957
+    },
+    {
+      "epoch": 0.4336594300396698,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0010566528769435108,
+      "loss": 0.0859,
+      "step": 49958
+    },
+    {
+      "epoch": 0.433668110519874,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0010566231490508234,
+      "loss": 0.1133,
+      "step": 49959
+    },
+    {
+      "epoch": 0.4336767910000781,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0010565934212055724,
+      "loss": 0.1104,
+      "step": 49960
+    },
+    {
+      "epoch": 0.4336854714802823,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.00105656369340779,
+      "loss": 0.1035,
+      "step": 49961
+    },
+    {
+      "epoch": 0.43369415196048644,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0010565339656575087,
+      "loss": 0.126,
+      "step": 49962
+    },
+    {
+      "epoch": 0.43370283244069063,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0010565042379547614,
+      "loss": 0.1045,
+      "step": 49963
+    },
+    {
+      "epoch": 0.43371151292089477,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.00105647451029958,
+      "loss": 0.126,
+      "step": 49964
+    },
+    {
+      "epoch": 0.43372019340109896,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0010564447826919982,
+      "loss": 0.168,
+      "step": 49965
+    },
+    {
+      "epoch": 0.4337288738813031,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0010564150551320471,
+      "loss": 0.1602,
+      "step": 49966
+    },
+    {
+      "epoch": 0.4337375543615073,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0010563853276197598,
+      "loss": 0.1592,
+      "step": 49967
+    },
+    {
+      "epoch": 0.43374623484171143,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.001056355600155169,
+      "loss": 0.0835,
+      "step": 49968
+    },
+    {
+      "epoch": 0.4337549153219156,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0010563258727383073,
+      "loss": 0.0947,
+      "step": 49969
+    },
+    {
+      "epoch": 0.43376359580211976,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0010562961453692069,
+      "loss": 0.2139,
+      "step": 49970
+    },
+    {
+      "epoch": 0.43377227628232395,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0010562664180479005,
+      "loss": 0.1377,
+      "step": 49971
+    },
+    {
+      "epoch": 0.4337809567625281,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.00105623669077442,
+      "loss": 0.1113,
+      "step": 49972
+    },
+    {
+      "epoch": 0.4337896372427323,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.001056206963548799,
+      "loss": 0.1152,
+      "step": 49973
+    },
+    {
+      "epoch": 0.4337983177229364,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001056177236371069,
+      "loss": 0.1318,
+      "step": 49974
+    },
+    {
+      "epoch": 0.4338069982031406,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0010561475092412628,
+      "loss": 0.0698,
+      "step": 49975
+    },
+    {
+      "epoch": 0.43381567868334475,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001056117782159413,
+      "loss": 0.126,
+      "step": 49976
+    },
+    {
+      "epoch": 0.43382435916354894,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0010560880551255525,
+      "loss": 0.1245,
+      "step": 49977
+    },
+    {
+      "epoch": 0.4338330396437531,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0010560583281397134,
+      "loss": 0.1094,
+      "step": 49978
+    },
+    {
+      "epoch": 0.4338417201239573,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0010560286012019282,
+      "loss": 0.0786,
+      "step": 49979
+    },
+    {
+      "epoch": 0.4338504006041614,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0010559988743122292,
+      "loss": 0.1162,
+      "step": 49980
+    },
+    {
+      "epoch": 0.4338590810843656,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0010559691474706495,
+      "loss": 0.0786,
+      "step": 49981
+    },
+    {
+      "epoch": 0.43386776156456974,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0010559394206772215,
+      "loss": 0.1123,
+      "step": 49982
+    },
+    {
+      "epoch": 0.43387644204477394,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.001055909693931977,
+      "loss": 0.054,
+      "step": 49983
+    },
+    {
+      "epoch": 0.4338851225249781,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0010558799672349488,
+      "loss": 0.0918,
+      "step": 49984
+    },
+    {
+      "epoch": 0.43389380300518227,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.00105585024058617,
+      "loss": 0.1064,
+      "step": 49985
+    },
+    {
+      "epoch": 0.4339024834853864,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.0010558205139856727,
+      "loss": 0.0991,
+      "step": 49986
+    },
+    {
+      "epoch": 0.4339111639655906,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0010557907874334893,
+      "loss": 0.085,
+      "step": 49987
+    },
+    {
+      "epoch": 0.43391984444579473,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0010557610609296527,
+      "loss": 0.0771,
+      "step": 49988
+    },
+    {
+      "epoch": 0.4339285249259989,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.001055731334474195,
+      "loss": 0.1021,
+      "step": 49989
+    },
+    {
+      "epoch": 0.43393720540620306,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.001055701608067149,
+      "loss": 0.1807,
+      "step": 49990
+    },
+    {
+      "epoch": 0.43394588588640726,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.0010556718817085469,
+      "loss": 0.1123,
+      "step": 49991
+    },
+    {
+      "epoch": 0.4339545663666114,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0010556421553984208,
+      "loss": 0.0737,
+      "step": 49992
+    },
+    {
+      "epoch": 0.4339632468468156,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0010556124291368043,
+      "loss": 0.0889,
+      "step": 49993
+    },
+    {
+      "epoch": 0.4339719273270197,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0010555827029237294,
+      "loss": 0.1445,
+      "step": 49994
+    },
+    {
+      "epoch": 0.4339806078072239,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0010555529767592286,
+      "loss": 0.0854,
+      "step": 49995
+    },
+    {
+      "epoch": 0.43398928828742805,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0010555232506433343,
+      "loss": 0.1157,
+      "step": 49996
+    },
+    {
+      "epoch": 0.43399796876763225,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0010554935245760794,
+      "loss": 0.1064,
+      "step": 49997
+    },
+    {
+      "epoch": 0.4340066492478364,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.001055463798557496,
+      "loss": 0.0938,
+      "step": 49998
+    },
+    {
+      "epoch": 0.4340153297280406,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0010554340725876165,
+      "loss": 0.1113,
+      "step": 49999
+    },
+    {
+      "epoch": 0.4340240102082447,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0010554043466664738,
+      "loss": 0.0613,
+      "step": 50000
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 96000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 5000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1221338733472563e+18,
+  "train_batch_size": 12,
+  "trial_name": null,
+  "trial_params": null
+}